import requests import re from bs4 import BeautifulSoup class TagGetter: def __init__(self): self.get_tags = { 'www.usinenouvelle.com': self.get_tags_Usine_Nouvelle, 'www.blast-info.fr': self.get_tags_Blast, 'www.courrierinternational.com': self.get_tags_CourrierInter, 'www.liberation.fr': self.get_tags_Liberation, 'www.francetvinfo.fr': self.get_tags_FranceInfo, 'www.mediapart.fr': self.get_tags_Mediapart, 'www.monde-diplomatique.fr': self.get_tags_Diplo, 'lesjours.fr': self.get_tags_LesJours, 'www.streetpress.com' : self.get_tags_StreetPress, 'nvo.fr' : self.get_tags_nvo, 'korii.slate.fr' : self.get_tags_korii, 'usbeketrica.com' : self.get_tags_UsbekRica, 'www.la-croix.com' : self.get_tags_LaCroix, 'www.lefigaro.fr' : self.get_tags_Figaro, 'www.numerama.com' : self.get_tags_Numerama, 'www.huffingtonpost.fr' : self.get_tags_LeHuffPost, 'www.humanite.fr' : self.get_tags_Huma, 'basta.media' : self.get_tags_Basta } def get_tags_from_url(self, url): domain = url.split('/')[2] if domain in self.get_tags: try: tags = self.get_tags[domain](url) except: tags = "Can't find site tags for this entry" return tags else: return "Not yet implemented" #raise ValueError(f"No subject-getter found for domain: {domain}") def get_tags_Usine_Nouvelle(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') meta_tag = soup.find('meta', {'name': 'DC.Subject', 'lang': 'fr'}) return meta_tag['content'] def get_tags_Blast(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') story_content_div = soup.find('div', class_='story-content') ul = story_content_div.find('ul') tag_list = [] for a in ul.find_all('a'): tag_list.append(a.text) tag_string = ', '.join(tag_list).replace("● ", "") return tag_string def get_tags_CourrierInter(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tags_div = soup.find('div', class_='tags-list') tag_list = [] for a in tags_div.find_all('a', class_='item'): tag_list.append(a.text) tag_string = ', '.join(tag_list) return tag_string def get_tags_Liberation(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') meta_tag = soup.find('meta', {'name': 'keywords'}) keywords = meta_tag.get('content') tags = [word.capitalize() for word in keywords.split(',')] tag_string = ', '.join(tags) return tag_string def get_tags_FranceInfo(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tag_list = [] ul = soup.find('ul', class_='related-tags__tags') for s in ul.find_all('span'): tag_list.append(s.text) tag_string = ', '.join(tag_list) return tag_string def get_tags_Mediapart(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') meta_tag = soup.find('meta', {'name': 'news_keywords'}) keywords = meta_tag.get('content') tags = [word.capitalize() for word in keywords.split(',')] tag_string = ', '.join(tag_list) return tag_string def get_tags_Diplo(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tags_div = soup.find('div', class_='tags') tag_list = [] for a in tags_div.find_all('a'): tag_list.append(a.text) tag_string = ', '.join(tag_list) return tag_string def get_tags_LesJours(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') meta_tag = soup.find('meta', {'name': 'news_keywords'}) keywords = meta_tag.get('content') tags = [word.capitalize() for word in keywords.split(',')] tag_string = ', '.join(tag_list) return tag_string def get_tags_StreetPress(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tags_div = soup.find('ul', class_='pager') tag_list = [] for a in tags_div.find_all('a'): tag_list.append(a.text) tag_string = ', '.join(tag_list) return tag_string def get_tags_nvo(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tags_div = soup.find('div', class_='barre_tags') tag_list = [] for a in tags_div.find_all('a'): tag_list.append(a.text) tag_string = ', '.join(tag_list) return tag_string def get_tags_korii(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') script_tag = soup.find('script', {'id': 'optidigital-ad-init'}) config_str = script_tag['config'] tags_str = re.search(r'"tags": \[(.*?)\]', config_str).group(1) list_of_strings = tags_str.split(',') list_of_words = [s.strip('"').capitalize() for s in list_of_strings] tag_string = ', '.join(list_of_words) return tag_string def get_tags_UsbekRica(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tag_div = soup.find('div', {'id': 'tags'}) tag_list = [] for a in tag_div.find_all('a'): tag_list.append(a.text.strip().lstrip('#')) tag_string = ', '.join(tag_list) return tag_string def get_tags_LaCroix(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') meta_tag = soup.find('meta', {'name': 'news_keywords'}) keywords = meta_tag.get('content') tags = [word.capitalize() for word in keywords.split(',')] tag_string = ', '.join(tags) return tag_string def get_tags_Figaro(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tag_div = soup.find('ul', class_='fig-tag__list') tag_list = [] for a in tag_div.find_all('a'): tag_list.append(a.text.strip().lstrip('#')) tag_string = ', '.join(tag_list) return tag_string def get_tags_Numerama(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') meta_tag = soup.find('meta', {'name': 'parsely-tags'}) keywords = meta_tag.get('content') tags = [word.capitalize() for word in keywords.split(',')] tag_string = ', '.join(tags) return tag_string def get_tags_LeHuffPost(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') meta_tag = soup.find('meta', {'property': 'article:tag'}) keywords = meta_tag.get('content') tags = [word.capitalize() for word in keywords.split(',')] tag_string = ', '.join(tags) return tag_string def get_tags_Huma(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tags_div = soup.find('div', class_='field-name-field-news-etiquettes') tag_list = [] for a in tags_div.find_all('a'): tag_list.append(a.text) tag_string = ', '.join(tag_list) return tag_string def get_tags_Basta(self, url): response = requests.get(url) soup = BeautifulSoup(response.content, 'html.parser') tags_div = soup.find('ul', class_='c-tag') tag_list = [] for a in tags_div.find_all('a'): tag_list.append(a.text) tag_string = ', '.join(tag_list) return tag_string