pyrss/tag_dict.py

210 lines
8.2 KiB
Python

import requests
import re
from bs4 import BeautifulSoup
class TagGetter:
def __init__(self):
self.get_tags = {
'www.usinenouvelle.com': self.get_tags_Usine_Nouvelle,
'www.blast-info.fr': self.get_tags_Blast,
'www.courrierinternational.com': self.get_tags_CourrierInter,
'www.liberation.fr': self.get_tags_Liberation,
'www.francetvinfo.fr': self.get_tags_FranceInfo,
'www.mediapart.fr': self.get_tags_Mediapart,
'www.monde-diplomatique.fr': self.get_tags_Diplo,
'lesjours.fr': self.get_tags_LesJours,
'www.streetpress.com' : self.get_tags_StreetPress,
'nvo.fr' : self.get_tags_nvo,
'korii.slate.fr' : self.get_tags_korii,
'usbeketrica.com' : self.get_tags_UsbekRica,
'www.la-croix.com' : self.get_tags_LaCroix,
'www.lefigaro.fr' : self.get_tags_Figaro,
'www.numerama.com' : self.get_tags_Numerama,
'www.huffingtonpost.fr' : self.get_tags_LeHuffPost,
'www.humanite.fr' : self.get_tags_Huma,
'basta.media' : self.get_tags_Basta
}
def get_tags_from_url(self, url):
domain = url.split('/')[2]
if domain in self.get_tags:
try:
tags = self.get_tags[domain](url)
except:
tags = "Can't find site tags for this entry"
return tags
else:
return "Not yet implemented"
#raise ValueError(f"No subject-getter found for domain: {domain}")
def get_tags_Usine_Nouvelle(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_tag = soup.find('meta', {'name': 'DC.Subject', 'lang': 'fr'})
return meta_tag['content']
def get_tags_Blast(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
story_content_div = soup.find('div', class_='story-content')
ul = story_content_div.find('ul')
tag_list = []
for a in ul.find_all('a'):
tag_list.append(a.text)
tag_string = ', '.join(tag_list).replace("", "")
return tag_string
def get_tags_CourrierInter(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tags_div = soup.find('div', class_='tags-list')
tag_list = []
for a in tags_div.find_all('a', class_='item'):
tag_list.append(a.text)
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_Liberation(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_tag = soup.find('meta', {'name': 'keywords'})
keywords = meta_tag.get('content')
tags = [word.capitalize() for word in keywords.split(',')]
tag_string = ', '.join(tags)
return tag_string
def get_tags_FranceInfo(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tag_list = []
ul = soup.find('ul', class_='related-tags__tags')
for s in ul.find_all('span'):
tag_list.append(s.text)
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_Mediapart(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_tag = soup.find('meta', {'name': 'news_keywords'})
keywords = meta_tag.get('content')
tags = [word.capitalize() for word in keywords.split(',')]
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_Diplo(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tags_div = soup.find('div', class_='tags')
tag_list = []
for a in tags_div.find_all('a'):
tag_list.append(a.text)
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_LesJours(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_tag = soup.find('meta', {'name': 'news_keywords'})
keywords = meta_tag.get('content')
tags = [word.capitalize() for word in keywords.split(',')]
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_StreetPress(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tags_div = soup.find('ul', class_='pager')
tag_list = []
for a in tags_div.find_all('a'):
tag_list.append(a.text)
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_nvo(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tags_div = soup.find('div', class_='barre_tags')
tag_list = []
for a in tags_div.find_all('a'):
tag_list.append(a.text)
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_korii(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
script_tag = soup.find('script', {'id': 'optidigital-ad-init'})
config_str = script_tag['config']
tags_str = re.search(r'"tags": \[(.*?)\]', config_str).group(1)
list_of_strings = tags_str.split(',')
list_of_words = [s.strip('"').capitalize() for s in list_of_strings]
tag_string = ', '.join(list_of_words)
return tag_string
def get_tags_UsbekRica(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tag_div = soup.find('div', {'id': 'tags'})
tag_list = []
for a in tag_div.find_all('a'):
tag_list.append(a.text.strip().lstrip('#'))
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_LaCroix(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_tag = soup.find('meta', {'name': 'news_keywords'})
keywords = meta_tag.get('content')
tags = [word.capitalize() for word in keywords.split(',')]
tag_string = ', '.join(tags)
return tag_string
def get_tags_Figaro(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tag_div = soup.find('ul', class_='fig-tag__list')
tag_list = []
for a in tag_div.find_all('a'):
tag_list.append(a.text.strip().lstrip('#'))
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_Numerama(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_tag = soup.find('meta', {'name': 'parsely-tags'})
keywords = meta_tag.get('content')
tags = [word.capitalize() for word in keywords.split(',')]
tag_string = ', '.join(tags)
return tag_string
def get_tags_LeHuffPost(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
meta_tag = soup.find('meta', {'property': 'article:tag'})
keywords = meta_tag.get('content')
tags = [word.capitalize() for word in keywords.split(',')]
tag_string = ', '.join(tags)
return tag_string
def get_tags_Huma(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tags_div = soup.find('div', class_='field-name-field-news-etiquettes')
tag_list = []
for a in tags_div.find_all('a'):
tag_list.append(a.text)
tag_string = ', '.join(tag_list)
return tag_string
def get_tags_Basta(self, url):
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')
tags_div = soup.find('ul', class_='c-tag')
tag_list = []
for a in tags_div.find_all('a'):
tag_list.append(a.text)
tag_string = ', '.join(tag_list)
return tag_string