210 lines
8.2 KiB
Python
210 lines
8.2 KiB
Python
import requests
|
|
import re
|
|
from bs4 import BeautifulSoup
|
|
|
|
class TagGetter:
|
|
def __init__(self):
|
|
self.get_tags = {
|
|
'www.usinenouvelle.com': self.get_tags_Usine_Nouvelle,
|
|
'www.blast-info.fr': self.get_tags_Blast,
|
|
'www.courrierinternational.com': self.get_tags_CourrierInter,
|
|
'www.liberation.fr': self.get_tags_Liberation,
|
|
'www.francetvinfo.fr': self.get_tags_FranceInfo,
|
|
'www.mediapart.fr': self.get_tags_Mediapart,
|
|
'www.monde-diplomatique.fr': self.get_tags_Diplo,
|
|
'lesjours.fr': self.get_tags_LesJours,
|
|
'www.streetpress.com' : self.get_tags_StreetPress,
|
|
'nvo.fr' : self.get_tags_nvo,
|
|
'korii.slate.fr' : self.get_tags_korii,
|
|
'usbeketrica.com' : self.get_tags_UsbekRica,
|
|
'www.la-croix.com' : self.get_tags_LaCroix,
|
|
'www.lefigaro.fr' : self.get_tags_Figaro,
|
|
'www.numerama.com' : self.get_tags_Numerama,
|
|
'www.huffingtonpost.fr' : self.get_tags_LeHuffPost,
|
|
'www.humanite.fr' : self.get_tags_Huma,
|
|
'basta.media' : self.get_tags_Basta
|
|
}
|
|
|
|
def get_tags_from_url(self, url):
|
|
domain = url.split('/')[2]
|
|
if domain in self.get_tags:
|
|
try:
|
|
tags = self.get_tags[domain](url)
|
|
except:
|
|
tags = "Can't find site tags for this entry"
|
|
return tags
|
|
else:
|
|
return "Not yet implemented"
|
|
#raise ValueError(f"No subject-getter found for domain: {domain}")
|
|
|
|
def get_tags_Usine_Nouvelle(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
meta_tag = soup.find('meta', {'name': 'DC.Subject', 'lang': 'fr'})
|
|
return meta_tag['content']
|
|
|
|
def get_tags_Blast(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
story_content_div = soup.find('div', class_='story-content')
|
|
ul = story_content_div.find('ul')
|
|
tag_list = []
|
|
for a in ul.find_all('a'):
|
|
tag_list.append(a.text)
|
|
tag_string = ', '.join(tag_list).replace("● ", "")
|
|
return tag_string
|
|
|
|
def get_tags_CourrierInter(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tags_div = soup.find('div', class_='tags-list')
|
|
tag_list = []
|
|
for a in tags_div.find_all('a', class_='item'):
|
|
tag_list.append(a.text)
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_Liberation(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
meta_tag = soup.find('meta', {'name': 'keywords'})
|
|
keywords = meta_tag.get('content')
|
|
tags = [word.capitalize() for word in keywords.split(',')]
|
|
tag_string = ', '.join(tags)
|
|
return tag_string
|
|
|
|
def get_tags_FranceInfo(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tag_list = []
|
|
ul = soup.find('ul', class_='related-tags__tags')
|
|
for s in ul.find_all('span'):
|
|
tag_list.append(s.text)
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_Mediapart(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
meta_tag = soup.find('meta', {'name': 'news_keywords'})
|
|
keywords = meta_tag.get('content')
|
|
tags = [word.capitalize() for word in keywords.split(',')]
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_Diplo(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tags_div = soup.find('div', class_='tags')
|
|
tag_list = []
|
|
for a in tags_div.find_all('a'):
|
|
tag_list.append(a.text)
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_LesJours(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
meta_tag = soup.find('meta', {'name': 'news_keywords'})
|
|
keywords = meta_tag.get('content')
|
|
tags = [word.capitalize() for word in keywords.split(',')]
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_StreetPress(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tags_div = soup.find('ul', class_='pager')
|
|
tag_list = []
|
|
for a in tags_div.find_all('a'):
|
|
tag_list.append(a.text)
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_nvo(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tags_div = soup.find('div', class_='barre_tags')
|
|
tag_list = []
|
|
for a in tags_div.find_all('a'):
|
|
tag_list.append(a.text)
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_korii(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
script_tag = soup.find('script', {'id': 'optidigital-ad-init'})
|
|
config_str = script_tag['config']
|
|
tags_str = re.search(r'"tags": \[(.*?)\]', config_str).group(1)
|
|
list_of_strings = tags_str.split(',')
|
|
list_of_words = [s.strip('"').capitalize() for s in list_of_strings]
|
|
tag_string = ', '.join(list_of_words)
|
|
return tag_string
|
|
|
|
def get_tags_UsbekRica(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tag_div = soup.find('div', {'id': 'tags'})
|
|
tag_list = []
|
|
for a in tag_div.find_all('a'):
|
|
tag_list.append(a.text.strip().lstrip('#'))
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_LaCroix(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
meta_tag = soup.find('meta', {'name': 'news_keywords'})
|
|
keywords = meta_tag.get('content')
|
|
tags = [word.capitalize() for word in keywords.split(',')]
|
|
tag_string = ', '.join(tags)
|
|
return tag_string
|
|
|
|
def get_tags_Figaro(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tag_div = soup.find('ul', class_='fig-tag__list')
|
|
tag_list = []
|
|
for a in tag_div.find_all('a'):
|
|
tag_list.append(a.text.strip().lstrip('#'))
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_Numerama(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
meta_tag = soup.find('meta', {'name': 'parsely-tags'})
|
|
keywords = meta_tag.get('content')
|
|
tags = [word.capitalize() for word in keywords.split(',')]
|
|
tag_string = ', '.join(tags)
|
|
return tag_string
|
|
|
|
def get_tags_LeHuffPost(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
meta_tag = soup.find('meta', {'property': 'article:tag'})
|
|
keywords = meta_tag.get('content')
|
|
tags = [word.capitalize() for word in keywords.split(',')]
|
|
tag_string = ', '.join(tags)
|
|
return tag_string
|
|
|
|
def get_tags_Huma(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tags_div = soup.find('div', class_='field-name-field-news-etiquettes')
|
|
tag_list = []
|
|
for a in tags_div.find_all('a'):
|
|
tag_list.append(a.text)
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string
|
|
|
|
def get_tags_Basta(self, url):
|
|
response = requests.get(url)
|
|
soup = BeautifulSoup(response.content, 'html.parser')
|
|
tags_div = soup.find('ul', class_='c-tag')
|
|
tag_list = []
|
|
for a in tags_div.find_all('a'):
|
|
tag_list.append(a.text)
|
|
tag_string = ', '.join(tag_list)
|
|
return tag_string |