pyrss/get.py

266 lines
6.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import feedparser
import json
import datetime
import hashlib
import time
journals = [
{
"name" : "L'Humanité",
"url" : 'https://www.humanite.fr/rss/actu.rss'
},
{
"name" : "Alternative Economiques",
"url" : 'https://www.alternatives-economiques.fr/rss.xml'
},
{
"name" : "Basta",
"url" : 'https://basta.media/spip.php?page=backend'
},
{
"name" : "Bondy Blog",
"url" : 'https://www.bondyblog.fr/feed'
},
{
"name" : "La Déferlante",
"url" : 'https://revueladeferlante.fr/feed/'
},
{
"name" : "OFF Investigation",
"url" : 'https://www.off-investigation.fr/feed/'
},
{
"name" : "Frustration",
"url" : 'https://www.frustrationmagazine.fr/feed'
},
{
"name" : "Lundi Matin",
"url" : 'https://lundi.am/spip.php?page=backend'
},
{
"name" : "Courrier international",
"url" : 'https://www.courrierinternational.com/feed/all/rss.xml'
},
{
"name" : "Les Économistes Atterrés",
"url" : 'https://www.atterres.org/feed'
},
{
"name" : "Acrimed",
"url" : 'https://www.acrimed.org/spip.php?page=backend'
},
{
"name" : "Libération",
"url" : 'https://www.liberation.fr/arc/outboundfeeds/rss/?outputType=xml'
},
{
"name" : "Médiapart",
"url" : 'https://www.mediapart.fr/articles/feed'
},
{
"name" : "Franceinfo",
"url" : 'https://www.francetvinfo.fr/titres.rss'
},
{
"name" : "Le Monde Diplomatique",
"url" : 'https://www.monde-diplomatique.fr/spip.php?page=backend&id_mot='
},
{
"name" : "Le Monde",
"url" : 'https://www.lemonde.fr/rss/une.xml'
},
{
"name" : "BLAST",
"url" : 'https://api.blast-info.fr/rss.xml'
},
{
"name" : "Les Jours",
"url" : 'https://lesjours.fr/rss.xml'
},
{
"name" : "Arrêt sur Images",
"url" : 'https://api.arretsurimages.net/api/public/rss/all-content'
},
{
"name" : "Élucid",
"url" : 'https://elucid.media/feed'
},
{
"name" : "reflets",
"url" : 'https://reflets.info/feeds/public'
},
{
"name" : "StreetPress",
"url" : 'https://backend.streetpress.com/rss.xml'
},
{
"name" : "Presse-citron",
"url" : 'https://www.presse-citron.net/feed/'
},
{
"name" : "korii",
"url" : 'https://korii.slate.fr/rss.xml'
},
{
"name" : "Slate",
"url" : 'https://www.slate.fr/rss.xml'
},
{
"name" : "Le HuffPost",
"url" : 'https://www.huffingtonpost.fr/rss/all_headline.xml'
},
{
"name" : "Numerama",
"url" : 'https://www.numerama.com/feed/'
},
{
"name" : "Le Figaro",
"url" : 'https://www.lefigaro.fr/rss/figaro_actualites-a-la-une.xml'
},
{
"name" : "La Croix",
"url" : 'https://www.la-croix.com/RSS/UNIVERS'
},
{
"name" : "nvo",
"url" : 'https://nvo.fr/feed/?post_type=post'
},
{
"name" : "Usine Nouvelle",
"url" : 'https://www.usinenouvelle.com/rss/'
},
{
"name" : "Fakir",
"url" : 'http://www.fakirpresse.info/spip.php?page=backend'
},
{
"name" : "CQFD",
"url" : 'https://cqfd-journal.org/spip.php?page=backend'
},
{
"name" : "Politis",
"url" : 'https://www.politis.fr/flux-rss-politis-fr/'
},
{
"name" : "afriqueXXI",
"url" : 'https://afriquexxi.info/?page=backend&lang=fr'
},
{
"name" : "Rapports de Force",
"url" : 'https://rapportsdeforce.fr/feed'
},
{
"name" : "Reporterre",
"url" : 'https://reporterre.net/spip.php?page=backend-simple'
},
{
"name" : "Science Critique",
"url" : 'https://sciences-critiques.fr/feed/'
},
{
"name" : "Socialter",
"url" : 'https://www.socialter.fr/rss'
},
{
"name" : "Terrestres",
"url" : 'https://www.terrestres.org/feed/'
},
{
"name" : "Miroir Social",
"url" : 'https://www.miroirsocial.com/rss.xml'
},
{
"name" : "Le Vent Se Lève",
"url" : 'https://lvsl.fr/feed/'
},
{
"name" : "Le Media",
"url" : 'https://api.lemediatv.fr/rss.xml'
},
{
"name" : "Là-bas si j'y suis",
"url" : 'https://la-bas.org/spip.php?page=backend'
},
{
"name" : "Lenvolée",
"url" : 'https://lenvolee.net/feed/'
},
{
"name" : "Jef Klak",
"url" : 'http://jefklak.org/?feed=rss2'
},
{
"name" : "Disclose",
"url" : 'https://disclose.ngo/feed/'
},
{
"name" : "Observatoire des multinationales",
"url" : 'https://multinationales.org/fr/page/backend'
},
{
"name" : "Next INpact",
"url" : 'https://www.nextinpact.com/rss/news.xml'
},
{
"name" : "Usbek & Rica",
"url" : 'https://usbeketrica.com/fr/rss'
}
]
empty = {
"name" : "",
"url" : ''
}
data = []
problem = [
{
}
]
def parsedatostring(timestamp):
return time.strftime('%Y-%m-%d', timestamp)
def getdate(entry):
date = ""
if (hasattr(entry,'published_parsed')):
date = entry.published_parsed
elif (hasattr(entry,'updated_parsed')):
date = entry.updated_parsed
elif (hasattr(entry,'created_parsed')):
date = entry.created_parsed
if date != "":
return parsedatostring(date)
else:
return "???"
def parse(url, journal):
news_feed = feedparser.parse(url)
for entry in news_feed.entries:
title = entry.title
try :
description = entry.description
except:
description = ""
date = getdate(entry)
#print(date)
link = entry.link
keywords = "news"
id = hashlib.sha256(link.encode("utf-8")).hexdigest()
data.append({"id": id, "journal" : journal,"title": title, "description": description, "url": link, "date": date, "tags": keywords})
for j in journals:
journal = j["name"]
print(journal)
url = j["url"]
parse(url, journal)
now = datetime.datetime.now()
fn = now.strftime("json/%Y-%m-%d-%H-%M.json")
with open(fn, "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)