pyrss/sort.py

79 lines
2.3 KiB
Python

import os
import glob
import json
from tag_dict import TagGetter
import html
from bs4 import BeautifulSoup
tagetter = TagGetter()
output_directory = "./by_date"
source_path = "./json/"
existing_entry_ids = set()
if not os.path.exists(output_directory):
os.makedirs(output_directory)
for filename in glob.glob('./by_date/*.json'):
with open(filename, 'r') as f:
data = json.load(f)
for article in data:
existing_entry_ids.add(article['id'])
output_files = {}
def read_json_file(filepath):
with open(filepath) as f:
data = json.load(f)
return data
def add_entry_to_file(entry):
entry_id = entry["id"]
if entry_id in existing_entry_ids:
return
date = entry["date"]
title = entry["title"]
url = entry["url"]
description = html.unescape(entry['description'])
soup = BeautifulSoup(description, 'html.parser')
description = soup.get_text(strip=True)
#split('-20')[0] is just a dirty fix for Le Diplo
entry['description'] = description.split('-20')[0]
tags_from_site = tagetter.get_tags_from_url(url)
entry["tags_from_site"] = tags_from_site
filename = f"{date}.json"
filepath = os.path.join(output_directory, filename)
if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
with open(filepath, "r") as f:
entries = json.load(f)
entries.append(entry)
with open(filepath, "w") as f:
json.dump(entries, f, ensure_ascii=False, indent=4)
else:
with open(filepath, "w") as f:
json.dump([entry], f, ensure_ascii=False, indent=4)
"""
if date not in output_files:
filename = f"{date}.json"
filepath = os.path.join(output_directory, filename)
output_files[date] = open(filepath, "a")
output_files[date].write("[\n")
else:
output_files[date].write(",\n")
json.dump(entry, output_files[date], ensure_ascii=False, indent=4)
"""
for filename in os.listdir(source_path):
if filename.endswith(".json"):
filepath = os.path.join(source_path, filename)
print("Now sorting entries from " + filename)
entries = read_json_file(filepath)
for entry in entries:
add_entry_to_file(entry)
# Close all the output files
for f in output_files.values():
f.write("\n]\n")
f.close()