pyrss/sort.py

import os
import glob
import json
from tag_dict import TagGetter
import html
from bs4 import BeautifulSoup

tagetter = TagGetter()

output_directory = "./by_date"
source_path = "./json/"
existing_entry_ids = set()

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

for filename in glob.glob('./by_date/*.json'):
    with open(filename, 'r') as f:
        data = json.load(f)
        for article in data:
            existing_entry_ids.add(article['id'])

output_files = {}

def read_json_file(filepath):
    with open(filepath) as f:
        data = json.load(f)
    return data

def add_entry_to_file(entry):
    entry_id = entry["id"]
    if entry_id in existing_entry_ids:
        return
    date = entry["date"]
    title = entry["title"]
    url = entry["url"]
    description = html.unescape(entry['description'])
    soup = BeautifulSoup(description, 'html.parser')
    description = soup.get_text(strip=True)
    #split('-20')[0] is just a dirty fix for Le Diplo
    entry['description'] = description.split('-20')[0]
    tags_from_site = tagetter.get_tags_from_url(url)
    entry["tags_from_site"] = tags_from_site
    filename = f"{date}.json"
    filepath = os.path.join(output_directory, filename)
    if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
        with open(filepath, "r") as f:
            entries = json.load(f)
        entries.append(entry)
        with open(filepath, "w") as f:
            json.dump(entries, f, ensure_ascii=False, indent=4)
    else:
        with open(filepath, "w") as f:
            json.dump([entry], f, ensure_ascii=False, indent=4)

    """
    if date not in output_files:
        filename = f"{date}.json"
        filepath = os.path.join(output_directory, filename)
        output_files[date] = open(filepath, "a")
        output_files[date].write("[\n")
    else:
        output_files[date].write(",\n")
    json.dump(entry, output_files[date], ensure_ascii=False, indent=4)
    """

for filename in os.listdir(source_path):
    if filename.endswith(".json"):
        filepath = os.path.join(source_path, filename)
        print("Now sorting entries from " + filename)
        entries = read_json_file(filepath)
        for entry in entries:
            add_entry_to_file(entry)

# Close all the output files
for f in output_files.values():
    f.write("\n]\n")
    f.close()