import os import glob import json from tag_dict import TagGetter import html from bs4 import BeautifulSoup tagetter = TagGetter() output_directory = "./by_date" source_path = "./json/" existing_entry_ids = set() if not os.path.exists(output_directory): os.makedirs(output_directory) for filename in glob.glob('./by_date/*.json'): with open(filename, 'r') as f: data = json.load(f) for article in data: existing_entry_ids.add(article['id']) output_files = {} def read_json_file(filepath): with open(filepath) as f: data = json.load(f) return data def add_entry_to_file(entry): entry_id = entry["id"] if entry_id in existing_entry_ids: return date = entry["date"] title = entry["title"] url = entry["url"] description = html.unescape(entry['description']) soup = BeautifulSoup(description, 'html.parser') description = soup.get_text(strip=True) #split('-20')[0] is just a dirty fix for Le Diplo entry['description'] = description.split('-20')[0] tags_from_site = tagetter.get_tags_from_url(url) entry["tags_from_site"] = tags_from_site filename = f"{date}.json" filepath = os.path.join(output_directory, filename) if os.path.exists(filepath) and os.path.getsize(filepath) > 0: with open(filepath, "r") as f: entries = json.load(f) entries.append(entry) with open(filepath, "w") as f: json.dump(entries, f, ensure_ascii=False, indent=4) else: with open(filepath, "w") as f: json.dump([entry], f, ensure_ascii=False, indent=4) """ if date not in output_files: filename = f"{date}.json" filepath = os.path.join(output_directory, filename) output_files[date] = open(filepath, "a") output_files[date].write("[\n") else: output_files[date].write(",\n") json.dump(entry, output_files[date], ensure_ascii=False, indent=4) """ for filename in os.listdir(source_path): if filename.endswith(".json"): filepath = os.path.join(source_path, filename) print("Now sorting entries from " + filename) entries = read_json_file(filepath) for entry in entries: add_entry_to_file(entry) # Close all the output files for f in output_files.values(): f.write("\n]\n") f.close()