79 lines
2.3 KiB
Python
79 lines
2.3 KiB
Python
import os
|
|
import glob
|
|
import json
|
|
from tag_dict import TagGetter
|
|
import html
|
|
from bs4 import BeautifulSoup
|
|
|
|
tagetter = TagGetter()
|
|
|
|
output_directory = "./by_date"
|
|
source_path = "./json/"
|
|
existing_entry_ids = set()
|
|
|
|
if not os.path.exists(output_directory):
|
|
os.makedirs(output_directory)
|
|
|
|
for filename in glob.glob('./by_date/*.json'):
|
|
with open(filename, 'r') as f:
|
|
data = json.load(f)
|
|
for article in data:
|
|
existing_entry_ids.add(article['id'])
|
|
|
|
output_files = {}
|
|
|
|
def read_json_file(filepath):
|
|
with open(filepath) as f:
|
|
data = json.load(f)
|
|
return data
|
|
|
|
def add_entry_to_file(entry):
|
|
entry_id = entry["id"]
|
|
if entry_id in existing_entry_ids:
|
|
return
|
|
date = entry["date"]
|
|
title = entry["title"]
|
|
url = entry["url"]
|
|
description = html.unescape(entry['description'])
|
|
soup = BeautifulSoup(description, 'html.parser')
|
|
description = soup.get_text(strip=True)
|
|
#split('-20')[0] is just a dirty fix for Le Diplo
|
|
entry['description'] = description.split('-20')[0]
|
|
tags_from_site = tagetter.get_tags_from_url(url)
|
|
entry["tags_from_site"] = tags_from_site
|
|
filename = f"{date}.json"
|
|
filepath = os.path.join(output_directory, filename)
|
|
if os.path.exists(filepath) and os.path.getsize(filepath) > 0:
|
|
with open(filepath, "r") as f:
|
|
entries = json.load(f)
|
|
entries.append(entry)
|
|
with open(filepath, "w") as f:
|
|
json.dump(entries, f, ensure_ascii=False, indent=4)
|
|
else:
|
|
with open(filepath, "w") as f:
|
|
json.dump([entry], f, ensure_ascii=False, indent=4)
|
|
|
|
"""
|
|
if date not in output_files:
|
|
filename = f"{date}.json"
|
|
filepath = os.path.join(output_directory, filename)
|
|
output_files[date] = open(filepath, "a")
|
|
output_files[date].write("[\n")
|
|
else:
|
|
output_files[date].write(",\n")
|
|
json.dump(entry, output_files[date], ensure_ascii=False, indent=4)
|
|
"""
|
|
|
|
for filename in os.listdir(source_path):
|
|
if filename.endswith(".json"):
|
|
filepath = os.path.join(source_path, filename)
|
|
print("Now sorting entries from " + filename)
|
|
entries = read_json_file(filepath)
|
|
for entry in entries:
|
|
add_entry_to_file(entry)
|
|
|
|
# Close all the output files
|
|
for f in output_files.values():
|
|
f.write("\n]\n")
|
|
f.close()
|