From 32cddd0b1ee34a66b0ab91da492bfc34e54e32ad Mon Sep 17 00:00:00 2001 From: ed Date: Mon, 24 Jul 2023 10:53:57 +0200 Subject: [PATCH] tldw translate and complete added --- complete.py | 0 main.py | 61 +++++++++++++++++++++++++++++++++++++++++++++++- tldw.py | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++++ translate.py | 17 ++++++++++++++ 4 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 complete.py create mode 100644 tldw.py create mode 100644 translate.py diff --git a/complete.py b/complete.py new file mode 100644 index 0000000..e69de29 diff --git a/main.py b/main.py index c0abcea..56f403f 100644 --- a/main.py +++ b/main.py @@ -2,7 +2,10 @@ from flask import Flask, request, jsonify from dotenv import load_dotenv load_dotenv() from langchain.chat_models import ChatOpenAI -from langchain.schema import HumanMessage +from langchain.schema import HumanMessage, SystemMessage +from translate import translate, models +from tldw import GetVideo +import os app = Flask(__name__) # set fixed port for app @@ -28,6 +31,54 @@ def openai(): 'body': resp.content } +@app.route('/tldw', methods=['POST']) +def tldw(): + content_type = request.headers.get('Content-Type') + if content_type == 'application/json': + json_payload = request.json + url = json_payload['url'] + else: + return jsonify({'error': 'Invalid content type'}), 400 + video = GetVideo(url) + print("received request for " + url ) + if not video.ytgenerated: + return { + 'statusCode': 200, + 'body': { + 'url': video.url, + 'creator': video.creator, + 'title': video.title, + 'transcript': video.transcript, + 'summary': 'This video doesn\'t have available subtitles :-/' + } + } + temperature = 0.9 + model_name = 'gpt-3.5-turbo' + summary = "" + llm = ChatOpenAI(temperature = temperature, model_name = model_name) + + system = "You are a professional note taker. User will provide a part of the transcript of a Youtube video. You will reply by listing bullet points with 1 simple sentence each time." + + for part in video.parts: + prompt = part + resp = llm([HumanMessage(content=prompt), SystemMessage(content=system)]) + summary += resp.content + "\n" + + hashtagerSystem = "You are a professional note taker. User will provide bullet points from the transcript of a video. You will reply by suggesting simple short hashtags" + + hashtags = llm([HumanMessage(content=summary), SystemMessage(content=hashtagerSystem)]) + return { + 'statusCode': 200, + 'body': { + 'url': video.url, + 'creator': video.creator, + 'title': video.title, + 'transcript': video.transcript, + 'summary': summary, + 'hashtags': hashtags.content + } + } + """ curl test: curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai @@ -67,7 +118,15 @@ def home(): """ +@app.route('/translate/', methods=['POST']) +def translate_text(lang): + if lang not in models: + return jsonify(error=f"No model for language: {lang}"), 400 + content = request.json + text = content['text'] + translated_text = translate(text, lang) + return jsonify(translated_text=translated_text) if __name__ == '__main__': app.run(debug=True) diff --git a/tldw.py b/tldw.py new file mode 100644 index 0000000..b8893ac --- /dev/null +++ b/tldw.py @@ -0,0 +1,66 @@ +import yt_dlp as youtube_dl +import os + +MAXWORDCOUNT = 2000 +LANG = "en" + +ydl_opts = { + 'writesubtitles': True, + 'writeautomaticsub': True, + 'subtitlesformat': 'vtt', + 'subtitleslangs': ['en'], + 'outtmpl': 'output/%(id)s', + 'skip_download': True, + 'writethumbnail': True, + 'progress' : False, + 'noprogress': True, + 'quiet': True +} + +ydl = youtube_dl.YoutubeDL(ydl_opts) + +class Video: + def __init__(self, url, creator, title, transcript, parts): + self.url = url + self.creator = creator + self.title = title + self.transcript = transcript + self.parts = parts + self.ytgenerated = False + +def GetVideo(url): + ydl.download([url]) + info = ydl.extract_info(url, download=False) + creator = info['uploader'] + title = info['title'] + id = info['id'] + subFn = "output/" + id + "." + LANG + ".vtt" + if not os.path.exists(subFn): + video = Video(url, creator, title, "This video doesn't have available subtitles :-/", []) + return video + txt = "" + with open(subFn, "r") as f: + lines = f.readlines() + + for i in range(len(lines)): + if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "": + txt += lines[i] + txt = txt.replace("\n"," ") + words = txt.split() + sList = [] + wCount = 0 + maxWordCount = MAXWORDCOUNT + currentString = "" + for w in words: + wCount +=1 + currentString += w + " " + if wCount == maxWordCount: + sList.append(currentString) + wCount = 0 + currentString = "" + if currentString: + sList.append(currentString) + blocks = len(sList) + video = Video(url, creator, title, txt, sList) + video.ytgenerated = True + return video \ No newline at end of file diff --git a/translate.py b/translate.py new file mode 100644 index 0000000..fba114e --- /dev/null +++ b/translate.py @@ -0,0 +1,17 @@ +from transformers import AutoTokenizer, AutoModelForSeq2SeqLM + +models = { + "en_ru": "Helsinki-NLP/opus-mt-en-ru", + "ru_en": "Helsinki-NLP/opus-mt-ru-en", + # Add more models as needed +} + +tokenizers = {lang: AutoTokenizer.from_pretrained(model) for lang, model in models.items()} +translation_models = {lang: AutoModelForSeq2SeqLM.from_pretrained(model) for lang, model in models.items()} + +def translate(text, lang): + tokenizer = tokenizers[lang] + model = translation_models[lang] + inputs = tokenizer(text, return_tensors="pt") + outputs = model.generate(**inputs, max_new_tokens = 200) + return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]