tldw translate and complete added

2023-07-24 10:53:57 +02:00 · 2023-07-24 10:53:57 +02:00 · 32cddd0b1e
parent 1b7a1b3962
commit 32cddd0b1e
4 changed files with 143 additions and 1 deletions
--- a/complete.py
+++ b/complete.py
--- a/main.py
+++ b/main.py
@ -2,7 +2,10 @@ from flask import Flask, request, jsonify
 from dotenv import load_dotenv
 load_dotenv()
 from langchain.chat_models import ChatOpenAI
-from langchain.schema import HumanMessage
+from langchain.schema import HumanMessage, SystemMessage
+from translate import translate, models
+from tldw import GetVideo
+import os

 app = Flask(__name__)
 # set fixed port for app
@ -28,6 +31,54 @@ def openai():
        'body': resp.content
    }

+@app.route('/tldw', methods=['POST'])
+def tldw():
+    content_type = request.headers.get('Content-Type')
+    if content_type == 'application/json':
+        json_payload = request.json
+        url = json_payload['url']
+    else:
+        return jsonify({'error': 'Invalid content type'}), 400
+    video = GetVideo(url)
+    print("received request for " + url )
+    if not video.ytgenerated:
+        return {
+            'statusCode': 200,
+            'body': {
+                'url': video.url,
+                'creator': video.creator,
+                'title': video.title,
+                'transcript': video.transcript,
+                'summary': 'This video doesn\'t have available subtitles :-/'
+            }
+        }
+    temperature = 0.9
+    model_name = 'gpt-3.5-turbo'
+    summary = ""
+    llm = ChatOpenAI(temperature = temperature, model_name = model_name)
+    
+    system = "You are a professional note taker. User will provide a part of the transcript of a Youtube video. You will reply by listing bullet points with 1 simple sentence each time."
+    
+    for part in video.parts:
+        prompt = part
+        resp = llm([HumanMessage(content=prompt), SystemMessage(content=system)])
+        summary += resp.content + "\n"
+    
+    hashtagerSystem = "You are a professional note taker. User will provide bullet points from the transcript of a video. You will reply by suggesting simple short hashtags"
+    
+    hashtags = llm([HumanMessage(content=summary), SystemMessage(content=hashtagerSystem)])
+    return {
+        'statusCode': 200,
+        'body': {
+            'url': video.url,
+            'creator': video.creator,
+            'title': video.title,
+            'transcript': video.transcript,
+            'summary': summary,
+            'hashtags': hashtags.content
+        }
+    }
+
 """
 curl test:
 curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai    
@ -67,7 +118,15 @@ def home():
 </html>
 """

+@app.route('/translate/<string:lang>', methods=['POST'])
+def translate_text(lang):
+    if lang not in models:
+        return jsonify(error=f"No model for language: {lang}"), 400

+    content = request.json
+    text = content['text']
+    translated_text = translate(text, lang)
+    return jsonify(translated_text=translated_text)

 if __name__ == '__main__':
    app.run(debug=True)
--- a/tldw.py
+++ b/tldw.py
@ -0,0 +1,66 @@
+import yt_dlp as youtube_dl
+import os
+
+MAXWORDCOUNT = 2000
+LANG = "en"
+
+ydl_opts = {
+    'writesubtitles': True,
+    'writeautomaticsub': True,
+    'subtitlesformat': 'vtt',
+    'subtitleslangs': ['en'],
+    'outtmpl': 'output/%(id)s',
+    'skip_download': True,
+    'writethumbnail': True,
+    'progress' : False,
+    'noprogress': True,
+    'quiet': True
+}
+
+ydl = youtube_dl.YoutubeDL(ydl_opts)
+
+class Video:
+    def __init__(self, url, creator, title, transcript, parts):
+        self.url = url
+        self.creator = creator
+        self.title = title
+        self.transcript = transcript
+        self.parts = parts
+        self.ytgenerated = False
+
+def GetVideo(url):
+    ydl.download([url])
+    info = ydl.extract_info(url, download=False)
+    creator = info['uploader']
+    title = info['title']
+    id = info['id']
+    subFn = "output/" + id + "." + LANG + ".vtt"
+    if not os.path.exists(subFn):
+        video = Video(url, creator, title, "This video doesn't have available subtitles :-/", [])
+        return video
+    txt = ""
+    with open(subFn, "r") as f:
+        lines = f.readlines()
+        
+        for i in range(len(lines)):
+            if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "":
+                txt += lines[i]
+    txt = txt.replace("\n"," ")
+    words = txt.split()
+    sList = []
+    wCount = 0
+    maxWordCount = MAXWORDCOUNT
+    currentString = ""
+    for w in words:
+        wCount +=1
+        currentString += w + " "
+        if wCount == maxWordCount:
+            sList.append(currentString)
+            wCount = 0
+            currentString = ""
+    if currentString:
+        sList.append(currentString)
+    blocks = len(sList)
+    video = Video(url, creator, title, txt, sList)
+    video.ytgenerated = True
+    return video
--- a/translate.py
+++ b/translate.py
@ -0,0 +1,17 @@
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+models = {
+    "en_ru": "Helsinki-NLP/opus-mt-en-ru",
+    "ru_en": "Helsinki-NLP/opus-mt-ru-en",
+    # Add more models as needed
+}
+
+tokenizers = {lang: AutoTokenizer.from_pretrained(model) for lang, model in models.items()}
+translation_models = {lang: AutoModelForSeq2SeqLM.from_pretrained(model) for lang, model in models.items()}
+
+def translate(text, lang):
+    tokenizer = tokenizers[lang]
+    model = translation_models[lang]
+    inputs = tokenizer(text, return_tensors="pt")
+    outputs = model.generate(**inputs, max_new_tokens = 200)
+    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]