tldw translate and complete added

2023-07-24 10:53:57 +02:00 · 2023-07-24 10:53:57 +02:00 · 32cddd0b1e
parent 1b7a1b3962
commit 32cddd0b1e
4 changed files with 143 additions and 1 deletions
--- a/complete.py
+++ b/complete.py
--- a/main.py
+++ b/main.py
@ -2,7 +2,10 @@ from flask import Flask, request, jsonify
 from dotenv import load_dotenv
 load_dotenv()
 from langchain.chat_models import ChatOpenAI
-from langchain.schema import HumanMessage
+from langchain.schema import HumanMessage, SystemMessage
 from translate import translate, models
 from tldw import GetVideo
 import os
 app = Flask(__name__)
 # set fixed port for app
@ -28,6 +31,54 @@ def openai():
        'body': resp.content
    }
@app.route('/tldw', methods=['POST'])
 def tldw():
    content_type = request.headers.get('Content-Type')
    if content_type == 'application/json':
        json_payload = request.json
        url = json_payload['url']
    else:
        return jsonify({'error': 'Invalid content type'}), 400
    video = GetVideo(url)
    print("received request for " + url )
    if not video.ytgenerated:
        return {
            'statusCode': 200,
            'body': {
                'url': video.url,
                'creator': video.creator,
                'title': video.title,
                'transcript': video.transcript,
                'summary': 'This video doesn\'t have available subtitles :-/'
            }
        }
    temperature = 0.9
    model_name = 'gpt-3.5-turbo'
    summary = ""
    llm = ChatOpenAI(temperature = temperature, model_name = model_name)
    system = "You are a professional note taker. User will provide a part of the transcript of a Youtube video. You will reply by listing bullet points with 1 simple sentence each time."
    for part in video.parts:
        prompt = part
        resp = llm([HumanMessage(content=prompt), SystemMessage(content=system)])
        summary += resp.content + "\n"
    hashtagerSystem = "You are a professional note taker. User will provide bullet points from the transcript of a video. You will reply by suggesting simple short hashtags"
    hashtags = llm([HumanMessage(content=summary), SystemMessage(content=hashtagerSystem)])
    return {
        'statusCode': 200,
        'body': {
            'url': video.url,
            'creator': video.creator,
            'title': video.title,
            'transcript': video.transcript,
            'summary': summary,
            'hashtags': hashtags.content
        }
    }
 """
 curl test:
 curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai    
@ -67,7 +118,15 @@ def home():
 </html>
 """
@app.route('/translate/<string:lang>', methods=['POST'])
 def translate_text(lang):
    if lang not in models:
        return jsonify(error=f"No model for language: {lang}"), 400
    content = request.json
    text = content['text']
    translated_text = translate(text, lang)
    return jsonify(translated_text=translated_text)
 if __name__ == '__main__':
    app.run(debug=True)
--- a/tldw.py
+++ b/tldw.py
@ -0,0 +1,66 @@
 import yt_dlp as youtube_dl
 import os
 MAXWORDCOUNT = 2000
 LANG = "en"
 ydl_opts = {
    'writesubtitles': True,
    'writeautomaticsub': True,
    'subtitlesformat': 'vtt',
    'subtitleslangs': ['en'],
    'outtmpl': 'output/%(id)s',
    'skip_download': True,
    'writethumbnail': True,
    'progress' : False,
    'noprogress': True,
    'quiet': True
 }
 ydl = youtube_dl.YoutubeDL(ydl_opts)
 class Video:
    def __init__(self, url, creator, title, transcript, parts):
        self.url = url
        self.creator = creator
        self.title = title
        self.transcript = transcript
        self.parts = parts
        self.ytgenerated = False
 def GetVideo(url):
    ydl.download([url])
    info = ydl.extract_info(url, download=False)
    creator = info['uploader']
    title = info['title']
    id = info['id']
    subFn = "output/" + id + "." + LANG + ".vtt"
    if not os.path.exists(subFn):
        video = Video(url, creator, title, "This video doesn't have available subtitles :-/", [])
        return video
    txt = ""
    with open(subFn, "r") as f:
        lines = f.readlines()
        for i in range(len(lines)):
            if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "":
                txt += lines[i]
    txt = txt.replace("\n"," ")
    words = txt.split()
    sList = []
    wCount = 0
    maxWordCount = MAXWORDCOUNT
    currentString = ""
    for w in words:
        wCount +=1
        currentString += w + " "
        if wCount == maxWordCount:
            sList.append(currentString)
            wCount = 0
            currentString = ""
    if currentString:
        sList.append(currentString)
    blocks = len(sList)
    video = Video(url, creator, title, txt, sList)
    video.ytgenerated = True
    return video
--- a/translate.py
+++ b/translate.py
@ -0,0 +1,17 @@
 from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 models = {
    "en_ru": "Helsinki-NLP/opus-mt-en-ru",
    "ru_en": "Helsinki-NLP/opus-mt-ru-en",
    # Add more models as needed
 }
 tokenizers = {lang: AutoTokenizer.from_pretrained(model) for lang, model in models.items()}
 translation_models = {lang: AutoModelForSeq2SeqLM.from_pretrained(model) for lang, model in models.items()}
 def translate(text, lang):
    tokenizer = tokenizers[lang]
    model = translation_models[lang]
    inputs = tokenizer(text, return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens = 200)
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]