tldw translate and complete added

This commit is contained in:
ed barz 2023-07-24 10:53:57 +02:00
parent 1b7a1b3962
commit 32cddd0b1e
4 changed files with 143 additions and 1 deletions

0
complete.py Normal file
View File

61
main.py
View File

@ -2,7 +2,10 @@ from flask import Flask, request, jsonify
from dotenv import load_dotenv from dotenv import load_dotenv
load_dotenv() load_dotenv()
from langchain.chat_models import ChatOpenAI from langchain.chat_models import ChatOpenAI
from langchain.schema import HumanMessage from langchain.schema import HumanMessage, SystemMessage
from translate import translate, models
from tldw import GetVideo
import os
app = Flask(__name__) app = Flask(__name__)
# set fixed port for app # set fixed port for app
@ -28,6 +31,54 @@ def openai():
'body': resp.content 'body': resp.content
} }
@app.route('/tldw', methods=['POST'])
def tldw():
content_type = request.headers.get('Content-Type')
if content_type == 'application/json':
json_payload = request.json
url = json_payload['url']
else:
return jsonify({'error': 'Invalid content type'}), 400
video = GetVideo(url)
print("received request for " + url )
if not video.ytgenerated:
return {
'statusCode': 200,
'body': {
'url': video.url,
'creator': video.creator,
'title': video.title,
'transcript': video.transcript,
'summary': 'This video doesn\'t have available subtitles :-/'
}
}
temperature = 0.9
model_name = 'gpt-3.5-turbo'
summary = ""
llm = ChatOpenAI(temperature = temperature, model_name = model_name)
system = "You are a professional note taker. User will provide a part of the transcript of a Youtube video. You will reply by listing bullet points with 1 simple sentence each time."
for part in video.parts:
prompt = part
resp = llm([HumanMessage(content=prompt), SystemMessage(content=system)])
summary += resp.content + "\n"
hashtagerSystem = "You are a professional note taker. User will provide bullet points from the transcript of a video. You will reply by suggesting simple short hashtags"
hashtags = llm([HumanMessage(content=summary), SystemMessage(content=hashtagerSystem)])
return {
'statusCode': 200,
'body': {
'url': video.url,
'creator': video.creator,
'title': video.title,
'transcript': video.transcript,
'summary': summary,
'hashtags': hashtags.content
}
}
""" """
curl test: curl test:
curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai
@ -67,7 +118,15 @@ def home():
</html> </html>
""" """
@app.route('/translate/<string:lang>', methods=['POST'])
def translate_text(lang):
if lang not in models:
return jsonify(error=f"No model for language: {lang}"), 400
content = request.json
text = content['text']
translated_text = translate(text, lang)
return jsonify(translated_text=translated_text)
if __name__ == '__main__': if __name__ == '__main__':
app.run(debug=True) app.run(debug=True)

66
tldw.py Normal file
View File

@ -0,0 +1,66 @@
import yt_dlp as youtube_dl
import os
MAXWORDCOUNT = 2000
LANG = "en"
ydl_opts = {
'writesubtitles': True,
'writeautomaticsub': True,
'subtitlesformat': 'vtt',
'subtitleslangs': ['en'],
'outtmpl': 'output/%(id)s',
'skip_download': True,
'writethumbnail': True,
'progress' : False,
'noprogress': True,
'quiet': True
}
ydl = youtube_dl.YoutubeDL(ydl_opts)
class Video:
def __init__(self, url, creator, title, transcript, parts):
self.url = url
self.creator = creator
self.title = title
self.transcript = transcript
self.parts = parts
self.ytgenerated = False
def GetVideo(url):
ydl.download([url])
info = ydl.extract_info(url, download=False)
creator = info['uploader']
title = info['title']
id = info['id']
subFn = "output/" + id + "." + LANG + ".vtt"
if not os.path.exists(subFn):
video = Video(url, creator, title, "This video doesn't have available subtitles :-/", [])
return video
txt = ""
with open(subFn, "r") as f:
lines = f.readlines()
for i in range(len(lines)):
if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "":
txt += lines[i]
txt = txt.replace("\n"," ")
words = txt.split()
sList = []
wCount = 0
maxWordCount = MAXWORDCOUNT
currentString = ""
for w in words:
wCount +=1
currentString += w + " "
if wCount == maxWordCount:
sList.append(currentString)
wCount = 0
currentString = ""
if currentString:
sList.append(currentString)
blocks = len(sList)
video = Video(url, creator, title, txt, sList)
video.ytgenerated = True
return video

17
translate.py Normal file
View File

@ -0,0 +1,17 @@
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
models = {
"en_ru": "Helsinki-NLP/opus-mt-en-ru",
"ru_en": "Helsinki-NLP/opus-mt-ru-en",
# Add more models as needed
}
tokenizers = {lang: AutoTokenizer.from_pretrained(model) for lang, model in models.items()}
translation_models = {lang: AutoModelForSeq2SeqLM.from_pretrained(model) for lang, model in models.items()}
def translate(text, lang):
tokenizer = tokenizers[lang]
model = translation_models[lang]
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens = 200)
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]