tldw translate and complete added
This commit is contained in:
parent
1b7a1b3962
commit
32cddd0b1e
61
main.py
61
main.py
|
@ -2,7 +2,10 @@ from flask import Flask, request, jsonify
|
|||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
from langchain.chat_models import ChatOpenAI
|
||||
from langchain.schema import HumanMessage
|
||||
from langchain.schema import HumanMessage, SystemMessage
|
||||
from translate import translate, models
|
||||
from tldw import GetVideo
|
||||
import os
|
||||
|
||||
app = Flask(__name__)
|
||||
# set fixed port for app
|
||||
|
@ -28,6 +31,54 @@ def openai():
|
|||
'body': resp.content
|
||||
}
|
||||
|
||||
@app.route('/tldw', methods=['POST'])
|
||||
def tldw():
|
||||
content_type = request.headers.get('Content-Type')
|
||||
if content_type == 'application/json':
|
||||
json_payload = request.json
|
||||
url = json_payload['url']
|
||||
else:
|
||||
return jsonify({'error': 'Invalid content type'}), 400
|
||||
video = GetVideo(url)
|
||||
print("received request for " + url )
|
||||
if not video.ytgenerated:
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'body': {
|
||||
'url': video.url,
|
||||
'creator': video.creator,
|
||||
'title': video.title,
|
||||
'transcript': video.transcript,
|
||||
'summary': 'This video doesn\'t have available subtitles :-/'
|
||||
}
|
||||
}
|
||||
temperature = 0.9
|
||||
model_name = 'gpt-3.5-turbo'
|
||||
summary = ""
|
||||
llm = ChatOpenAI(temperature = temperature, model_name = model_name)
|
||||
|
||||
system = "You are a professional note taker. User will provide a part of the transcript of a Youtube video. You will reply by listing bullet points with 1 simple sentence each time."
|
||||
|
||||
for part in video.parts:
|
||||
prompt = part
|
||||
resp = llm([HumanMessage(content=prompt), SystemMessage(content=system)])
|
||||
summary += resp.content + "\n"
|
||||
|
||||
hashtagerSystem = "You are a professional note taker. User will provide bullet points from the transcript of a video. You will reply by suggesting simple short hashtags"
|
||||
|
||||
hashtags = llm([HumanMessage(content=summary), SystemMessage(content=hashtagerSystem)])
|
||||
return {
|
||||
'statusCode': 200,
|
||||
'body': {
|
||||
'url': video.url,
|
||||
'creator': video.creator,
|
||||
'title': video.title,
|
||||
'transcript': video.transcript,
|
||||
'summary': summary,
|
||||
'hashtags': hashtags.content
|
||||
}
|
||||
}
|
||||
|
||||
"""
|
||||
curl test:
|
||||
curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai
|
||||
|
@ -67,7 +118,15 @@ def home():
|
|||
</html>
|
||||
"""
|
||||
|
||||
@app.route('/translate/<string:lang>', methods=['POST'])
|
||||
def translate_text(lang):
|
||||
if lang not in models:
|
||||
return jsonify(error=f"No model for language: {lang}"), 400
|
||||
|
||||
content = request.json
|
||||
text = content['text']
|
||||
translated_text = translate(text, lang)
|
||||
return jsonify(translated_text=translated_text)
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True)
|
||||
|
|
|
@ -0,0 +1,66 @@
|
|||
import yt_dlp as youtube_dl
|
||||
import os
|
||||
|
||||
MAXWORDCOUNT = 2000
|
||||
LANG = "en"
|
||||
|
||||
ydl_opts = {
|
||||
'writesubtitles': True,
|
||||
'writeautomaticsub': True,
|
||||
'subtitlesformat': 'vtt',
|
||||
'subtitleslangs': ['en'],
|
||||
'outtmpl': 'output/%(id)s',
|
||||
'skip_download': True,
|
||||
'writethumbnail': True,
|
||||
'progress' : False,
|
||||
'noprogress': True,
|
||||
'quiet': True
|
||||
}
|
||||
|
||||
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||
|
||||
class Video:
|
||||
def __init__(self, url, creator, title, transcript, parts):
|
||||
self.url = url
|
||||
self.creator = creator
|
||||
self.title = title
|
||||
self.transcript = transcript
|
||||
self.parts = parts
|
||||
self.ytgenerated = False
|
||||
|
||||
def GetVideo(url):
|
||||
ydl.download([url])
|
||||
info = ydl.extract_info(url, download=False)
|
||||
creator = info['uploader']
|
||||
title = info['title']
|
||||
id = info['id']
|
||||
subFn = "output/" + id + "." + LANG + ".vtt"
|
||||
if not os.path.exists(subFn):
|
||||
video = Video(url, creator, title, "This video doesn't have available subtitles :-/", [])
|
||||
return video
|
||||
txt = ""
|
||||
with open(subFn, "r") as f:
|
||||
lines = f.readlines()
|
||||
|
||||
for i in range(len(lines)):
|
||||
if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "":
|
||||
txt += lines[i]
|
||||
txt = txt.replace("\n"," ")
|
||||
words = txt.split()
|
||||
sList = []
|
||||
wCount = 0
|
||||
maxWordCount = MAXWORDCOUNT
|
||||
currentString = ""
|
||||
for w in words:
|
||||
wCount +=1
|
||||
currentString += w + " "
|
||||
if wCount == maxWordCount:
|
||||
sList.append(currentString)
|
||||
wCount = 0
|
||||
currentString = ""
|
||||
if currentString:
|
||||
sList.append(currentString)
|
||||
blocks = len(sList)
|
||||
video = Video(url, creator, title, txt, sList)
|
||||
video.ytgenerated = True
|
||||
return video
|
|
@ -0,0 +1,17 @@
|
|||
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||
|
||||
models = {
|
||||
"en_ru": "Helsinki-NLP/opus-mt-en-ru",
|
||||
"ru_en": "Helsinki-NLP/opus-mt-ru-en",
|
||||
# Add more models as needed
|
||||
}
|
||||
|
||||
tokenizers = {lang: AutoTokenizer.from_pretrained(model) for lang, model in models.items()}
|
||||
translation_models = {lang: AutoModelForSeq2SeqLM.from_pretrained(model) for lang, model in models.items()}
|
||||
|
||||
def translate(text, lang):
|
||||
tokenizer = tokenizers[lang]
|
||||
model = translation_models[lang]
|
||||
inputs = tokenizer(text, return_tensors="pt")
|
||||
outputs = model.generate(**inputs, max_new_tokens = 200)
|
||||
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
Loading…
Reference in New Issue