tldw translate and complete added
This commit is contained in:
parent
1b7a1b3962
commit
32cddd0b1e
61
main.py
61
main.py
|
@ -2,7 +2,10 @@ from flask import Flask, request, jsonify
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
from langchain.chat_models import ChatOpenAI
|
from langchain.chat_models import ChatOpenAI
|
||||||
from langchain.schema import HumanMessage
|
from langchain.schema import HumanMessage, SystemMessage
|
||||||
|
from translate import translate, models
|
||||||
|
from tldw import GetVideo
|
||||||
|
import os
|
||||||
|
|
||||||
app = Flask(__name__)
|
app = Flask(__name__)
|
||||||
# set fixed port for app
|
# set fixed port for app
|
||||||
|
@ -28,6 +31,54 @@ def openai():
|
||||||
'body': resp.content
|
'body': resp.content
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@app.route('/tldw', methods=['POST'])
|
||||||
|
def tldw():
|
||||||
|
content_type = request.headers.get('Content-Type')
|
||||||
|
if content_type == 'application/json':
|
||||||
|
json_payload = request.json
|
||||||
|
url = json_payload['url']
|
||||||
|
else:
|
||||||
|
return jsonify({'error': 'Invalid content type'}), 400
|
||||||
|
video = GetVideo(url)
|
||||||
|
print("received request for " + url )
|
||||||
|
if not video.ytgenerated:
|
||||||
|
return {
|
||||||
|
'statusCode': 200,
|
||||||
|
'body': {
|
||||||
|
'url': video.url,
|
||||||
|
'creator': video.creator,
|
||||||
|
'title': video.title,
|
||||||
|
'transcript': video.transcript,
|
||||||
|
'summary': 'This video doesn\'t have available subtitles :-/'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
temperature = 0.9
|
||||||
|
model_name = 'gpt-3.5-turbo'
|
||||||
|
summary = ""
|
||||||
|
llm = ChatOpenAI(temperature = temperature, model_name = model_name)
|
||||||
|
|
||||||
|
system = "You are a professional note taker. User will provide a part of the transcript of a Youtube video. You will reply by listing bullet points with 1 simple sentence each time."
|
||||||
|
|
||||||
|
for part in video.parts:
|
||||||
|
prompt = part
|
||||||
|
resp = llm([HumanMessage(content=prompt), SystemMessage(content=system)])
|
||||||
|
summary += resp.content + "\n"
|
||||||
|
|
||||||
|
hashtagerSystem = "You are a professional note taker. User will provide bullet points from the transcript of a video. You will reply by suggesting simple short hashtags"
|
||||||
|
|
||||||
|
hashtags = llm([HumanMessage(content=summary), SystemMessage(content=hashtagerSystem)])
|
||||||
|
return {
|
||||||
|
'statusCode': 200,
|
||||||
|
'body': {
|
||||||
|
'url': video.url,
|
||||||
|
'creator': video.creator,
|
||||||
|
'title': video.title,
|
||||||
|
'transcript': video.transcript,
|
||||||
|
'summary': summary,
|
||||||
|
'hashtags': hashtags.content
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
"""
|
"""
|
||||||
curl test:
|
curl test:
|
||||||
curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai
|
curl -XPOST --header "Content-Type: application/json" -d "{\"prompt\":\"What is the best way to learn a language?\"}" http://localhost:5000/openai
|
||||||
|
@ -67,7 +118,15 @@ def home():
|
||||||
</html>
|
</html>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@app.route('/translate/<string:lang>', methods=['POST'])
|
||||||
|
def translate_text(lang):
|
||||||
|
if lang not in models:
|
||||||
|
return jsonify(error=f"No model for language: {lang}"), 400
|
||||||
|
|
||||||
|
content = request.json
|
||||||
|
text = content['text']
|
||||||
|
translated_text = translate(text, lang)
|
||||||
|
return jsonify(translated_text=translated_text)
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
app.run(debug=True)
|
app.run(debug=True)
|
||||||
|
|
|
@ -0,0 +1,66 @@
|
||||||
|
import yt_dlp as youtube_dl
|
||||||
|
import os
|
||||||
|
|
||||||
|
MAXWORDCOUNT = 2000
|
||||||
|
LANG = "en"
|
||||||
|
|
||||||
|
ydl_opts = {
|
||||||
|
'writesubtitles': True,
|
||||||
|
'writeautomaticsub': True,
|
||||||
|
'subtitlesformat': 'vtt',
|
||||||
|
'subtitleslangs': ['en'],
|
||||||
|
'outtmpl': 'output/%(id)s',
|
||||||
|
'skip_download': True,
|
||||||
|
'writethumbnail': True,
|
||||||
|
'progress' : False,
|
||||||
|
'noprogress': True,
|
||||||
|
'quiet': True
|
||||||
|
}
|
||||||
|
|
||||||
|
ydl = youtube_dl.YoutubeDL(ydl_opts)
|
||||||
|
|
||||||
|
class Video:
|
||||||
|
def __init__(self, url, creator, title, transcript, parts):
|
||||||
|
self.url = url
|
||||||
|
self.creator = creator
|
||||||
|
self.title = title
|
||||||
|
self.transcript = transcript
|
||||||
|
self.parts = parts
|
||||||
|
self.ytgenerated = False
|
||||||
|
|
||||||
|
def GetVideo(url):
|
||||||
|
ydl.download([url])
|
||||||
|
info = ydl.extract_info(url, download=False)
|
||||||
|
creator = info['uploader']
|
||||||
|
title = info['title']
|
||||||
|
id = info['id']
|
||||||
|
subFn = "output/" + id + "." + LANG + ".vtt"
|
||||||
|
if not os.path.exists(subFn):
|
||||||
|
video = Video(url, creator, title, "This video doesn't have available subtitles :-/", [])
|
||||||
|
return video
|
||||||
|
txt = ""
|
||||||
|
with open(subFn, "r") as f:
|
||||||
|
lines = f.readlines()
|
||||||
|
|
||||||
|
for i in range(len(lines)):
|
||||||
|
if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "":
|
||||||
|
txt += lines[i]
|
||||||
|
txt = txt.replace("\n"," ")
|
||||||
|
words = txt.split()
|
||||||
|
sList = []
|
||||||
|
wCount = 0
|
||||||
|
maxWordCount = MAXWORDCOUNT
|
||||||
|
currentString = ""
|
||||||
|
for w in words:
|
||||||
|
wCount +=1
|
||||||
|
currentString += w + " "
|
||||||
|
if wCount == maxWordCount:
|
||||||
|
sList.append(currentString)
|
||||||
|
wCount = 0
|
||||||
|
currentString = ""
|
||||||
|
if currentString:
|
||||||
|
sList.append(currentString)
|
||||||
|
blocks = len(sList)
|
||||||
|
video = Video(url, creator, title, txt, sList)
|
||||||
|
video.ytgenerated = True
|
||||||
|
return video
|
|
@ -0,0 +1,17 @@
|
||||||
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
||||||
|
|
||||||
|
models = {
|
||||||
|
"en_ru": "Helsinki-NLP/opus-mt-en-ru",
|
||||||
|
"ru_en": "Helsinki-NLP/opus-mt-ru-en",
|
||||||
|
# Add more models as needed
|
||||||
|
}
|
||||||
|
|
||||||
|
tokenizers = {lang: AutoTokenizer.from_pretrained(model) for lang, model in models.items()}
|
||||||
|
translation_models = {lang: AutoModelForSeq2SeqLM.from_pretrained(model) for lang, model in models.items()}
|
||||||
|
|
||||||
|
def translate(text, lang):
|
||||||
|
tokenizer = tokenizers[lang]
|
||||||
|
model = translation_models[lang]
|
||||||
|
inputs = tokenizer(text, return_tensors="pt")
|
||||||
|
outputs = model.generate(**inputs, max_new_tokens = 200)
|
||||||
|
return tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
|
Loading…
Reference in New Issue