import yt_dlp as youtube_dl import os MAXWORDCOUNT = 2000 LANG = "en" ydl_opts = { 'writesubtitles': True, 'writeautomaticsub': True, 'subtitlesformat': 'vtt', 'subtitleslangs': ['en'], 'outtmpl': 'output/%(id)s', 'skip_download': True, 'writethumbnail': True, 'progress' : False, 'noprogress': True, 'quiet': True } ydl = youtube_dl.YoutubeDL(ydl_opts) class Video: def __init__(self, url, creator, title, transcript, parts): self.url = url self.creator = creator self.title = title self.transcript = transcript self.parts = parts self.ytgenerated = False def GetVideo(url): ydl.download([url]) info = ydl.extract_info(url, download=False) creator = info['uploader'] title = info['title'] id = info['id'] subFn = "output/" + id + "." + LANG + ".vtt" if not os.path.exists(subFn): video = Video(url, creator, title, "This video doesn't have available subtitles :-/", []) return video txt = "" with open(subFn, "r") as f: lines = f.readlines() for i in range(len(lines)): if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "": txt += lines[i] txt = txt.replace("\n"," ") words = txt.split() sList = [] wCount = 0 maxWordCount = MAXWORDCOUNT currentString = "" for w in words: wCount +=1 currentString += w + " " if wCount == maxWordCount: sList.append(currentString) wCount = 0 currentString = "" if currentString: sList.append(currentString) blocks = len(sList) video = Video(url, creator, title, txt, sList) video.ytgenerated = True return video