import youtube_dl import sys import os url = sys.argv[1] lang = sys.argv[2] ydl_opts = { 'writesubtitles': True, 'writeautomaticsub': True, 'subtitlesformat': 'vtt', 'subtitleslangs': [lang], 'lkjsdlfkjsdf': 'sdflkj', 'outtmpl': 'output/%(id)s', 'skip_download': True, 'writethumbnail': False, 'quiet': True } ydl = youtube_dl.YoutubeDL(ydl_opts) ydl.download([url]) info = ydl.extract_info(url, download=False) creator = info['uploader'] title = info['title'] id = info['id'] subFn = "output/" + id + "." + lang + ".vtt" txt = "" with open(subFn, "r") as f: lines = f.readlines() for i in range(len(lines)): if i < len(lines) - 1 and ">" not in lines[i] and ":" not in lines[i] and lines[i].strip() != "" and lines[i + 1].strip() == "": txt += lines[i] txt = txt.replace("\n"," ") #print(txt) words = txt.split() sList = [] wCount = 0 maxWordCount = 1000 currentString = "" for w in words: wCount +=1 currentString += w + " " if wCount == maxWordCount: sList.append(currentString) wCount = 0 currentString = "" if currentString: sList.append(currentString) blocks = len(sList) for i in range(blocks): print('This is part '+ str(i+1) +'/'+ str(blocks) + ' of the transcript of a video named: "' + title + '": \n' + sList[i])