-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyoutube.py
73 lines (57 loc) · 2.13 KB
/
youtube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
"""
Usage: youtube.py <youtube_id> [<question>]
prints the script of the video when no question is provided
prints youtube links into the video where the question is answered
idea inspired by Yuvi https://github.com/yvrjsharma
"""
from youtube_transcript_api import YouTubeTranscriptApi
import sys
try:
available = YouTubeTranscriptApi.list_transcripts(sys.argv[1])
transcript = YouTubeTranscriptApi.get_transcript(sys.argv[1],languages=['en','de','fr',]) # this will only work for English
transcriptAsString = ' '.join([i['text'] for i in transcript]).replace('[Music]',' ')
transcriptAsWords = transcriptAsString.split(' ')
tAWSize=len(transcriptAsWords)
except:
print("No transcipt found")
quit()
try:
question=sys.argv[2]
questionAsWords=question.split(' ')
qAWSize=len(questionAsWords)
except:
print("As no question was asked, we are done,\n",transcriptAsString,"\n bye")
quit()
print("Now loading NLP, be patient")
from transformers import AutoTokenizer
from transformers import pipeline
from transformers import AutoModelForQuestionAnswering
import pandas as pd
#you can try different models and checkpoints
#checkPointQA = "deepset/minilm-uncased-squad2"
checkPointQA = "deepset/roberta-base-squad2"
#checkPointQA = "distilbert-base-cased-distilled-squad"
print("NLP loaded")
contexts=[]
i=0
while i < tAWSize:
contexts.append(' '.join(transcriptAsWords[i:i+(5*qAWSize)]))
i = i+qAWSize * 4
pipe = pipeline("question-answering",
model=AutoModelForQuestionAnswering.from_pretrained(checkPointQA),
tokenizer=AutoTokenizer.from_pretrained(checkPointQA))
answers=[]
for c in contexts:
answers.append(pipe(question=question, context=c))
answers=pd.DataFrame(answers)
goodAnswers=answers.sort_values(by='score',ascending=False).head(2)['answer'].values
transcript=pd.DataFrame(transcript)
transcript['good']=None
def contains(a,b):
for b1 in b:
if b1 in a:
return True
return False
transcript['good']=transcript.apply(lambda x: contains(x['text'],goodAnswers),axis=1)
for s in transcript[transcript['good']]['start'].values:
print(f"https://youtu.be/{sys.argv[1]}?t={int(s)}")