-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtranscript_api.py
67 lines (59 loc) · 1.89 KB
/
transcript_api.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import json
import requests
def parse_player_info(video_id):
player_params = {"key": "AIzaSyAO_FJ2SlqU8Q4STEHLGCilw_Y9_11qcW8"}
data = {
"videoId": video_id,
"context": {
"client": {
"clientName": "WEB_EMBEDDED_PLAYER",
"clientVersion": "1.20211019.01.00",
},
},
}
response = requests.post(
"https://www.youtube-nocookie.com/youtubei/v1/player",
params=player_params,
data=json.dumps(data),
)
return response.json()
def get_transcript(video_id):
sentences = []
player_info = parse_player_info(video_id)
if (
"captions" not in player_info
or "playerCaptionsTracklistRenderer" not in player_info["captions"]
):
return sentences
caption_tracks = player_info["captions"]["playerCaptionsTracklistRenderer"][
"captionTracks"
]
json3_url = None
for cap in caption_tracks:
if "kind" in cap and cap["kind"] == "asr":
json3_url = "https://www.youtube.com" + cap["baseUrl"] + "&fmt=json3"
if cap["languageCode"] != "en":
json3_url += "&tlang=en"
break
if json3_url is None:
return sentences
r = requests.get(json3_url)
for event in r.json()["events"]:
if "segs" not in event:
continue
segs = event["segs"]
start_ms = event["tStartMs"]
for seg in segs:
if "tOffsetMs" in seg:
seg_ms = start_ms + seg["tOffsetMs"]
else:
seg_ms = start_ms
sentences.append({"word": seg["utf8"], "start_ts": seg_ms / 1000})
return sentences
if __name__ == "__main__":
# API limit test
import multiprocessing
video_id = "-wJOUAuKZm8"
video_ids = [video_id] * 1_000
pool = multiprocessing.Pool(processes=4)
results = pool.map(get_transcript, video_ids)