mirror of https://git.sr.ht/~cadence/NewLeaf
Support auto-generated captions
The caption extraction is now entirely in our own hands.
This commit is contained in:
parent
aaf7d65b32
commit
1d52fca3a0
|
@ -5,7 +5,10 @@ from urllib.parse import urlencode
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
|
|
||||||
def extract_captions(id, **kwargs):
|
def extract_captions(id, **kwargs):
|
||||||
captions = extract_captions_from_api(id)
|
if "label" in kwargs and "auto-generated" in kwargs["label"]:
|
||||||
|
captions = extract_captions_from_video(id)
|
||||||
|
else:
|
||||||
|
captions = extract_captions_from_api(id)
|
||||||
return extract_captions_from_dict(captions, **kwargs)
|
return extract_captions_from_dict(captions, **kwargs)
|
||||||
|
|
||||||
# Return captions for the language specified,
|
# Return captions for the language specified,
|
||||||
|
@ -19,15 +22,9 @@ def extract_captions_from_dict(captions, *, lang=None, label=None):
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return r
|
return r
|
||||||
|
|
||||||
# Currently unused in favour of extract_captions_from_api.
|
# List of captions directly from youtube, but no automatic
|
||||||
def extract_captions_from_video(id):
|
|
||||||
return {
|
|
||||||
"captions": extract_video(id)["captions"]
|
|
||||||
}
|
|
||||||
|
|
||||||
# no automatic captions
|
|
||||||
def extract_captions_from_api(id):
|
def extract_captions_from_api(id):
|
||||||
url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id
|
url = "https://video.google.com/timedtext?hl=en&type=list&v={}".format(id)
|
||||||
with requests.get(url) as r:
|
with requests.get(url) as r:
|
||||||
if r.status_code == 404:
|
if r.status_code == 404:
|
||||||
return {
|
return {
|
||||||
|
@ -67,3 +64,9 @@ def extract_captions_from_api(id):
|
||||||
})
|
})
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
# We'll fall back to this function for auto-captions.
|
||||||
|
def extract_captions_from_video(id):
|
||||||
|
return {
|
||||||
|
"captions": extract_video(id)["captions"]
|
||||||
|
}
|
||||||
|
|
|
@ -10,6 +10,7 @@ from tools.converters import *
|
||||||
from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response
|
from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response
|
||||||
import tools.files as files
|
import tools.files as files
|
||||||
from math import floor
|
from math import floor
|
||||||
|
from urllib.parse import parse_qs, urlparse, urlencode
|
||||||
from cachetools import TTLCache
|
from cachetools import TTLCache
|
||||||
|
|
||||||
video_cache = TTLCache(maxsize=50, ttl=300)
|
video_cache = TTLCache(maxsize=50, ttl=300)
|
||||||
|
@ -165,22 +166,6 @@ def extract_video(id):
|
||||||
"second__height": format["height"]
|
"second__height": format["height"]
|
||||||
})
|
})
|
||||||
|
|
||||||
if info.get("requested_subtitles"):
|
|
||||||
for language_code, subtitle in info["requested_subtitles"].items():
|
|
||||||
if language_code == "live_chat":
|
|
||||||
continue
|
|
||||||
|
|
||||||
subtitle_url = subtitle["url"]
|
|
||||||
label = get_language_label_from_url(subtitle_url)
|
|
||||||
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
|
|
||||||
result["captions"].append({
|
|
||||||
"label": label if label != "" else language_code,
|
|
||||||
"languageCode": language_code,
|
|
||||||
"url": subtitle_api_url,
|
|
||||||
"second__subtitleUrl": subtitle_url # Direct YouTube url
|
|
||||||
})
|
|
||||||
|
|
||||||
|
|
||||||
result = get_more_stuff_from_file(info["id"], result)
|
result = get_more_stuff_from_file(info["id"], result)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
@ -300,6 +285,27 @@ def get_more_stuff_from_file(id, result):
|
||||||
f["qualityLabel"] = label
|
f["qualityLabel"] = label
|
||||||
f["second__order"] = format_order(f)
|
f["second__order"] = format_order(f)
|
||||||
|
|
||||||
|
for track in player_response["captions"]["playerCaptionsTracklistRenderer"]["captionTracks"]:
|
||||||
|
# safely editing the track format by taking apart the url...
|
||||||
|
url = track["baseUrl"]
|
||||||
|
parts = urlparse(url)
|
||||||
|
qs = parse_qs(parts.query)
|
||||||
|
qs["format"] = ["vtt"]
|
||||||
|
qs = urlencode(qs, doseq=True)
|
||||||
|
# ...and putting it back together...
|
||||||
|
parts = parts._replace(query=qs)
|
||||||
|
url = parts.geturl()
|
||||||
|
# now make the caption object
|
||||||
|
label = combine_runs(track["name"])
|
||||||
|
language_code = track["languageCode"]
|
||||||
|
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
|
||||||
|
result["captions"].append({
|
||||||
|
"label": label,
|
||||||
|
"languageCode": language_code,
|
||||||
|
"url": subtitle_api_url,
|
||||||
|
"second__remoteUrl": url
|
||||||
|
})
|
||||||
|
|
||||||
except Exception:
|
except Exception:
|
||||||
print("messed up extracting recommendations.")
|
print("messed up extracting recommendations.")
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
|
|
@ -217,7 +217,7 @@ def get_subtitle_api_url(id, label, language_code):
|
||||||
subtitle_api_url = "/api/v1/captions/{}?".format(id)
|
subtitle_api_url = "/api/v1/captions/{}?".format(id)
|
||||||
params = {}
|
params = {}
|
||||||
|
|
||||||
if label:
|
if label and "auto-generated" in label:
|
||||||
params["label"] = label
|
params["label"] = label
|
||||||
else:
|
else:
|
||||||
params["lang"] = language_code
|
params["lang"] = language_code
|
||||||
|
|
Loading…
Reference in New Issue