Implement captions

Automatic subtitles are not supported, because youtube_dlc does not
provide them.
This commit is contained in:
bopol 2021-01-17 23:59:14 +01:00 committed by Cadence Ember
parent 985f0c1c32
commit 6709aa30c2
No known key found for this signature in database
GPG Key ID: BC1C2C61CF521B17
5 changed files with 126 additions and 2 deletions

View File

@ -13,6 +13,7 @@ These endpoints are somewhat implemented:
- `/api/v1/channels/{part}/{ucid}`
- `/api/v1/search?q={search}`
- `/api/v1/search/suggestions?q={search}`
- `/api/v1/captions/{id}`
- `/vi/{id}/{file}`
- `/api/manifest/dash/id/{id}`

73
extractors/captions.py Normal file
View File

@ -0,0 +1,73 @@
import requests
from extractors.video import extract_video
from tools.converters import escape_html_textcontent, get_subtitle_api_url
from urllib.parse import urlencode
import xml.etree.ElementTree as ET
def extract_captions(id, **kwargs):
captions = extract_captions_from_api(id)
return extract_captions_from_dict(captions, **kwargs)
# Return captions for the language specified,
# The captions list otherwise
def extract_captions_from_dict(captions, **kwargs):
lang = None
label = None
if "lang" in kwargs:
lang = kwargs["lang"]
elif "label" in kwargs:
label = kwargs["label"]
else:
return captions
for subtitle in captions["captions"]:
if lang == subtitle["languageCode"] or label == subtitle["label"]:
url = subtitle["second__subtitleUrl"]
with requests.get(url) as r:
r.raise_for_status()
return r.content.decode("utf8")
# Currently unused in favour of extract_captions_from_api.
def extract_captions_from_video(id):
return {
"captions": extract_video(id)["captions"]
}
# no automatic captions
def extract_captions_from_api(id):
url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id
with requests.get(url) as r:
r.raise_for_status()
transcript = ET.fromstring(r.content.decode("utf8"))
tracks = transcript.findall("track")
captions = []
result = {
"captions": captions
}
for track in tracks:
language_code = track.attrib["lang_code"]
label = track.get("name", default=language_code)
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
params = urlencode({
"lang": language_code,
"v": id,
"fmt": "vtt",
"name": label
})
subtitle_url = "https://www.youtube.com/api/timedtext?" + params
captions.append({
"label": label if label != "" else language_code,
"languageCode": language_code,
"url": subtitle_api_url,
"second__subtitleUrl": subtitle_url
})
return result

View File

@ -19,7 +19,9 @@ ytdl_opts = {
"playlist_items": "1-100",
"extract_flat": "in_playlist",
"write_pages": True,
"source_address": "0.0.0.0"
"source_address": "0.0.0.0",
"writesubtitles": True,
"allsubtitles": True,
}
ytdl = youtube_dlc.YoutubeDL(ytdl_opts)
@ -172,6 +174,23 @@ def extract_video(id):
"second__height": format["height"]
})
if "requested_subtitles" in info and info["requested_subtitles"]:
for language_code, subtitle in info["requested_subtitles"].items():
if language_code != "live_chat":
subtitle_url = subtitle["url"]
label = get_language_label_from_url(subtitle_url)
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
result["captions"].append({
"label": label if label != "" else language_code,
"languageCode": language_code,
"url": subtitle_api_url,
"second__subtitleUrl": subtitle_url # Direct YouTube url
})
result = get_more_stuff_from_file(info["id"], result)
return result

View File

@ -7,6 +7,7 @@ from extractors.channel import extract_channel, extract_channel_videos, extract_
from extractors.manifest import extract_manifest
from extractors.search import extract_search
from extractors.suggestions import extract_search_suggestions
from extractors.captions import extract_captions
@cherrypy.tools.register("before_finalize", priority=60)
def custom_headers():
@ -22,7 +23,8 @@ class Second(object):
endpoints = [
["channels", 1, 2],
["videos", 1, 1],
["search", 0, 1]
["search", 0, 1],
["captions", 1, 1]
]
for e in endpoints:
if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
@ -91,6 +93,17 @@ class Second(object):
def suggestions(self, *, q, **kwargs):
return extract_search_suggestions(q)
@cherrypy.expose
def captions(self, id, **kwargs):
result = extract_captions(id, **kwargs)
if type(result) is dict:
cherrypy.response.headers["content-type"] = "application/json"
return bytes(json.dumps(result), "utf8")
else:
cherrypy.response.headers["content-type"] = "text/vtt; charset=UTF-8"
return result
@cherrypy.expose
def vi(self, id, file):
with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r:

View File

@ -2,6 +2,7 @@ import configuration
import datetime
import re
import time
from urllib.parse import urlparse, parse_qs, quote_plus
def length_text_to_seconds(text):
s = text.split(":")
@ -205,3 +206,20 @@ def time_to_past_text(timestamp):
number = diff // unit_value
plural_unit = unit_name if number == 1 else unit_name + "s"
return "{} {} ago".format(number, plural_unit)
def get_language_label_from_url(url_string):
url = urlparse(url_string)
params = parse_qs(url.query)
label = params["name"][0] if "name" in params else "" # name may be in params with empty value
return label
def get_subtitle_api_url(id, label, language_code):
subtitle_api_url = "{}/api/v1/captions/{}?".format(configuration.website_origin, id)
if label == "":
label = language_code
subtitle_api_url += "lang=" + quote_plus(language_code)
else:
subtitle_api_url += "label=" + quote_plus(label)
return subtitle_api_url