Implement captions
Automatic subtitles are not supported, because youtube_dlc does not provide them.
This commit is contained in:
parent
985f0c1c32
commit
6709aa30c2
@ -13,6 +13,7 @@ These endpoints are somewhat implemented:
|
||||
- `/api/v1/channels/{part}/{ucid}`
|
||||
- `/api/v1/search?q={search}`
|
||||
- `/api/v1/search/suggestions?q={search}`
|
||||
- `/api/v1/captions/{id}`
|
||||
- `/vi/{id}/{file}`
|
||||
- `/api/manifest/dash/id/{id}`
|
||||
|
||||
|
73
extractors/captions.py
Normal file
73
extractors/captions.py
Normal file
@ -0,0 +1,73 @@
|
||||
import requests
|
||||
from extractors.video import extract_video
|
||||
from tools.converters import escape_html_textcontent, get_subtitle_api_url
|
||||
from urllib.parse import urlencode
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
def extract_captions(id, **kwargs):
|
||||
captions = extract_captions_from_api(id)
|
||||
return extract_captions_from_dict(captions, **kwargs)
|
||||
|
||||
# Return captions for the language specified,
|
||||
# The captions list otherwise
|
||||
def extract_captions_from_dict(captions, **kwargs):
|
||||
lang = None
|
||||
label = None
|
||||
|
||||
if "lang" in kwargs:
|
||||
lang = kwargs["lang"]
|
||||
elif "label" in kwargs:
|
||||
label = kwargs["label"]
|
||||
else:
|
||||
return captions
|
||||
|
||||
for subtitle in captions["captions"]:
|
||||
if lang == subtitle["languageCode"] or label == subtitle["label"]:
|
||||
url = subtitle["second__subtitleUrl"]
|
||||
|
||||
with requests.get(url) as r:
|
||||
r.raise_for_status()
|
||||
return r.content.decode("utf8")
|
||||
|
||||
# Currently unused in favour of extract_captions_from_api.
|
||||
def extract_captions_from_video(id):
|
||||
return {
|
||||
"captions": extract_video(id)["captions"]
|
||||
}
|
||||
|
||||
# no automatic captions
|
||||
def extract_captions_from_api(id):
|
||||
url = "https://video.google.com/timedtext?hl=en&type=list&v=%s" % id
|
||||
with requests.get(url) as r:
|
||||
r.raise_for_status()
|
||||
|
||||
transcript = ET.fromstring(r.content.decode("utf8"))
|
||||
tracks = transcript.findall("track")
|
||||
|
||||
captions = []
|
||||
result = {
|
||||
"captions": captions
|
||||
}
|
||||
|
||||
for track in tracks:
|
||||
language_code = track.attrib["lang_code"]
|
||||
label = track.get("name", default=language_code)
|
||||
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
|
||||
|
||||
params = urlencode({
|
||||
"lang": language_code,
|
||||
"v": id,
|
||||
"fmt": "vtt",
|
||||
"name": label
|
||||
})
|
||||
|
||||
subtitle_url = "https://www.youtube.com/api/timedtext?" + params
|
||||
|
||||
captions.append({
|
||||
"label": label if label != "" else language_code,
|
||||
"languageCode": language_code,
|
||||
"url": subtitle_api_url,
|
||||
"second__subtitleUrl": subtitle_url
|
||||
})
|
||||
|
||||
return result
|
@ -19,7 +19,9 @@ ytdl_opts = {
|
||||
"playlist_items": "1-100",
|
||||
"extract_flat": "in_playlist",
|
||||
"write_pages": True,
|
||||
"source_address": "0.0.0.0"
|
||||
"source_address": "0.0.0.0",
|
||||
"writesubtitles": True,
|
||||
"allsubtitles": True,
|
||||
}
|
||||
ytdl = youtube_dlc.YoutubeDL(ytdl_opts)
|
||||
|
||||
@ -172,6 +174,23 @@ def extract_video(id):
|
||||
"second__height": format["height"]
|
||||
})
|
||||
|
||||
if "requested_subtitles" in info and info["requested_subtitles"]:
|
||||
|
||||
for language_code, subtitle in info["requested_subtitles"].items():
|
||||
|
||||
if language_code != "live_chat":
|
||||
subtitle_url = subtitle["url"]
|
||||
label = get_language_label_from_url(subtitle_url)
|
||||
subtitle_api_url = get_subtitle_api_url(id, label, language_code)
|
||||
|
||||
result["captions"].append({
|
||||
"label": label if label != "" else language_code,
|
||||
"languageCode": language_code,
|
||||
"url": subtitle_api_url,
|
||||
"second__subtitleUrl": subtitle_url # Direct YouTube url
|
||||
})
|
||||
|
||||
|
||||
result = get_more_stuff_from_file(info["id"], result)
|
||||
|
||||
return result
|
||||
|
15
index.py
15
index.py
@ -7,6 +7,7 @@ from extractors.channel import extract_channel, extract_channel_videos, extract_
|
||||
from extractors.manifest import extract_manifest
|
||||
from extractors.search import extract_search
|
||||
from extractors.suggestions import extract_search_suggestions
|
||||
from extractors.captions import extract_captions
|
||||
|
||||
@cherrypy.tools.register("before_finalize", priority=60)
|
||||
def custom_headers():
|
||||
@ -22,7 +23,8 @@ class Second(object):
|
||||
endpoints = [
|
||||
["channels", 1, 2],
|
||||
["videos", 1, 1],
|
||||
["search", 0, 1]
|
||||
["search", 0, 1],
|
||||
["captions", 1, 1]
|
||||
]
|
||||
for e in endpoints:
|
||||
if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
|
||||
@ -91,6 +93,17 @@ class Second(object):
|
||||
def suggestions(self, *, q, **kwargs):
|
||||
return extract_search_suggestions(q)
|
||||
|
||||
@cherrypy.expose
|
||||
def captions(self, id, **kwargs):
|
||||
result = extract_captions(id, **kwargs)
|
||||
if type(result) is dict:
|
||||
cherrypy.response.headers["content-type"] = "application/json"
|
||||
return bytes(json.dumps(result), "utf8")
|
||||
else:
|
||||
cherrypy.response.headers["content-type"] = "text/vtt; charset=UTF-8"
|
||||
return result
|
||||
|
||||
|
||||
@cherrypy.expose
|
||||
def vi(self, id, file):
|
||||
with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r:
|
||||
|
@ -2,6 +2,7 @@ import configuration
|
||||
import datetime
|
||||
import re
|
||||
import time
|
||||
from urllib.parse import urlparse, parse_qs, quote_plus
|
||||
|
||||
def length_text_to_seconds(text):
|
||||
s = text.split(":")
|
||||
@ -205,3 +206,20 @@ def time_to_past_text(timestamp):
|
||||
number = diff // unit_value
|
||||
plural_unit = unit_name if number == 1 else unit_name + "s"
|
||||
return "{} {} ago".format(number, plural_unit)
|
||||
|
||||
def get_language_label_from_url(url_string):
|
||||
url = urlparse(url_string)
|
||||
params = parse_qs(url.query)
|
||||
label = params["name"][0] if "name" in params else "" # name may be in params with empty value
|
||||
return label
|
||||
|
||||
def get_subtitle_api_url(id, label, language_code):
|
||||
subtitle_api_url = "{}/api/v1/captions/{}?".format(configuration.website_origin, id)
|
||||
|
||||
if label == "":
|
||||
label = language_code
|
||||
subtitle_api_url += "lang=" + quote_plus(language_code)
|
||||
else:
|
||||
subtitle_api_url += "label=" + quote_plus(label)
|
||||
|
||||
return subtitle_api_url
|
||||
|
Loading…
Reference in New Issue
Block a user