mirror of https://git.sr.ht/~cadence/NewLeaf
Remove extraneous " align:start position:0%" on auto-generated captions
This commit is contained in:
parent
1d52fca3a0
commit
be8a2dad5f
|
@ -1,3 +1,4 @@
|
||||||
|
import re
|
||||||
import requests
|
import requests
|
||||||
from extractors.video import extract_video
|
from extractors.video import extract_video
|
||||||
from tools.converters import escape_html_textcontent, get_subtitle_api_url
|
from tools.converters import escape_html_textcontent, get_subtitle_api_url
|
||||||
|
@ -20,6 +21,9 @@ def extract_captions_from_dict(captions, *, lang=None, label=None):
|
||||||
url = next(caption["second__remoteUrl"] for caption in captions["captions"] if caption["languageCode"] == lang or caption["label"] == label)
|
url = next(caption["second__remoteUrl"] for caption in captions["captions"] if caption["languageCode"] == lang or caption["label"] == label)
|
||||||
with requests.get(url) as r:
|
with requests.get(url) as r:
|
||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
|
# remove extraneous " align:start position:0%" on timestamps lines on auto-generated captions
|
||||||
|
if (lang and "auto-generated" in lang) or (label and "auto-generated" in label):
|
||||||
|
return re.sub(r"^([0-9:.]+ --> [0-9:.]+).*$", r"\1", r.content.decode("utf8"), flags=re.MULTILINE)
|
||||||
return r
|
return r
|
||||||
|
|
||||||
# List of captions directly from youtube, but no automatic
|
# List of captions directly from youtube, but no automatic
|
||||||
|
|
Loading…
Reference in New Issue