Improve ytInitialData extraction

This commit is contained in:
Cadence Ember 2020-12-03 17:00:06 +13:00
parent ba88c53857
commit 554cd8cc3a
No known key found for this signature in database
GPG Key ID: BC1C2C61CF521B17
2 changed files with 9 additions and 6 deletions

View File

@ -28,6 +28,11 @@ def get_created_files(id):
id = "_" + id[1:] # youtube-dl changes - to _ at the start, presumably to not accidentally trigger switches with * in shell id = "_" + id[1:] # youtube-dl changes - to _ at the start, presumably to not accidentally trigger switches with * in shell
return (f for f in os.listdir() if f.startswith("{}_".format(id))) return (f for f in os.listdir() if f.startswith("{}_".format(id)))
def clean_up_temp_files(id):
created_files = get_created_files(id)
for file in created_files:
os.unlink(file)
def format_order(format): def format_order(format):
# most significant to least significant # most significant to least significant
# key, max, order, transform # key, max, order, transform
@ -172,6 +177,8 @@ def extract_video(id):
return result return result
except youtube_dlc.DownloadError as e: except youtube_dlc.DownloadError as e:
clean_up_temp_files(id)
if isinstance(e.exc_info[1], urllib.error.HTTPError): if isinstance(e.exc_info[1], urllib.error.HTTPError):
if e.exc_info[1].code == 429: if e.exc_info[1].code == 429:
result = { result = {
@ -192,9 +199,7 @@ def extract_video(id):
print("messed up in original transform.") print("messed up in original transform.")
finally: finally:
created_files = get_created_files(id) clean_up_temp_files(id)
for file in created_files:
os.unlink(file)
return result return result
def get_more_stuff_from_file(id, result): def get_more_stuff_from_file(id, result):

View File

@ -1,13 +1,11 @@
import re import re
import json import json
r_yt_initial_data = re.compile(r"""(?:\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+\});</script>""") r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M)
def extract_yt_initial_data(content): def extract_yt_initial_data(content):
content = content.replace("\n", "")
m_yt_initial_data = re.search(r_yt_initial_data, content) m_yt_initial_data = re.search(r_yt_initial_data, content)
if m_yt_initial_data: if m_yt_initial_data:
print(m_yt_initial_data.group(1))
yt_initial_data = json.loads(m_yt_initial_data.group(1)) yt_initial_data = json.loads(m_yt_initial_data.group(1))
return yt_initial_data return yt_initial_data
else: else: