Compare commits

...

2 Commits

Author SHA1 Message Date
Cadence Ember e3854a6050
Extract fact check notices to second__clarification 2021-11-04 02:01:52 +13:00
Cadence Ember 65bb7a2c4c
Fix recommended extraction when fact check notice 2021-11-04 01:59:50 +13:00
2 changed files with 22 additions and 5 deletions

View File

@ -7,7 +7,7 @@ import traceback
import yt_dlp import yt_dlp
import urllib.error import urllib.error
from tools.converters import * from tools.converters import *
from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response from tools.extractors import extract_yt_initial_data, extract_yt_initial_player_response, deep_get
import tools.files as files import tools.files as files
from math import floor from math import floor
from urllib.parse import parse_qs, urlparse, urlencode from urllib.parse import parse_qs, urlparse, urlencode
@ -219,7 +219,11 @@ def get_more_stuff_from_file(id, result):
yt_initial_data = extract_yt_initial_data(content) yt_initial_data = extract_yt_initial_data(content)
main_video = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][0]["videoPrimaryInfoRenderer"] # result = yt_initial_data
# return result
main_sections = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"]
main_video = next(s["videoPrimaryInfoRenderer"] for s in main_sections if "videoPrimaryInfoRenderer" in s)
views = main_video["viewCount"]["videoViewCountRenderer"] views = main_video["viewCount"]["videoViewCountRenderer"]
result["second__viewCountText"] = get_view_count_text_or_recommended(views) result["second__viewCountText"] = get_view_count_text_or_recommended(views)
if "shortViewCount" in views: if "shortViewCount" in views:
@ -234,9 +238,6 @@ def get_more_stuff_from_file(id, result):
recommendations = yt_initial_data["contents"]["twoColumnWatchNextResults"]["secondaryResults"]\ recommendations = yt_initial_data["contents"]["twoColumnWatchNextResults"]["secondaryResults"]\
["secondaryResults"]["results"] ["secondaryResults"]["results"]
# result = yt_initial_data
# return result
def get_useful_recommendation_data(r): def get_useful_recommendation_data(r):
if "compactVideoRenderer" in r: if "compactVideoRenderer" in r:
return r["compactVideoRenderer"] return r["compactVideoRenderer"]
@ -316,6 +317,12 @@ def get_more_stuff_from_file(id, result):
"second__remoteUrl": url "second__remoteUrl": url
}) })
# fact check notices! aka "clarifications".
# for now, we just return the data as-is for the renderer to deal with (or not).
def get_clarification(section):
return deep_get(section, ["itemSectionRenderer", "contents", 0, "clarificationRenderer"])
result["second__clarification"] = next((get_clarification(s) for s in main_sections if get_clarification(s)), None)
except Exception: except Exception:
print("messed up extracting recommendations.") print("messed up extracting recommendations.")
traceback.print_exc() traceback.print_exc()

View File

@ -1,6 +1,7 @@
import re import re
import json import json
import random import random
from functools import reduce
r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M) r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M)
r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M) r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M)
@ -30,3 +31,12 @@ def extract_yt_cfg(content):
def eu_consent_cookie(): def eu_consent_cookie():
return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))} return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))}
def is_in(o, key):
if isinstance(o, list):
return type(key) == int and key >= 0 and key < len(o)
else:
return key in o
def deep_get(o, properties):
return reduce(lambda a, b: a and is_in(a, b) and a[b] or None, [o, *properties])