mirror of https://git.sr.ht/~cadence/NewLeaf
Retrieve the first 20 comments of a video on /api/v1/comments/:videoid
Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X headers). This is not a complete reimplementation of Invidious API as continuation is not implemented (to retrieve more than the first 20 comments and comments replies), likes and replies count are also missing.
This commit is contained in:
parent
1ea86101fd
commit
3f57d50893
|
@ -0,0 +1,47 @@
|
|||
import json
|
||||
import requests
|
||||
import urllib.parse
|
||||
from tools.converters import *
|
||||
from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
|
||||
|
||||
def extract_comments(id, **kwargs):
|
||||
s = requests.session()
|
||||
s.headers.update({"accept-language": "en-US,en;q=0.9"})
|
||||
s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
|
||||
with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
|
||||
r.raise_for_status()
|
||||
yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
|
||||
item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
|
||||
continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
|
||||
itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
|
||||
xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
|
||||
if not xsrf_token:
|
||||
cherrypy.response.status = 500
|
||||
return {
|
||||
"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
|
||||
"identifier": "XSRF_TOKEN_NOT_FOUND"
|
||||
}
|
||||
url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
|
||||
with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
|
||||
data = json.loads(rr.content.decode("utf8"))
|
||||
return {
|
||||
"videoId": id,
|
||||
"comments": [
|
||||
{
|
||||
"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
|
||||
"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
|
||||
"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
|
||||
"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
|
||||
"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
||||
"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
|
||||
"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
|
||||
"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
|
||||
# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
|
||||
"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
|
||||
"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
|
||||
# "replies": {
|
||||
# "replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
|
||||
# }
|
||||
} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
|
||||
]
|
||||
}
|
9
index.py
9
index.py
|
@ -9,6 +9,7 @@ from extractors.manifest import extract_manifest
|
|||
from extractors.search import extract_search
|
||||
from extractors.suggestions import extract_search_suggestions
|
||||
from extractors.captions import extract_captions
|
||||
from extractors.comments import extract_comments
|
||||
import configuration
|
||||
|
||||
@cherrypy.tools.register("before_finalize", priority=60)
|
||||
|
@ -26,7 +27,8 @@ class NewLeaf(object):
|
|||
["channels", 1, 2],
|
||||
["videos", 1, 1],
|
||||
["search", 0, 1],
|
||||
["captions", 1, 1]
|
||||
["captions", 1, 1],
|
||||
["comments", 1, 1]
|
||||
]
|
||||
for e in endpoints:
|
||||
if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
|
||||
|
@ -114,6 +116,11 @@ class NewLeaf(object):
|
|||
"identifier": "NO_MATCHING_CAPTIONS"
|
||||
}), "utf8")
|
||||
|
||||
@cherrypy.expose
|
||||
@cherrypy.tools.json_out()
|
||||
def comments(self, id, **kwargs):
|
||||
return extract_comments(id)
|
||||
|
||||
@cherrypy.expose
|
||||
def vi(self, id, file):
|
||||
with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r:
|
||||
|
|
|
@ -4,6 +4,7 @@ import random
|
|||
|
||||
r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M)
|
||||
r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M)
|
||||
r_yt_cfg = re.compile(r"""ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;""")
|
||||
|
||||
def extract_yt_initial_data(content):
|
||||
m_yt_initial_data = re.search(r_yt_initial_data, content)
|
||||
|
@ -21,5 +22,11 @@ def extract_yt_initial_player_response(content):
|
|||
else:
|
||||
raise Exception("Could not match ytInitialPlayerResponse in content")
|
||||
|
||||
def extract_yt_cfg(content):
|
||||
m_yt_cfg = re.search(r_yt_cfg, content)
|
||||
if m_yt_cfg:
|
||||
return json.loads(m_yt_cfg.group(1))
|
||||
raise Exception("Could not match ytcfg in content")
|
||||
|
||||
def eu_consent_cookie():
|
||||
return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))}
|
||||
|
|
Loading…
Reference in New Issue