Retrieve the first 20 comments of a video on /api/v1/comments/:videoid

Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X headers). This is not a complete reimplementation of Invidious API as continuation is not implemented (to retrieve more than the first 20 comments and comments replies), likes and replies count are also missing.
2025-05-23 03:41:34 +00:00 · 2021-06-27 04:06:13 +02:00 · 2021-06-27 04:06:13 +02:00 · 3f57d50893
commit 3f57d50893
parent 1ea86101fd
3 changed files with 62 additions and 1 deletions
--- a/extractors/comments.py
+++ b/extractors/comments.py
@ -0,0 +1,47 @@
 import json
 import requests
 import urllib.parse
 from tools.converters import *
 from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie
 def extract_comments(id, **kwargs):
 	s = requests.session()
 	s.headers.update({"accept-language": "en-US,en;q=0.9"})
 	s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT"))
 	with s.get("https://www.youtube.com/watch?v={}".format(id)) as r:
 		r.raise_for_status()
 		yt_initial_data = extract_yt_initial_data(r.content.decode("utf8"))
 		item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"]
 		continuation = item["continuations"][0]["nextContinuationData"]["continuation"]
 		itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"]
 		xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None)
 		if not xsrf_token:
 			cherrypy.response.status = 500
 			return {
 				"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.",
 				"identifier": "XSRF_TOKEN_NOT_FOUND"
 			}
 		url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct))
 		with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr:
 			data = json.loads(rr.content.decode("utf8"))
 			return {
 				"videoId": id,
 				"comments": [
 					{
 						"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"],
 						"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]],
 						"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"],
 						"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"],
 						"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
 						"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]),
 						"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])),
 						"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]),
 						# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", ""))
 						"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"],
 						"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"],
 						# "replies": {
 						# 	"replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"]
 						# }
 					} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"]
 				]
 			}
--- a/index.py
+++ b/index.py
@ -9,6 +9,7 @@ from extractors.manifest import extract_manifest
 from extractors.search import extract_search
 from extractors.suggestions import extract_search_suggestions
 from extractors.captions import extract_captions
 from extractors.comments import extract_comments
 import configuration
@cherrypy.tools.register("before_finalize", priority=60)
@ -26,7 +27,8 @@ class NewLeaf(object):
 				["channels", 1, 2],
 				["videos", 1, 1],
 				["search", 0, 1],
-				["captions", 1, 1]
+				["captions", 1, 1],
 				["comments", 1, 1]
 			]
 			for e in endpoints:
 				if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3:
@ -114,6 +116,11 @@ class NewLeaf(object):
 				"identifier": "NO_MATCHING_CAPTIONS"
 			}), "utf8")
 	@cherrypy.expose
 	@cherrypy.tools.json_out()
 	def comments(self, id, **kwargs):
 		return extract_comments(id)
 	@cherrypy.expose
 	def vi(self, id, file):
 		with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r:
--- a/tools/extractors.py
+++ b/tools/extractors.py
@ -4,6 +4,7 @@ import random
 r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M)
 r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M)
 r_yt_cfg = re.compile(r"""ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;""")
 def extract_yt_initial_data(content):
 	m_yt_initial_data = re.search(r_yt_initial_data, content)
@ -21,5 +22,11 @@ def extract_yt_initial_player_response(content):
 	else:
 		raise Exception("Could not match ytInitialPlayerResponse in content")
 def extract_yt_cfg(content):
 	m_yt_cfg = re.search(r_yt_cfg, content)
 	if m_yt_cfg:
 		return json.loads(m_yt_cfg.group(1))
 	raise Exception("Could not match ytcfg in content")
 def eu_consent_cookie():
 	return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))}