mirror of
				https://git.sr.ht/~cadence/NewLeaf
				synced 2025-10-26 01:38:16 +00:00 
			
		
		
		
	Retrieve the first 20 comments of a video on /api/v1/comments/:videoid
Got some inspiration from https://github.com/nlitsme/youtube_tool (for the x-youtube-client-X headers). This is not a complete reimplementation of Invidious API as continuation is not implemented (to retrieve more than the first 20 comments and comments replies), likes and replies count are also missing.
This commit is contained in:
		
							parent
							
								
									1ea86101fd
								
							
						
					
					
						commit
						3f57d50893
					
				
							
								
								
									
										47
									
								
								extractors/comments.py
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								extractors/comments.py
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,47 @@ | ||||
| import json | ||||
| import requests | ||||
| import urllib.parse | ||||
| from tools.converters import * | ||||
| from tools.extractors import extract_yt_initial_data, extract_yt_cfg, eu_consent_cookie | ||||
| 
 | ||||
| def extract_comments(id, **kwargs): | ||||
| 	s = requests.session() | ||||
| 	s.headers.update({"accept-language": "en-US,en;q=0.9"}) | ||||
| 	s.cookies.set("CONSENT", eu_consent_cookie().get("CONSENT")) | ||||
| 	with s.get("https://www.youtube.com/watch?v={}".format(id)) as r: | ||||
| 		r.raise_for_status() | ||||
| 		yt_initial_data = extract_yt_initial_data(r.content.decode("utf8")) | ||||
| 		item = yt_initial_data["contents"]["twoColumnWatchNextResults"]["results"]["results"]["contents"][2]["itemSectionRenderer"] | ||||
| 		continuation = item["continuations"][0]["nextContinuationData"]["continuation"] | ||||
| 		itct = item["continuations"][0]["nextContinuationData"]["clickTrackingParams"] | ||||
| 		xsrf_token = extract_yt_cfg(r.content.decode("utf8")).get("XSRF_TOKEN", None) | ||||
| 		if not xsrf_token: | ||||
| 			cherrypy.response.status = 500 | ||||
| 			return { | ||||
| 				"error": "NewLeaf was unable to obtain XSRF_TOKEN from ytcfg.", | ||||
| 				"identifier": "XSRF_TOKEN_NOT_FOUND" | ||||
| 			} | ||||
| 		url = "https://www.youtube.com/comment_service_ajax?action_get_comments=1&pbj=1&ctoken={}&continuation={}&type=next&itct={}".format(continuation, continuation, urllib.parse.quote_plus(itct)) | ||||
| 		with s.post(url, headers={"x-youtube-client-name": "1", "x-youtube-client-version": "2.20210422.04.00"}, data={"session_token": xsrf_token}) as rr: | ||||
| 			data = json.loads(rr.content.decode("utf8")) | ||||
| 			return { | ||||
| 				"videoId": id, | ||||
| 				"comments": [ | ||||
| 					{ | ||||
| 						"author": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorText"]["simpleText"], | ||||
| 						"authorThumbnails": [x for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorThumbnail"]["thumbnails"]], | ||||
| 						"authorId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["browseId"], | ||||
| 						"authorUrl": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorEndpoint"]["browseEndpoint"]["canonicalBaseUrl"], | ||||
| 						"isEdited": " (edited)" in "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]), | ||||
| 						"content": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]]), | ||||
| 						"contentHtml": escape_html_textcontent("".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["contentText"]["runs"]])), | ||||
| 						"publishedText": "".join([x["text"] for x in c["commentThreadRenderer"]["comment"]["commentRenderer"]["publishedTimeText"]["runs"]]), | ||||
| 						# "likeCount": int(c["commentThreadRenderer"]["comment"]["commentRenderer"]["voteCount"]["simpleText"].replace(",", "")) | ||||
| 						"commentId": c["commentThreadRenderer"]["comment"]["commentRenderer"]["commentId"], | ||||
| 						"authorIsChannelOwner": c["commentThreadRenderer"]["comment"]["commentRenderer"]["authorIsChannelOwner"], | ||||
| 						# "replies": { | ||||
| 						# 	"replyCount": c["commentThreadRenderer"]["comment"]["commentRenderer"]["replyCount"] | ||||
| 						# } | ||||
| 					} for c in data["response"]["continuationContents"]["itemSectionContinuation"]["contents"] | ||||
| 				] | ||||
| 			} | ||||
							
								
								
									
										9
									
								
								index.py
									
									
									
									
									
								
							
							
						
						
									
										9
									
								
								index.py
									
									
									
									
									
								
							| @ -9,6 +9,7 @@ from extractors.manifest import extract_manifest | ||||
| from extractors.search import extract_search | ||||
| from extractors.suggestions import extract_search_suggestions | ||||
| from extractors.captions import extract_captions | ||||
| from extractors.comments import extract_comments | ||||
| import configuration | ||||
| 
 | ||||
| @cherrypy.tools.register("before_finalize", priority=60) | ||||
| @ -26,7 +27,8 @@ class NewLeaf(object): | ||||
| 				["channels", 1, 2], | ||||
| 				["videos", 1, 1], | ||||
| 				["search", 0, 1], | ||||
| 				["captions", 1, 1] | ||||
| 				["captions", 1, 1], | ||||
| 				["comments", 1, 1] | ||||
| 			] | ||||
| 			for e in endpoints: | ||||
| 				if vpath[2] == e[0] and len(vpath) >= e[1]+3 and len(vpath) <= e[2]+3: | ||||
| @ -114,6 +116,11 @@ class NewLeaf(object): | ||||
| 				"identifier": "NO_MATCHING_CAPTIONS" | ||||
| 			}), "utf8") | ||||
| 
 | ||||
| 	@cherrypy.expose | ||||
| 	@cherrypy.tools.json_out() | ||||
| 	def comments(self, id, **kwargs): | ||||
| 		return extract_comments(id) | ||||
| 
 | ||||
| 	@cherrypy.expose | ||||
| 	def vi(self, id, file): | ||||
| 		with requests.get("https://i.ytimg.com/vi/{}/{}".format(id, file)) as r: | ||||
|  | ||||
| @ -4,6 +4,7 @@ import random | ||||
| 
 | ||||
| r_yt_initial_data = re.compile(r"""(?:^\s*window\["ytInitialData"\]|var ytInitialData) = (\{.+?\});(?:\s*$|</script>)""", re.S + re.M) | ||||
| r_yt_initial_player_response = re.compile(r"""(?:^\s*window\["ytInitialPlayerResponse"\]|var ytInitialPlayerResponse) = (\{.+?\});(?:\s*$|</script>|var )""", re.S + re.M) | ||||
| r_yt_cfg = re.compile(r"""ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;""") | ||||
| 
 | ||||
| def extract_yt_initial_data(content): | ||||
| 	m_yt_initial_data = re.search(r_yt_initial_data, content) | ||||
| @ -21,5 +22,11 @@ def extract_yt_initial_player_response(content): | ||||
| 	else: | ||||
| 		raise Exception("Could not match ytInitialPlayerResponse in content") | ||||
| 
 | ||||
| def extract_yt_cfg(content): | ||||
| 	m_yt_cfg = re.search(r_yt_cfg, content) | ||||
| 	if m_yt_cfg: | ||||
| 		return json.loads(m_yt_cfg.group(1)) | ||||
| 	raise Exception("Could not match ytcfg in content") | ||||
| 
 | ||||
| def eu_consent_cookie(): | ||||
| 	return {"CONSENT": "YES+cb.20210509-17-p0.en+F+{}".format(random.randint(100, 999))} | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user