From ee81ca4a950d5a79b8d1863372dcd593280c96bc Mon Sep 17 00:00:00 2001 From: jakeogh Date: Tue, 23 Apr 2024 12:16:30 -0700 Subject: [PATCH 1/9] apply patch from issues/9358#issuecomment-2072600506 --- yt_dlp/extractor/youtube.py | 61 +++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 30 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e553fff9f..e2cbc7764 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3307,23 +3307,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'value': ('intensityScoreNormalized', {float_or_none}), })) or None - def _extract_comment(self, comment_renderer, parent=None): - comment_id = comment_renderer.get('commentId') - if not comment_id: - return + def _extract_comment(self, view_model, entity, parent=None): + entity_payload = entity['payload']['commentEntityPayload'] + comment_id = entity_payload.get('properties').get('commentId') info = { 'id': comment_id, - 'text': self._get_text(comment_renderer, 'contentText'), - 'like_count': self._get_count(comment_renderer, 'voteCount'), - 'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})), - 'author': self._get_text(comment_renderer, 'authorText'), - 'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})), + 'text': self._get_text(entity_payload, ('properties', 'content', 'contetn')), + 'like_count': self._get_count(entity_payload, ('toolbar', 'likeCountNotliked')), + 'author_id': traverse_obj(entity_payload, ('author', 'channelId', {self.ucid_or_none})), + 'author': self._get_text(entity_payload, ('author', 'displayName')), + 'author_thumbnail': traverse_obj(entity_payload, ('author', 'avatarThumbnailUrl', {url_or_none})), 'parent': parent or 'root', } # Timestamp is an estimate calculated from the current time and time_text - time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' + time_text = self._get_text(entity_payload, ('properties', 'publishedTime')) or '' timestamp = self._parse_time_text(time_text) info.update({ @@ -3333,25 +3332,23 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) info['author_url'] = urljoin( - 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', ( - ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))), + 'https://www.youtube.com', traverse_obj(entity_payload, + ('author', 'channelCommand', 'innertubeCommand', 'browseEndpoint', 'canonicalBaseUrl'), expected_type=str, get_all=False)) - author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner') + author_is_uploader = traverse_obj(entity_payload, ('author', 'isCreator')) if author_is_uploader is not None: info['author_is_uploader'] = author_is_uploader comment_abr = traverse_obj( - comment_renderer, ('actionButtons', 'commentActionButtonsRenderer'), expected_type=dict) + entity, ('payload', 'engagementToolbarStateEntityPayload', 'heartState'), expected_type=str) if comment_abr is not None: - info['is_favorited'] = 'creatorHeart' in comment_abr + info['is_favorited'] = comment_abr == 'TOOLBAR_HEART_STATE_HEARTED' - badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')]) - if self._has_badge(badges, BadgeType.VERIFIED): - info['author_is_verified'] = True + info['author_is_verified'] = traverse_obj(entity_payload, ('author', 'isVerified')) == 'true' - is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') - if is_pinned: + pinned_text = traverse_obj(view_model, 'pinnedText') + if pinned_text: info['is_pinned'] = True return info @@ -3388,21 +3385,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break return _continuation - def extract_thread(contents): + def extract_thread(contents, entity_payloads): if not parent: tracker['current_page_thread'] = 0 for content in contents: if not parent and tracker['total_parent_comments'] >= max_parents: yield comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) - comment_renderer = get_first( - (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], - expected_type=dict, default={}) - - comment = self._extract_comment(comment_renderer, parent) - if not comment: + view_model = traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel')) + if not view_model: + view_model = content.get('commentViewModel') + if not view_model: continue - comment_id = comment['id'] + comment_id = view_model['commentId'] + for entity in entity_payloads: + if traverse_obj(entity, ('payload', 'commentEntityPayload', 'properties', 'commentId')) == comment_id: + entity = entity + break + + comment = self._extract_comment(view_model, entity, parent) if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. @@ -3495,7 +3496,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): check_get_keys = None if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): check_get_keys = [[*continuation_items_path, ..., ( - 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel'))]] try: response = self._extract_response( item_id=None, query=continuation, @@ -3527,7 +3528,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break continue - for entry in extract_thread(continuation_items): + for entry in extract_thread(continuation_items, response['frameworkUpdates']['entityBatchUpdate']['mutations']): if not entry: return yield entry From 16cb4fedbe9a58f884bd8a0097f81670c0f37138 Mon Sep 17 00:00:00 2001 From: jakeogh Date: Tue, 23 Apr 2024 12:31:41 -0700 Subject: [PATCH 2/9] fix typo in previous patch, like count, and use direct dict access --- yt_dlp/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e2cbc7764..0be52a77a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3313,16 +3313,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info = { 'id': comment_id, - 'text': self._get_text(entity_payload, ('properties', 'content', 'contetn')), - 'like_count': self._get_count(entity_payload, ('toolbar', 'likeCountNotliked')), + 'text': entity_payload['properties']['content']['content'], + 'like_count': entity_payload['toolbar']['likeCountLiked'], 'author_id': traverse_obj(entity_payload, ('author', 'channelId', {self.ucid_or_none})), - 'author': self._get_text(entity_payload, ('author', 'displayName')), + 'author': entity_payload['author']['displayName'], 'author_thumbnail': traverse_obj(entity_payload, ('author', 'avatarThumbnailUrl', {url_or_none})), 'parent': parent or 'root', } # Timestamp is an estimate calculated from the current time and time_text - time_text = self._get_text(entity_payload, ('properties', 'publishedTime')) or '' + time_text = entity_payload['properties']['publishedTime'] or '' timestamp = self._parse_time_text(time_text) info.update({ From 6083596d50ce35ec837d88544525b9306fc8a2e0 Mon Sep 17 00:00:00 2001 From: jakeogh Date: Tue, 23 Apr 2024 14:53:00 -0700 Subject: [PATCH 3/9] handle KeyError: 'frameworkUpdates' when the old comment format is served --- yt_dlp/extractor/youtube.py | 91 ++++++++++++++++++++++++++++++++----- 1 file changed, 80 insertions(+), 11 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 0be52a77a..7f3995d9a 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3353,6 +3353,55 @@ class YoutubeIE(YoutubeBaseInfoExtractor): return info + def _extract_comment_old(self, comment_renderer, parent=None): + comment_id = comment_renderer.get('commentId') + if not comment_id: + return + + info = { + 'id': comment_id, + 'text': self._get_text(comment_renderer, 'contentText'), + 'like_count': self._get_count(comment_renderer, 'voteCount'), + 'author_id': traverse_obj(comment_renderer, ('authorEndpoint', 'browseEndpoint', 'browseId', {self.ucid_or_none})), + 'author': self._get_text(comment_renderer, 'authorText'), + 'author_thumbnail': traverse_obj(comment_renderer, ('authorThumbnail', 'thumbnails', -1, 'url', {url_or_none})), + 'parent': parent or 'root', + } + + # Timestamp is an estimate calculated from the current time and time_text + time_text = self._get_text(comment_renderer, 'publishedTimeText') or '' + timestamp = self._parse_time_text(time_text) + + info.update({ + # FIXME: non-standard, but we need a way of showing that it is an estimate. + '_time_text': time_text, + 'timestamp': timestamp, + }) + + info['author_url'] = urljoin( + 'https://www.youtube.com', traverse_obj(comment_renderer, ('authorEndpoint', ( + ('browseEndpoint', 'canonicalBaseUrl'), ('commandMetadata', 'webCommandMetadata', 'url'))), + expected_type=str, get_all=False)) + + author_is_uploader = traverse_obj(comment_renderer, 'authorIsChannelOwner') + if author_is_uploader is not None: + info['author_is_uploader'] = author_is_uploader + + comment_abr = traverse_obj( + comment_renderer, ('actionButtons', 'commentActionButtonsRenderer'), expected_type=dict) + if comment_abr is not None: + info['is_favorited'] = 'creatorHeart' in comment_abr + + badges = self._extract_badges([traverse_obj(comment_renderer, 'authorCommentBadge')]) + if self._has_badge(badges, BadgeType.VERIFIED): + info['author_is_verified'] = True + + is_pinned = traverse_obj(comment_renderer, 'pinnedCommentBadge') + if is_pinned: + info['is_pinned'] = True + + return info + def _comment_entries(self, root_continuation_data, ytcfg, video_id, parent=None, tracker=None): get_single_config_arg = lambda c: self._configuration_arg(c, [''])[0] @@ -3392,16 +3441,30 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if not parent and tracker['total_parent_comments'] >= max_parents: yield comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) - view_model = traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel')) - if not view_model: - view_model = content.get('commentViewModel') - if not view_model: - continue - comment_id = view_model['commentId'] - for entity in entity_payloads: - if traverse_obj(entity, ('payload', 'commentEntityPayload', 'properties', 'commentId')) == comment_id: - entity = entity - break + + # old comment format + if entity_payloads is None: + comment_renderer = get_first( + (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], + expected_type=dict, default={}) + + comment = self._extract_comment_old(comment_renderer, parent) + if not comment: + continue + comment_id = comment['id'] + + # new comment format + else: + view_model = traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel')) + if not view_model: + view_model = content.get('commentViewModel') + if not view_model: + continue + comment_id = view_model['commentId'] + for entity in entity_payloads: + if traverse_obj(entity, ('payload', 'commentEntityPayload', 'properties', 'commentId')) == comment_id: + entity = entity + break comment = self._extract_comment(view_model, entity, parent) if comment.get('is_pinned'): @@ -3528,10 +3591,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break continue - for entry in extract_thread(continuation_items, response['frameworkUpdates']['entityBatchUpdate']['mutations']): + if 'frameworkUpdates' in response: + _iterator = extract_thread(continuation_items, response['frameworkUpdates']['entityBatchUpdate']['mutations']) + else: + _iterator = extract_thread(continuation_items, None) + + for entry in _iterator: if not entry: return yield entry + continuation = self._extract_continuation({'contents': continuation_items}) if continuation: break From 2ef6563fb1d6e182461514bcb3178d066832bfd8 Mon Sep 17 00:00:00 2001 From: jakeogh Date: Tue, 23 Apr 2024 16:02:03 -0700 Subject: [PATCH 4/9] fix old comment extraction --- yt_dlp/extractor/youtube.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 7f3995d9a..82d421ad4 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3466,7 +3466,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): entity = entity break - comment = self._extract_comment(view_model, entity, parent) + comment = self._extract_comment(view_model, entity, parent) + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. From 4da1db9d1ee00e06b9b403dc7ea7ba978788735b Mon Sep 17 00:00:00 2001 From: jakeogh Date: Tue, 23 Apr 2024 20:32:37 -0700 Subject: [PATCH 5/9] fix like_count --- yt_dlp/extractor/youtube.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 82d421ad4..1978933d4 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -17,7 +17,6 @@ import threading import time import traceback import urllib.parse - from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper from ..compat import functools @@ -3314,7 +3313,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info = { 'id': comment_id, 'text': entity_payload['properties']['content']['content'], - 'like_count': entity_payload['toolbar']['likeCountLiked'], + 'like_count': entity_payload['toolbar']['likeCountNotliked'], 'author_id': traverse_obj(entity_payload, ('author', 'channelId', {self.ucid_or_none})), 'author': entity_payload['author']['displayName'], 'author_thumbnail': traverse_obj(entity_payload, ('author', 'avatarThumbnailUrl', {url_or_none})), From 800906c9ceef02e823b002e2600d53f36f2ef4ba Mon Sep 17 00:00:00 2001 From: jakeogh Date: Wed, 24 Apr 2024 00:03:23 -0700 Subject: [PATCH 6/9] fix indent --- yt_dlp/extractor/youtube.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1978933d4..582862d65 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3456,7 +3456,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): else: view_model = traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel')) if not view_model: - view_model = content.get('commentViewModel') + view_model = content.get('commentViewModel') if not view_model: continue comment_id = view_model['commentId'] From 276347381c3a1908964ee3216c340f3f4ab0e826 Mon Sep 17 00:00:00 2001 From: jakeogh Date: Wed, 24 Apr 2024 00:07:55 -0700 Subject: [PATCH 7/9] fix another indent --- yt_dlp/extractor/youtube.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 582862d65..fcf852bd7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3331,9 +3331,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor): }) info['author_url'] = urljoin( - 'https://www.youtube.com', traverse_obj(entity_payload, - ('author', 'channelCommand', 'innertubeCommand', 'browseEndpoint', 'canonicalBaseUrl'), - expected_type=str, get_all=False)) + 'https://www.youtube.com', + traverse_obj(entity_payload, + ('author', + 'channelCommand', + 'innertubeCommand', + 'browseEndpoint', + 'canonicalBaseUrl'), + expected_type=str, get_all=False)) author_is_uploader = traverse_obj(entity_payload, ('author', 'isCreator')) if author_is_uploader is not None: From 17bb4434c2e712498d047778e262033bdd2c8d61 Mon Sep 17 00:00:00 2001 From: jakeogh Date: Wed, 24 Apr 2024 01:53:19 -0700 Subject: [PATCH 8/9] replace dict access with try_get() --- yt_dlp/extractor/youtube.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index fcf852bd7..1c12c2320 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3312,16 +3312,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): info = { 'id': comment_id, - 'text': entity_payload['properties']['content']['content'], - 'like_count': entity_payload['toolbar']['likeCountNotliked'], + 'text': try_get(entity_payload, lambda x: x['properties']['content']['content'], str), + 'like_count': str_to_int(try_get(entity_payload, lambda x: x['toolbar']['likeCountNotliked'], str)) or 0, 'author_id': traverse_obj(entity_payload, ('author', 'channelId', {self.ucid_or_none})), - 'author': entity_payload['author']['displayName'], + 'author': try_get(entity_payload, lambda x: x['author']['displayName'], str), 'author_thumbnail': traverse_obj(entity_payload, ('author', 'avatarThumbnailUrl', {url_or_none})), 'parent': parent or 'root', } # Timestamp is an estimate calculated from the current time and time_text - time_text = entity_payload['properties']['publishedTime'] or '' + time_text = try_get(entity_payload, lambda x: x['properties']['publishedTime'], str) or '' timestamp = self._parse_time_text(time_text) info.update({ From 3ef651718e3050252b5f9eb9f46906ef01b8e924 Mon Sep 17 00:00:00 2001 From: jakeogh Date: Wed, 24 Apr 2024 11:10:13 -0700 Subject: [PATCH 9/9] replace dict access with traverse_obj() and use likeCountA11y --- yt_dlp/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 1c12c2320..b4f33e7f7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -3307,13 +3307,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): })) or None def _extract_comment(self, view_model, entity, parent=None): - entity_payload = entity['payload']['commentEntityPayload'] + entity_payload = traverse_obj(entity, ('payload', 'commentEntityPayload', {dict})) comment_id = entity_payload.get('properties').get('commentId') info = { 'id': comment_id, 'text': try_get(entity_payload, lambda x: x['properties']['content']['content'], str), - 'like_count': str_to_int(try_get(entity_payload, lambda x: x['toolbar']['likeCountNotliked'], str)) or 0, + 'like_count': self._search_regex(r'^([\d]+)', try_get(entity_payload, lambda x: x['toolbar']['likeCountA11y'], str), 'like_count', fatal=False) or 0, 'author_id': traverse_obj(entity_payload, ('author', 'channelId', {self.ucid_or_none})), 'author': try_get(entity_payload, lambda x: x['author']['displayName'], str), 'author_thumbnail': traverse_obj(entity_payload, ('author', 'avatarThumbnailUrl', {url_or_none})),