diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e553fff9f..b4f33e7f7 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -17,7 +17,6 @@ import threading import time import traceback import urllib.parse - from .common import InfoExtractor, SearchInfoExtractor from .openload import PhantomJSwrapper from ..compat import functools @@ -3307,7 +3306,58 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'value': ('intensityScoreNormalized', {float_or_none}), })) or None - def _extract_comment(self, comment_renderer, parent=None): + def _extract_comment(self, view_model, entity, parent=None): + entity_payload = traverse_obj(entity, ('payload', 'commentEntityPayload', {dict})) + comment_id = entity_payload.get('properties').get('commentId') + + info = { + 'id': comment_id, + 'text': try_get(entity_payload, lambda x: x['properties']['content']['content'], str), + 'like_count': self._search_regex(r'^([\d]+)', try_get(entity_payload, lambda x: x['toolbar']['likeCountA11y'], str), 'like_count', fatal=False) or 0, + 'author_id': traverse_obj(entity_payload, ('author', 'channelId', {self.ucid_or_none})), + 'author': try_get(entity_payload, lambda x: x['author']['displayName'], str), + 'author_thumbnail': traverse_obj(entity_payload, ('author', 'avatarThumbnailUrl', {url_or_none})), + 'parent': parent or 'root', + } + + # Timestamp is an estimate calculated from the current time and time_text + time_text = try_get(entity_payload, lambda x: x['properties']['publishedTime'], str) or '' + timestamp = self._parse_time_text(time_text) + + info.update({ + # FIXME: non-standard, but we need a way of showing that it is an estimate. + '_time_text': time_text, + 'timestamp': timestamp, + }) + + info['author_url'] = urljoin( + 'https://www.youtube.com', + traverse_obj(entity_payload, + ('author', + 'channelCommand', + 'innertubeCommand', + 'browseEndpoint', + 'canonicalBaseUrl'), + expected_type=str, get_all=False)) + + author_is_uploader = traverse_obj(entity_payload, ('author', 'isCreator')) + if author_is_uploader is not None: + info['author_is_uploader'] = author_is_uploader + + comment_abr = traverse_obj( + entity, ('payload', 'engagementToolbarStateEntityPayload', 'heartState'), expected_type=str) + if comment_abr is not None: + info['is_favorited'] = comment_abr == 'TOOLBAR_HEART_STATE_HEARTED' + + info['author_is_verified'] = traverse_obj(entity_payload, ('author', 'isVerified')) == 'true' + + pinned_text = traverse_obj(view_model, 'pinnedText') + if pinned_text: + info['is_pinned'] = True + + return info + + def _extract_comment_old(self, comment_renderer, parent=None): comment_id = comment_renderer.get('commentId') if not comment_id: return @@ -3388,21 +3438,40 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break return _continuation - def extract_thread(contents): + def extract_thread(contents, entity_payloads): if not parent: tracker['current_page_thread'] = 0 for content in contents: if not parent and tracker['total_parent_comments'] >= max_parents: yield comment_thread_renderer = try_get(content, lambda x: x['commentThreadRenderer']) - comment_renderer = get_first( - (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], - expected_type=dict, default={}) - comment = self._extract_comment(comment_renderer, parent) - if not comment: - continue - comment_id = comment['id'] + # old comment format + if entity_payloads is None: + comment_renderer = get_first( + (comment_thread_renderer, content), [['commentRenderer', ('comment', 'commentRenderer')]], + expected_type=dict, default={}) + + comment = self._extract_comment_old(comment_renderer, parent) + if not comment: + continue + comment_id = comment['id'] + + # new comment format + else: + view_model = traverse_obj(comment_thread_renderer, ('commentViewModel', 'commentViewModel')) + if not view_model: + view_model = content.get('commentViewModel') + if not view_model: + continue + comment_id = view_model['commentId'] + for entity in entity_payloads: + if traverse_obj(entity, ('payload', 'commentEntityPayload', 'properties', 'commentId')) == comment_id: + entity = entity + break + + comment = self._extract_comment(view_model, entity, parent) + if comment.get('is_pinned'): tracker['pinned_comment_ids'].add(comment_id) # Sometimes YouTube may break and give us infinite looping comments. @@ -3495,7 +3564,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): check_get_keys = None if not is_forced_continuation and not (tracker['est_total'] == 0 and tracker['running_total'] == 0): check_get_keys = [[*continuation_items_path, ..., ( - 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentRenderer'))]] + 'commentsHeaderRenderer' if is_first_continuation else ('commentThreadRenderer', 'commentViewModel'))]] try: response = self._extract_response( item_id=None, query=continuation, @@ -3527,10 +3596,16 @@ class YoutubeIE(YoutubeBaseInfoExtractor): break continue - for entry in extract_thread(continuation_items): + if 'frameworkUpdates' in response: + _iterator = extract_thread(continuation_items, response['frameworkUpdates']['entityBatchUpdate']['mutations']) + else: + _iterator = extract_thread(continuation_items, None) + + for entry in _iterator: if not entry: return yield entry + continuation = self._extract_continuation({'contents': continuation_items}) if continuation: break