From 7565fcb271239d0af12cc191686574e30a2fc125 Mon Sep 17 00:00:00 2001 From: JerryZhouSirui <1806430169@qq.com> Date: Fri, 26 Apr 2024 13:48:19 -0400 Subject: [PATCH 1/2] Modified tele5 extractor --- supportedsites.md | 1 + yt_dlp/extractor/tele5.py | 110 ++++++++++++++++++++++++++++++++------ 2 files changed, 95 insertions(+), 16 deletions(-) diff --git a/supportedsites.md b/supportedsites.md index ba77c0feb..ec2517dbe 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -503,6 +503,7 @@ - **gem.cbc.ca**: [*cbcgem*](## "netrc machine") - **gem.cbc.ca:live** - **gem.cbc.ca:playlist** + - **generic**: Generic downloader that works on some sites - **Genius** - **GeniusLyrics** - **GetCourseRu**: [*getcourseru*](## "netrc machine") diff --git a/yt_dlp/extractor/tele5.py b/yt_dlp/extractor/tele5.py index 72f67e402..2e2e7ed92 100644 --- a/yt_dlp/extractor/tele5.py +++ b/yt_dlp/extractor/tele5.py @@ -1,17 +1,68 @@ +import re + +import requests + from .dplay import DPlayIE from ..compat import compat_urlparse from ..utils import ( ExtractorError, - extract_attributes, ) +def _generate_video_specific_cache_url(slug, parent_slug): + """ + Generate the MAGIC string for the video specific cache url. + :param slug: The part of the url that identifies the video by title. + :param parent_slug: The part of the url that identifies the PARENT directory. + :return: The generated url. + """ + return 'https://de-api.loma-cms.com/feloma/page/{0}/?environment=tele5&parent_slug={1}&v=2'.format(slug, + parent_slug) +def _do_cached_post(s: requests.session, + referer: str, + url: str) -> dict: + """ + Do the API call to CACHED json endpoint. + It is likely connected to the new "loma-cms" API. + :param s: The session we use. + :param referer: The referer url. + :param url: The url to retrieve the cached data for. + :return: The json dict from the response. + """ + r = s.post(url='https://tele5.de/cached', + headers={ + 'Origin': 'https://tele5.de', + 'Referer': referer, + # Referer is a mandatory key, + 'User-Agent': 'Youtube-DL', + # User-Agent is a mandatory key, it can be anything! + }, + json={'path': url} + ) + r.raise_for_status() + return r.json() class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE _WORKING = False _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P[^/?#&]+)' _GEO_COUNTRIES = ['DE'] _TESTS = [{ + 'url': 'https://tele5.de/mediathek/sorority-babes-in-the-slimeball-bowl-o-rama', + 'info_dict': { + 'id': '5582852', + 'title': 'Sorority Babes in the Slimeball Bowl-O-Rama', + 'ext': 'mp4', + 'series': 'Sorority Babes in the Slimeball Bowl-O-Rama', + 'duration': 4779.88, + 'description': 'md5:1d8d30ed3d221613861aaefa8d7e887e', + 'timestamp': 1697839800, + 'upload_date': '20231020', + 'creator': 'Tele5', + 'tags': [], + 'thumbnail': 'https://eu1-prod-images.disco-api.com/2023/10/02/501fa839-d3ac-3c04-aa61-57f98802c532.jpeg', + }, + }, { 'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416', + 'only_matching': True, 'info_dict': { 'id': '1549416', 'ext': 'mp4', @@ -26,6 +77,7 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE }, { # jwplatform, nexx unavailable 'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/', + 'only_matching': True, 'info_dict': { 'id': 'WJuiOlUp', 'ext': 'mp4', @@ -40,6 +92,7 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE 'skip': 'No longer available, redirects to Filme page', }, { 'url': 'https://tele5.de/mediathek/angel-of-mine/', + 'only_matching': True, 'info_dict': { 'id': '1252360', 'ext': 'mp4', @@ -72,18 +125,43 @@ class Tele5IE(DPlayIE): # XXX: Do not subclass from concrete IE }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - player_element = self._search_regex(r'(]+?>)', webpage, 'video player') - player_info = extract_attributes(player_element) - asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', )) - endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname - source_type = player_info.get('sourcetype') - if source_type: - endpoint = '%s-%s' % (source_type, endpoint) - try: - return self._get_disco_api_info(url, asset_id, endpoint, realm, country) - except ExtractorError as e: - if getattr(e, 'message', '') == 'Missing deviceId in context': - self.report_drm(video_id) - raise + content_regex = re.compile(r'https?://(?:www\.)?(?P[^.]+)\.de/(?P[^/]+)/(?P[^/?#&]+)') + m = content_regex.search(url) + if m is not None: + environment, parent_slug, slug = m.groups() + s = requests.session() + headers_for_origin = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0'} + r = s.get(url=url, + headers=headers_for_origin) + r.raise_for_status() + + cached_base = _do_cached_post(s=s, + referer=url, + url='https://de-api.loma-cms.com/feloma/configurations/?environment={0}'.format(environment)) + + site_info = cached_base.get('data').get('settings').get('site') + player_info = site_info.get('player') + + sonic_realm = player_info['sonicRealm'] + sonic_endpoint = compat_urlparse.urlparse(player_info['sonicEndpoint']).hostname + country = site_info['info']['country'] + + cached_video_specific = _do_cached_post(s=s, referer=url, + url=_generate_video_specific_cache_url( + slug=slug, + parent_slug=parent_slug)) + + video_id = cached_video_specific['data']['blocks'][1]['videoId'] + + try: + return self._get_disco_api_info(url=url, + display_id=video_id, + disco_host=sonic_endpoint, + realm=sonic_realm, + country=country, + api_version=3, + ) + except ExtractorError as e: + if getattr(e, 'message', '') == 'Missing deviceId in context': + self.report_drm(video_id) + raise From a4d480929845b3722ea5bbb5ece87db3260eeb3e Mon Sep 17 00:00:00 2001 From: JerryZhouSirui <1806430169@qq.com> Date: Fri, 26 Apr 2024 14:32:00 -0400 Subject: [PATCH 2/2] Fix beatport extractor --- yt_dlp/extractor/beatport.py | 78 ++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 43 deletions(-) diff --git a/yt_dlp/extractor/beatport.py b/yt_dlp/extractor/beatport.py index 0aecbd089..5c50f5620 100644 --- a/yt_dlp/extractor/beatport.py +++ b/yt_dlp/extractor/beatport.py @@ -2,7 +2,7 @@ import re from .common import InfoExtractor from ..compat import compat_str -from ..utils import int_or_none +from ..utils import int_or_none, ExtractorError class BeatportIE(InfoExtractor): @@ -43,55 +43,47 @@ class BeatportIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - playables = self._parse_json( - self._search_regex( - r'window\.Playables\s*=\s*({.+?});', webpage, - 'playables info', flags=re.DOTALL), - track_id) + try: + playables_json = self._search_regex( + r'window\.Playables\s*=\s*({.+?})\s*;', webpage, + 'playables info', default='{}', flags=re.DOTALL) + playables = self._parse_json(playables_json, track_id) + except re.error: + raise ExtractorError('Failed to extract playables information. The page structure may have changed.') - track = next(t for t in playables['tracks'] if t['id'] == int(track_id)) + if not playables or 'tracks' not in playables: + raise ExtractorError('No playable tracks found in the extracted information.') - title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name'] - if track['mix']: + track = next((t for t in playables['tracks'] if t['id'] == int(track_id)), None) + if not track: + raise ExtractorError(f'No track with ID {track_id} found.') + + title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name'] + if track.get('mix'): title += ' (' + track['mix'] + ')' formats = [] - for ext, info in track['preview'].items(): - if not info['url']: - continue - fmt = { - 'url': info['url'], - 'ext': ext, - 'format_id': ext, - 'vcodec': 'none', - } - if ext == 'mp3': - fmt['acodec'] = 'mp3' - fmt['abr'] = 96 - fmt['asr'] = 44100 - elif ext == 'mp4': - fmt['acodec'] = 'aac' - fmt['abr'] = 96 - fmt['asr'] = 44100 - formats.append(fmt) + for ext, info in track.get('preview', {}).items(): + url = info.get('url') + if url: + fmt = { + 'url': url, + 'ext': ext, + 'format_id': ext, + 'vcodec': 'none', + 'acodec': 'mp3' if ext == 'mp3' else 'aac', + 'abr': 96, + 'asr': 44100 + } + formats.append(fmt) - images = [] - for name, info in track['images'].items(): - image_url = info.get('url') - if name == 'dynamic' or not image_url: - continue - image = { - 'id': name, - 'url': image_url, - 'height': int_or_none(info.get('height')), - 'width': int_or_none(info.get('width')), - } - images.append(image) + images = [{'id': name, 'url': info['url'], 'height': int_or_none(info.get('height')), 'width': int_or_none(info.get('width'))} + for name, info in track.get('images', {}).items() if name != 'dynamic' and info.get('url')] return { - 'id': compat_str(track.get('id')) or track_id, - 'display_id': track.get('slug') or display_id, + 'id': compat_str(track.get('id', track_id)), + 'display_id': track.get('slug', display_id), 'title': title, 'formats': formats, - 'thumbnails': images, - } + 'thumbnails': images + } \ No newline at end of file