Merge a4d4809298 into 64766459e3

2024-04-27 19:32:14 +10:00 · 2024-04-27 19:32:14 +10:00 · 736c819149
commit 736c819149
--- a/supportedsites.md
+++ b/supportedsites.md
@ -503,6 +503,7 @@
 - **gem.cbc.ca**: [*cbcgem*](## "netrc machine")
 - **gem.cbc.ca:live**
 - **gem.cbc.ca:playlist**
+ - **generic**: Generic downloader that works on some sites
 - **Genius**
 - **GeniusLyrics**
 - **GetCourseRu**: [*getcourseru*](## "netrc machine")
--- a/yt_dlp/extractor/beatport.py
+++ b/yt_dlp/extractor/beatport.py
@ -2,7 +2,7 @@ import re

 from .common import InfoExtractor
 from ..compat import compat_str
-from ..utils import int_or_none
+from ..utils import int_or_none, ExtractorError


 class BeatportIE(InfoExtractor):
@ -43,55 +43,47 @@ class BeatportIE(InfoExtractor):

        webpage = self._download_webpage(url, display_id)

-        playables = self._parse_json(
-            self._search_regex(
-                r'window\.Playables\s*=\s*({.+?});', webpage,
-                'playables info', flags=re.DOTALL),
-            track_id)
+        try:
+            playables_json = self._search_regex(
+                r'window\.Playables\s*=\s*({.+?})\s*;', webpage,
+                'playables info', default='{}', flags=re.DOTALL)
+            playables = self._parse_json(playables_json, track_id)
+        except re.error:
+            raise ExtractorError('Failed to extract playables information. The page structure may have changed.')

-        track = next(t for t in playables['tracks'] if t['id'] == int(track_id))
+        if not playables or 'tracks' not in playables:
+            raise ExtractorError('No playable tracks found in the extracted information.')

-        title = ', '.join((a['name'] for a in track['artists'])) + ' - ' + track['name']
-        if track['mix']:
+        track = next((t for t in playables['tracks'] if t['id'] == int(track_id)), None)
+        if not track:
+            raise ExtractorError(f'No track with ID {track_id} found.')
+
+        title = ', '.join(a['name'] for a in track['artists']) + ' - ' + track['name']
+        if track.get('mix'):
            title += ' (' + track['mix'] + ')'

        formats = []
-        for ext, info in track['preview'].items():
-            if not info['url']:
-                continue
-            fmt = {
-                'url': info['url'],
-                'ext': ext,
-                'format_id': ext,
-                'vcodec': 'none',
-            }
-            if ext == 'mp3':
-                fmt['acodec'] = 'mp3'
-                fmt['abr'] = 96
-                fmt['asr'] = 44100
-            elif ext == 'mp4':
-                fmt['acodec'] = 'aac'
-                fmt['abr'] = 96
-                fmt['asr'] = 44100
-            formats.append(fmt)
+        for ext, info in track.get('preview', {}).items():
+            url = info.get('url')
+            if url:
+                fmt = {
+                    'url': url,
+                    'ext': ext,
+                    'format_id': ext,
+                    'vcodec': 'none',
+                    'acodec': 'mp3' if ext == 'mp3' else 'aac',
+                    'abr': 96,
+                    'asr': 44100
+                }
+                formats.append(fmt)

-        images = []
-        for name, info in track['images'].items():
-            image_url = info.get('url')
-            if name == 'dynamic' or not image_url:
-                continue
-            image = {
-                'id': name,
-                'url': image_url,
-                'height': int_or_none(info.get('height')),
-                'width': int_or_none(info.get('width')),
-            }
-            images.append(image)
+        images = [{'id': name, 'url': info['url'], 'height': int_or_none(info.get('height')), 'width': int_or_none(info.get('width'))}
+                  for name, info in track.get('images', {}).items() if name != 'dynamic' and info.get('url')]

        return {
-            'id': compat_str(track.get('id')) or track_id,
-            'display_id': track.get('slug') or display_id,
+            'id': compat_str(track.get('id', track_id)),
+            'display_id': track.get('slug', display_id),
            'title': title,
            'formats': formats,
-            'thumbnails': images,
-        }
+            'thumbnails': images
+        }
--- a/yt_dlp/extractor/tele5.py
+++ b/yt_dlp/extractor/tele5.py
@ -1,17 +1,68 @@
+import re
+
+import requests
+
 from .dplay import DPlayIE
 from ..compat import compat_urlparse
 from ..utils import (
    ExtractorError,
-    extract_attributes,
 )

+def _generate_video_specific_cache_url(slug, parent_slug):
+    """
+    Generate the MAGIC string for the video specific cache url.
+    :param slug: The part of the url that identifies the video by title.
+    :param parent_slug: The part of the url that identifies the PARENT directory.
+    :return: The generated url.
+    """
+    return 'https://de-api.loma-cms.com/feloma/page/{0}/?environment=tele5&parent_slug={1}&v=2'.format(slug,
+                                                                                                       parent_slug)
+def _do_cached_post(s: requests.session,
+                    referer: str,
+                    url: str) -> dict:
+    """
+    Do the API call to CACHED json endpoint.
+    It is likely connected to the new "loma-cms" API.
+    :param s: The session we use.
+    :param referer: The referer url.
+    :param url: The url to retrieve the cached data for.
+    :return: The json dict from the response.
+    """
+    r = s.post(url='https://tele5.de/cached',
+               headers={
+                   'Origin': 'https://tele5.de',
+                   'Referer': referer,
+                   # Referer is a mandatory key,
+                   'User-Agent': 'Youtube-DL',
+                   # User-Agent is a mandatory key, it can be anything!
+               },
+               json={'path': url}
+               )
+    r.raise_for_status()
+    return r.json()

 class Tele5IE(DPlayIE):  # XXX: Do not subclass from concrete IE
    _WORKING = False
    _VALID_URL = r'https?://(?:www\.)?tele5\.de/(?:[^/]+/)*(?P<id>[^/?#&]+)'
    _GEO_COUNTRIES = ['DE']
    _TESTS = [{
+        'url': 'https://tele5.de/mediathek/sorority-babes-in-the-slimeball-bowl-o-rama',
+        'info_dict': {
+            'id': '5582852',
+            'title': 'Sorority Babes in the Slimeball Bowl-O-Rama',
+            'ext': 'mp4',
+            'series': 'Sorority Babes in the Slimeball Bowl-O-Rama',
+            'duration': 4779.88,
+            'description': 'md5:1d8d30ed3d221613861aaefa8d7e887e',
+            'timestamp': 1697839800,
+            'upload_date': '20231020',
+            'creator': 'Tele5',
+            'tags': [],
+            'thumbnail': 'https://eu1-prod-images.disco-api.com/2023/10/02/501fa839-d3ac-3c04-aa61-57f98802c532.jpeg',
+        },
+    }, {
        'url': 'https://www.tele5.de/mediathek/filme-online/videos?vid=1549416',
+        'only_matching': True,
        'info_dict': {
            'id': '1549416',
            'ext': 'mp4',
@ -26,6 +77,7 @@ class Tele5IE(DPlayIE):  # XXX: Do not subclass from concrete IE
    }, {
        # jwplatform, nexx unavailable
        'url': 'https://www.tele5.de/filme/ghoul-das-geheimnis-des-friedhofmonsters/',
+        'only_matching': True,
        'info_dict': {
            'id': 'WJuiOlUp',
            'ext': 'mp4',
@ -40,6 +92,7 @@ class Tele5IE(DPlayIE):  # XXX: Do not subclass from concrete IE
        'skip': 'No longer available, redirects to Filme page',
    }, {
        'url': 'https://tele5.de/mediathek/angel-of-mine/',
+        'only_matching': True,
        'info_dict': {
            'id': '1252360',
            'ext': 'mp4',
@ -72,18 +125,43 @@ class Tele5IE(DPlayIE):  # XXX: Do not subclass from concrete IE
    }]

    def _real_extract(self, url):
-        video_id = self._match_id(url)
-        webpage = self._download_webpage(url, video_id)
-        player_element = self._search_regex(r'(<hyoga-player\b[^>]+?>)', webpage, 'video player')
-        player_info = extract_attributes(player_element)
-        asset_id, country, realm = (player_info[x] for x in ('assetid', 'locale', 'realm', ))
-        endpoint = compat_urlparse.urlparse(player_info['endpoint']).hostname
-        source_type = player_info.get('sourcetype')
-        if source_type:
-            endpoint = '%s-%s' % (source_type, endpoint)
-        try:
-            return self._get_disco_api_info(url, asset_id, endpoint, realm, country)
-        except ExtractorError as e:
-            if getattr(e, 'message', '') == 'Missing deviceId in context':
-                self.report_drm(video_id)
-            raise
+        content_regex = re.compile(r'https?://(?:www\.)?(?P<environment>[^.]+)\.de/(?P<parent_slug>[^/]+)/(?P<slug>[^/?#&]+)')
+        m = content_regex.search(url)
+        if m is not None:
+            environment, parent_slug, slug = m.groups()
+            s = requests.session()
+            headers_for_origin = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/113.0'}
+            r = s.get(url=url,
+                      headers=headers_for_origin)
+            r.raise_for_status()
+
+            cached_base = _do_cached_post(s=s,
+                                               referer=url,
+                                               url='https://de-api.loma-cms.com/feloma/configurations/?environment={0}'.format(environment))
+
+            site_info = cached_base.get('data').get('settings').get('site')
+            player_info = site_info.get('player')
+
+            sonic_realm = player_info['sonicRealm']
+            sonic_endpoint = compat_urlparse.urlparse(player_info['sonicEndpoint']).hostname
+            country = site_info['info']['country']
+
+            cached_video_specific = _do_cached_post(s=s, referer=url,
+                                                         url=_generate_video_specific_cache_url(
+                                                             slug=slug,
+                                                             parent_slug=parent_slug))
+
+            video_id = cached_video_specific['data']['blocks'][1]['videoId']
+
+            try:
+                return self._get_disco_api_info(url=url,
+                                                display_id=video_id,
+                                                disco_host=sonic_endpoint,
+                                                realm=sonic_realm,
+                                                country=country,
+                                                api_version=3,
+                                                )
+            except ExtractorError as e:
+                if getattr(e, 'message', '') == 'Missing deviceId in context':
+                    self.report_drm(video_id)
+                raise