diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index da5f26257..ebda09c8c 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -125,11 +125,12 @@ jobs: sudo apt -y install zip pandoc man sed cat > ./requirements.txt << EOF python=3.10.* + pyinstaller brotli-python EOF python devscripts/install_deps.py --print \ --exclude brotli --exclude brotlicffi \ - --include secretstorage --include pyinstaller >> ./requirements.txt + --include secretstorage >> ./requirements.txt mamba create -n build --file ./requirements.txt - name: Prepare @@ -247,13 +248,13 @@ jobs: run: | brew install coreutils python3 devscripts/install_deps.py --user -o --include build - python3 devscripts/install_deps.py --print --include pyinstaller_macos > requirements.txt + python3 devscripts/install_deps.py --print --include pyinstaller > requirements.txt # We need to ignore wheels otherwise we break universal2 builds python3 -m pip install -U --user --no-binary :all: -r requirements.txt # We need to fuse our own universal2 wheels for curl_cffi python3 -m pip install -U --user delocate mkdir curl_cffi_whls curl_cffi_universal2 - python3 devscripts/install_deps.py --print -o --include curl_cffi > requirements.txt + python3 devscripts/install_deps.py --print -o --include curl-cffi > requirements.txt for platform in "macosx_11_0_arm64" "macosx_11_0_x86_64"; do python3 -m pip download \ --only-binary=:all: \ @@ -319,7 +320,7 @@ jobs: run: | brew install coreutils python3 devscripts/install_deps.py --user -o --include build - python3 devscripts/install_deps.py --user --include pyinstaller_macos --include curl_cffi + python3 devscripts/install_deps.py --user --include pyinstaller - name: Prepare run: | @@ -361,7 +362,7 @@ jobs: - name: Install Requirements run: | # Custom pyinstaller built with https://github.com/yt-dlp/pyinstaller-builds python devscripts/install_deps.py -o --include build - python devscripts/install_deps.py --include py2exe --include curl_cffi + python devscripts/install_deps.py --include py2exe --include curl-cffi python -m pip install -U "https://yt-dlp.github.io/Pyinstaller-Builds/x86_64/pyinstaller-5.8.0-py3-none-any.whl" - name: Prepare @@ -466,8 +467,9 @@ jobs: - name: Make SHA2-SUMS files run: | cd ./artifact/ - sha256sum * > ../SHA2-256SUMS - sha512sum * > ../SHA2-512SUMS + # make sure SHA sums are also printed to stdout + sha256sum * | tee ../SHA2-256SUMS + sha512sum * | tee ../SHA2-512SUMS - name: Make Update spec run: | diff --git a/.github/workflows/core.yml b/.github/workflows/core.yml index 076f785bf..70769f967 100644 --- a/.github/workflows/core.yml +++ b/.github/workflows/core.yml @@ -53,7 +53,7 @@ jobs: with: python-version: ${{ matrix.python-version }} - name: Install test requirements - run: python3 ./devscripts/install_deps.py --include dev --include curl_cffi + run: python3 ./devscripts/install_deps.py --include dev --include curl-cffi - name: Run tests continue-on-error: False run: | diff --git a/.github/workflows/quick-test.yml b/.github/workflows/quick-test.yml index 3114e7bdd..24b34911f 100644 --- a/.github/workflows/quick-test.yml +++ b/.github/workflows/quick-test.yml @@ -27,6 +27,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 + with: + python-version: '3.8' - name: Install flake8 run: python3 ./devscripts/install_deps.py -o --include dev - name: Make lazy extractors diff --git a/CONTRIBUTORS b/CONTRIBUTORS index 6ee3baa3d..8b5d19a64 100644 --- a/CONTRIBUTORS +++ b/CONTRIBUTORS @@ -600,3 +600,13 @@ xpadev-net Xpl0itU YoshichikaAAA zhijinwuu +alb +hruzgar +kasper93 +leoheitmannruiz +luiso1979 +nipotan +Offert4324 +sta1us +Tomoka1 +trwstin diff --git a/Changelog.md b/Changelog.md index 45a9cef3f..6cf08beab 100644 --- a/Changelog.md +++ b/Changelog.md @@ -4,6 +4,101 @@ # To create a release, dispatch the https://github.com/yt-dlp/yt-dlp/actions/workflows/release.yml workflow on master --> +### 2024.04.09 + +#### Important changes +- Security: [[CVE-2024-22423](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-22423)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-hjq6-52gw-2g7p) + - The shell escape function now properly escapes `%`, `\` and `\n`. + - `utils.Popen` has been patched accordingly. + +#### Core changes +- [Add new option `--progress-delta`](https://github.com/yt-dlp/yt-dlp/commit/9590cc6b4768e190183d7d071a6c78170889116a) ([#9082](https://github.com/yt-dlp/yt-dlp/issues/9082)) by [Grub4K](https://github.com/Grub4K) +- [Add new options `--impersonate` and `--list-impersonate-targets`](https://github.com/yt-dlp/yt-dlp/commit/0b81d4d252bd065ccd352722987ea34fe17f9244) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- [Add option `--no-break-on-existing`](https://github.com/yt-dlp/yt-dlp/commit/16be117729150b2784f3b17755c886cb0cf73374) ([#9610](https://github.com/yt-dlp/yt-dlp/issues/9610)) by [bashonly](https://github.com/bashonly) +- [Fix `filesize_approx` calculation](https://github.com/yt-dlp/yt-dlp/commit/86e3b82261e8ebc6c6707c09544c9dfb8907c0fd) ([#9560](https://github.com/yt-dlp/yt-dlp/issues/9560)) by [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) +- [Infer `acodec` for single-codec containers](https://github.com/yt-dlp/yt-dlp/commit/86a972033e05fea80e5fe7f2aff6723dbe2f3952) by [pukkandan](https://github.com/pukkandan) +- [Prevent RCE when using `--exec` with `%q` (CVE-2024-22423)](https://github.com/yt-dlp/yt-dlp/commit/ff07792676f404ffff6ee61b5638c9dc1a33a37a) by [Grub4K](https://github.com/Grub4K) +- **cookies**: [Add `--cookies-from-browser` support for Firefox Flatpak](https://github.com/yt-dlp/yt-dlp/commit/2ab2651a4a7be18939e2b4cb21be79fe477c797a) ([#9619](https://github.com/yt-dlp/yt-dlp/issues/9619)) by [un-def](https://github.com/un-def) +- **utils** + - `traverse_obj` + - [Allow unbranching using `all` and `any`](https://github.com/yt-dlp/yt-dlp/commit/3699eeb67cad333272b14a42dd3843d93fda1a2e) ([#9571](https://github.com/yt-dlp/yt-dlp/issues/9571)) by [Grub4K](https://github.com/Grub4K) + - [Convenience improvements](https://github.com/yt-dlp/yt-dlp/commit/32abfb00bdbd119ca675fdc6d1719331f0a2741a) ([#9577](https://github.com/yt-dlp/yt-dlp/issues/9577)) by [Grub4K](https://github.com/Grub4K) + +#### Extractor changes +- [Add extractor impersonate API](https://github.com/yt-dlp/yt-dlp/commit/50c29352312f5662acf9a64b0012766f5c40af61) ([#9474](https://github.com/yt-dlp/yt-dlp/issues/9474)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) +- **afreecatv** + - [Overhaul extractor](https://github.com/yt-dlp/yt-dlp/commit/9415f1a5ef88482ebafe3083e8bcb778ac512df7) ([#9566](https://github.com/yt-dlp/yt-dlp/issues/9566)) by [bashonly](https://github.com/bashonly), [Tomoka1](https://github.com/Tomoka1) + - live: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/9073ae6458f4c6a832aa832c67174c61852869be) ([#9348](https://github.com/yt-dlp/yt-dlp/issues/9348)) by [hui1601](https://github.com/hui1601) +- **asobistage**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/0284f1fee202302a78888420f933deae19d9f4e1) ([#8735](https://github.com/yt-dlp/yt-dlp/issues/8735)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **box**: [Support URLs without file IDs](https://github.com/yt-dlp/yt-dlp/commit/07f5b2f7570fd9ac85aed17f4c0118f6eac77beb) ([#9504](https://github.com/yt-dlp/yt-dlp/issues/9504)) by [shreyasminocha](https://github.com/shreyasminocha) +- **cbc.ca**: player: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/b49d5ffc53a72d8245ba319ff07bdc5b8c6a4f0c) ([#9561](https://github.com/yt-dlp/yt-dlp/issues/9561)) by [trainman261](https://github.com/trainman261) +- **crunchyroll** + - [Extract `vo_adaptive_hls` formats by default](https://github.com/yt-dlp/yt-dlp/commit/be77923ffe842f667971019460f6005f3cad01eb) ([#9447](https://github.com/yt-dlp/yt-dlp/issues/9447)) by [bashonly](https://github.com/bashonly) + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/954e57e405f79188450eb30103a9308732cd318f) ([#9615](https://github.com/yt-dlp/yt-dlp/issues/9615)) by [bytedream](https://github.com/bytedream) +- **dropbox**: [Fix formats extraction](https://github.com/yt-dlp/yt-dlp/commit/a48cc86d6f6b20427553620c2ddb990ede6a4b41) ([#9627](https://github.com/yt-dlp/yt-dlp/issues/9627)) by [bashonly](https://github.com/bashonly) +- **fathom**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/bc2b8c0596fd6b75af24822c4f0f1da6783d71f7) ([#9495](https://github.com/yt-dlp/yt-dlp/issues/9495)) by [src-tinkerer](https://github.com/src-tinkerer) +- **gofile**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/0da66980d3193cad3dae0120cddddbfcabddf7a1) ([#9446](https://github.com/yt-dlp/yt-dlp/issues/9446)) by [jazz1611](https://github.com/jazz1611) +- **imgur**: [Fix extraction](https://github.com/yt-dlp/yt-dlp/commit/86d2f4d24849af0d1f3af7c0e2ac43bf8a058f74) ([#9471](https://github.com/yt-dlp/yt-dlp/issues/9471)) by [trwstin](https://github.com/trwstin) +- **jiosaavn** + - [Extract artists](https://github.com/yt-dlp/yt-dlp/commit/0ae16ceb1846cc4e609b70ce7c5d8e7458efceb2) ([#9612](https://github.com/yt-dlp/yt-dlp/issues/9612)) by [bashonly](https://github.com/bashonly) + - [Fix format extensions](https://github.com/yt-dlp/yt-dlp/commit/443e206ec41e64ca2aef61d8ef91640fb69b3113) ([#9609](https://github.com/yt-dlp/yt-dlp/issues/9609)) by [bashonly](https://github.com/bashonly) + - [Support playlists](https://github.com/yt-dlp/yt-dlp/commit/2e94602f241f6e41bdc48576c61089435529339b) ([#9622](https://github.com/yt-dlp/yt-dlp/issues/9622)) by [bashonly](https://github.com/bashonly) +- **joqrag**: [Fix live status detection](https://github.com/yt-dlp/yt-dlp/commit/f2fd449b46c4058222e1744f7a35caa20b2d003d) ([#9624](https://github.com/yt-dlp/yt-dlp/issues/9624)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **kick**: [Support browser impersonation](https://github.com/yt-dlp/yt-dlp/commit/c8a61a910096c77ce08dad5e1b2fbda5eb964156) ([#9611](https://github.com/yt-dlp/yt-dlp/issues/9611)) by [bashonly](https://github.com/bashonly) +- **loom**: [Add extractors](https://github.com/yt-dlp/yt-dlp/commit/f859ed3ba1e8b129ae6a467592c65687e73fbca1) ([#8686](https://github.com/yt-dlp/yt-dlp/issues/8686)) by [bashonly](https://github.com/bashonly), [hruzgar](https://github.com/hruzgar) +- **medici**: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4cd9e251b9abada107b10830de997bf4d79ca369) ([#9518](https://github.com/yt-dlp/yt-dlp/issues/9518)) by [Offert4324](https://github.com/Offert4324) +- **mixch** + - [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/4c3b7a0769706f7f0ea24adf1f219d5ae82d2b07) ([#9608](https://github.com/yt-dlp/yt-dlp/issues/9608)) by [bashonly](https://github.com/bashonly), [nipotan](https://github.com/nipotan) + - archive: [Fix extractor](https://github.com/yt-dlp/yt-dlp/commit/c59de48e2bb4c681b03b93b584a05f52609ce4a0) ([#8761](https://github.com/yt-dlp/yt-dlp/issues/8761)) by [pzhlkj6612](https://github.com/pzhlkj6612) +- **nhk**: [Fix NHK World extractors](https://github.com/yt-dlp/yt-dlp/commit/4af9d5c2f6aa81403ae2a8a5ae3cc824730f0b86) ([#9623](https://github.com/yt-dlp/yt-dlp/issues/9623)) by [bashonly](https://github.com/bashonly) +- **patreon**: [Do not extract dead embed URLs](https://github.com/yt-dlp/yt-dlp/commit/36b240f9a72af57eb2c9d927ebb7fd1c917ebf18) ([#9613](https://github.com/yt-dlp/yt-dlp/issues/9613)) by [johnvictorfs](https://github.com/johnvictorfs) +- **radio1be**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/36baaa10e06715ccba06b78885b2042c4844c826) ([#9122](https://github.com/yt-dlp/yt-dlp/issues/9122)) by [HobbyistDev](https://github.com/HobbyistDev) +- **sharepoint**: [Add extractor](https://github.com/yt-dlp/yt-dlp/commit/ff349ff94aae0b2b148bd3670f7c91d39c2f1d8e) ([#6531](https://github.com/yt-dlp/yt-dlp/issues/6531)) by [bashonly](https://github.com/bashonly), [C0D3D3V](https://github.com/C0D3D3V) +- **sonylivseries**: [Fix season extraction](https://github.com/yt-dlp/yt-dlp/commit/f2868b26e917354203f82a370ad2396646edb813) ([#9423](https://github.com/yt-dlp/yt-dlp/issues/9423)) by [bashonly](https://github.com/bashonly) +- **soundcloud** + - [Adjust format sorting](https://github.com/yt-dlp/yt-dlp/commit/a2d0840739cddd585d24e0ce4796394fc8a4fa2e) ([#9584](https://github.com/yt-dlp/yt-dlp/issues/9584)) by [bashonly](https://github.com/bashonly) + - [Support cookies](https://github.com/yt-dlp/yt-dlp/commit/97362712a1f2b04e735bdf54f749ad99165a62fe) ([#9586](https://github.com/yt-dlp/yt-dlp/issues/9586)) by [bashonly](https://github.com/bashonly) + - [Support retries for API rate-limit](https://github.com/yt-dlp/yt-dlp/commit/246571ae1d867df8bf31a056bdf3bbbfd398366a) ([#9585](https://github.com/yt-dlp/yt-dlp/issues/9585)) by [bashonly](https://github.com/bashonly) +- **thisoldhouse**: [Support Brightcove embeds](https://github.com/yt-dlp/yt-dlp/commit/0df63cce69026d2f4c0cbb4dd36163e83eac93dc) ([#9576](https://github.com/yt-dlp/yt-dlp/issues/9576)) by [bashonly](https://github.com/bashonly) +- **tiktok** + - [Fix API extraction](https://github.com/yt-dlp/yt-dlp/commit/cb61e20c266facabb7a30f9ce53bd79dfc158475) ([#9548](https://github.com/yt-dlp/yt-dlp/issues/9548)) by [bashonly](https://github.com/bashonly), [Grub4K](https://github.com/Grub4K) + - [Prefer non-bytevc2 formats](https://github.com/yt-dlp/yt-dlp/commit/63f685f341f35f6f02b0368d1ba53bdb5b520410) ([#9575](https://github.com/yt-dlp/yt-dlp/issues/9575)) by [bashonly](https://github.com/bashonly) + - [Restore `carrier_region` API parameter](https://github.com/yt-dlp/yt-dlp/commit/fc53ec13ff1ee926a3e533a68cfca8acc887b661) ([#9637](https://github.com/yt-dlp/yt-dlp/issues/9637)) by [bashonly](https://github.com/bashonly) + - [Update API hostname](https://github.com/yt-dlp/yt-dlp/commit/8c05b3ebae23c5b444857549a85b84004c01a536) ([#9444](https://github.com/yt-dlp/yt-dlp/issues/9444)) by [bashonly](https://github.com/bashonly) +- **twitch**: [Extract AV1 and HEVC formats](https://github.com/yt-dlp/yt-dlp/commit/02f93ff51b3ff9436d60c4993562b366eaae8851) ([#9158](https://github.com/yt-dlp/yt-dlp/issues/9158)) by [kasper93](https://github.com/kasper93) +- **vkplay**: [Fix `_VALID_URL`](https://github.com/yt-dlp/yt-dlp/commit/b15b0c1d2106437ec61a5c436c543e8760eac160) ([#9636](https://github.com/yt-dlp/yt-dlp/issues/9636)) by [bashonly](https://github.com/bashonly) +- **xvideos**: [Support new URL format](https://github.com/yt-dlp/yt-dlp/commit/aa7e9ae4f48276bd5d0173966c77db9484f65a0a) ([#9502](https://github.com/yt-dlp/yt-dlp/issues/9502)) by [sta1us](https://github.com/sta1us) +- **youtube** + - [Calculate more accurate `filesize`](https://github.com/yt-dlp/yt-dlp/commit/a25a424323267e3f6f9f63c0b62df499bd7b8d46) by [pukkandan](https://github.com/pukkandan) + - [Update `android` params](https://github.com/yt-dlp/yt-dlp/commit/e7b17fce14775bd2448695c8eb7379b8d31d3537) by [pukkandan](https://github.com/pukkandan) + - search: [Fix params for uncensored results](https://github.com/yt-dlp/yt-dlp/commit/17d248a58781e2588d18a5ebe00c441d10011fcd) ([#9456](https://github.com/yt-dlp/yt-dlp/issues/9456)) by [alb](https://github.com/alb), [pukkandan](https://github.com/pukkandan) + +#### Downloader changes +- **ffmpeg**: [Accept output args from info dict](https://github.com/yt-dlp/yt-dlp/commit/9c42b7eef547e826e9fcc7beb6706a2523949d05) ([#9278](https://github.com/yt-dlp/yt-dlp/issues/9278)) by [bashonly](https://github.com/bashonly) + +#### Networking changes +- [Respect `SSLKEYLOGFILE` environment variable](https://github.com/yt-dlp/yt-dlp/commit/79a451e5763eda8b10d00684d5d3378f3255ee01) ([#9543](https://github.com/yt-dlp/yt-dlp/issues/9543)) by [luiso1979](https://github.com/luiso1979) +- **Request Handler** + - curlcffi: [Add support for `curl_cffi`](https://github.com/yt-dlp/yt-dlp/commit/52f5be1f1e0dc45bb397ab950f564721976a39bf) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan) + - websockets: [Workaround race condition causing issues on PyPy](https://github.com/yt-dlp/yt-dlp/commit/e5d4f11104ce7ea1717a90eea82c0f7d230ea5d5) ([#9514](https://github.com/yt-dlp/yt-dlp/issues/9514)) by [coletdjnz](https://github.com/coletdjnz) + +#### Misc. changes +- **build** + - [Do not include `curl_cffi` in `macos_legacy`](https://github.com/yt-dlp/yt-dlp/commit/b19ae095fdddd43c2a2c67d10fbe0d9a645bb98f) ([#9653](https://github.com/yt-dlp/yt-dlp/issues/9653)) by [bashonly](https://github.com/bashonly) + - [Optional dependencies cleanup](https://github.com/yt-dlp/yt-dlp/commit/58dd0f8d1eee6bc9fdc57f1923bed772fa3c946d) ([#9550](https://github.com/yt-dlp/yt-dlp/issues/9550)) by [bashonly](https://github.com/bashonly) + - [Print SHA sums to GHA logs](https://github.com/yt-dlp/yt-dlp/commit/e8032503b9517465b0e86d776fc1e60d8795d673) ([#9582](https://github.com/yt-dlp/yt-dlp/issues/9582)) by [bashonly](https://github.com/bashonly) + - [Update changelog for tarball and sdist](https://github.com/yt-dlp/yt-dlp/commit/17b96974a334688f76b57d350e07cae8cda46877) ([#9425](https://github.com/yt-dlp/yt-dlp/issues/9425)) by [bashonly](https://github.com/bashonly) +- **cleanup** + - [Standardize `import datetime as dt`](https://github.com/yt-dlp/yt-dlp/commit/c305a25c1b16bcf7a5ec499c3b786ed1e2c748da) ([#8978](https://github.com/yt-dlp/yt-dlp/issues/8978)) by [pukkandan](https://github.com/pukkandan) + - ie: [No `from` stdlib imports in extractors](https://github.com/yt-dlp/yt-dlp/commit/e3a3ed8a981d9395c4859b6ef56cd02bc3148db2) by [pukkandan](https://github.com/pukkandan) + - Miscellaneous: [216f6a3](https://github.com/yt-dlp/yt-dlp/commit/216f6a3cb57824e6a3c859649ce058c199b1b247) by [bashonly](https://github.com/bashonly), [pukkandan](https://github.com/pukkandan) +- **docs** + - [Update yt-dlp tagline](https://github.com/yt-dlp/yt-dlp/commit/388c979ac63a8774339fac2516fe1cc852b4276e) ([#9481](https://github.com/yt-dlp/yt-dlp/issues/9481)) by [bashonly](https://github.com/bashonly), [coletdjnz](https://github.com/coletdjnz), [Grub4K](https://github.com/Grub4K), [pukkandan](https://github.com/pukkandan), [seproDev](https://github.com/seproDev) + - [Various manpage fixes](https://github.com/yt-dlp/yt-dlp/commit/df0e138fc02ae2764a44f2f59fc93c756c4d3ee2) by [leoheitmannruiz](https://github.com/leoheitmannruiz) +- **test** + - [Workaround websocket server hanging](https://github.com/yt-dlp/yt-dlp/commit/f849d77ab54788446b995d256e1ee0894c4fb927) ([#9467](https://github.com/yt-dlp/yt-dlp/issues/9467)) by [coletdjnz](https://github.com/coletdjnz) + - `traversal`: [Separate traversal tests](https://github.com/yt-dlp/yt-dlp/commit/979ce2e786f2ee3fc783b6dc1ef4188d8805c923) ([#9574](https://github.com/yt-dlp/yt-dlp/issues/9574)) by [Grub4K](https://github.com/Grub4K) + ### 2024.03.10 #### Core changes diff --git a/Makefile b/Makefile index 38c6b4f2d..cef4bc6cb 100644 --- a/Makefile +++ b/Makefile @@ -10,9 +10,12 @@ tar: yt-dlp.tar.gz # intended use: when building a source distribution, # make pypi-files && python3 -m build -sn . pypi-files: AUTHORS Changelog.md LICENSE README.md README.txt supportedsites \ - completions yt-dlp.1 pyproject.toml setup.cfg devscripts/* test/* + completions yt-dlp.1 pyproject.toml setup.cfg devscripts/* test/* -.PHONY: all clean install test tar pypi-files completions ot offlinetest codetest supportedsites +.PHONY: all clean clean-all clean-test clean-dist clean-cache \ + completions completion-bash completion-fish completion-zsh \ + doc issuetemplates supportedsites ot offlinetest codetest test \ + tar pypi-files lazy-extractors install uninstall clean-test: rm -rf test/testdata/sigs/player-*.js tmp/ *.annotations.xml *.aria2 *.description *.dump *.frag \ diff --git a/README.md b/README.md index 8497f8655..3202e634a 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,7 @@ When using `--update`/`-U`, a release binary will only update to its current cha You may also use `--update-to ` (`/`) to update to a channel on a completely different repository. Be careful with what repository you are updating to though, there is no verification done for binaries from different repositories. Example usage: + * `yt-dlp --update-to master` switch to the `master` channel and update to its latest release * `yt-dlp --update-to stable@2023.07.06` upgrade/downgrade to release to `stable` channel tag `2023.07.06` * `yt-dlp --update-to 2023.10.07` upgrade/downgrade to tag `2023.10.07` if it exists on the current channel @@ -201,8 +202,8 @@ While all the other dependencies are optional, `ffmpeg` and `ffprobe` are highly The following provide support for impersonating browser requests. This may be required for some sites that employ TLS fingerprinting. * [**curl_cffi**](https://github.com/yifeikong/curl_cffi) (recommended) - Python binding for [curl-impersonate](https://github.com/lwthiker/curl-impersonate). Provides impersonation targets for Chrome, Edge and Safari. Licensed under [MIT](https://github.com/yifeikong/curl_cffi/blob/main/LICENSE) - * Can be installed with the `curl_cffi` group, e.g. `pip install yt-dlp[default,curl_cffi]` - * Only included in `yt-dlp.exe`, `yt-dlp_macos` and `yt-dlp_macos_legacy` builds + * Can be installed with the `curl-cffi` group, e.g. `pip install yt-dlp[default,curl-cffi]` + * Currently only included in `yt-dlp.exe` and `yt-dlp_macos` builds ### Metadata @@ -481,6 +482,9 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git --max-downloads NUMBER Abort after downloading NUMBER files --break-on-existing Stop the download process when encountering a file that is in the archive + --no-break-on-existing Do not stop the download process when + encountering a file that is in the archive + (default) --break-per-input Alters --max-downloads, --break-on-existing, --break-match-filter, and autonumber to reset per input URL @@ -754,6 +758,7 @@ If you fork the project on GitHub, you can run your fork's [build workflow](.git accessible under "progress" key. E.g. --console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s" + --progress-delta SECONDS Time between progress output (default: 0) -v, --verbose Print various debugging information --dump-pages Print downloaded pages encoded using base64 to debug problems (very verbose) @@ -1472,9 +1477,9 @@ The following numeric meta fields can be used with comparisons `<`, `<=`, `>`, ` - `width`: Width of the video, if known - `height`: Height of the video, if known - `aspect_ratio`: Aspect ratio of the video, if known - - `tbr`: Average bitrate of audio and video in KBit/s - - `abr`: Average audio bitrate in KBit/s - - `vbr`: Average video bitrate in KBit/s + - `tbr`: Average bitrate of audio and video in [kbps](## "1000 bits/sec") + - `abr`: Average audio bitrate in [kbps](## "1000 bits/sec") + - `vbr`: Average video bitrate in [kbps](## "1000 bits/sec") - `asr`: Audio sampling rate in Hertz - `fps`: Frame rate - `audio_channels`: The number of audio channels @@ -1499,7 +1504,7 @@ Any string comparison may be prefixed with negation `!` in order to produce an o **Note**: None of the aforementioned meta fields are guaranteed to be present since this solely depends on the metadata obtained by particular extractor, i.e. the metadata offered by the website. Any other field made available by the extractor can also be used for filtering. -Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "bv[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 KBit/s. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. +Formats for which the value is not known are excluded unless you put a question mark (`?`) after the operator. You can combine format filters, so `-f "bv[height<=?720][tbr>500]"` selects up to 720p videos (or videos where the height is not known) with a bitrate of at least 500 kbps. You can also use the filters with `all` to download all formats that satisfy the filter, e.g. `-f "all[vcodec=none]"` selects all audio-only formats. Format selectors can also be grouped using parentheses; e.g. `-f "(mp4,webm)[height<480]"` will download the best pre-merged mp4 and webm formats with a height lower than 480. @@ -1531,10 +1536,10 @@ The available fields are: - `fps`: Framerate of video - `hdr`: The dynamic range of the video (`DV` > `HDR12` > `HDR10+` > `HDR10` > `HLG` > `SDR`) - `channels`: The number of audio channels - - `tbr`: Total average bitrate in KBit/s - - `vbr`: Average video bitrate in KBit/s - - `abr`: Average audio bitrate in KBit/s - - `br`: Average bitrate in KBit/s, `tbr`/`vbr`/`abr` + - `tbr`: Total average bitrate in [kbps](## "1000 bits/sec") + - `vbr`: Average video bitrate in [kbps](## "1000 bits/sec") + - `abr`: Average audio bitrate in [kbps](## "1000 bits/sec") + - `br`: Average bitrate in [kbps](## "1000 bits/sec"), `tbr`/`vbr`/`abr` - `asr`: Audio sample rate in Hz **Deprecation warning**: Many of these fields have (currently undocumented) aliases, that may be removed in a future version. It is recommended to use only the documented field names. @@ -1781,8 +1786,7 @@ The following extractors use this feature: * `version`: The video version to extract - `uncut` or `simulcast` #### crunchyrollbeta (Crunchyroll) -* `format`: Which stream type(s) to extract (default: `adaptive_hls`). Potentially useful values include `adaptive_hls`, `adaptive_dash`, `vo_adaptive_hls`, `vo_adaptive_dash`, `download_hls`, `download_dash`, `multitrack_adaptive_hls_v2` -* `hardsub`: Preference order for which hardsub versions to extract, or `all` (default: `None` = no hardsubs), e.g. `crunchyrollbeta:hardsub=en-US,None` +* `hardsub`: One or more hardsub versions to extract (in order of preference), or `all` (default: `None` = no hardsubs will be extracted), e.g. `crunchyrollbeta:hardsub=en-US,de-DE` #### vikichannel * `video_types`: Types of videos to download - one or more of `episodes`, `movies`, `clips`, `trailers` @@ -1805,9 +1809,12 @@ The following extractors use this feature: * `max_comments`: Maximum number of comments to extract - default is `120` #### tiktok -* `api_hostname`: Hostname to use for mobile API requests, e.g. `api-h2.tiktokv.com` -* `app_version`: App version to call mobile APIs with - should be set along with `manifest_app_version`, e.g. `20.2.1` -* `manifest_app_version`: Numeric app version to call mobile APIs with, e.g. `221` +* `api_hostname`: Hostname to use for mobile API calls, e.g. `api22-normal-c-alisg.tiktokv.com` +* `app_name`: Default app name to use with mobile API calls, e.g. `trill` +* `app_version`: Default app version to use with mobile API calls - should be set along with `manifest_app_version`, e.g. `34.1.2` +* `manifest_app_version`: Default numeric app version to use with mobile API calls, e.g. `2023401020` +* `aid`: Default app ID to use with API calls, e.g. `1180` +* `app_info`: One or more app info strings in the format of `/[app_name]/[app_version]/[manifest_app_version]/[aid]`, where `iid` is the unique app install ID. `iid` is the only required value; all other values and their `/` separators can be omitted, e.g. `tiktok:app_info=1234567890123456789` or `tiktok:app_info=123,456/trill///1180,789//34.0.1/340001` #### rokfinchannel * `tab`: Which tab to download - one of `new`, `top`, `videos`, `podcasts`, `streams`, `stacks` @@ -1830,6 +1837,9 @@ The following extractors use this feature: #### jiosaavn * `bitrate`: Audio bitrates to request. One or more of `16`, `32`, `64`, `128`, `320`. Default is `128,320` +#### afreecatvlive +* `cdn`: One or more CDN IDs to use with the API call for stream URLs, e.g. `gcp_cdn`, `gs_cdn_pc_app`, `gs_cdn_mobile_web`, `gs_cdn_pc_web` + **Note**: These options may be changed/removed in the future without concern for backward compatibility @@ -1887,6 +1897,7 @@ Plugins can be installed using various methods and locations. `.zip`, `.egg` and `.whl` archives containing a `yt_dlp_plugins` namespace folder in their root are also supported as plugin packages. + * e.g. `${XDG_CONFIG_HOME}/yt-dlp/plugins/mypluginpkg.zip` where `mypluginpkg.zip` contains `yt_dlp_plugins//myplugin.py` Run yt-dlp with `--verbose` to check if the plugin has been loaded. diff --git a/devscripts/changelog_override.json b/devscripts/changelog_override.json index 2a34ad071..046060cb2 100644 --- a/devscripts/changelog_override.json +++ b/devscripts/changelog_override.json @@ -126,5 +126,26 @@ "when": "4ce57d3b873c2887814cbec03d029533e82f7db5", "short": "[ie] Support multi-period MPD streams (#6654)", "authors": ["alard", "pukkandan"] + }, + { + "action": "change", + "when": "aa7e9ae4f48276bd5d0173966c77db9484f65a0a", + "short": "[ie/xvideos] Support new URL format (#9502)", + "authors": ["sta1us"] + }, + { + "action": "remove", + "when": "22e4dfacb61f62dfbb3eb41b31c7b69ba1059b80" + }, + { + "action": "change", + "when": "e3a3ed8a981d9395c4859b6ef56cd02bc3148db2", + "short": "[cleanup:ie] No `from` stdlib imports in extractors", + "authors": ["pukkandan"] + }, + { + "action": "add", + "when": "9590cc6b4768e190183d7d071a6c78170889116a", + "short": "[priority] Security: [[CVE-2024-22423](https://cve.mitre.org/cgi-bin/cvename.cgi?name=CVE-2024-22423)] [Prevent RCE when using `--exec` with `%q` on Windows](https://github.com/yt-dlp/yt-dlp/security/advisories/GHSA-hjq6-52gw-2g7p)\n - The shell escape function now properly escapes `%`, `\\` and `\\n`.\n - `utils.Popen` has been patched accordingly." } ] diff --git a/devscripts/install_deps.py b/devscripts/install_deps.py index 889d9abeb..d33fc637c 100755 --- a/devscripts/install_deps.py +++ b/devscripts/install_deps.py @@ -10,6 +10,8 @@ import argparse import re import subprocess +from pathlib import Path + from devscripts.tomlparse import parse_toml from devscripts.utils import read_file @@ -17,17 +19,23 @@ from devscripts.utils import read_file def parse_args(): parser = argparse.ArgumentParser(description='Install dependencies for yt-dlp') parser.add_argument( - 'input', nargs='?', metavar='TOMLFILE', default='pyproject.toml', help='Input file (default: %(default)s)') + 'input', nargs='?', metavar='TOMLFILE', default=Path(__file__).parent.parent / 'pyproject.toml', + help='input file (default: %(default)s)') parser.add_argument( - '-e', '--exclude', metavar='DEPENDENCY', action='append', help='Exclude a dependency') + '-e', '--exclude', metavar='DEPENDENCY', action='append', + help='exclude a dependency') parser.add_argument( - '-i', '--include', metavar='GROUP', action='append', help='Include an optional dependency group') + '-i', '--include', metavar='GROUP', action='append', + help='include an optional dependency group') parser.add_argument( - '-o', '--only-optional', action='store_true', help='Only install optional dependencies') + '-o', '--only-optional', action='store_true', + help='only install optional dependencies') parser.add_argument( - '-p', '--print', action='store_true', help='Only print a requirements.txt to stdout') + '-p', '--print', action='store_true', + help='only print requirements to stdout') parser.add_argument( - '-u', '--user', action='store_true', help='Install with pip as --user') + '-u', '--user', action='store_true', + help='install with pip as --user') return parser.parse_args() @@ -37,24 +45,16 @@ def main(): optional_groups = project_table['optional-dependencies'] excludes = args.exclude or [] - deps = [] + targets = [] if not args.only_optional: # `-o` should exclude 'dependencies' and the 'default' group - deps.extend(project_table['dependencies']) + targets.extend(project_table['dependencies']) if 'default' not in excludes: # `--exclude default` should exclude entire 'default' group - deps.extend(optional_groups['default']) - - def name(dependency): - return re.match(r'[\w-]+', dependency)[0].lower() - - target_map = {name(dep): dep for dep in deps} + targets.extend(optional_groups['default']) for include in filter(None, map(optional_groups.get, args.include or [])): - target_map.update(zip(map(name, include), include)) + targets.extend(include) - for exclude in map(name, excludes): - target_map.pop(exclude, None) - - targets = list(target_map.values()) + targets = [t for t in targets if re.match(r'[\w-]+', t).group(0).lower() not in excludes] if args.print: for target in targets: diff --git a/devscripts/prepare_manpage.py b/devscripts/prepare_manpage.py index 009e7bba1..47188e992 100644 --- a/devscripts/prepare_manpage.py +++ b/devscripts/prepare_manpage.py @@ -43,6 +43,27 @@ def filter_excluded_sections(readme): '', readme) +def _convert_code_blocks(readme): + current_code_block = None + + for line in readme.splitlines(True): + if current_code_block: + if line == current_code_block: + current_code_block = None + yield '\n' + else: + yield f' {line}' + elif line.startswith('```'): + current_code_block = line.count('`') * '`' + '\n' + yield '\n' + else: + yield line + + +def convert_code_blocks(readme): + return ''.join(_convert_code_blocks(readme)) + + def move_sections(readme): MOVE_TAG_TEMPLATE = '' sections = re.findall(r'(?m)^%s$' % ( @@ -65,8 +86,10 @@ def move_sections(readme): def filter_options(readme): section = re.search(r'(?sm)^# USAGE AND OPTIONS\n.+?(?=^# )', readme).group(0) + section_new = section.replace('*', R'\*') + options = '# OPTIONS\n' - for line in section.split('\n')[1:]: + for line in section_new.split('\n')[1:]: mobj = re.fullmatch(r'''(?x) \s{4}(?P-(?:,\s|[^\s])+) (?:\s(?P(?:[^\s]|\s(?!\s))+))? @@ -86,7 +109,7 @@ def filter_options(readme): return readme.replace(section, options, 1) -TRANSFORM = compose_functions(filter_excluded_sections, move_sections, filter_options) +TRANSFORM = compose_functions(filter_excluded_sections, convert_code_blocks, move_sections, filter_options) def main(): diff --git a/devscripts/tomlparse.py b/devscripts/tomlparse.py index 85ac4eef7..ac9ea3170 100755 --- a/devscripts/tomlparse.py +++ b/devscripts/tomlparse.py @@ -11,7 +11,7 @@ IMPORTANT: INVALID FILES OR MULTILINE STRINGS ARE NOT SUPPORTED! from __future__ import annotations -import datetime +import datetime as dt import json import re @@ -115,9 +115,9 @@ def parse_value(data: str, index: int): for func in [ int, float, - datetime.time.fromisoformat, - datetime.date.fromisoformat, - datetime.datetime.fromisoformat, + dt.time.fromisoformat, + dt.date.fromisoformat, + dt.datetime.fromisoformat, {'true': True, 'false': False}.get, ]: try: @@ -179,7 +179,7 @@ def main(): data = file.read() def default(obj): - if isinstance(obj, (datetime.date, datetime.time, datetime.datetime)): + if isinstance(obj, (dt.date, dt.time, dt.datetime)): return obj.isoformat() print(json.dumps(parse_toml(data), default=default)) diff --git a/devscripts/update-version.py b/devscripts/update-version.py index da54a6a25..07a071745 100644 --- a/devscripts/update-version.py +++ b/devscripts/update-version.py @@ -9,15 +9,15 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import argparse import contextlib +import datetime as dt import sys -from datetime import datetime, timezone from devscripts.utils import read_version, run_process, write_file def get_new_version(version, revision): if not version: - version = datetime.now(timezone.utc).strftime('%Y.%m.%d') + version = dt.datetime.now(dt.timezone.utc).strftime('%Y.%m.%d') if revision: assert revision.isdecimal(), 'Revision must be a number' diff --git a/pyproject.toml b/pyproject.toml index c57cac757..5fadd1449 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,7 +53,7 @@ dependencies = [ [project.optional-dependencies] default = [] -curl_cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] +curl-cffi = ["curl-cffi==0.5.10; implementation_name=='cpython'"] secretstorage = [ "cffi", "secretstorage", @@ -69,8 +69,10 @@ dev = [ "isort", "pytest", ] -pyinstaller = ["pyinstaller>=6.3"] -pyinstaller_macos = ["pyinstaller==5.13.2"] # needed for curl_cffi builds +pyinstaller = [ + "pyinstaller>=6.3; sys_platform!='darwin'", + "pyinstaller==5.13.2; sys_platform=='darwin'", # needed for curl_cffi +] py2exe = ["py2exe>=0.12"] [project.urls] diff --git a/supportedsites.md b/supportedsites.md index a4b2d5799..ba77c0feb 100644 --- a/supportedsites.md +++ b/supportedsites.md @@ -47,7 +47,7 @@ - **aenetworks:show** - **AeonCo** - **afreecatv**: [*afreecatv*](## "netrc machine") afreecatv.com - - **afreecatv:live**: [*afreecatv*](## "netrc machine") afreecatv.com + - **afreecatv:live**: [*afreecatv*](## "netrc machine") afreecatv.com livestreams - **afreecatv:user** - **AirTV** - **AitubeKZVideo** @@ -105,6 +105,7 @@ - **ArteTVPlaylist** - **asobichannel**: ASOBI CHANNEL - **asobichannel:tag**: ASOBI CHANNEL + - **AsobiStage**: ASOBISTAGE (アソビステージ) - **AtresPlayer**: [*atresplayer*](## "netrc machine") - **AtScaleConfEvent** - **ATVAt** @@ -436,6 +437,7 @@ - **FacebookPluginsVideo** - **fancode:live**: [*fancode*](## "netrc machine") (**Currently broken**) - **fancode:vod**: [*fancode*](## "netrc machine") (**Currently broken**) + - **Fathom** - **faz.net** - **fc2**: [*fc2*](## "netrc machine") - **fc2:embed** @@ -633,8 +635,9 @@ - **Jamendo** - **JamendoAlbum** - **JeuxVideo**: (**Currently broken**) - - **JioSaavnAlbum** - - **JioSaavnSong** + - **jiosaavn:album** + - **jiosaavn:playlist** + - **jiosaavn:song** - **Joj** - **JoqrAg**: 超!A&G+ 文化放送 (f.k.a. AGQR) Nippon Cultural Broadcasting, Inc. (JOQR) - **Jove** @@ -716,6 +719,8 @@ - **Lnk** - **LnkGo** - **loc**: Library of Congress + - **loom** + - **loom:folder** - **LoveHomePorn** - **LRTStream** - **LRTVOD** @@ -1136,6 +1141,7 @@ - **Radiko** - **RadikoRadio** - **radio.de**: (**Currently broken**) + - **Radio1Be** - **radiocanada** - **radiocanada:audiovideo** - **RadioComercial** @@ -1288,6 +1294,7 @@ - **SeznamZpravyArticle** - **Shahid**: [*shahid*](## "netrc machine") - **ShahidShow** + - **SharePoint** - **ShareVideosEmbed** - **ShemarooMe** - **ShowRoomLive** diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py index b7dee496a..c633ce3e4 100644 --- a/test/test_InfoExtractor.py +++ b/test/test_InfoExtractor.py @@ -1906,6 +1906,15 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/ expected_status=TEAPOT_RESPONSE_STATUS) self.assertEqual(content, TEAPOT_RESPONSE_BODY) + def test_search_nextjs_data(self): + data = '' + self.assertEqual(self.ie._search_nextjs_data(data, None), {'props': {}}) + self.assertEqual(self.ie._search_nextjs_data('', None, fatal=False), {}) + self.assertEqual(self.ie._search_nextjs_data('', None, default=None), None) + self.assertEqual(self.ie._search_nextjs_data('', None, default={}), {}) + with self.assertRaises(DeprecationWarning): + self.assertEqual(self.ie._search_nextjs_data('', None, default='{}'), {}) + if __name__ == '__main__': unittest.main() diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 6be47af97..5242cf88f 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -183,7 +183,7 @@ class TestFormatSelection(unittest.TestCase): ] info_dict = _make_result(formats) - ydl = YDL({'format': 'best'}) + ydl = YDL({'format': 'best', 'format_sort': ['abr', 'ext']}) ydl.sort_formats(info_dict) ydl.process_ie_result(copy.deepcopy(info_dict)) downloaded = ydl.downloaded_info_dicts[0] @@ -195,7 +195,7 @@ class TestFormatSelection(unittest.TestCase): downloaded = ydl.downloaded_info_dicts[0] self.assertEqual(downloaded['format_id'], 'mp3-64') - ydl = YDL({'prefer_free_formats': True}) + ydl = YDL({'prefer_free_formats': True, 'format_sort': ['abr', 'ext']}) ydl.sort_formats(info_dict) ydl.process_ie_result(copy.deepcopy(info_dict)) downloaded = ydl.downloaded_info_dicts[0] diff --git a/test/test_cookies.py b/test/test_cookies.py index 5282ef621..bd61f30a6 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -1,5 +1,5 @@ +import datetime as dt import unittest -from datetime import datetime, timezone from yt_dlp import cookies from yt_dlp.cookies import ( @@ -138,7 +138,7 @@ class TestCookies(unittest.TestCase): self.assertEqual(cookie.name, 'foo') self.assertEqual(cookie.value, 'test%20%3Bcookie') self.assertFalse(cookie.secure) - expected_expiration = datetime(2021, 6, 18, 21, 39, 19, tzinfo=timezone.utc) + expected_expiration = dt.datetime(2021, 6, 18, 21, 39, 19, tzinfo=dt.timezone.utc) self.assertEqual(cookie.expires, int(expected_expiration.timestamp())) def test_pbkdf2_sha1(self): diff --git a/test/test_traversal.py b/test/test_traversal.py new file mode 100644 index 000000000..9b2a27b08 --- /dev/null +++ b/test/test_traversal.py @@ -0,0 +1,444 @@ +import http.cookies +import re +import xml.etree.ElementTree + +import pytest + +from yt_dlp.utils import dict_get, int_or_none, str_or_none +from yt_dlp.utils.traversal import traverse_obj + +_TEST_DATA = { + 100: 100, + 1.2: 1.2, + 'str': 'str', + 'None': None, + '...': ..., + 'urls': [ + {'index': 0, 'url': 'https://www.example.com/0'}, + {'index': 1, 'url': 'https://www.example.com/1'}, + ], + 'data': ( + {'index': 2}, + {'index': 3}, + ), + 'dict': {}, +} + + +class TestTraversal: + def test_traversal_base(self): + assert traverse_obj(_TEST_DATA, ('str',)) == 'str', \ + 'allow tuple path' + assert traverse_obj(_TEST_DATA, ['str']) == 'str', \ + 'allow list path' + assert traverse_obj(_TEST_DATA, (value for value in ("str",))) == 'str', \ + 'allow iterable path' + assert traverse_obj(_TEST_DATA, 'str') == 'str', \ + 'single items should be treated as a path' + assert traverse_obj(_TEST_DATA, 100) == 100, \ + 'allow int path' + assert traverse_obj(_TEST_DATA, 1.2) == 1.2, \ + 'allow float path' + assert traverse_obj(_TEST_DATA, None) == _TEST_DATA, \ + '`None` should not perform any modification' + + def test_traversal_ellipsis(self): + assert traverse_obj(_TEST_DATA, ...) == [x for x in _TEST_DATA.values() if x not in (None, {})], \ + '`...` should give all non discarded values' + assert traverse_obj(_TEST_DATA, ('urls', 0, ...)) == list(_TEST_DATA['urls'][0].values()), \ + '`...` selection for dicts should select all values' + assert traverse_obj(_TEST_DATA, (..., ..., 'url')) == ['https://www.example.com/0', 'https://www.example.com/1'], \ + 'nested `...` queries should work' + assert traverse_obj(_TEST_DATA, (..., ..., 'index')) == list(range(4)), \ + '`...` query result should be flattened' + assert traverse_obj(iter(range(4)), ...) == list(range(4)), \ + '`...` should accept iterables' + + def test_traversal_function(self): + filter_func = lambda x, y: x == 'urls' and isinstance(y, list) + assert traverse_obj(_TEST_DATA, filter_func) == [_TEST_DATA['urls']], \ + 'function as query key should perform a filter based on (key, value)' + assert traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)) == ['str'], \ + 'exceptions in the query function should be catched' + assert traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0) == [0, 2], \ + 'function key should accept iterables' + # Wrong function signature should raise (debug mode) + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, lambda a: ...) + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, lambda a, b, c: ...) + + def test_traversal_set(self): + # transformation/type, like `expected_type` + assert traverse_obj(_TEST_DATA, (..., {str.upper}, )) == ['STR'], \ + 'Function in set should be a transformation' + assert traverse_obj(_TEST_DATA, (..., {str})) == ['str'], \ + 'Type in set should be a type filter' + assert traverse_obj(_TEST_DATA, (..., {str, int})) == [100, 'str'], \ + 'Multiple types in set should be a type filter' + assert traverse_obj(_TEST_DATA, {dict}) == _TEST_DATA, \ + 'A single set should be wrapped into a path' + assert traverse_obj(_TEST_DATA, (..., {str.upper})) == ['STR'], \ + 'Transformation function should not raise' + expected = [x for x in map(str_or_none, _TEST_DATA.values()) if x is not None] + assert traverse_obj(_TEST_DATA, (..., {str_or_none})) == expected, \ + 'Function in set should be a transformation' + assert traverse_obj(_TEST_DATA, ('fail', {lambda _: 'const'})) == 'const', \ + 'Function in set should always be called' + # Sets with length < 1 or > 1 not including only types should raise + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, set()) + with pytest.raises(Exception): + traverse_obj(_TEST_DATA, {str.upper, str}) + + def test_traversal_slice(self): + _SLICE_DATA = [0, 1, 2, 3, 4] + + assert traverse_obj(_TEST_DATA, ('dict', slice(1))) is None, \ + 'slice on a dictionary should not throw' + assert traverse_obj(_SLICE_DATA, slice(1)) == _SLICE_DATA[:1], \ + 'slice key should apply slice to sequence' + assert traverse_obj(_SLICE_DATA, slice(1, 2)) == _SLICE_DATA[1:2], \ + 'slice key should apply slice to sequence' + assert traverse_obj(_SLICE_DATA, slice(1, 4, 2)) == _SLICE_DATA[1:4:2], \ + 'slice key should apply slice to sequence' + + def test_traversal_alternatives(self): + assert traverse_obj(_TEST_DATA, 'fail', 'str') == 'str', \ + 'multiple `paths` should be treated as alternative paths' + assert traverse_obj(_TEST_DATA, 'str', 100) == 'str', \ + 'alternatives should exit early' + assert traverse_obj(_TEST_DATA, 'fail', 'fail') is None, \ + 'alternatives should return `default` if exhausted' + assert traverse_obj(_TEST_DATA, (..., 'fail'), 100) == 100, \ + 'alternatives should track their own branching return' + assert traverse_obj(_TEST_DATA, ('dict', ...), ('data', ...)) == list(_TEST_DATA['data']), \ + 'alternatives on empty objects should search further' + + def test_traversal_branching_nesting(self): + assert traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')) == ['https://www.example.com/0'], \ + 'tuple as key should be treated as branches' + assert traverse_obj(_TEST_DATA, ('urls', [3, 0], 'url')) == ['https://www.example.com/0'], \ + 'list as key should be treated as branches' + assert traverse_obj(_TEST_DATA, ('urls', ((1, 'fail'), (0, 'url')))) == ['https://www.example.com/0'], \ + 'double nesting in path should be treated as paths' + assert traverse_obj(['0', [1, 2]], [(0, 1), 0]) == [1], \ + 'do not fail early on branching' + expected = ['https://www.example.com/0', 'https://www.example.com/1'] + assert traverse_obj(_TEST_DATA, ('urls', ((0, ('fail', 'url')), (1, 'url')))) == expected, \ + 'tripple nesting in path should be treated as branches' + assert traverse_obj(_TEST_DATA, ('urls', ('fail', (..., 'url')))) == expected, \ + 'ellipsis as branch path start gets flattened' + + def test_traversal_dict(self): + assert traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}) == {0: 100, 1: 1.2}, \ + 'dict key should result in a dict with the same keys' + expected = {0: 'https://www.example.com/0'} + assert traverse_obj(_TEST_DATA, {0: ('urls', 0, 'url')}) == expected, \ + 'dict key should allow paths' + expected = {0: ['https://www.example.com/0']} + assert traverse_obj(_TEST_DATA, {0: ('urls', (3, 0), 'url')}) == expected, \ + 'tuple in dict path should be treated as branches' + assert traverse_obj(_TEST_DATA, {0: ('urls', ((1, 'fail'), (0, 'url')))}) == expected, \ + 'double nesting in dict path should be treated as paths' + expected = {0: ['https://www.example.com/1', 'https://www.example.com/0']} + assert traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}) == expected, \ + 'tripple nesting in dict path should be treated as branches' + assert traverse_obj(_TEST_DATA, {0: 'fail'}) == {}, \ + 'remove `None` values when top level dict key fails' + assert traverse_obj(_TEST_DATA, {0: 'fail'}, default=...) == {0: ...}, \ + 'use `default` if key fails and `default`' + assert traverse_obj(_TEST_DATA, {0: 'dict'}) == {}, \ + 'remove empty values when dict key' + assert traverse_obj(_TEST_DATA, {0: 'dict'}, default=...) == {0: ...}, \ + 'use `default` when dict key and `default`' + assert traverse_obj(_TEST_DATA, {0: {0: 'fail'}}) == {}, \ + 'remove empty values when nested dict key fails' + assert traverse_obj(None, {0: 'fail'}) == {}, \ + 'default to dict if pruned' + assert traverse_obj(None, {0: 'fail'}, default=...) == {0: ...}, \ + 'default to dict if pruned and default is given' + assert traverse_obj(_TEST_DATA, {0: {0: 'fail'}}, default=...) == {0: {0: ...}}, \ + 'use nested `default` when nested dict key fails and `default`' + assert traverse_obj(_TEST_DATA, {0: ('dict', ...)}) == {}, \ + 'remove key if branch in dict key not successful' + + def test_traversal_default(self): + _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} + + assert traverse_obj(_DEFAULT_DATA, 'fail') is None, \ + 'default value should be `None`' + assert traverse_obj(_DEFAULT_DATA, 'fail', 'fail', default=...) == ..., \ + 'chained fails should result in default' + assert traverse_obj(_DEFAULT_DATA, 'None', 'int') == 0, \ + 'should not short cirquit on `None`' + assert traverse_obj(_DEFAULT_DATA, 'fail', default=1) == 1, \ + 'invalid dict key should result in `default`' + assert traverse_obj(_DEFAULT_DATA, 'None', default=1) == 1, \ + '`None` is a deliberate sentinel and should become `default`' + assert traverse_obj(_DEFAULT_DATA, ('list', 10)) is None, \ + '`IndexError` should result in `default`' + assert traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=1) == 1, \ + 'if branched but not successful return `default` if defined, not `[]`' + assert traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=None) is None, \ + 'if branched but not successful return `default` even if `default` is `None`' + assert traverse_obj(_DEFAULT_DATA, (..., 'fail')) == [], \ + 'if branched but not successful return `[]`, not `default`' + assert traverse_obj(_DEFAULT_DATA, ('list', ...)) == [], \ + 'if branched but object is empty return `[]`, not `default`' + assert traverse_obj(None, ...) == [], \ + 'if branched but object is `None` return `[]`, not `default`' + assert traverse_obj({0: None}, (0, ...)) == [], \ + 'if branched but state is `None` return `[]`, not `default`' + + @pytest.mark.parametrize('path', [ + ('fail', ...), + (..., 'fail'), + 100 * ('fail',) + (...,), + (...,) + 100 * ('fail',), + ]) + def test_traversal_branching(self, path): + assert traverse_obj({}, path) == [], \ + 'if branched but state is `None`, return `[]` (not `default`)' + assert traverse_obj({}, 'fail', path) == [], \ + 'if branching in last alternative and previous did not match, return `[]` (not `default`)' + assert traverse_obj({0: 'x'}, 0, path) == 'x', \ + 'if branching in last alternative and previous did match, return single value' + assert traverse_obj({0: 'x'}, path, 0) == 'x', \ + 'if branching in first alternative and non-branching path does match, return single value' + assert traverse_obj({}, path, 'fail') is None, \ + 'if branching in first alternative and non-branching path does not match, return `default`' + + def test_traversal_expected_type(self): + _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} + + assert traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str) == 'str', \ + 'accept matching `expected_type` type' + assert traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int) is None, \ + 'reject non matching `expected_type` type' + assert traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)) == '0', \ + 'transform type using type function' + assert traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0) is None, \ + 'wrap expected_type fuction in try_call' + assert traverse_obj(_EXPECTED_TYPE_DATA, ..., expected_type=str) == ['str'], \ + 'eliminate items that expected_type fails on' + assert traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}, expected_type=int) == {0: 100}, \ + 'type as expected_type should filter dict values' + assert traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none) == {0: '100', 1: '1.2'}, \ + 'function as expected_type should transform dict values' + assert traverse_obj(_TEST_DATA, ({0: 1.2}, 0, {int_or_none}), expected_type=int) == 1, \ + 'expected_type should not filter non final dict values' + assert traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int) == {0: {0: 100}}, \ + 'expected_type should transform deep dict values' + assert traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(...)) == [{0: ...}, {0: ...}], \ + 'expected_type should transform branched dict values' + assert traverse_obj({1: {3: 4}}, [(1, 2), 3], expected_type=int) == [4], \ + 'expected_type regression for type matching in tuple branching' + assert traverse_obj(_TEST_DATA, ['data', ...], expected_type=int) == [], \ + 'expected_type regression for type matching in dict result' + + def test_traversal_get_all(self): + _GET_ALL_DATA = {'key': [0, 1, 2]} + + assert traverse_obj(_GET_ALL_DATA, ('key', ...), get_all=False) == 0, \ + 'if not `get_all`, return only first matching value' + assert traverse_obj(_GET_ALL_DATA, ..., get_all=False) == [0, 1, 2], \ + 'do not overflatten if not `get_all`' + + def test_traversal_casesense(self): + _CASESENSE_DATA = { + 'KeY': 'value0', + 0: { + 'KeY': 'value1', + 0: {'KeY': 'value2'}, + }, + } + + assert traverse_obj(_CASESENSE_DATA, 'key') is None, \ + 'dict keys should be case sensitive unless `casesense`' + assert traverse_obj(_CASESENSE_DATA, 'keY', casesense=False) == 'value0', \ + 'allow non matching key case if `casesense`' + assert traverse_obj(_CASESENSE_DATA, [0, ('keY',)], casesense=False) == ['value1'], \ + 'allow non matching key case in branch if `casesense`' + assert traverse_obj(_CASESENSE_DATA, [0, ([0, 'keY'],)], casesense=False) == ['value2'], \ + 'allow non matching key case in branch path if `casesense`' + + def test_traversal_traverse_string(self): + _TRAVERSE_STRING_DATA = {'str': 'str', 1.2: 1.2} + + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0)) is None, \ + 'do not traverse into string if not `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0), traverse_string=True) == 's', \ + 'traverse into string if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, (1.2, 1), traverse_string=True) == '.', \ + 'traverse into converted data if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', ...), traverse_string=True) == 'str', \ + '`...` should result in string (same value) if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)), traverse_string=True) == 'sr', \ + '`slice` should result in string if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"), traverse_string=True) == 'str', \ + 'function should result in string if `traverse_string`' + assert traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), traverse_string=True) == ['s', 'r'], \ + 'branching should result in list if `traverse_string`' + assert traverse_obj({}, (0, ...), traverse_string=True) == [], \ + 'branching should result in list if `traverse_string`' + assert traverse_obj({}, (0, lambda x, y: True), traverse_string=True) == [], \ + 'branching should result in list if `traverse_string`' + assert traverse_obj({}, (0, slice(1)), traverse_string=True) == [], \ + 'branching should result in list if `traverse_string`' + + def test_traversal_re(self): + mobj = re.fullmatch(r'0(12)(?P3)(4)?', '0123') + assert traverse_obj(mobj, ...) == [x for x in mobj.groups() if x is not None], \ + '`...` on a `re.Match` should give its `groups()`' + assert traverse_obj(mobj, lambda k, _: k in (0, 2)) == ['0123', '3'], \ + 'function on a `re.Match` should give groupno, value starting at 0' + assert traverse_obj(mobj, 'group') == '3', \ + 'str key on a `re.Match` should give group with that name' + assert traverse_obj(mobj, 2) == '3', \ + 'int key on a `re.Match` should give group with that name' + assert traverse_obj(mobj, 'gRoUp', casesense=False) == '3', \ + 'str key on a `re.Match` should respect casesense' + assert traverse_obj(mobj, 'fail') is None, \ + 'failing str key on a `re.Match` should return `default`' + assert traverse_obj(mobj, 'gRoUpS', casesense=False) is None, \ + 'failing str key on a `re.Match` should return `default`' + assert traverse_obj(mobj, 8) is None, \ + 'failing int key on a `re.Match` should return `default`' + assert traverse_obj(mobj, lambda k, _: k in (0, 'group')) == ['0123', '3'], \ + 'function on a `re.Match` should give group name as well' + + def test_traversal_xml_etree(self): + etree = xml.etree.ElementTree.fromstring(''' + + + 1 + 2008 + 141100 + + + + + 4 + 2011 + 59900 + + + + 68 + 2011 + 13600 + + + + ''') + assert traverse_obj(etree, '') == etree, \ + 'empty str key should return the element itself' + assert traverse_obj(etree, 'country') == list(etree), \ + 'str key should lead all children with that tag name' + assert traverse_obj(etree, ...) == list(etree), \ + '`...` as key should return all children' + assert traverse_obj(etree, lambda _, x: x[0].text == '4') == [etree[1]], \ + 'function as key should get element as value' + assert traverse_obj(etree, lambda i, _: i == 1) == [etree[1]], \ + 'function as key should get index as key' + assert traverse_obj(etree, 0) == etree[0], \ + 'int key should return the nth child' + expected = ['Austria', 'Switzerland', 'Malaysia', 'Costa Rica', 'Colombia'] + assert traverse_obj(etree, './/neighbor/@name') == expected, \ + '`@` at end of path should give that attribute' + assert traverse_obj(etree, '//neighbor/@fail') == [None, None, None, None, None], \ + '`@` at end of path should give `None`' + assert traverse_obj(etree, ('//neighbor/@', 2)) == {'name': 'Malaysia', 'direction': 'N'}, \ + '`@` should give the full attribute dict' + assert traverse_obj(etree, '//year/text()') == ['2008', '2011', '2011'], \ + '`text()` at end of path should give the inner text' + assert traverse_obj(etree, '//*[@direction]/@direction') == ['E', 'W', 'N', 'W', 'E'], \ + 'full Python xpath features should be supported' + assert traverse_obj(etree, (0, '@name')) == 'Liechtenstein', \ + 'special transformations should act on current element' + assert traverse_obj(etree, ('country', 0, ..., 'text()', {int_or_none})) == [1, 2008, 141100], \ + 'special transformations should act on current element' + + def test_traversal_unbranching(self): + assert traverse_obj(_TEST_DATA, [(100, 1.2), all]) == [100, 1.2], \ + '`all` should give all results as list' + assert traverse_obj(_TEST_DATA, [(100, 1.2), any]) == 100, \ + '`any` should give the first result' + assert traverse_obj(_TEST_DATA, [100, all]) == [100], \ + '`all` should give list if non branching' + assert traverse_obj(_TEST_DATA, [100, any]) == 100, \ + '`any` should give single item if non branching' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100), all]) == [100], \ + '`all` should filter `None` and empty dict' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100), any]) == 100, \ + '`any` should filter `None` and empty dict' + assert traverse_obj(_TEST_DATA, [{ + 'all': [('dict', 'None', 100, 1.2), all], + 'any': [('dict', 'None', 100, 1.2), any], + }]) == {'all': [100, 1.2], 'any': 100}, \ + '`all`/`any` should apply to each dict path separately' + assert traverse_obj(_TEST_DATA, [{ + 'all': [('dict', 'None', 100, 1.2), all], + 'any': [('dict', 'None', 100, 1.2), any], + }], get_all=False) == {'all': [100, 1.2], 'any': 100}, \ + '`all`/`any` should apply to dict regardless of `get_all`' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100, 1.2), all, {float}]) is None, \ + '`all` should reset branching status' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100, 1.2), any, {float}]) is None, \ + '`any` should reset branching status' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 100, 1.2), all, ..., {float}]) == [1.2], \ + '`all` should allow further branching' + assert traverse_obj(_TEST_DATA, [('dict', 'None', 'urls', 'data'), any, ..., 'index']) == [0, 1], \ + '`any` should allow further branching' + + def test_traversal_morsel(self): + values = { + 'expires': 'a', + 'path': 'b', + 'comment': 'c', + 'domain': 'd', + 'max-age': 'e', + 'secure': 'f', + 'httponly': 'g', + 'version': 'h', + 'samesite': 'i', + } + morsel = http.cookies.Morsel() + morsel.set('item_key', 'item_value', 'coded_value') + morsel.update(values) + values['key'] = 'item_key' + values['value'] = 'item_value' + + for key, value in values.items(): + assert traverse_obj(morsel, key) == value, \ + 'Morsel should provide access to all values' + assert traverse_obj(morsel, ...) == list(values.values()), \ + '`...` should yield all values' + assert traverse_obj(morsel, lambda k, v: True) == list(values.values()), \ + 'function key should yield all values' + assert traverse_obj(morsel, [(None,), any]) == morsel, \ + 'Morsel should not be implicitly changed to dict on usage' + + +class TestDictGet: + def test_dict_get(self): + FALSE_VALUES = { + 'none': None, + 'false': False, + 'zero': 0, + 'empty_string': '', + 'empty_list': [], + } + d = {**FALSE_VALUES, 'a': 42} + assert dict_get(d, 'a') == 42 + assert dict_get(d, 'b') is None + assert dict_get(d, 'b', 42) == 42 + assert dict_get(d, ('a',)) == 42 + assert dict_get(d, ('b', 'a')) == 42 + assert dict_get(d, ('b', 'c', 'a', 'd')) == 42 + assert dict_get(d, ('b', 'c')) is None + assert dict_get(d, ('b', 'c'), 42) == 42 + for key, false_value in FALSE_VALUES.items(): + assert dict_get(d, ('b', 'c', key)) is None + assert dict_get(d, ('b', 'c', key), skip_false_values=False) == false_value diff --git a/test/test_utils.py b/test/test_utils.py index a3073f0e0..ddf0a7c24 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -2,7 +2,6 @@ # Allow direct execution import os -import re import sys import unittest import warnings @@ -45,7 +44,6 @@ from yt_dlp.utils import ( determine_ext, determine_file_encoding, dfxp2srt, - dict_get, encode_base_n, encode_compat_str, encodeFilename, @@ -106,13 +104,11 @@ from yt_dlp.utils import ( sanitize_url, shell_quote, smuggle_url, - str_or_none, str_to_int, strip_jsonp, strip_or_none, subtitles_filename, timeconvert, - traverse_obj, try_call, unescapeHTML, unified_strdate, @@ -755,28 +751,6 @@ class TestUtil(unittest.TestCase): self.assertRaises( ValueError, multipart_encode, {b'field': b'value'}, boundary='value') - def test_dict_get(self): - FALSE_VALUES = { - 'none': None, - 'false': False, - 'zero': 0, - 'empty_string': '', - 'empty_list': [], - } - d = FALSE_VALUES.copy() - d['a'] = 42 - self.assertEqual(dict_get(d, 'a'), 42) - self.assertEqual(dict_get(d, 'b'), None) - self.assertEqual(dict_get(d, 'b', 42), 42) - self.assertEqual(dict_get(d, ('a', )), 42) - self.assertEqual(dict_get(d, ('b', 'a', )), 42) - self.assertEqual(dict_get(d, ('b', 'c', 'a', 'd', )), 42) - self.assertEqual(dict_get(d, ('b', 'c', )), None) - self.assertEqual(dict_get(d, ('b', 'c', ), 42), 42) - for key, false_value in FALSE_VALUES.items(): - self.assertEqual(dict_get(d, ('b', 'c', key, )), None) - self.assertEqual(dict_get(d, ('b', 'c', key, ), skip_false_values=False), false_value) - def test_merge_dicts(self): self.assertEqual(merge_dicts({'a': 1}, {'b': 2}), {'a': 1, 'b': 2}) self.assertEqual(merge_dicts({'a': 1}, {'a': 2}), {'a': 1}) @@ -2039,359 +2013,6 @@ Line 1 warnings.simplefilter('ignore') self.assertEqual(variadic('spam', allowed_types=[dict]), 'spam') - def test_traverse_obj(self): - _TEST_DATA = { - 100: 100, - 1.2: 1.2, - 'str': 'str', - 'None': None, - '...': ..., - 'urls': [ - {'index': 0, 'url': 'https://www.example.com/0'}, - {'index': 1, 'url': 'https://www.example.com/1'}, - ], - 'data': ( - {'index': 2}, - {'index': 3}, - ), - 'dict': {}, - } - - # Test base functionality - self.assertEqual(traverse_obj(_TEST_DATA, ('str',)), 'str', - msg='allow tuple path') - self.assertEqual(traverse_obj(_TEST_DATA, ['str']), 'str', - msg='allow list path') - self.assertEqual(traverse_obj(_TEST_DATA, (value for value in ("str",))), 'str', - msg='allow iterable path') - self.assertEqual(traverse_obj(_TEST_DATA, 'str'), 'str', - msg='single items should be treated as a path') - self.assertEqual(traverse_obj(_TEST_DATA, None), _TEST_DATA) - self.assertEqual(traverse_obj(_TEST_DATA, 100), 100) - self.assertEqual(traverse_obj(_TEST_DATA, 1.2), 1.2) - - # Test Ellipsis behavior - self.assertCountEqual(traverse_obj(_TEST_DATA, ...), - (item for item in _TEST_DATA.values() if item not in (None, {})), - msg='`...` should give all non discarded values') - self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', 0, ...)), _TEST_DATA['urls'][0].values(), - msg='`...` selection for dicts should select all values') - self.assertEqual(traverse_obj(_TEST_DATA, (..., ..., 'url')), - ['https://www.example.com/0', 'https://www.example.com/1'], - msg='nested `...` queries should work') - self.assertCountEqual(traverse_obj(_TEST_DATA, (..., ..., 'index')), range(4), - msg='`...` query result should be flattened') - self.assertEqual(traverse_obj(iter(range(4)), ...), list(range(4)), - msg='`...` should accept iterables') - - # Test function as key - self.assertEqual(traverse_obj(_TEST_DATA, lambda x, y: x == 'urls' and isinstance(y, list)), - [_TEST_DATA['urls']], - msg='function as query key should perform a filter based on (key, value)') - self.assertCountEqual(traverse_obj(_TEST_DATA, lambda _, x: isinstance(x[0], str)), {'str'}, - msg='exceptions in the query function should be catched') - self.assertEqual(traverse_obj(iter(range(4)), lambda _, x: x % 2 == 0), [0, 2], - msg='function key should accept iterables') - if __debug__: - with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): - traverse_obj(_TEST_DATA, lambda a: ...) - with self.assertRaises(Exception, msg='Wrong function signature should raise in debug'): - traverse_obj(_TEST_DATA, lambda a, b, c: ...) - - # Test set as key (transformation/type, like `expected_type`) - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str.upper}, )), ['STR'], - msg='Function in set should be a transformation') - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str})), ['str'], - msg='Type in set should be a type filter') - self.assertEqual(traverse_obj(_TEST_DATA, {dict}), _TEST_DATA, - msg='A single set should be wrapped into a path') - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str.upper})), ['STR'], - msg='Transformation function should not raise') - self.assertEqual(traverse_obj(_TEST_DATA, (..., {str_or_none})), - [item for item in map(str_or_none, _TEST_DATA.values()) if item is not None], - msg='Function in set should be a transformation') - self.assertEqual(traverse_obj(_TEST_DATA, ('fail', {lambda _: 'const'})), 'const', - msg='Function in set should always be called') - if __debug__: - with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): - traverse_obj(_TEST_DATA, set()) - with self.assertRaises(Exception, msg='Sets with length != 1 should raise in debug'): - traverse_obj(_TEST_DATA, {str.upper, str}) - - # Test `slice` as a key - _SLICE_DATA = [0, 1, 2, 3, 4] - self.assertEqual(traverse_obj(_TEST_DATA, ('dict', slice(1))), None, - msg='slice on a dictionary should not throw') - self.assertEqual(traverse_obj(_SLICE_DATA, slice(1)), _SLICE_DATA[:1], - msg='slice key should apply slice to sequence') - self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 2)), _SLICE_DATA[1:2], - msg='slice key should apply slice to sequence') - self.assertEqual(traverse_obj(_SLICE_DATA, slice(1, 4, 2)), _SLICE_DATA[1:4:2], - msg='slice key should apply slice to sequence') - - # Test alternative paths - self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'str'), 'str', - msg='multiple `paths` should be treated as alternative paths') - self.assertEqual(traverse_obj(_TEST_DATA, 'str', 100), 'str', - msg='alternatives should exit early') - self.assertEqual(traverse_obj(_TEST_DATA, 'fail', 'fail'), None, - msg='alternatives should return `default` if exhausted') - self.assertEqual(traverse_obj(_TEST_DATA, (..., 'fail'), 100), 100, - msg='alternatives should track their own branching return') - self.assertEqual(traverse_obj(_TEST_DATA, ('dict', ...), ('data', ...)), list(_TEST_DATA['data']), - msg='alternatives on empty objects should search further') - - # Test branch and path nesting - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', (3, 0), 'url')), ['https://www.example.com/0'], - msg='tuple as key should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', [3, 0], 'url')), ['https://www.example.com/0'], - msg='list as key should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ((1, 'fail'), (0, 'url')))), ['https://www.example.com/0'], - msg='double nesting in path should be treated as paths') - self.assertEqual(traverse_obj(['0', [1, 2]], [(0, 1), 0]), [1], - msg='do not fail early on branching') - self.assertCountEqual(traverse_obj(_TEST_DATA, ('urls', ((1, ('fail', 'url')), (0, 'url')))), - ['https://www.example.com/0', 'https://www.example.com/1'], - msg='tripple nesting in path should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, ('urls', ('fail', (..., 'url')))), - ['https://www.example.com/0', 'https://www.example.com/1'], - msg='ellipsis as branch path start gets flattened') - - # Test dictionary as key - self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}), {0: 100, 1: 1.2}, - msg='dict key should result in a dict with the same keys') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', 0, 'url')}), - {0: 'https://www.example.com/0'}, - msg='dict key should allow paths') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', (3, 0), 'url')}), - {0: ['https://www.example.com/0']}, - msg='tuple in dict path should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, 'fail'), (0, 'url')))}), - {0: ['https://www.example.com/0']}, - msg='double nesting in dict path should be treated as paths') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('urls', ((1, ('fail', 'url')), (0, 'url')))}), - {0: ['https://www.example.com/1', 'https://www.example.com/0']}, - msg='tripple nesting in dict path should be treated as branches') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}), {}, - msg='remove `None` values when top level dict key fails') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'fail'}, default=...), {0: ...}, - msg='use `default` if key fails and `default`') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}), {}, - msg='remove empty values when dict key') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 'dict'}, default=...), {0: ...}, - msg='use `default` when dict key and `default`') - self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}), {}, - msg='remove empty values when nested dict key fails') - self.assertEqual(traverse_obj(None, {0: 'fail'}), {}, - msg='default to dict if pruned') - self.assertEqual(traverse_obj(None, {0: 'fail'}, default=...), {0: ...}, - msg='default to dict if pruned and default is given') - self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 'fail'}}, default=...), {0: {0: ...}}, - msg='use nested `default` when nested dict key fails and `default`') - self.assertEqual(traverse_obj(_TEST_DATA, {0: ('dict', ...)}), {}, - msg='remove key if branch in dict key not successful') - - # Testing default parameter behavior - _DEFAULT_DATA = {'None': None, 'int': 0, 'list': []} - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail'), None, - msg='default value should be `None`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', 'fail', default=...), ..., - msg='chained fails should result in default') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', 'int'), 0, - msg='should not short cirquit on `None`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'fail', default=1), 1, - msg='invalid dict key should result in `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, 'None', default=1), 1, - msg='`None` is a deliberate sentinel and should become `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', 10)), None, - msg='`IndexError` should result in `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=1), 1, - msg='if branched but not successful return `default` if defined, not `[]`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail'), default=None), None, - msg='if branched but not successful return `default` even if `default` is `None`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, (..., 'fail')), [], - msg='if branched but not successful return `[]`, not `default`') - self.assertEqual(traverse_obj(_DEFAULT_DATA, ('list', ...)), [], - msg='if branched but object is empty return `[]`, not `default`') - self.assertEqual(traverse_obj(None, ...), [], - msg='if branched but object is `None` return `[]`, not `default`') - self.assertEqual(traverse_obj({0: None}, (0, ...)), [], - msg='if branched but state is `None` return `[]`, not `default`') - - branching_paths = [ - ('fail', ...), - (..., 'fail'), - 100 * ('fail',) + (...,), - (...,) + 100 * ('fail',), - ] - for branching_path in branching_paths: - self.assertEqual(traverse_obj({}, branching_path), [], - msg='if branched but state is `None`, return `[]` (not `default`)') - self.assertEqual(traverse_obj({}, 'fail', branching_path), [], - msg='if branching in last alternative and previous did not match, return `[]` (not `default`)') - self.assertEqual(traverse_obj({0: 'x'}, 0, branching_path), 'x', - msg='if branching in last alternative and previous did match, return single value') - self.assertEqual(traverse_obj({0: 'x'}, branching_path, 0), 'x', - msg='if branching in first alternative and non-branching path does match, return single value') - self.assertEqual(traverse_obj({}, branching_path, 'fail'), None, - msg='if branching in first alternative and non-branching path does not match, return `default`') - - # Testing expected_type behavior - _EXPECTED_TYPE_DATA = {'str': 'str', 'int': 0} - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=str), - 'str', msg='accept matching `expected_type` type') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=int), - None, msg='reject non matching `expected_type` type') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'int', expected_type=lambda x: str(x)), - '0', msg='transform type using type function') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, 'str', expected_type=lambda _: 1 / 0), - None, msg='wrap expected_type fuction in try_call') - self.assertEqual(traverse_obj(_EXPECTED_TYPE_DATA, ..., expected_type=str), - ['str'], msg='eliminate items that expected_type fails on') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2}, expected_type=int), - {0: 100}, msg='type as expected_type should filter dict values') - self.assertEqual(traverse_obj(_TEST_DATA, {0: 100, 1: 1.2, 2: 'None'}, expected_type=str_or_none), - {0: '100', 1: '1.2'}, msg='function as expected_type should transform dict values') - self.assertEqual(traverse_obj(_TEST_DATA, ({0: 1.2}, 0, {int_or_none}), expected_type=int), - 1, msg='expected_type should not filter non final dict values') - self.assertEqual(traverse_obj(_TEST_DATA, {0: {0: 100, 1: 'str'}}, expected_type=int), - {0: {0: 100}}, msg='expected_type should transform deep dict values') - self.assertEqual(traverse_obj(_TEST_DATA, [({0: '...'}, {0: '...'})], expected_type=type(...)), - [{0: ...}, {0: ...}], msg='expected_type should transform branched dict values') - self.assertEqual(traverse_obj({1: {3: 4}}, [(1, 2), 3], expected_type=int), - [4], msg='expected_type regression for type matching in tuple branching') - self.assertEqual(traverse_obj(_TEST_DATA, ['data', ...], expected_type=int), - [], msg='expected_type regression for type matching in dict result') - - # Test get_all behavior - _GET_ALL_DATA = {'key': [0, 1, 2]} - self.assertEqual(traverse_obj(_GET_ALL_DATA, ('key', ...), get_all=False), 0, - msg='if not `get_all`, return only first matching value') - self.assertEqual(traverse_obj(_GET_ALL_DATA, ..., get_all=False), [0, 1, 2], - msg='do not overflatten if not `get_all`') - - # Test casesense behavior - _CASESENSE_DATA = { - 'KeY': 'value0', - 0: { - 'KeY': 'value1', - 0: {'KeY': 'value2'}, - }, - } - self.assertEqual(traverse_obj(_CASESENSE_DATA, 'key'), None, - msg='dict keys should be case sensitive unless `casesense`') - self.assertEqual(traverse_obj(_CASESENSE_DATA, 'keY', - casesense=False), 'value0', - msg='allow non matching key case if `casesense`') - self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ('keY',)), - casesense=False), ['value1'], - msg='allow non matching key case in branch if `casesense`') - self.assertEqual(traverse_obj(_CASESENSE_DATA, (0, ((0, 'keY'),)), - casesense=False), ['value2'], - msg='allow non matching key case in branch path if `casesense`') - - # Test traverse_string behavior - _TRAVERSE_STRING_DATA = {'str': 'str', 1.2: 1.2} - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0)), None, - msg='do not traverse into string if not `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', 0), - traverse_string=True), 's', - msg='traverse into string if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, (1.2, 1), - traverse_string=True), '.', - msg='traverse into converted data if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', ...), - traverse_string=True), 'str', - msg='`...` should result in string (same value) if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', slice(0, None, 2)), - traverse_string=True), 'sr', - msg='`slice` should result in string if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', lambda i, v: i or v == "s"), - traverse_string=True), 'str', - msg='function should result in string if `traverse_string`') - self.assertEqual(traverse_obj(_TRAVERSE_STRING_DATA, ('str', (0, 2)), - traverse_string=True), ['s', 'r'], - msg='branching should result in list if `traverse_string`') - self.assertEqual(traverse_obj({}, (0, ...), traverse_string=True), [], - msg='branching should result in list if `traverse_string`') - self.assertEqual(traverse_obj({}, (0, lambda x, y: True), traverse_string=True), [], - msg='branching should result in list if `traverse_string`') - self.assertEqual(traverse_obj({}, (0, slice(1)), traverse_string=True), [], - msg='branching should result in list if `traverse_string`') - - # Test re.Match as input obj - mobj = re.fullmatch(r'0(12)(?P3)(4)?', '0123') - self.assertEqual(traverse_obj(mobj, ...), [x for x in mobj.groups() if x is not None], - msg='`...` on a `re.Match` should give its `groups()`') - self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 2)), ['0123', '3'], - msg='function on a `re.Match` should give groupno, value starting at 0') - self.assertEqual(traverse_obj(mobj, 'group'), '3', - msg='str key on a `re.Match` should give group with that name') - self.assertEqual(traverse_obj(mobj, 2), '3', - msg='int key on a `re.Match` should give group with that name') - self.assertEqual(traverse_obj(mobj, 'gRoUp', casesense=False), '3', - msg='str key on a `re.Match` should respect casesense') - self.assertEqual(traverse_obj(mobj, 'fail'), None, - msg='failing str key on a `re.Match` should return `default`') - self.assertEqual(traverse_obj(mobj, 'gRoUpS', casesense=False), None, - msg='failing str key on a `re.Match` should return `default`') - self.assertEqual(traverse_obj(mobj, 8), None, - msg='failing int key on a `re.Match` should return `default`') - self.assertEqual(traverse_obj(mobj, lambda k, _: k in (0, 'group')), ['0123', '3'], - msg='function on a `re.Match` should give group name as well') - - # Test xml.etree.ElementTree.Element as input obj - etree = xml.etree.ElementTree.fromstring(''' - - - 1 - 2008 - 141100 - - - - - 4 - 2011 - 59900 - - - - 68 - 2011 - 13600 - - - - ''') - self.assertEqual(traverse_obj(etree, ''), etree, - msg='empty str key should return the element itself') - self.assertEqual(traverse_obj(etree, 'country'), list(etree), - msg='str key should lead all children with that tag name') - self.assertEqual(traverse_obj(etree, ...), list(etree), - msg='`...` as key should return all children') - self.assertEqual(traverse_obj(etree, lambda _, x: x[0].text == '4'), [etree[1]], - msg='function as key should get element as value') - self.assertEqual(traverse_obj(etree, lambda i, _: i == 1), [etree[1]], - msg='function as key should get index as key') - self.assertEqual(traverse_obj(etree, 0), etree[0], - msg='int key should return the nth child') - self.assertEqual(traverse_obj(etree, './/neighbor/@name'), - ['Austria', 'Switzerland', 'Malaysia', 'Costa Rica', 'Colombia'], - msg='`@` at end of path should give that attribute') - self.assertEqual(traverse_obj(etree, '//neighbor/@fail'), [None, None, None, None, None], - msg='`@` at end of path should give `None`') - self.assertEqual(traverse_obj(etree, ('//neighbor/@', 2)), {'name': 'Malaysia', 'direction': 'N'}, - msg='`@` should give the full attribute dict') - self.assertEqual(traverse_obj(etree, '//year/text()'), ['2008', '2011', '2011'], - msg='`text()` at end of path should give the inner text') - self.assertEqual(traverse_obj(etree, '//*[@direction]/@direction'), ['E', 'W', 'N', 'W', 'E'], - msg='full Python xpath features should be supported') - self.assertEqual(traverse_obj(etree, (0, '@name')), 'Liechtenstein', - msg='special transformations should act on current element') - self.assertEqual(traverse_obj(etree, ('country', 0, ..., 'text()', {int_or_none})), [1, 2008, 141100], - msg='special transformations should act on current element') - def test_http_header_dict(self): headers = HTTPHeaderDict() headers['ytdl-test'] = b'0' @@ -2448,6 +2069,10 @@ Line 1 # Test escaping assert run_shell(['echo', 'test"&']) == '"test""&"\n' + assert run_shell(['echo', '%CMDCMDLINE:~-1%&']) == '"%CMDCMDLINE:~-1%&"\n' + assert run_shell(['echo', 'a\nb']) == '"a"\n"b"\n' + assert run_shell(['echo', '"']) == '""""\n' + assert run_shell(['echo', '\\']) == '\\\n' # Test if delayed expansion is disabled assert run_shell(['echo', '^!']) == '"^!"\n' assert run_shell('echo "^!"') == '"^!"\n' diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index e3d1db376..9f730d038 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1,7 +1,7 @@ import collections import contextlib import copy -import datetime +import datetime as dt import errno import fileinput import http.cookiejar @@ -25,7 +25,7 @@ import unicodedata from .cache import Cache from .compat import functools, urllib # isort: split -from .compat import compat_os_name, compat_shlex_quote, urllib_req_to_req +from .compat import compat_os_name, urllib_req_to_req from .cookies import LenientSimpleCookie, load_cookies from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name from .downloader.rtmp import rtmpdump_version @@ -102,7 +102,6 @@ from .utils import ( UserNotLive, YoutubeDLError, age_restricted, - args_to_str, bug_reports_message, date_from_str, deprecation_warning, @@ -141,11 +140,13 @@ from .utils import ( sanitize_filename, sanitize_path, sanitize_url, + shell_quote, str_or_none, strftime_or_none, subtitles_filename, supports_terminal_sequences, system_identifier, + filesize_from_tbr, timetuple_from_msec, to_high_limit_path, traverse_obj, @@ -480,7 +481,7 @@ class YoutubeDL: nopart, updatetime, buffersize, ratelimit, throttledratelimit, min_filesize, max_filesize, test, noresizebuffer, retries, file_access_retries, fragment_retries, continuedl, xattr_set_filesize, hls_use_mpegts, http_chunk_size, - external_downloader_args, concurrent_fragment_downloads. + external_downloader_args, concurrent_fragment_downloads, progress_delta. The following options are used by the post processors: ffmpeg_location: Location of the ffmpeg/avconv binary; either the path @@ -822,7 +823,7 @@ class YoutubeDL: self.report_warning( 'Long argument string detected. ' 'Use -- to separate parameters and URLs, like this:\n%s' % - args_to_str(correct_argv)) + shell_quote(correct_argv)) def add_info_extractor(self, ie): """Add an InfoExtractor object to the end of the list.""" @@ -1354,7 +1355,7 @@ class YoutubeDL: value, fmt = escapeHTML(str(value)), str_fmt elif fmt[-1] == 'q': # quoted value = map(str, variadic(value) if '#' in flags else [value]) - value, fmt = ' '.join(map(compat_shlex_quote, value)), str_fmt + value, fmt = shell_quote(value, shell=True), str_fmt elif fmt[-1] == 'B': # bytes value = f'%{str_fmt}'.encode() % str(value).encode() value, fmt = value.decode('utf-8', 'ignore'), 's' @@ -2628,7 +2629,7 @@ class YoutubeDL: # Working around out-of-range timestamp values (e.g. negative ones on Windows, # see http://bugs.python.org/issue1646728) with contextlib.suppress(ValueError, OverflowError, OSError): - upload_date = datetime.datetime.fromtimestamp(info_dict[ts_key], datetime.timezone.utc) + upload_date = dt.datetime.fromtimestamp(info_dict[ts_key], dt.timezone.utc) info_dict[date_key] = upload_date.strftime('%Y%m%d') if not info_dict.get('release_year'): @@ -2782,7 +2783,7 @@ class YoutubeDL: get_from_start = not info_dict.get('is_live') or bool(self.params.get('live_from_start')) if not get_from_start: - info_dict['title'] += ' ' + datetime.datetime.now().strftime('%Y-%m-%d %H:%M') + info_dict['title'] += ' ' + dt.datetime.now().strftime('%Y-%m-%d %H:%M') if info_dict.get('is_live') and formats: formats = [f for f in formats if bool(f.get('is_from_start')) == get_from_start] if get_from_start and not formats: @@ -2813,6 +2814,9 @@ class YoutubeDL: format['url'] = sanitize_url(format['url']) if format.get('ext') is None: format['ext'] = determine_ext(format['url']).lower() + if format['ext'] in ('aac', 'opus', 'mp3', 'flac', 'vorbis'): + if format.get('acodec') is None: + format['acodec'] = format['ext'] if format.get('protocol') is None: format['protocol'] = determine_protocol(format) if format.get('resolution') is None: @@ -2823,9 +2827,8 @@ class YoutubeDL: format['aspect_ratio'] = try_call(lambda: round(format['width'] / format['height'], 2)) # For fragmented formats, "tbr" is often max bitrate and not average if (('manifest-filesize-approx' in self.params['compat_opts'] or not format.get('manifest_url')) - and info_dict.get('duration') and format.get('tbr') and not format.get('filesize') and not format.get('filesize_approx')): - format['filesize_approx'] = int(info_dict['duration'] * format['tbr'] * (1024 / 8)) + format['filesize_approx'] = filesize_from_tbr(format.get('tbr'), info_dict.get('duration')) format['http_headers'] = self._calc_headers(collections.ChainMap(format, info_dict), load_cookies=True) # Safeguard against old/insecure infojson when using --load-info-json @@ -3875,8 +3878,8 @@ class YoutubeDL: delim, ( format_field(f, 'filesize', ' \t%s', func=format_bytes) or format_field(f, 'filesize_approx', '≈\t%s', func=format_bytes) - or format_field(try_call(lambda: format_bytes(int(info_dict['duration'] * f['tbr'] * (1024 / 8)))), - None, self._format_out('~\t%s', self.Styles.SUPPRESS))), + or format_field(filesize_from_tbr(f.get('tbr'), info_dict.get('duration')), None, + self._format_out('~\t%s', self.Styles.SUPPRESS), func=format_bytes)), format_field(f, 'tbr', '\t%dk', func=round), shorten_protocol_name(f.get('protocol', '')), delim, diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 940594faf..3d606bcba 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -836,6 +836,7 @@ def parse_options(argv=None): 'noprogress': opts.quiet if opts.noprogress is None else opts.noprogress, 'progress_with_newline': opts.progress_with_newline, 'progress_template': opts.progress_template, + 'progress_delta': opts.progress_delta, 'playliststart': opts.playliststart, 'playlistend': opts.playlistend, 'playlistreverse': opts.playlist_reverse, diff --git a/yt_dlp/compat/__init__.py b/yt_dlp/compat/__init__.py index 5ad5c70ec..d820adaf1 100644 --- a/yt_dlp/compat/__init__.py +++ b/yt_dlp/compat/__init__.py @@ -27,12 +27,9 @@ def compat_etree_fromstring(text): compat_os_name = os._name if os.name == 'java' else os.name -if compat_os_name == 'nt': - def compat_shlex_quote(s): - import re - return s if re.match(r'^[-_\w./]+$', s) else s.replace('"', '""').join('""') -else: - from shlex import quote as compat_shlex_quote # noqa: F401 +def compat_shlex_quote(s): + from ..utils import shell_quote + return shell_quote(s) def compat_ord(c): diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index 28d174a09..7b8d215f0 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,6 +1,7 @@ import base64 import collections import contextlib +import datetime as dt import glob import http.cookiejar import http.cookies @@ -15,7 +16,6 @@ import sys import tempfile import time import urllib.request -from datetime import datetime, timedelta, timezone from enum import Enum, auto from hashlib import pbkdf2_hmac @@ -194,7 +194,11 @@ def _firefox_browser_dirs(): yield os.path.expanduser('~/Library/Application Support/Firefox/Profiles') else: - yield from map(os.path.expanduser, ('~/.mozilla/firefox', '~/snap/firefox/common/.mozilla/firefox')) + yield from map(os.path.expanduser, ( + '~/.mozilla/firefox', + '~/snap/firefox/common/.mozilla/firefox', + '~/.var/app/org.mozilla.firefox/.mozilla/firefox', + )) def _firefox_cookie_dbs(roots): @@ -594,7 +598,7 @@ class DataParser: def _mac_absolute_time_to_posix(timestamp): - return int((datetime(2001, 1, 1, 0, 0, tzinfo=timezone.utc) + timedelta(seconds=timestamp)).timestamp()) + return int((dt.datetime(2001, 1, 1, 0, 0, tzinfo=dt.timezone.utc) + dt.timedelta(seconds=timestamp)).timestamp()) def _parse_safari_cookies_header(data, logger): diff --git a/yt_dlp/downloader/common.py b/yt_dlp/downloader/common.py index b71d7ee8f..65a0d6f23 100644 --- a/yt_dlp/downloader/common.py +++ b/yt_dlp/downloader/common.py @@ -4,6 +4,7 @@ import functools import os import random import re +import threading import time from ..minicurses import ( @@ -63,6 +64,7 @@ class FileDownloader: min_filesize: Skip files smaller than this size max_filesize: Skip files larger than this size xattr_set_filesize: Set ytdl.filesize user xattribute with expected size. + progress_delta: The minimum time between progress output, in seconds external_downloader_args: A dictionary of downloader keys (in lower case) and a list of additional command-line arguments for the executable. Use 'default' as the name for arguments to be @@ -88,6 +90,9 @@ class FileDownloader: self.params = params self._prepare_multiline_status() self.add_progress_hook(self.report_progress) + if self.params.get('progress_delta'): + self._progress_delta_lock = threading.Lock() + self._progress_delta_time = time.monotonic() def _set_ydl(self, ydl): self.ydl = ydl @@ -366,6 +371,12 @@ class FileDownloader: if s['status'] != 'downloading': return + if update_delta := self.params.get('progress_delta'): + with self._progress_delta_lock: + if time.monotonic() < self._progress_delta_time: + return + self._progress_delta_time += update_delta + s.update({ '_eta_str': self.format_eta(s.get('eta')).strip(), '_speed_str': self.format_speed(s.get('speed')), diff --git a/yt_dlp/downloader/external.py b/yt_dlp/downloader/external.py index ce5eeb0a9..8b0b94e72 100644 --- a/yt_dlp/downloader/external.py +++ b/yt_dlp/downloader/external.py @@ -491,7 +491,7 @@ class FFmpegFD(ExternalFD): if not self.params.get('verbose'): args += ['-hide_banner'] - args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args'), default=[]) + args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args', ...)) # These exists only for compatibility. Extractors should use # info_dict['downloader_options']['ffmpeg_args'] instead @@ -615,6 +615,8 @@ class FFmpegFD(ExternalFD): else: args += ['-f', EXT_TO_OUT_FORMATS.get(ext, ext)] + args += traverse_obj(info_dict, ('downloader_options', 'ffmpeg_args_out', ...)) + args += self._configuration_args(('_o1', '_o', '')) args = [encodeArgument(opt) for opt in args] diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index 36d0853a0..42034275b 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -150,6 +150,7 @@ from .arte import ( ) from .arnes import ArnesIE from .asobichannel import AsobiChannelIE, AsobiChannelTagURLIE +from .asobistage import AsobiStageIE from .atresplayer import AtresPlayerIE from .atscaleconf import AtScaleConfEventIE from .atvat import ATVAtIE @@ -875,6 +876,7 @@ from .jeuxvideo import JeuxVideoIE from .jiosaavn import ( JioSaavnSongIE, JioSaavnAlbumIE, + JioSaavnPlaylistIE, ) from .jove import JoveIE from .joj import JojIE @@ -2289,6 +2291,7 @@ from .vrt import ( VrtNUIE, KetnetIE, DagelijkseKostIE, + Radio1BeIE, ) from .vtm import VTMIE from .medialaan import MedialaanIE diff --git a/yt_dlp/extractor/afreecatv.py b/yt_dlp/extractor/afreecatv.py index 3d26d9c25..3e5738f6a 100644 --- a/yt_dlp/extractor/afreecatv.py +++ b/yt_dlp/extractor/afreecatv.py @@ -1,25 +1,65 @@ import functools -import re from .common import InfoExtractor from ..utils import ( ExtractorError, OnDemandPagedList, - date_from_str, + UserNotLive, determine_ext, + filter_dict, int_or_none, - qualities, - traverse_obj, - unified_strdate, + orderedSet, unified_timestamp, - update_url_query, url_or_none, urlencode_postdata, - xpath_text, + urljoin, ) +from ..utils.traversal import traverse_obj -class AfreecaTVIE(InfoExtractor): +class AfreecaTVBaseIE(InfoExtractor): + _NETRC_MACHINE = 'afreecatv' + + def _perform_login(self, username, password): + login_form = { + 'szWork': 'login', + 'szType': 'json', + 'szUid': username, + 'szPassword': password, + 'isSaveId': 'false', + 'szScriptVar': 'oLoginRet', + 'szAction': '', + } + + response = self._download_json( + 'https://login.afreecatv.com/app/LoginAction.php', None, + 'Logging in', data=urlencode_postdata(login_form)) + + _ERRORS = { + -4: 'Your account has been suspended due to a violation of our terms and policies.', + -5: 'https://member.afreecatv.com/app/user_delete_progress.php', + -6: 'https://login.afreecatv.com/membership/changeMember.php', + -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", + -9: 'https://member.afreecatv.com/app/pop_login_block.php', + -11: 'https://login.afreecatv.com/afreeca/second_login.php', + -12: 'https://member.afreecatv.com/app/user_security.php', + 0: 'The username does not exist or you have entered the wrong password.', + -1: 'The username does not exist or you have entered the wrong password.', + -3: 'You have entered your username/password incorrectly.', + -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', + -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', + -32008: 'You have failed to log in. Please contact our Help Center.', + } + + result = int_or_none(response.get('RESULT')) + if result != 1: + error = _ERRORS.get(result, 'You have failed to log in.') + raise ExtractorError( + 'Unable to login: %s said: %s' % (self.IE_NAME, error), + expected=True) + + +class AfreecaTVIE(AfreecaTVBaseIE): IE_NAME = 'afreecatv' IE_DESC = 'afreecatv.com' _VALID_URL = r'''(?x) @@ -34,7 +74,6 @@ class AfreecaTVIE(InfoExtractor): ) (?P\d+) ''' - _NETRC_MACHINE = 'afreecatv' _TESTS = [{ 'url': 'http://live.afreecatv.com:8079/app/index.cgi?szType=read_ucc_bbs&szBjId=dailyapril&nStationNo=16711924&nBbsNo=18605867&nTitleNo=36164052&szSkin=', 'md5': 'f72c89fe7ecc14c1b5ce506c4996046e', @@ -87,6 +126,7 @@ class AfreecaTVIE(InfoExtractor): 'uploader': '♥이슬이', 'uploader_id': 'dasl8121', 'upload_date': '20170411', + 'timestamp': 1491929865, 'duration': 213, }, 'params': { @@ -120,219 +160,102 @@ class AfreecaTVIE(InfoExtractor): 'uploader_id': 'rlantnghks', 'uploader': '페이즈으', 'duration': 10840, - 'thumbnail': 'http://videoimg.afreecatv.com/php/SnapshotLoad.php?rowKey=20230108_9FF5BEE1_244432674_1_r', + 'thumbnail': r're:https?://videoimg\.afreecatv\.com/.+', 'upload_date': '20230108', + 'timestamp': 1673218805, 'title': '젠지 페이즈', }, 'params': { 'skip_download': True, }, + }, { + # adult content + 'url': 'https://vod.afreecatv.com/player/70395877', + 'only_matching': True, + }, { + # subscribers only + 'url': 'https://vod.afreecatv.com/player/104647403', + 'only_matching': True, + }, { + # private + 'url': 'https://vod.afreecatv.com/player/81669846', + 'only_matching': True, }] - @staticmethod - def parse_video_key(key): - video_key = {} - m = re.match(r'^(?P\d{8})_\w+_(?P\d+)$', key) - if m: - video_key['upload_date'] = m.group('upload_date') - video_key['part'] = int(m.group('part')) - return video_key - - def _perform_login(self, username, password): - login_form = { - 'szWork': 'login', - 'szType': 'json', - 'szUid': username, - 'szPassword': password, - 'isSaveId': 'false', - 'szScriptVar': 'oLoginRet', - 'szAction': '', - } - - response = self._download_json( - 'https://login.afreecatv.com/app/LoginAction.php', None, - 'Logging in', data=urlencode_postdata(login_form)) - - _ERRORS = { - -4: 'Your account has been suspended due to a violation of our terms and policies.', - -5: 'https://member.afreecatv.com/app/user_delete_progress.php', - -6: 'https://login.afreecatv.com/membership/changeMember.php', - -8: "Hello! AfreecaTV here.\nThe username you have entered belongs to \n an account that requires a legal guardian's consent. \nIf you wish to use our services without restriction, \nplease make sure to go through the necessary verification process.", - -9: 'https://member.afreecatv.com/app/pop_login_block.php', - -11: 'https://login.afreecatv.com/afreeca/second_login.php', - -12: 'https://member.afreecatv.com/app/user_security.php', - 0: 'The username does not exist or you have entered the wrong password.', - -1: 'The username does not exist or you have entered the wrong password.', - -3: 'You have entered your username/password incorrectly.', - -7: 'You cannot use your Global AfreecaTV account to access Korean AfreecaTV.', - -10: 'Sorry for the inconvenience. \nYour account has been blocked due to an unauthorized access. \nPlease contact our Help Center for assistance.', - -32008: 'You have failed to log in. Please contact our Help Center.', - } - - result = int_or_none(response.get('RESULT')) - if result != 1: - error = _ERRORS.get(result, 'You have failed to log in.') - raise ExtractorError( - 'Unable to login: %s said: %s' % (self.IE_NAME, error), - expected=True) - def _real_extract(self, url): video_id = self._match_id(url) - - partial_view = False - adult_view = False - for _ in range(2): - data = self._download_json( - 'https://api.m.afreecatv.com/station/video/a/view', - video_id, headers={'Referer': url}, data=urlencode_postdata({ - 'nTitleNo': video_id, - 'nApiLevel': 10, - }))['data'] - if traverse_obj(data, ('code', {int})) == -6221: - raise ExtractorError('The VOD does not exist', expected=True) - query = { + data = self._download_json( + 'https://api.m.afreecatv.com/station/video/a/view', video_id, + headers={'Referer': url}, data=urlencode_postdata({ 'nTitleNo': video_id, - 'nStationNo': data['station_no'], - 'nBbsNo': data['bbs_no'], - } - if partial_view: - query['partialView'] = 'SKIP_ADULT' - if adult_view: - query['adultView'] = 'ADULT_VIEW' - video_xml = self._download_xml( - 'http://afbbs.afreecatv.com:8080/api/video/get_video_info.php', - video_id, 'Downloading video info XML%s' - % (' (skipping adult)' if partial_view else ''), - video_id, headers={ - 'Referer': url, - }, query=query) + 'nApiLevel': 10, + }))['data'] - flag = xpath_text(video_xml, './track/flag', 'flag', default=None) - if flag and flag == 'SUCCEED': - break - if flag == 'PARTIAL_ADULT': - self.report_warning( - 'In accordance with local laws and regulations, underage users are restricted from watching adult content. ' - 'Only content suitable for all ages will be downloaded. ' - 'Provide account credentials if you wish to download restricted content.') - partial_view = True - continue - elif flag == 'ADULT': - if not adult_view: - adult_view = True - continue - error = 'Only users older than 19 are able to watch this video. Provide account credentials to download this content.' - else: - error = flag - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error), expected=True) - else: - raise ExtractorError('Unable to download video info') + error_code = traverse_obj(data, ('code', {int})) + if error_code == -6221: + raise ExtractorError('The VOD does not exist', expected=True) + elif error_code == -6205: + raise ExtractorError('This VOD is private', expected=True) - video_element = video_xml.findall('./track/video')[-1] - if video_element is None or video_element.text is None: - raise ExtractorError( - 'Video %s does not exist' % video_id, expected=True) - - video_url = video_element.text.strip() - - title = xpath_text(video_xml, './track/title', 'title', fatal=True) - - uploader = xpath_text(video_xml, './track/nickname', 'uploader') - uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') - duration = int_or_none(xpath_text( - video_xml, './track/duration', 'duration')) - thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') - - common_entry = { - 'uploader': uploader, - 'uploader_id': uploader_id, - 'thumbnail': thumbnail, - } - - info = common_entry.copy() - info.update({ - 'id': video_id, - 'title': title, - 'duration': duration, + common_info = traverse_obj(data, { + 'title': ('title', {str}), + 'uploader': ('writer_nick', {str}), + 'uploader_id': ('bj_id', {str}), + 'duration': ('total_file_duration', {functools.partial(int_or_none, scale=1000)}), + 'thumbnail': ('thumb', {url_or_none}), }) - if not video_url: - entries = [] - file_elements = video_element.findall('./file') - one = len(file_elements) == 1 - for file_num, file_element in enumerate(file_elements, start=1): - file_url = url_or_none(file_element.text) - if not file_url: - continue - key = file_element.get('key', '') - upload_date = unified_strdate(self._search_regex( - r'^(\d{8})_', key, 'upload date', default=None)) - if upload_date is not None: - # sometimes the upload date isn't included in the file name - # instead, another random ID is, which may parse as a valid - # date but be wildly out of a reasonable range - parsed_date = date_from_str(upload_date) - if parsed_date.year < 2000 or parsed_date.year >= 2100: - upload_date = None - file_duration = int_or_none(file_element.get('duration')) - format_id = key if key else '%s_%s' % (video_id, file_num) - if determine_ext(file_url) == 'm3u8': - formats = self._extract_m3u8_formats( - file_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', - note='Downloading part %d m3u8 information' % file_num) - else: - formats = [{ - 'url': file_url, - 'format_id': 'http', - }] - if not formats and not self.get_param('ignore_no_formats'): - continue - file_info = common_entry.copy() - file_info.update({ - 'id': format_id, - 'title': title if one else '%s (part %d)' % (title, file_num), - 'upload_date': upload_date, - 'duration': file_duration, - 'formats': formats, + entries = [] + for file_num, file_element in enumerate( + traverse_obj(data, ('files', lambda _, v: url_or_none(v['file']))), start=1): + file_url = file_element['file'] + if determine_ext(file_url) == 'm3u8': + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', m3u8_id='hls', + note=f'Downloading part {file_num} m3u8 information') + else: + formats = [{ + 'url': file_url, + 'format_id': 'http', + }] + + entries.append({ + **common_info, + 'id': file_element.get('file_info_key') or f'{video_id}_{file_num}', + 'title': f'{common_info.get("title") or "Untitled"} (part {file_num})', + 'formats': formats, + **traverse_obj(file_element, { + 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}), + 'timestamp': ('file_start', {unified_timestamp}), }) - entries.append(file_info) - entries_info = info.copy() - entries_info.update({ - '_type': 'multi_video', - 'entries': entries, - }) - return entries_info - - info = { - 'id': video_id, - 'title': title, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'duration': duration, - 'thumbnail': thumbnail, - } - - if determine_ext(video_url) == 'm3u8': - info['formats'] = self._extract_m3u8_formats( - video_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls') - else: - app, playpath = video_url.split('mp4:') - info.update({ - 'url': app, - 'ext': 'flv', - 'play_path': 'mp4:' + playpath, - 'rtmp_live': True, # downloading won't end without this }) - return info + if traverse_obj(data, ('adult_status', {str})) == 'notLogin': + if not entries: + self.raise_login_required( + 'Only users older than 19 are able to watch this video', method='password') + self.report_warning( + 'In accordance with local laws and regulations, underage users are ' + 'restricted from watching adult content. Only content suitable for all ' + f'ages will be downloaded. {self._login_hint("password")}') + + if not entries and traverse_obj(data, ('sub_upload_type', {str})): + self.raise_login_required('This VOD is for subscribers only', method='password') + + if len(entries) == 1: + return { + **entries[0], + 'title': common_info.get('title'), + } + + common_info['timestamp'] = traverse_obj(entries, (..., 'timestamp'), get_all=False) + + return self.playlist_result(entries, video_id, multi_video=True, **common_info) -class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE - +class AfreecaTVLiveIE(AfreecaTVBaseIE): IE_NAME = 'afreecatv:live' + IE_DESC = 'afreecatv.com livestreams' _VALID_URL = r'https?://play\.afreeca(?:tv)?\.com/(?P[^/]+)(?:/(?P\d+))?' _TESTS = [{ 'url': 'https://play.afreecatv.com/pyh3646/237852185', @@ -347,77 +270,97 @@ class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE }, 'skip': 'Livestream has ended', }, { - 'url': 'http://play.afreeca.com/pyh3646/237852185', + 'url': 'https://play.afreecatv.com/pyh3646/237852185', 'only_matching': True, }, { - 'url': 'http://play.afreeca.com/pyh3646', + 'url': 'https://play.afreecatv.com/pyh3646', 'only_matching': True, }] _LIVE_API_URL = 'https://live.afreecatv.com/afreeca/player_live_api.php' + _WORKING_CDNS = [ + 'gcp_cdn', # live-global-cdn-v02.afreecatv.com + 'gs_cdn_pc_app', # pc-app.stream.afreecatv.com + 'gs_cdn_mobile_web', # mobile-web.stream.afreecatv.com + 'gs_cdn_pc_web', # pc-web.stream.afreecatv.com + ] + _BAD_CDNS = [ + 'gs_cdn', # chromecast.afreeca.gscdn.com (cannot resolve) + 'gs_cdn_chromecast', # chromecast.stream.afreecatv.com (HTTP Error 400) + 'azure_cdn', # live-global-cdn-v01.afreecatv.com (cannot resolve) + 'aws_cf', # live-global-cdn-v03.afreecatv.com (cannot resolve) + 'kt_cdn', # kt.stream.afreecatv.com (HTTP Error 400) + ] - _QUALITIES = ('sd', 'hd', 'hd2k', 'original') + def _extract_formats(self, channel_info, broadcast_no, aid): + stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' + + # If user has not passed CDN IDs, try API-provided CDN ID followed by other working CDN IDs + default_cdn_ids = orderedSet([ + *traverse_obj(channel_info, ('CDN', {str}, all, lambda _, v: v not in self._BAD_CDNS)), + *self._WORKING_CDNS, + ]) + cdn_ids = self._configuration_arg('cdn', default_cdn_ids) + + for attempt, cdn_id in enumerate(cdn_ids, start=1): + m3u8_url = traverse_obj(self._download_json( + urljoin(stream_base_url, 'broad_stream_assign.html'), broadcast_no, + f'Downloading {cdn_id} stream info', f'Unable to download {cdn_id} stream info', + fatal=False, query={ + 'return_type': cdn_id, + 'broad_key': f'{broadcast_no}-common-master-hls', + }), ('view_url', {url_or_none})) + try: + return self._extract_m3u8_formats( + m3u8_url, broadcast_no, 'mp4', m3u8_id='hls', query={'aid': aid}, + headers={'Referer': 'https://play.afreecatv.com/'}) + except ExtractorError as e: + if attempt == len(cdn_ids): + raise + self.report_warning( + f'{e.cause or e.msg}. Retrying... (attempt {attempt} of {len(cdn_ids)})') def _real_extract(self, url): broadcaster_id, broadcast_no = self._match_valid_url(url).group('id', 'bno') - password = self.get_param('videopassword') + channel_info = traverse_obj(self._download_json( + self._LIVE_API_URL, broadcaster_id, data=urlencode_postdata({'bid': broadcaster_id})), + ('CHANNEL', {dict})) or {} - info = self._download_json(self._LIVE_API_URL, broadcaster_id, fatal=False, - data=urlencode_postdata({'bid': broadcaster_id})) or {} - channel_info = info.get('CHANNEL') or {} broadcaster_id = channel_info.get('BJID') or broadcaster_id broadcast_no = channel_info.get('BNO') or broadcast_no - password_protected = channel_info.get('BPWD') if not broadcast_no: - raise ExtractorError(f'Unable to extract broadcast number ({broadcaster_id} may not be live)', expected=True) - if password_protected == 'Y' and password is None: + raise UserNotLive(video_id=broadcaster_id) + + password = self.get_param('videopassword') + if channel_info.get('BPWD') == 'Y' and password is None: raise ExtractorError( 'This livestream is protected by a password, use the --video-password option', expected=True) - formats = [] - quality_key = qualities(self._QUALITIES) - for quality_str in self._QUALITIES: - params = { + token_info = traverse_obj(self._download_json( + self._LIVE_API_URL, broadcast_no, 'Downloading access token for stream', + 'Unable to download access token for stream', data=urlencode_postdata(filter_dict({ 'bno': broadcast_no, 'stream_type': 'common', 'type': 'aid', - 'quality': quality_str, - } - if password is not None: - params['pwd'] = password - aid_response = self._download_json( - self._LIVE_API_URL, broadcast_no, fatal=False, - data=urlencode_postdata(params), - note=f'Downloading access token for {quality_str} stream', - errnote=f'Unable to download access token for {quality_str} stream') - aid = traverse_obj(aid_response, ('CHANNEL', 'AID')) - if not aid: - continue + 'quality': 'master', + 'pwd': password, + }))), ('CHANNEL', {dict})) or {} + aid = token_info.get('AID') + if not aid: + result = token_info.get('RESULT') + if result == 0: + raise ExtractorError('This livestream has ended', expected=True) + elif result == -6: + self.raise_login_required('This livestream is for subscribers only', method='password') + raise ExtractorError('Unable to extract access token') - stream_base_url = channel_info.get('RMD') or 'https://livestream-manager.afreecatv.com' - stream_info = self._download_json( - f'{stream_base_url}/broad_stream_assign.html', broadcast_no, fatal=False, - query={ - 'return_type': channel_info.get('CDN', 'gcp_cdn'), - 'broad_key': f'{broadcast_no}-common-{quality_str}-hls', - }, - note=f'Downloading metadata for {quality_str} stream', - errnote=f'Unable to download metadata for {quality_str} stream') or {} + formats = self._extract_formats(channel_info, broadcast_no, aid) - if stream_info.get('view_url'): - formats.append({ - 'format_id': quality_str, - 'url': update_url_query(stream_info['view_url'], {'aid': aid}), - 'ext': 'mp4', - 'protocol': 'm3u8', - 'quality': quality_key(quality_str), - }) - - station_info = self._download_json( + station_info = traverse_obj(self._download_json( 'https://st.afreecatv.com/api/get_station_status.php', broadcast_no, - query={'szBjId': broadcaster_id}, fatal=False, - note='Downloading channel metadata', errnote='Unable to download channel metadata') or {} + 'Downloading channel metadata', 'Unable to download channel metadata', + query={'szBjId': broadcaster_id}, fatal=False), {dict}) or {} return { 'id': broadcast_no, @@ -427,6 +370,7 @@ class AfreecaTVLiveIE(AfreecaTVIE): # XXX: Do not subclass from concrete IE 'timestamp': unified_timestamp(station_info.get('broad_start')), 'formats': formats, 'is_live': True, + 'http_headers': {'Referer': url}, } diff --git a/yt_dlp/extractor/ard.py b/yt_dlp/extractor/ard.py index 46e68d61e..3db59c5ca 100644 --- a/yt_dlp/extractor/ard.py +++ b/yt_dlp/extractor/ard.py @@ -1,5 +1,5 @@ +import functools import re -from functools import partial from .common import InfoExtractor from ..utils import ( @@ -349,7 +349,7 @@ class ARDBetaMediathekIE(InfoExtractor): r'(?P.*)', ] - return traverse_obj(patterns, (..., {partial(re.match, string=title)}, { + return traverse_obj(patterns, (..., {functools.partial(re.match, string=title)}, { 'season_number': ('season_number', {int_or_none}), 'episode_number': ('episode_number', {int_or_none}), 'episode': (( diff --git a/yt_dlp/extractor/asobistage.py b/yt_dlp/extractor/asobistage.py new file mode 100644 index 000000000..8fa8f3edb --- /dev/null +++ b/yt_dlp/extractor/asobistage.py @@ -0,0 +1,154 @@ +import functools + +from .common import InfoExtractor +from ..utils import str_or_none, url_or_none +from ..utils.traversal import traverse_obj + + +class AsobiStageIE(InfoExtractor): + IE_DESC = 'ASOBISTAGE (アソビステージ)' + _VALID_URL = r'https?://asobistage\.asobistore\.jp/event/(?P<id>(?P<event>\w+)/(?P<type>archive|player)/(?P<slug>\w+))(?:[?#]|$)' + _TESTS = [{ + 'url': 'https://asobistage.asobistore.jp/event/315passionhour_2022summer/archive/frame', + 'info_dict': { + 'id': '315passionhour_2022summer/archive/frame', + 'title': '315プロダクションプレゼンツ 315パッションアワー!!!', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': 'edff52f2', + 'ext': 'mp4', + 'title': '315passion_FRAME_only', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }], + }, { + 'url': 'https://asobistage.asobistore.jp/event/idolmaster_idolworld2023_goods/archive/live', + 'info_dict': { + 'id': 'idolmaster_idolworld2023_goods/archive/live', + 'title': 'md5:378510b6e830129d505885908bd6c576', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + 'playlist_count': 1, + 'playlist': [{ + 'info_dict': { + 'id': '3aef7110', + 'ext': 'mp4', + 'title': 'asobistore_station_1020_serverREC', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }], + }, { + 'url': 'https://asobistage.asobistore.jp/event/sidem_fclive_bpct/archive/premium_hc', + 'playlist_count': 4, + 'info_dict': { + 'id': 'sidem_fclive_bpct/archive/premium_hc', + 'title': '315 Production presents F@NTASTIC COMBINATION LIVE ~BRAINPOWER!!~/~CONNECTIME!!!!~', + 'thumbnail': r're:^https?://[\w.-]+/\w+/\w+', + }, + }, { + 'url': 'https://asobistage.asobistore.jp/event/ijigenfes_utagassen/player/day1', + 'only_matching': True, + }] + + _API_HOST = 'https://asobistage-api.asobistore.jp' + _HEADERS = {} + _is_logged_in = False + + @functools.cached_property + def _owned_tickets(self): + owned_tickets = set() + if not self._is_logged_in: + return owned_tickets + + for path, name in [ + ('api/v1/purchase_history/list', 'ticket purchase history'), + ('api/v1/serialcode/list', 'redemption history'), + ]: + response = self._download_json( + f'{self._API_HOST}/{path}', None, f'Downloading {name}', + f'Unable to download {name}', expected_status=400) + if traverse_obj(response, ('payload', 'error_message'), 'error') == 'notlogin': + self._is_logged_in = False + break + owned_tickets.update( + traverse_obj(response, ('payload', 'value', ..., 'digital_product_id', {str_or_none}))) + + return owned_tickets + + def _get_available_channel_id(self, channel): + channel_id = traverse_obj(channel, ('chennel_vspf_id', {str})) + if not channel_id: + return None + # if rights_type_id == 6, then 'No conditions (no login required - non-members are OK)' + if traverse_obj(channel, ('viewrights', lambda _, v: v['rights_type_id'] == 6)): + return channel_id + available_tickets = traverse_obj(channel, ( + 'viewrights', ..., ('tickets', 'serialcodes'), ..., 'digital_product_id', {str_or_none})) + if not self._owned_tickets.intersection(available_tickets): + self.report_warning( + f'You are not a ticketholder for "{channel.get("channel_name") or channel_id}"') + return None + return channel_id + + def _real_initialize(self): + if self._get_cookies(self._API_HOST): + self._is_logged_in = True + token = self._download_json( + f'{self._API_HOST}/api/v1/vspf/token', None, 'Getting token', 'Unable to get token') + self._HEADERS['Authorization'] = f'Bearer {token}' + + def _real_extract(self, url): + video_id, event, type_, slug = self._match_valid_url(url).group('id', 'event', 'type', 'slug') + video_type = {'archive': 'archives', 'player': 'broadcasts'}[type_] + webpage = self._download_webpage(url, video_id) + event_data = traverse_obj( + self._search_nextjs_data(webpage, video_id, default={}), + ('props', 'pageProps', 'eventCMSData', { + 'title': ('event_name', {str}), + 'thumbnail': ('event_thumbnail_image', {url_or_none}), + })) + + available_channels = traverse_obj(self._download_json( + f'https://asobistage.asobistore.jp/cdn/v101/events/{event}/{video_type}.json', + video_id, 'Getting channel list', 'Unable to get channel list'), ( + video_type, lambda _, v: v['broadcast_slug'] == slug, + 'channels', lambda _, v: v['chennel_vspf_id'] != '00000')) + + entries = [] + for channel_id in traverse_obj(available_channels, (..., {self._get_available_channel_id})): + if video_type == 'archives': + channel_json = self._download_json( + f'https://survapi.channel.or.jp/proxy/v1/contents/{channel_id}/get_by_cuid', channel_id, + 'Getting archive channel info', 'Unable to get archive channel info', fatal=False, + headers=self._HEADERS) + channel_data = traverse_obj(channel_json, ('ex_content', { + 'm3u8_url': 'streaming_url', + 'title': 'title', + 'thumbnail': ('thumbnail', 'url'), + })) + else: # video_type == 'broadcasts' + channel_json = self._download_json( + f'https://survapi.channel.or.jp/ex/events/{channel_id}', channel_id, + 'Getting live channel info', 'Unable to get live channel info', fatal=False, + headers=self._HEADERS, query={'embed': 'channel'}) + channel_data = traverse_obj(channel_json, ('data', { + 'm3u8_url': ('Channel', 'Custom_live_url'), + 'title': 'Name', + 'thumbnail': 'Poster_url', + })) + + entries.append({ + 'id': channel_id, + 'title': channel_data.get('title'), + 'formats': self._extract_m3u8_formats(channel_data.get('m3u8_url'), channel_id, fatal=False), + 'is_live': video_type == 'broadcasts', + 'thumbnail': url_or_none(channel_data.get('thumbnail')), + }) + + if not self._is_logged_in and not entries: + self.raise_login_required() + + return self.playlist_result(entries, video_id, **event_data) diff --git a/yt_dlp/extractor/atvat.py b/yt_dlp/extractor/atvat.py index d6ed9e495..d60feba31 100644 --- a/yt_dlp/extractor/atvat.py +++ b/yt_dlp/extractor/atvat.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt from .common import InfoExtractor from ..utils import ( @@ -71,9 +71,9 @@ class ATVAtIE(InfoExtractor): content_ids = [{'id': id, 'subclip_start': content['start'], 'subclip_end': content['end']} for id, content in enumerate(contentResource)] - time_of_request = datetime.datetime.now() - not_before = time_of_request - datetime.timedelta(minutes=5) - expire = time_of_request + datetime.timedelta(minutes=5) + time_of_request = dt.datetime.now() + not_before = time_of_request - dt.timedelta(minutes=5) + expire = time_of_request + dt.timedelta(minutes=5) payload = { 'content_ids': { content_id: content_ids, diff --git a/yt_dlp/extractor/aws.py b/yt_dlp/extractor/aws.py index c4741a6a1..4ebef9295 100644 --- a/yt_dlp/extractor/aws.py +++ b/yt_dlp/extractor/aws.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import hashlib import hmac @@ -12,7 +12,7 @@ class AWSIE(InfoExtractor): # XXX: Conventionally, base classes should end with def _aws_execute_api(self, aws_dict, video_id, query=None): query = query or {} - amz_date = datetime.datetime.now(datetime.timezone.utc).strftime('%Y%m%dT%H%M%SZ') + amz_date = dt.datetime.now(dt.timezone.utc).strftime('%Y%m%dT%H%M%SZ') date = amz_date[:8] headers = { 'Accept': 'application/json', diff --git a/yt_dlp/extractor/bibeltv.py b/yt_dlp/extractor/bibeltv.py index 34464daa1..666b51c56 100644 --- a/yt_dlp/extractor/bibeltv.py +++ b/yt_dlp/extractor/bibeltv.py @@ -1,4 +1,4 @@ -from functools import partial +import functools from .common import InfoExtractor from ..utils import ( @@ -50,7 +50,7 @@ class BibelTVBaseIE(InfoExtractor): **traverse_obj(data, { 'title': 'title', 'description': 'description', - 'duration': ('duration', {partial(int_or_none, scale=1000)}), + 'duration': ('duration', {functools.partial(int_or_none, scale=1000)}), 'timestamp': ('schedulingStart', {parse_iso8601}), 'season_number': 'seasonNumber', 'episode_number': 'episodeNumber', diff --git a/yt_dlp/extractor/bundestag.py b/yt_dlp/extractor/bundestag.py index 9fd7c7de1..71f772665 100644 --- a/yt_dlp/extractor/bundestag.py +++ b/yt_dlp/extractor/bundestag.py @@ -1,5 +1,5 @@ +import functools import re -from functools import partial from .common import InfoExtractor from ..networking.exceptions import HTTPError @@ -115,9 +115,9 @@ class BundestagIE(InfoExtractor): note='Downloading metadata overlay', fatal=False, ), { 'title': ( - {partial(get_element_text_and_html_by_tag, 'h3')}, 0, - {partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}), - 'description': ({partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), + {functools.partial(get_element_text_and_html_by_tag, 'h3')}, 0, + {functools.partial(re.sub, r'<span[^>]*>[^<]+</span>', '')}, {clean_html}), + 'description': ({functools.partial(get_element_text_and_html_by_tag, 'p')}, 0, {clean_html}), })) return result diff --git a/yt_dlp/extractor/cbc.py b/yt_dlp/extractor/cbc.py index b5beb1ec8..ff320dd68 100644 --- a/yt_dlp/extractor/cbc.py +++ b/yt_dlp/extractor/cbc.py @@ -151,7 +151,7 @@ class CBCIE(InfoExtractor): class CBCPlayerIE(InfoExtractor): IE_NAME = 'cbc.ca:player' - _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>\d+)' + _VALID_URL = r'(?:cbcplayer:|https?://(?:www\.)?cbc\.ca/(?:player/play/|i/caffeine/syndicate/\?mediaId=))(?P<id>(?:\d\.)?\d+)' _TESTS = [{ 'url': 'http://www.cbc.ca/player/play/2683190193', 'md5': '64d25f841ddf4ddb28a235338af32e2c', @@ -165,9 +165,52 @@ class CBCPlayerIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, 'skip': 'Geo-restricted to Canada and no longer available', + }, { + 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2657631896', + 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', + 'info_dict': { + 'id': '2657631896', + 'ext': 'mp3', + 'title': 'CBC Montreal is organizing its first ever community hackathon!', + 'description': 'md5:dd3b692f0a139b0369943150bd1c46a9', + 'timestamp': 1425704400, + 'upload_date': '20150307', + 'uploader': 'CBCC-NEW', + 'thumbnail': 'http://thumbnails.cbc.ca/maven_legacy/thumbnails/sonali-karnick-220.jpg', + 'chapters': [], + 'duration': 494.811, + 'categories': ['AudioMobile/All in a Weekend Montreal'], + 'tags': 'count:8', + 'location': 'Quebec', + 'series': 'All in a Weekend Montreal', + 'season': 'Season 2015', + 'season_number': 2015, + 'media_type': 'Excerpt', + }, + }, { + 'url': 'http://www.cbc.ca/i/caffeine/syndicate/?mediaId=2164402062', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', + 'info_dict': { + 'id': '2164402062', + 'ext': 'mp4', + 'title': 'Cancer survivor four times over', + 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', + 'timestamp': 1320410746, + 'upload_date': '20111104', + 'uploader': 'CBCC-NEW', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/277/67/cancer_852x480_2164412612.jpg', + 'chapters': [], + 'duration': 186.867, + 'series': 'CBC News: Windsor at 6:00', + 'categories': ['News/Canada/Windsor'], + 'location': 'Windsor', + 'tags': ['cancer'], + 'creators': ['Allison Johnson'], + 'media_type': 'Excerpt', + }, }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ - 'url': 'http://www.cbc.ca/player/play/2657631896', + 'url': 'https://www.cbc.ca/player/play/1.2985700', 'md5': 'e5e708c34ae6fca156aafe17c43e8b75', 'info_dict': { 'id': '2657631896', @@ -189,7 +232,7 @@ class CBCPlayerIE(InfoExtractor): 'media_type': 'Excerpt', }, }, { - 'url': 'http://www.cbc.ca/player/play/2164402062', + 'url': 'https://www.cbc.ca/player/play/1.1711287', 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', @@ -206,38 +249,53 @@ class CBCPlayerIE(InfoExtractor): 'categories': ['News/Canada/Windsor'], 'location': 'Windsor', 'tags': ['cancer'], - 'creator': 'Allison Johnson', + 'creators': ['Allison Johnson'], 'media_type': 'Excerpt', }, }, { # Has subtitles # These broadcasts expire after ~1 month, can find new test URL here: # https://www.cbc.ca/player/news/TV%20Shows/The%20National/Latest%20Broadcast - 'url': 'http://www.cbc.ca/player/play/2284799043667', - 'md5': '9b49f0839e88b6ec0b01d840cf3d42b5', + 'url': 'https://www.cbc.ca/player/play/1.7159484', + 'md5': '6ed6cd0fc2ef568d2297ba68a763d455', 'info_dict': { - 'id': '2284799043667', + 'id': '2324213316001', 'ext': 'mp4', - 'title': 'The National | Hockey coach charged, Green grants, Safer drugs', - 'description': 'md5:84ef46321c94bcf7d0159bb565d26bfa', - 'timestamp': 1700272800, - 'duration': 2718.833, + 'title': 'The National | School boards sue social media giants', + 'description': 'md5:4b4db69322fa32186c3ce426da07402c', + 'timestamp': 1711681200, + 'duration': 2743.400, 'subtitles': {'eng': [{'ext': 'vtt', 'protocol': 'm3u8_native'}]}, - 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/907/171/thumbnail.jpeg', + 'thumbnail': 'https://thumbnails.cbc.ca/maven_legacy/thumbnails/607/559/thumbnail.jpeg', 'uploader': 'CBCC-NEW', 'chapters': 'count:5', - 'upload_date': '20231118', + 'upload_date': '20240329', 'categories': 'count:4', 'series': 'The National - Full Show', 'tags': 'count:1', - 'creator': 'News', + 'creators': ['News'], 'location': 'Canada', 'media_type': 'Full Program', }, + }, { + 'url': 'cbcplayer:1.7159484', + 'only_matching': True, + }, { + 'url': 'cbcplayer:2164402062', + 'only_matching': True, + }, { + 'url': 'http://www.cbc.ca/player/play/2657631896', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + if '.' in video_id: + webpage = self._download_webpage(f'https://www.cbc.ca/player/play/{video_id}', video_id) + video_id = self._search_json( + r'window\.__INITIAL_STATE__\s*=', webpage, + 'initial state', video_id)['video']['currentClip']['mediaId'] + return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', diff --git a/yt_dlp/extractor/cda.py b/yt_dlp/extractor/cda.py index 1157114b2..90b4d082e 100644 --- a/yt_dlp/extractor/cda.py +++ b/yt_dlp/extractor/cda.py @@ -1,6 +1,6 @@ import base64 import codecs -import datetime +import datetime as dt import hashlib import hmac import json @@ -134,7 +134,7 @@ class CDAIE(InfoExtractor): self._API_HEADERS['User-Agent'] = f'pl.cda 1.0 (version {app_version}; Android {android_version}; {phone_model})' cached_bearer = self.cache.load(self._BEARER_CACHE, username) or {} - if cached_bearer.get('valid_until', 0) > datetime.datetime.now().timestamp() + 5: + if cached_bearer.get('valid_until', 0) > dt.datetime.now().timestamp() + 5: self._API_HEADERS['Authorization'] = f'Bearer {cached_bearer["token"]}' return @@ -154,7 +154,7 @@ class CDAIE(InfoExtractor): }) self.cache.store(self._BEARER_CACHE, username, { 'token': token_res['access_token'], - 'valid_until': token_res['expires_in'] + datetime.datetime.now().timestamp(), + 'valid_until': token_res['expires_in'] + dt.datetime.now().timestamp(), }) self._API_HEADERS['Authorization'] = f'Bearer {token_res["access_token"]}' diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index e776ccae9..bebbc6b43 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -37,6 +37,7 @@ from ..networking.exceptions import ( IncompleteRead, network_exceptions, ) +from ..networking.impersonate import ImpersonateTarget from ..utils import ( IDENTITY, JSON_LD_RE, @@ -170,12 +171,12 @@ class InfoExtractor: Automatically calculated from width and height * dynamic_range The dynamic range of the video. One of: "SDR" (None), "HDR10", "HDR10+, "HDR12", "HLG, "DV" - * tbr Average bitrate of audio and video in KBit/s - * abr Average audio bitrate in KBit/s + * tbr Average bitrate of audio and video in kbps (1000 bits/sec) + * abr Average audio bitrate in kbps (1000 bits/sec) * acodec Name of the audio codec in use * asr Audio sampling rate in Hertz * audio_channels Number of audio channels - * vbr Average video bitrate in KBit/s + * vbr Average video bitrate in kbps (1000 bits/sec) * fps Frame rate * vcodec Name of the video codec in use * container Name of the container format @@ -246,7 +247,8 @@ class InfoExtractor: * downloader_options A dictionary of downloader options (For internal use only) * http_chunk_size Chunk size for HTTP downloads - * ffmpeg_args Extra arguments for ffmpeg downloader + * ffmpeg_args Extra arguments for ffmpeg downloader (input) + * ffmpeg_args_out Extra arguments for ffmpeg downloader (output) * is_dash_periods Whether the format is a result of merging multiple DASH periods. RTMP formats can also have the additional fields: page_url, @@ -817,7 +819,7 @@ class InfoExtractor: else: return err.status in variadic(expected_status) - def _create_request(self, url_or_request, data=None, headers=None, query=None): + def _create_request(self, url_or_request, data=None, headers=None, query=None, extensions=None): if isinstance(url_or_request, urllib.request.Request): self._downloader.deprecation_warning( 'Passing a urllib.request.Request to _create_request() is deprecated. ' @@ -826,10 +828,11 @@ class InfoExtractor: elif not isinstance(url_or_request, Request): url_or_request = Request(url_or_request) - url_or_request.update(data=data, headers=headers, query=query) + url_or_request.update(data=data, headers=headers, query=query, extensions=extensions) return url_or_request - def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, headers=None, query=None, expected_status=None): + def _request_webpage(self, url_or_request, video_id, note=None, errnote=None, fatal=True, data=None, + headers=None, query=None, expected_status=None, impersonate=None, require_impersonation=False): """ Return the response handle. @@ -860,8 +863,31 @@ class InfoExtractor: headers = (headers or {}).copy() headers.setdefault('X-Forwarded-For', self._x_forwarded_for_ip) + extensions = {} + + if impersonate in (True, ''): + impersonate = ImpersonateTarget() + requested_targets = [ + t if isinstance(t, ImpersonateTarget) else ImpersonateTarget.from_str(t) + for t in variadic(impersonate) + ] if impersonate else [] + + available_target = next(filter(self._downloader._impersonate_target_available, requested_targets), None) + if available_target: + extensions['impersonate'] = available_target + elif requested_targets: + message = 'The extractor is attempting impersonation, but ' + message += ( + 'no impersonate target is available' if not str(impersonate) + else f'none of these impersonate targets are available: "{", ".join(map(str, requested_targets))}"') + info_msg = ('see https://github.com/yt-dlp/yt-dlp#impersonation ' + 'for information on installing the required dependencies') + if require_impersonation: + raise ExtractorError(f'{message}; {info_msg}', expected=True) + self.report_warning(f'{message}; if you encounter errors, then {info_msg}', only_once=True) + try: - return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query)) + return self._downloader.urlopen(self._create_request(url_or_request, data, headers, query, extensions)) except network_exceptions as err: if isinstance(err, HTTPError): if self.__can_accept_status_code(err, expected_status): @@ -880,13 +906,14 @@ class InfoExtractor: return False def _download_webpage_handle(self, url_or_request, video_id, note=None, errnote=None, fatal=True, - encoding=None, data=None, headers={}, query={}, expected_status=None): + encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): """ Return a tuple (page content as string, URL handle). Arguments: url_or_request -- plain text URL as a string or - a urllib.request.Request object + a yt_dlp.networking.Request object video_id -- Video/playlist/item identifier (string) Keyword arguments: @@ -911,13 +938,22 @@ class InfoExtractor: returning True if it should be accepted Note that this argument does not affect success status codes (2xx) which are always accepted. + impersonate -- the impersonate target. Can be any of the following entities: + - an instance of yt_dlp.networking.impersonate.ImpersonateTarget + - a string in the format of CLIENT[:OS] + - a list or a tuple of CLIENT[:OS] strings or ImpersonateTarget instances + - a boolean value; True means any impersonate target is sufficient + require_impersonation -- flag to toggle whether the request should raise an error + if impersonation is not possible (bool, default: False) """ # Strip hashes from the URL (#1038) if isinstance(url_or_request, str): url_or_request = url_or_request.partition('#')[0] - urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, headers=headers, query=query, expected_status=expected_status) + urlh = self._request_webpage(url_or_request, video_id, note, errnote, fatal, data=data, + headers=headers, query=query, expected_status=expected_status, + impersonate=impersonate, require_impersonation=require_impersonation) if urlh is False: assert not fatal return False @@ -1046,17 +1082,20 @@ class InfoExtractor: return getattr(ie, parser)(content, *args, **kwargs) def download_handle(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): res = self._download_webpage_handle( url_or_request, video_id, note=note, errnote=errnote, fatal=fatal, encoding=encoding, - data=data, headers=headers, query=query, expected_status=expected_status) + data=data, headers=headers, query=query, expected_status=expected_status, + impersonate=impersonate, require_impersonation=require_impersonation) if res is False: return res content, urlh = res return parse(self, content, video_id, transform_source=transform_source, fatal=fatal, errnote=errnote), urlh def download_content(self, url_or_request, video_id, note=note, errnote=errnote, transform_source=None, - fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None): + fatal=True, encoding=None, data=None, headers={}, query={}, expected_status=None, + impersonate=None, require_impersonation=False): if self.get_param('load_pages'): url_or_request = self._create_request(url_or_request, data, headers, query) filename = self._request_dump_filename(url_or_request.url, video_id) @@ -1079,6 +1118,8 @@ class InfoExtractor: 'headers': headers, 'query': query, 'expected_status': expected_status, + 'impersonate': impersonate, + 'require_impersonation': require_impersonation, } if parser is None: kwargs.pop('transform_source') @@ -1697,12 +1738,16 @@ class InfoExtractor: traverse_json_ld(json_ld) return filter_dict(info) - def _search_nextjs_data(self, webpage, video_id, *, transform_source=None, fatal=True, **kw): - return self._parse_json( - self._search_regex( - r'(?s)<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>([^<]+)</script>', - webpage, 'next.js data', fatal=fatal, **kw), - video_id, transform_source=transform_source, fatal=fatal) + def _search_nextjs_data(self, webpage, video_id, *, fatal=True, default=NO_DEFAULT, **kw): + if default == '{}': + self._downloader.deprecation_warning('using `default=\'{}\'` is deprecated, use `default={}` instead') + default = {} + if default is not NO_DEFAULT: + fatal = False + + return self._search_json( + r'<script[^>]+id=[\'"]__NEXT_DATA__[\'"][^>]*>', webpage, 'next.js data', + video_id, end_pattern='</script>', fatal=fatal, default=default, **kw) def _search_nuxt_data(self, webpage, video_id, context_name='__NUXT__', *, fatal=True, traverse=('data', 0)): """Parses Nuxt.js metadata. This works as long as the function __NUXT__ invokes is a pure function""" diff --git a/yt_dlp/extractor/crunchyroll.py b/yt_dlp/extractor/crunchyroll.py index d35e9995a..385a3c2d3 100644 --- a/yt_dlp/extractor/crunchyroll.py +++ b/yt_dlp/extractor/crunchyroll.py @@ -1,4 +1,5 @@ import base64 +import uuid from .common import InfoExtractor from ..networking.exceptions import HTTPError @@ -7,12 +8,11 @@ from ..utils import ( float_or_none, format_field, int_or_none, - join_nonempty, + jwt_decode_hs256, parse_age_limit, parse_count, parse_iso8601, qualities, - remove_start, time_seconds, traverse_obj, url_or_none, @@ -24,10 +24,15 @@ class CrunchyrollBaseIE(InfoExtractor): _BASE_URL = 'https://www.crunchyroll.com' _API_BASE = 'https://api.crunchyroll.com' _NETRC_MACHINE = 'crunchyroll' + _REFRESH_TOKEN = None _AUTH_HEADERS = None + _AUTH_EXPIRY = None _API_ENDPOINT = None - _BASIC_AUTH = None - _CLIENT_ID = ('cr_web', 'noaihdevm_6iyg0a8l0q') + _BASIC_AUTH = 'Basic ' + base64.b64encode(':'.join(( + 't-kdgp2h8c3jub8fn0fq', + 'yfLDfMfrYvKXh4JXS1LEI2cCqu1v5Wan', + )).encode()).decode() + _IS_PREMIUM = None _LOCALE_LOOKUP = { 'ar': 'ar-SA', 'de': 'de-DE', @@ -42,63 +47,74 @@ class CrunchyrollBaseIE(InfoExtractor): 'hi': 'hi-IN', } - @property - def is_logged_in(self): - return bool(self._get_cookies(self._BASE_URL).get('etp_rt')) + def _set_auth_info(self, response): + CrunchyrollBaseIE._IS_PREMIUM = 'cr_premium' in traverse_obj(response, ('access_token', {jwt_decode_hs256}, 'benefits', ...)) + CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': response['token_type'] + ' ' + response['access_token']} + CrunchyrollBaseIE._AUTH_EXPIRY = time_seconds(seconds=traverse_obj(response, ('expires_in', {float_or_none}), default=300) - 10) + + def _request_token(self, headers, data, note='Requesting token', errnote='Failed to request token'): + try: # TODO: Add impersonation support here + return self._download_json( + f'{self._BASE_URL}/auth/v1/token', None, note=note, errnote=errnote, + headers=headers, data=urlencode_postdata(data)) + except ExtractorError as error: + if not isinstance(error.cause, HTTPError) or error.cause.status != 403: + raise + raise ExtractorError( + 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' + 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' + 'and your browser\'s User-Agent (with --user-agent)', expected=True) def _perform_login(self, username, password): - if self.is_logged_in: + if not CrunchyrollBaseIE._REFRESH_TOKEN: + CrunchyrollBaseIE._REFRESH_TOKEN = self.cache.load(self._NETRC_MACHINE, username) + if CrunchyrollBaseIE._REFRESH_TOKEN: return - upsell_response = self._download_json( - f'{self._API_BASE}/get_upsell_data.0.json', None, 'Getting session id', - query={ - 'sess_id': 1, - 'device_id': 'whatvalueshouldbeforweb', - 'device_type': 'com.crunchyroll.static', - 'access_token': 'giKq5eY27ny3cqz', - 'referer': f'{self._BASE_URL}/welcome/login' - }) - if upsell_response['code'] != 'ok': - raise ExtractorError('Could not get session id') - session_id = upsell_response['data']['session_id'] - - login_response = self._download_json( - f'{self._API_BASE}/login.1.json', None, 'Logging in', - data=urlencode_postdata({ - 'account': username, - 'password': password, - 'session_id': session_id - })) - if login_response['code'] != 'ok': - raise ExtractorError('Login failed. Server message: %s' % login_response['message'], expected=True) - if not self.is_logged_in: - raise ExtractorError('Login succeeded but did not set etp_rt cookie') - - def _update_auth(self): - if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_REFRESH > time_seconds(): - return - - if not CrunchyrollBaseIE._BASIC_AUTH: - cx_api_param = self._CLIENT_ID[self.is_logged_in] - self.write_debug(f'Using cxApiParam={cx_api_param}') - CrunchyrollBaseIE._BASIC_AUTH = 'Basic ' + base64.b64encode(f'{cx_api_param}:'.encode()).decode() - - grant_type = 'etp_rt_cookie' if self.is_logged_in else 'client_id' try: - auth_response = self._download_json( - f'{self._BASE_URL}/auth/v1/token', None, note=f'Authenticating with grant_type={grant_type}', - headers={'Authorization': CrunchyrollBaseIE._BASIC_AUTH}, data=f'grant_type={grant_type}'.encode()) + login_response = self._request_token( + headers={'Authorization': self._BASIC_AUTH}, data={ + 'username': username, + 'password': password, + 'grant_type': 'password', + 'scope': 'offline_access', + }, note='Logging in', errnote='Failed to log in') except ExtractorError as error: - if isinstance(error.cause, HTTPError) and error.cause.status == 403: - raise ExtractorError( - 'Request blocked by Cloudflare; navigate to Crunchyroll in your browser, ' - 'then pass the fresh cookies (with --cookies-from-browser or --cookies) ' - 'and your browser\'s User-Agent (with --user-agent)', expected=True) + if isinstance(error.cause, HTTPError) and error.cause.status == 401: + raise ExtractorError('Invalid username and/or password', expected=True) raise - CrunchyrollBaseIE._AUTH_HEADERS = {'Authorization': auth_response['token_type'] + ' ' + auth_response['access_token']} - CrunchyrollBaseIE._AUTH_REFRESH = time_seconds(seconds=traverse_obj(auth_response, ('expires_in', {float_or_none}), default=300) - 10) + CrunchyrollBaseIE._REFRESH_TOKEN = login_response['refresh_token'] + self.cache.store(self._NETRC_MACHINE, username, CrunchyrollBaseIE._REFRESH_TOKEN) + self._set_auth_info(login_response) + + def _update_auth(self): + if CrunchyrollBaseIE._AUTH_HEADERS and CrunchyrollBaseIE._AUTH_EXPIRY > time_seconds(): + return + + auth_headers = {'Authorization': self._BASIC_AUTH} + if CrunchyrollBaseIE._REFRESH_TOKEN: + data = { + 'refresh_token': CrunchyrollBaseIE._REFRESH_TOKEN, + 'grant_type': 'refresh_token', + 'scope': 'offline_access', + } + else: + data = {'grant_type': 'client_id'} + auth_headers['ETP-Anonymous-ID'] = uuid.uuid4() + try: + auth_response = self._request_token(auth_headers, data) + except ExtractorError as error: + username, password = self._get_login_info() + if not username or not isinstance(error.cause, HTTPError) or error.cause.status != 400: + raise + self.to_screen('Refresh token has expired. Re-logging in') + CrunchyrollBaseIE._REFRESH_TOKEN = None + self.cache.store(self._NETRC_MACHINE, username, None) + self._perform_login(username, password) + return + + self._set_auth_info(auth_response) def _locale_from_language(self, language): config_locale = self._configuration_arg('metadata', ie_key=CrunchyrollBetaIE, casesense=True) @@ -135,62 +151,73 @@ class CrunchyrollBaseIE(InfoExtractor): raise ExtractorError(f'Unexpected response when downloading {note} JSON') return result - def _extract_formats(self, stream_response, display_id=None): - requested_formats = self._configuration_arg('format') or ['vo_adaptive_hls'] - available_formats = {} - for stream_type, streams in traverse_obj( - stream_response, (('streams', ('data', 0)), {dict.items}, ...)): - if stream_type not in requested_formats: + def _extract_chapters(self, internal_id): + # if no skip events are available, a 403 xml error is returned + skip_events = self._download_json( + f'https://static.crunchyroll.com/skip-events/production/{internal_id}.json', + internal_id, note='Downloading chapter info', fatal=False, errnote=False) + if not skip_events: + return None + + chapters = [] + for event in ('recap', 'intro', 'credits', 'preview'): + start = traverse_obj(skip_events, (event, 'start', {float_or_none})) + end = traverse_obj(skip_events, (event, 'end', {float_or_none})) + # some chapters have no start and/or ending time, they will just be ignored + if start is None or end is None: continue - for stream in traverse_obj(streams, lambda _, v: v['url']): - hardsub_lang = stream.get('hardsub_locale') or '' - format_id = join_nonempty(stream_type, format_field(stream, 'hardsub_locale', 'hardsub-%s')) - available_formats[hardsub_lang] = (stream_type, format_id, hardsub_lang, stream['url']) + chapters.append({'title': event.capitalize(), 'start_time': start, 'end_time': end}) + + return chapters + + def _extract_stream(self, identifier, display_id=None): + if not display_id: + display_id = identifier + + self._update_auth() + stream_response = self._download_json( + f'https://cr-play-service.prd.crunchyrollsvc.com/v1/{identifier}/console/switch/play', + display_id, note='Downloading stream info', errnote='Failed to download stream info', + headers=CrunchyrollBaseIE._AUTH_HEADERS) + + available_formats = {'': ('', '', stream_response['url'])} + for hardsub_lang, stream in traverse_obj(stream_response, ('hardSubs', {dict.items}, lambda _, v: v[1]['url'])): + available_formats[hardsub_lang] = (f'hardsub-{hardsub_lang}', hardsub_lang, stream['url']) requested_hardsubs = [('' if val == 'none' else val) for val in (self._configuration_arg('hardsub') or ['none'])] - if '' in available_formats and 'all' not in requested_hardsubs: + hardsub_langs = [lang for lang in available_formats if lang] + if hardsub_langs and 'all' not in requested_hardsubs: full_format_langs = set(requested_hardsubs) + self.to_screen(f'Available hardsub languages: {", ".join(hardsub_langs)}') self.to_screen( - 'To get all formats of a hardsub language, use ' + 'To extract formats of a hardsub language, use ' '"--extractor-args crunchyrollbeta:hardsub=<language_code or all>". ' 'See https://github.com/yt-dlp/yt-dlp#crunchyrollbeta-crunchyroll for more info', only_once=True) else: full_format_langs = set(map(str.lower, available_formats)) - audio_locale = traverse_obj(stream_response, ((None, 'meta'), 'audio_locale'), get_all=False) + audio_locale = traverse_obj(stream_response, ('audioLocale', {str})) hardsub_preference = qualities(requested_hardsubs[::-1]) - formats = [] - for stream_type, format_id, hardsub_lang, stream_url in available_formats.values(): - if stream_type.endswith('hls'): - if hardsub_lang.lower() in full_format_langs: - adaptive_formats = self._extract_m3u8_formats( - stream_url, display_id, 'mp4', m3u8_id=format_id, - fatal=False, note=f'Downloading {format_id} HLS manifest') - else: - adaptive_formats = (self._m3u8_meta_format(stream_url, ext='mp4', m3u8_id=format_id),) - elif stream_type.endswith('dash'): - adaptive_formats = self._extract_mpd_formats( - stream_url, display_id, mpd_id=format_id, - fatal=False, note=f'Downloading {format_id} MPD manifest') + formats, subtitles = [], {} + for format_id, hardsub_lang, stream_url in available_formats.values(): + if hardsub_lang.lower() in full_format_langs: + adaptive_formats, dash_subs = self._extract_mpd_formats_and_subtitles( + stream_url, display_id, mpd_id=format_id, headers=CrunchyrollBaseIE._AUTH_HEADERS, + fatal=False, note=f'Downloading {f"{format_id} " if hardsub_lang else ""}MPD manifest') + self._merge_subtitles(dash_subs, target=subtitles) else: - self.report_warning(f'Encountered unknown stream_type: {stream_type!r}', display_id, only_once=True) - continue + continue # XXX: Update this if/when meta mpd formats are working for f in adaptive_formats: if f.get('acodec') != 'none': f['language'] = audio_locale f['quality'] = hardsub_preference(hardsub_lang.lower()) formats.extend(adaptive_formats) - return formats + for locale, subtitle in traverse_obj(stream_response, (('subtitles', 'captions'), {dict.items}, ...)): + subtitles.setdefault(locale, []).append(traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})) - def _extract_subtitles(self, data): - subtitles = {} - - for locale, subtitle in traverse_obj(data, ((None, 'meta'), 'subtitles', {dict.items}, ...)): - subtitles[locale] = [traverse_obj(subtitle, {'url': 'url', 'ext': 'format'})] - - return subtitles + return formats, subtitles class CrunchyrollCmsBaseIE(CrunchyrollBaseIE): @@ -245,7 +272,11 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): 'like_count': int, 'dislike_count': int, }, - 'params': {'skip_download': 'm3u8', 'format': 'all[format_id~=hardsub]'}, + 'params': { + 'skip_download': 'm3u8', + 'extractor_args': {'crunchyrollbeta': {'hardsub': ['de-DE']}}, + 'format': 'bv[format_id~=hardsub]', + }, }, { # Premium only 'url': 'https://www.crunchyroll.com/watch/GYE5WKQGR', @@ -306,6 +337,7 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): 'thumbnail': r're:^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'no longer exists', }, { 'url': 'https://www.crunchyroll.com/watch/G62PEZ2E6', 'info_dict': { @@ -359,31 +391,15 @@ class CrunchyrollBetaIE(CrunchyrollCmsBaseIE): else: raise ExtractorError(f'Unknown object type {object_type}') - # There might be multiple audio languages for one object (`<object>_metadata.versions`), - # so we need to get the id from `streams_link` instead or we dont know which language to choose - streams_link = response.get('streams_link') - if not streams_link and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): + if not self._IS_PREMIUM and traverse_obj(response, (f'{object_type}_metadata', 'is_premium_only')): message = f'This {object_type} is for premium members only' - if self.is_logged_in: + if CrunchyrollBaseIE._REFRESH_TOKEN: raise ExtractorError(message, expected=True) - self.raise_login_required(message) + self.raise_login_required(message, method='password') - # We need go from unsigned to signed api to avoid getting soft banned - stream_response = self._call_cms_api_signed(remove_start( - streams_link, '/content/v2/cms/'), internal_id, lang, 'stream info') - result['formats'] = self._extract_formats(stream_response, internal_id) - result['subtitles'] = self._extract_subtitles(stream_response) + result['formats'], result['subtitles'] = self._extract_stream(internal_id) - # if no intro chapter is available, a 403 without usable data is returned - intro_chapter = self._download_json( - f'https://static.crunchyroll.com/datalab-intro-v2/{internal_id}.json', - internal_id, note='Downloading chapter info', fatal=False, errnote=False) - if isinstance(intro_chapter, dict): - result['chapters'] = [{ - 'title': 'Intro', - 'start_time': float_or_none(intro_chapter.get('startTime')), - 'end_time': float_or_none(intro_chapter.get('endTime')), - }] + result['chapters'] = self._extract_chapters(internal_id) def calculate_count(item): return parse_count(''.join((item['displayed'], item.get('unit') or ''))) @@ -512,7 +528,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'display_id': 'egaono-hana', 'title': 'Egaono Hana', 'track': 'Egaono Hana', - 'artist': 'Goose house', + 'artists': ['Goose house'], 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'genres': ['J-Pop'], }, @@ -525,11 +541,12 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'display_id': 'crossing-field', 'title': 'Crossing Field', 'track': 'Crossing Field', - 'artist': 'LiSA', + 'artists': ['LiSA'], 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'genres': ['Anime'], }, 'params': {'skip_download': 'm3u8'}, + 'skip': 'no longer exists', }, { 'url': 'https://www.crunchyroll.com/watch/concert/MC2E2AC135', 'info_dict': { @@ -538,7 +555,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'display_id': 'live-is-smile-always-364joker-at-yokohama-arena', 'title': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', 'track': 'LiVE is Smile Always-364+JOKER- at YOKOHAMA ARENA', - 'artist': 'LiSA', + 'artists': ['LiSA'], 'thumbnail': r're:(?i)^https://www.crunchyroll.com/imgsrv/.*\.jpeg?$', 'description': 'md5:747444e7e6300907b7a43f0a0503072e', 'genres': ['J-Pop'], @@ -566,16 +583,14 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): if not response: raise ExtractorError(f'No video with id {internal_id} could be found (possibly region locked?)', expected=True) - streams_link = response.get('streams_link') - if not streams_link and response.get('isPremiumOnly'): + if not self._IS_PREMIUM and response.get('isPremiumOnly'): message = f'This {response.get("type") or "media"} is for premium members only' - if self.is_logged_in: + if CrunchyrollBaseIE._REFRESH_TOKEN: raise ExtractorError(message, expected=True) - self.raise_login_required(message) + self.raise_login_required(message, method='password') result = self._transform_music_response(response) - stream_response = self._call_api(streams_link, internal_id, lang, 'stream info') - result['formats'] = self._extract_formats(stream_response, internal_id) + result['formats'], _ = self._extract_stream(f'music/{internal_id}', internal_id) return result @@ -587,7 +602,7 @@ class CrunchyrollMusicIE(CrunchyrollBaseIE): 'display_id': 'slug', 'title': 'title', 'track': 'title', - 'artist': ('artist', 'name'), + 'artists': ('artist', 'name', all), 'description': ('description', {str}, {lambda x: x.replace(r'\r\n', '\n') or None}), 'thumbnails': ('images', ..., ..., { 'url': ('source', {url_or_none}), @@ -611,7 +626,7 @@ class CrunchyrollArtistIE(CrunchyrollBaseIE): 'info_dict': { 'id': 'MA179CB50D', 'title': 'LiSA', - 'genres': ['J-Pop', 'Anime', 'Rock'], + 'genres': ['Anime', 'J-Pop', 'Rock'], 'description': 'md5:16d87de61a55c3f7d6c454b73285938e', }, 'playlist_mincount': 83, diff --git a/yt_dlp/extractor/dropbox.py b/yt_dlp/extractor/dropbox.py index bc2efce12..0246975c1 100644 --- a/yt_dlp/extractor/dropbox.py +++ b/yt_dlp/extractor/dropbox.py @@ -65,12 +65,14 @@ class DropboxIE(InfoExtractor): formats, subtitles, has_anonymous_download = [], {}, False for encoded in reversed(re.findall(r'registerStreamedPrefetch\s*\(\s*"[\w/+=]+"\s*,\s*"([\w/+=]+)"', webpage)): decoded = base64.b64decode(encoded).decode('utf-8', 'ignore') + if not has_anonymous_download: + has_anonymous_download = self._search_regex( + r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) transcode_url = self._search_regex( r'\n.(https://[^\x03\x08\x12\n]+\.m3u8)', decoded, 'transcode url', default=None) if not transcode_url: continue formats, subtitles = self._extract_m3u8_formats_and_subtitles(transcode_url, video_id, 'mp4') - has_anonymous_download = self._search_regex(r'(anonymous:\tanonymous)', decoded, 'anonymous', default=False) break # downloads enabled we can get the original file diff --git a/yt_dlp/extractor/dtube.py b/yt_dlp/extractor/dtube.py index bb06c42be..5ea014cf0 100644 --- a/yt_dlp/extractor/dtube.py +++ b/yt_dlp/extractor/dtube.py @@ -1,5 +1,5 @@ import json -from socket import timeout +import socket from .common import InfoExtractor from ..utils import ( @@ -56,7 +56,7 @@ class DTubeIE(InfoExtractor): try: self.to_screen('%s: Checking %s video format URL' % (video_id, format_id)) self._downloader._opener.open(video_url, timeout=5).close() - except timeout: + except socket.timeout: self.to_screen( '%s: %s URL is invalid, skipping' % (video_id, format_id)) continue diff --git a/yt_dlp/extractor/facebook.py b/yt_dlp/extractor/facebook.py index 834b1df18..b76407a5c 100644 --- a/yt_dlp/extractor/facebook.py +++ b/yt_dlp/extractor/facebook.py @@ -560,7 +560,7 @@ class FacebookIE(InfoExtractor): js_data, lambda x: x['jsmods']['instances'], list) or []) def extract_dash_manifest(video, formats): - dash_manifest = video.get('dash_manifest') + dash_manifest = traverse_obj(video, 'dash_manifest', 'playlist', expected_type=str) if dash_manifest: formats.extend(self._parse_mpd_formats( compat_etree_fromstring(urllib.parse.unquote_plus(dash_manifest)), diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 9d8251582..2cfed0fd0 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -2104,22 +2104,6 @@ class GenericIE(InfoExtractor): 'age_limit': 0, }, }, - { - 'note': 'JW Player embed with unicode-escape sequences in URL', - 'url': 'https://www.medici.tv/en/concerts/lahav-shani-mozart-mahler-israel-philharmonic-abu-dhabi-classics', - 'info_dict': { - 'id': 'm', - 'ext': 'mp4', - 'title': 'Lahav Shani conducts the Israel Philharmonic\'s first-ever concert in Abu Dhabi', - 'description': 'Mahler\'s ', - 'uploader': 'www.medici.tv', - 'age_limit': 0, - 'thumbnail': r're:^https?://.+\.jpg', - }, - 'params': { - 'skip_download': True, - }, - }, { 'url': 'https://shooshtime.com/videos/284002/just-out-of-the-shower-joi/', 'md5': 'e2f0a4c329f7986280b7328e24036d60', diff --git a/yt_dlp/extractor/goplay.py b/yt_dlp/extractor/goplay.py index 74aad1192..7a98e0f31 100644 --- a/yt_dlp/extractor/goplay.py +++ b/yt_dlp/extractor/goplay.py @@ -1,6 +1,6 @@ import base64 import binascii -import datetime +import datetime as dt import hashlib import hmac import json @@ -422,7 +422,7 @@ class AwsIdp: months = [None, 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] days = ['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'] - time_now = datetime.datetime.now(datetime.timezone.utc) + time_now = dt.datetime.now(dt.timezone.utc) format_string = "{} {} {} %H:%M:%S UTC %Y".format(days[time_now.weekday()], months[time_now.month], time_now.day) time_string = time_now.strftime(format_string) return time_string diff --git a/yt_dlp/extractor/jiosaavn.py b/yt_dlp/extractor/jiosaavn.py index a59209835..35fb3fd6b 100644 --- a/yt_dlp/extractor/jiosaavn.py +++ b/yt_dlp/extractor/jiosaavn.py @@ -1,89 +1,143 @@ +import functools +import math +import re + from .common import InfoExtractor from ..utils import ( + InAdvancePagedList, + clean_html, int_or_none, - js_to_json, + make_archive_id, + smuggle_url, + unsmuggle_url, + url_basename, url_or_none, urlencode_postdata, - urljoin, ) from ..utils.traversal import traverse_obj class JioSaavnBaseIE(InfoExtractor): - def _extract_initial_data(self, url, audio_id): - webpage = self._download_webpage(url, audio_id) - return self._search_json( - r'window\.__INITIAL_DATA__\s*=', webpage, - 'init json', audio_id, transform_source=js_to_json) + _API_URL = 'https://www.jiosaavn.com/api.php' + _VALID_BITRATES = {'16', '32', '64', '128', '320'} - -class JioSaavnSongIE(JioSaavnBaseIE): - _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)' - _TESTS = [{ - 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', - 'md5': '3b84396d15ed9e083c3106f1fa589c04', - 'info_dict': { - 'id': 'OQsEfQFVUXk', - 'ext': 'mp4', - 'title': 'Leja Re', - 'album': 'Leja Re', - 'thumbnail': 'https://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', - 'duration': 205, - 'view_count': int, - 'release_year': 2018, - }, - }, { - 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', - 'only_matching': True, - }] - - _VALID_BITRATES = ('16', '32', '64', '128', '320') - - def _real_extract(self, url): - audio_id = self._match_id(url) - extract_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn') - if invalid_bitrates := [br for br in extract_bitrates if br not in self._VALID_BITRATES]: + @functools.cached_property + def requested_bitrates(self): + requested_bitrates = self._configuration_arg('bitrate', ['128', '320'], ie_key='JioSaavn') + if invalid_bitrates := set(requested_bitrates) - self._VALID_BITRATES: raise ValueError( f'Invalid bitrate(s): {", ".join(invalid_bitrates)}. ' - + f'Valid bitrates are: {", ".join(self._VALID_BITRATES)}') + + f'Valid bitrates are: {", ".join(sorted(self._VALID_BITRATES, key=int))}') + return requested_bitrates - song_data = self._extract_initial_data(url, audio_id)['song']['song'] - formats = [] - for bitrate in extract_bitrates: + def _extract_formats(self, song_data): + for bitrate in self.requested_bitrates: media_data = self._download_json( - 'https://www.jiosaavn.com/api.php', audio_id, f'Downloading format info for {bitrate}', + self._API_URL, song_data['id'], + f'Downloading format info for {bitrate}', fatal=False, data=urlencode_postdata({ '__call': 'song.generateAuthToken', '_format': 'json', 'bitrate': bitrate, 'url': song_data['encrypted_media_url'], })) - if not media_data.get('auth_url'): + if not traverse_obj(media_data, ('auth_url', {url_or_none})): self.report_warning(f'Unable to extract format info for {bitrate}') continue - formats.append({ + ext = media_data.get('type') + yield { 'url': media_data['auth_url'], - 'ext': media_data.get('type'), + 'ext': 'm4a' if ext == 'mp4' else ext, 'format_id': bitrate, 'abr': int(bitrate), 'vcodec': 'none', + } + + def _extract_song(self, song_data, url=None): + info = traverse_obj(song_data, { + 'id': ('id', {str}), + 'title': ('song', {clean_html}), + 'album': ('album', {clean_html}), + 'thumbnail': ('image', {url_or_none}, {lambda x: re.sub(r'-\d+x\d+\.', '-500x500.', x)}), + 'duration': ('duration', {int_or_none}), + 'view_count': ('play_count', {int_or_none}), + 'release_year': ('year', {int_or_none}), + 'artists': ('primary_artists', {lambda x: x.split(', ') if x else None}), + 'webpage_url': ('perma_url', {url_or_none}), + }) + if webpage_url := info.get('webpage_url') or url: + info['display_id'] = url_basename(webpage_url) + info['_old_archive_ids'] = [make_archive_id(JioSaavnSongIE, info['display_id'])] + + return info + + def _call_api(self, type_, token, note='API', params={}): + return self._download_json( + self._API_URL, token, f'Downloading {note} JSON', f'Unable to download {note} JSON', + query={ + '__call': 'webapi.get', + '_format': 'json', + '_marker': '0', + 'ctx': 'web6dot0', + 'token': token, + 'type': type_, + **params, }) - return { - 'id': audio_id, - 'formats': formats, - **traverse_obj(song_data, { - 'title': ('title', 'text'), - 'album': ('album', 'text'), - 'thumbnail': ('image', 0, {url_or_none}), - 'duration': ('duration', {int_or_none}), - 'view_count': ('play_count', {int_or_none}), - 'release_year': ('year', {int_or_none}), - }), - } + def _yield_songs(self, playlist_data): + for song_data in traverse_obj(playlist_data, ('songs', lambda _, v: v['id'] and v['perma_url'])): + song_info = self._extract_song(song_data) + url = smuggle_url(song_info['webpage_url'], { + 'id': song_data['id'], + 'encrypted_media_url': song_data['encrypted_media_url'], + }) + yield self.url_result(url, JioSaavnSongIE, url_transparent=True, **song_info) + + +class JioSaavnSongIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:song' + _VALID_URL = r'https?://(?:www\.)?(?:jiosaavn\.com/song/[^/?#]+/|saavn\.com/s/song/(?:[^/?#]+/){3})(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/song/leja-re/OQsEfQFVUXk', + 'md5': '3b84396d15ed9e083c3106f1fa589c04', + 'info_dict': { + 'id': 'IcoLuefJ', + 'display_id': 'OQsEfQFVUXk', + 'ext': 'm4a', + 'title': 'Leja Re', + 'album': 'Leja Re', + 'thumbnail': r're:https?://c.saavncdn.com/258/Leja-Re-Hindi-2018-20181124024539-500x500.jpg', + 'duration': 205, + 'view_count': int, + 'release_year': 2018, + 'artists': ['Sandesh Shandilya', 'Dhvani Bhanushali', 'Tanishk Bagchi'], + '_old_archive_ids': ['jiosaavnsong OQsEfQFVUXk'], + }, + }, { + 'url': 'https://www.saavn.com/s/song/hindi/Saathiya/O-Humdum-Suniyo-Re/KAMiazoCblU', + 'only_matching': True, + }] + + def _real_extract(self, url): + url, smuggled_data = unsmuggle_url(url) + song_data = traverse_obj(smuggled_data, ({ + 'id': ('id', {str}), + 'encrypted_media_url': ('encrypted_media_url', {str}), + })) + + if 'id' in song_data and 'encrypted_media_url' in song_data: + result = {'id': song_data['id']} + else: + # only extract metadata if this is not a url_transparent result + song_data = self._call_api('song', self._match_id(url))['songs'][0] + result = self._extract_song(song_data, url) + + result['formats'] = list(self._extract_formats(song_data)) + return result class JioSaavnAlbumIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:album' _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/album/[^/?#]+/(?P<id>[^/?#]+)' _TESTS = [{ 'url': 'https://www.jiosaavn.com/album/96/buIOjYZDrNA_', @@ -95,11 +149,46 @@ class JioSaavnAlbumIE(JioSaavnBaseIE): }] def _real_extract(self, url): - album_id = self._match_id(url) - album_view = self._extract_initial_data(url, album_id)['albumView'] + display_id = self._match_id(url) + album_data = self._call_api('album', display_id) - return self.playlist_from_matches( - traverse_obj(album_view, ( - 'modules', lambda _, x: x['key'] == 'list', 'data', ..., 'title', 'action', {str})), - album_id, traverse_obj(album_view, ('album', 'title', 'text', {str})), ie=JioSaavnSongIE, - getter=lambda x: urljoin('https://www.jiosaavn.com/', x)) + return self.playlist_result( + self._yield_songs(album_data), display_id, traverse_obj(album_data, ('title', {str}))) + + +class JioSaavnPlaylistIE(JioSaavnBaseIE): + IE_NAME = 'jiosaavn:playlist' + _VALID_URL = r'https?://(?:www\.)?(?:jio)?saavn\.com/s/playlist/(?:[^/?#]+/){2}(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-english/LlJ8ZWT1ibN5084vKHRj2Q__', + 'info_dict': { + 'id': 'LlJ8ZWT1ibN5084vKHRj2Q__', + 'title': 'Mood English', + }, + 'playlist_mincount': 301, + }, { + 'url': 'https://www.jiosaavn.com/s/playlist/2279fbe391defa793ad7076929a2f5c9/mood-hindi/DVR,pFUOwyXqIp77B1JF,A__', + 'info_dict': { + 'id': 'DVR,pFUOwyXqIp77B1JF,A__', + 'title': 'Mood Hindi', + }, + 'playlist_mincount': 801, + }] + _PAGE_SIZE = 50 + + def _fetch_page(self, token, page): + return self._call_api( + 'playlist', token, f'playlist page {page}', {'p': page, 'n': self._PAGE_SIZE}) + + def _entries(self, token, first_page_data, page): + page_data = first_page_data if not page else self._fetch_page(token, page + 1) + yield from self._yield_songs(page_data) + + def _real_extract(self, url): + display_id = self._match_id(url) + playlist_data = self._fetch_page(display_id, 1) + total_pages = math.ceil(int(playlist_data['list_count']) / self._PAGE_SIZE) + + return self.playlist_result(InAdvancePagedList( + functools.partial(self._entries, display_id, playlist_data), + total_pages, self._PAGE_SIZE), display_id, traverse_obj(playlist_data, ('listname', {str}))) diff --git a/yt_dlp/extractor/joqrag.py b/yt_dlp/extractor/joqrag.py index 3bb28af94..7a91d4a23 100644 --- a/yt_dlp/extractor/joqrag.py +++ b/yt_dlp/extractor/joqrag.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import urllib.parse from .common import InfoExtractor @@ -50,8 +50,8 @@ class JoqrAgIE(InfoExtractor): def _extract_start_timestamp(self, video_id, is_live): def extract_start_time_from(date_str): - dt = datetime_from_str(date_str) + datetime.timedelta(hours=9) - date = dt.strftime('%Y%m%d') + dt_ = datetime_from_str(date_str) + dt.timedelta(hours=9) + date = dt_.strftime('%Y%m%d') start_time = self._search_regex( r'<h3[^>]+\bclass="dailyProgram-itemHeaderTime"[^>]*>[\s\d:]+–\s*(\d{1,2}:\d{1,2})', self._download_webpage( @@ -60,7 +60,7 @@ class JoqrAgIE(InfoExtractor): errnote=f'Failed to download program list of {date}') or '', 'start time', default=None) if start_time: - return unified_timestamp(f'{dt.strftime("%Y/%m/%d")} {start_time} +09:00') + return unified_timestamp(f'{dt_.strftime("%Y/%m/%d")} {start_time} +09:00') return None start_timestamp = extract_start_time_from('today') @@ -80,14 +80,14 @@ class JoqrAgIE(InfoExtractor): note='Downloading metadata', errnote='Failed to download metadata') title = self._extract_metadata('Program_name', metadata) - if title == '放送休止': + if not title or title == '放送休止': formats = [] live_status = 'is_upcoming' release_timestamp = self._extract_start_timestamp(video_id, False) msg = 'This stream is not currently live' if release_timestamp: msg += (' and will start at ' - + datetime.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) + + dt.datetime.fromtimestamp(release_timestamp).strftime('%Y-%m-%d %H:%M:%S')) self.raise_no_formats(msg, expected=True) else: m3u8_path = self._search_regex( diff --git a/yt_dlp/extractor/kick.py b/yt_dlp/extractor/kick.py index d12437242..889548f52 100644 --- a/yt_dlp/extractor/kick.py +++ b/yt_dlp/extractor/kick.py @@ -13,7 +13,8 @@ from ..utils import ( class KickBaseIE(InfoExtractor): def _real_initialize(self): - self._request_webpage(HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False) + self._request_webpage( + HEADRequest('https://kick.com/'), None, 'Setting up session', fatal=False, impersonate=True) xsrf_token = self._get_cookies('https://kick.com/').get('XSRF-TOKEN') if not xsrf_token: self.write_debug('kick.com did not set XSRF-TOKEN cookie') @@ -25,7 +26,7 @@ class KickBaseIE(InfoExtractor): def _call_api(self, path, display_id, note='Downloading API JSON', headers={}, **kwargs): return self._download_json( f'https://kick.com/api/v1/{path}', display_id, note=note, - headers=merge_dicts(headers, self._API_HEADERS), **kwargs) + headers=merge_dicts(headers, self._API_HEADERS), impersonate=True, **kwargs) class KickIE(KickBaseIE): @@ -82,26 +83,27 @@ class KickIE(KickBaseIE): class KickVODIE(KickBaseIE): _VALID_URL = r'https?://(?:www\.)?kick\.com/video/(?P<id>[\da-f]{8}-(?:[\da-f]{4}-){3}[\da-f]{12})' _TESTS = [{ - 'url': 'https://kick.com/video/54244b5e-050a-4df4-a013-b2433dafbe35', - 'md5': '73691206a6a49db25c5aa1588e6538fc', + 'url': 'https://kick.com/video/58bac65b-e641-4476-a7ba-3707a35e60e3', + 'md5': '3870f94153e40e7121a6e46c068b70cb', 'info_dict': { - 'id': '54244b5e-050a-4df4-a013-b2433dafbe35', + 'id': '58bac65b-e641-4476-a7ba-3707a35e60e3', 'ext': 'mp4', - 'title': 'Making 710-carBoosting. Kinda No Pixel inspired. !guilded - !links', - 'description': 'md5:a0d3546bf7955d0a8252ffe0fd6f518f', - 'channel': 'kmack710', - 'channel_id': '16278', - 'uploader': 'Kmack710', - 'uploader_id': '16412', - 'upload_date': '20221206', - 'timestamp': 1670318289, - 'duration': 40104.0, + 'title': '🤠REBIRTH IS BACK!!!!🤠!stake CODE JAREDFPS 🤠', + 'description': 'md5:02b0c46f9b4197fb545ab09dddb85b1d', + 'channel': 'jaredfps', + 'channel_id': '26608', + 'uploader': 'JaredFPS', + 'uploader_id': '26799', + 'upload_date': '20240402', + 'timestamp': 1712097108, + 'duration': 33859.0, 'thumbnail': r're:^https?://.*\.jpg', - 'categories': ['Grand Theft Auto V'], + 'categories': ['Call of Duty: Warzone'], }, 'params': { 'skip_download': 'm3u8', }, + 'expected_warnings': [r'impersonation'], }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/leeco.py b/yt_dlp/extractor/leeco.py index 85033b8f8..5d61a607f 100644 --- a/yt_dlp/extractor/leeco.py +++ b/yt_dlp/extractor/leeco.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import hashlib import re import time @@ -185,7 +185,7 @@ class LeIE(InfoExtractor): publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', default=None), - delimiter=' ', timezone=datetime.timedelta(hours=8)) + delimiter=' ', timezone=dt.timedelta(hours=8)) description = self._html_search_meta('description', page, fatal=False) return { diff --git a/yt_dlp/extractor/linkedin.py b/yt_dlp/extractor/linkedin.py index ad41c0e20..e12f467ef 100644 --- a/yt_dlp/extractor/linkedin.py +++ b/yt_dlp/extractor/linkedin.py @@ -1,4 +1,4 @@ -from itertools import zip_longest +import itertools import re from .common import InfoExtractor @@ -156,7 +156,7 @@ class LinkedInLearningIE(LinkedInLearningBaseIE): def json2srt(self, transcript_lines, duration=None): srt_data = '' - for line, (line_dict, next_dict) in enumerate(zip_longest(transcript_lines, transcript_lines[1:])): + for line, (line_dict, next_dict) in enumerate(itertools.zip_longest(transcript_lines, transcript_lines[1:])): start_time, caption = line_dict['transcriptStartAt'] / 1000, line_dict['caption'] end_time = next_dict['transcriptStartAt'] / 1000 if next_dict else duration or start_time + 1 srt_data += '%d\n%s --> %s\n%s\n\n' % (line + 1, srt_subtitles_timecode(start_time), diff --git a/yt_dlp/extractor/masters.py b/yt_dlp/extractor/masters.py index 716f1c961..c3c58d7d0 100644 --- a/yt_dlp/extractor/masters.py +++ b/yt_dlp/extractor/masters.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( traverse_obj, diff --git a/yt_dlp/extractor/medici.py b/yt_dlp/extractor/medici.py index 328ccd2c9..b6235b64d 100644 --- a/yt_dlp/extractor/medici.py +++ b/yt_dlp/extractor/medici.py @@ -1,67 +1,153 @@ +import urllib.parse + from .common import InfoExtractor from ..utils import ( - unified_strdate, - update_url_query, - urlencode_postdata, + filter_dict, + parse_iso8601, + traverse_obj, + try_call, + url_or_none, ) class MediciIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)' - _TEST = { - 'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp', - 'md5': '004c21bb0a57248085b6ff3fec72719d', + _VALID_URL = r'https?://(?:(?P<sub>www|edu)\.)?medici\.tv/[a-z]{2}/[\w.-]+/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.medici.tv/en/operas/thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris', + 'md5': 'd483f74e7a7a9eac0dbe152ab189050d', 'info_dict': { - 'id': '3059', - 'ext': 'flv', - 'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson', - 'description': 'md5:322a1e952bafb725174fd8c1a8212f58', - 'thumbnail': r're:^https?://.*\.jpg$', - 'upload_date': '20170408', + 'id': '8032', + 'ext': 'mp4', + 'title': 'Thomas Adès\'s The Exterminating Angel', + 'description': 'md5:708ae6350dadc604225b4a6e32482bab', + 'thumbnail': r're:https://.+/.+\.jpg', + 'upload_date': '20240304', + 'timestamp': 1709561766, + 'display_id': 'thomas-ades-the-exterminating-angel-calixto-bieito-opera-bastille-paris', }, - } + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://edu.medici.tv/en/operas/wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum', + 'md5': '4ef3f4079a6e1c617584463a9eb84f99', + 'info_dict': { + 'id': '7900', + 'ext': 'mp4', + 'title': 'Wagner\'s Lohengrin', + 'description': 'md5:a384a62937866101f86902f21752cd89', + 'thumbnail': r're:https://.+/.+\.jpg', + 'upload_date': '20231017', + 'timestamp': 1697554771, + 'display_id': 'wagner-lohengrin-paris-opera-kirill-serebrennikov-piotr-beczala-kwangchul-youn-johanni-van-oostrum', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://www.medici.tv/en/concerts/sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello', + 'md5': '9dd757e53b22b2511e85ea9ea60e4815', + 'info_dict': { + 'id': '5712', + 'ext': 'mp4', + 'title': 'Sergey Smbatyan conducts Tigran Mansurian — With Chouchane Siranossian and Mario Brunello', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:9411fe44c874bb10e9af288c65816e41', + 'upload_date': '20200323', + 'timestamp': 1584975600, + 'display_id': 'sergey-smbatyan-conducts-mansurian-chouchane-siranossian-mario-brunello', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://www.medici.tv/en/ballets/carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma', + 'md5': '40f5e76cb701a97a6d7ba23b62c49990', + 'info_dict': { + 'id': '7857', + 'ext': 'mp4', + 'title': 'Carmen by Jiří Bubeníček after Roland Petit, music by Bizet, de Falla, Castelnuovo-Tedesco, and Bonolis', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:0f15a15611ed748020c769873e10a8bb', + 'upload_date': '20240223', + 'timestamp': 1708707600, + 'display_id': 'carmen-ballet-choregraphie-de-jiri-bubenicek-teatro-dellopera-di-roma', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://www.medici.tv/en/documentaries/la-sonnambula-liege-2023-documentaire', + 'md5': '87ff198018ce79a34757ab0dd6f21080', + 'info_dict': { + 'id': '7513', + 'ext': 'mp4', + 'title': 'La Sonnambula', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:0caf9109a860fd50cd018df062a67f34', + 'upload_date': '20231103', + 'timestamp': 1699010830, + 'display_id': 'la-sonnambula-liege-2023-documentaire', + }, + 'expected_warnings': [r'preview'], + }, { + 'url': 'https://edu.medici.tv/en/masterclasses/yvonne-loriod-olivier-messiaen', + 'md5': 'fb5dcec46d76ad20fbdbaabb01da191d', + 'info_dict': { + 'id': '3024', + 'ext': 'mp4', + 'title': 'Olivier Messiaen and Yvonne Loriod, pianists and teachers', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:aab948e2f7690214b5c28896c83f1fc1', + 'upload_date': '20150223', + 'timestamp': 1424706608, + 'display_id': 'yvonne-loriod-olivier-messiaen', + }, + 'skip': 'Requires authentication; preview starts in the middle', + }, { + 'url': 'https://www.medici.tv/en/jazz/makaya-mccraven-la-rochelle', + 'md5': '4cc279a8b06609782747c8f50beea2b3', + 'info_dict': { + 'id': '7922', + 'ext': 'mp4', + 'title': 'NEW: Makaya McCraven in La Rochelle', + 'thumbnail': r're:https://.+/.+\.jpg', + 'description': 'md5:b5a8aaeb6993d8ccb18bde8abb8aa8d2', + 'upload_date': '20231228', + 'timestamp': 1703754863, + 'display_id': 'makaya-mccraven-la-rochelle', + }, + 'expected_warnings': [r'preview'], + }] def _real_extract(self, url): - video_id = self._match_id(url) + display_id, subdomain = self._match_valid_url(url).group('id', 'sub') + self._request_webpage(url, display_id, 'Requesting CSRF token cookie') - # Sets csrftoken cookie - self._download_webpage(url, video_id) - - MEDICI_URL = 'http://www.medici.tv/' + subdomain = 'edu-' if subdomain == 'edu' else '' + origin = f'https://{urllib.parse.urlparse(url).hostname}' data = self._download_json( - MEDICI_URL, video_id, - data=urlencode_postdata({ - 'json': 'true', - 'page': '/%s' % video_id, - 'timezone_offset': -420, - }), headers={ - 'X-CSRFToken': self._get_cookies(url)['csrftoken'].value, - 'X-Requested-With': 'XMLHttpRequest', - 'Referer': MEDICI_URL, - 'Content-Type': 'application/x-www-form-urlencoded', - }) + f'https://api.medici.tv/{subdomain}satie/edito/movie-file/{display_id}/', display_id, + headers=filter_dict({ + 'Authorization': try_call( + lambda: urllib.parse.unquote(self._get_cookies(url)['auth._token.mAuth'].value)), + 'Device-Type': 'web', + 'Origin': origin, + 'Referer': f'{origin}/', + 'Accept': 'application/json, text/plain, */*', + })) - video = data['video']['videos']['video1'] + if not traverse_obj(data, ('video', 'is_full_video')) and traverse_obj( + data, ('video', 'is_limited_by_user_access')): + self.report_warning( + 'The full video is for subscribers only. Only previews will be downloaded. If you ' + 'have used the --cookies-from-browser option, try using the --cookies option instead') - title = video.get('nom') or data['title'] - - video_id = video.get('id') or video_id - formats = self._extract_f4m_formats( - update_url_query(video['url_akamai'], { - 'hdcore': '3.1.0', - 'plugin=aasp': '3.1.0.43.124', - }), video_id, f4m_id='hds') - - description = data.get('meta_description') - thumbnail = video.get('url_thumbnail') or data.get('main_image') - upload_date = unified_strdate(data['video'].get('date')) + formats, subtitles = self._extract_m3u8_formats_and_subtitles( + data['video']['video_url'], display_id, 'mp4') return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'upload_date': upload_date, + 'id': str(data['id']), + 'display_id': display_id, 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('subtitle', {str}), + 'thumbnail': ('picture', {url_or_none}), + 'timestamp': ('date_publish', {parse_iso8601}), + }), } diff --git a/yt_dlp/extractor/microsoftstream.py b/yt_dlp/extractor/microsoftstream.py index 9b50996b7..5f5f16087 100644 --- a/yt_dlp/extractor/microsoftstream.py +++ b/yt_dlp/extractor/microsoftstream.py @@ -1,4 +1,4 @@ -from base64 import b64decode +import base64 from .common import InfoExtractor from ..utils import ( @@ -81,7 +81,7 @@ class MicrosoftStreamIE(InfoExtractor): 'url': thumbnail_url, } thumb_name = url_basename(thumbnail_url) - thumb_name = str(b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) + thumb_name = str(base64.b64decode(thumb_name + '=' * (-len(thumb_name) % 4))) thumb.update(parse_resolution(thumb_name)) thumbnails.append(thumb) diff --git a/yt_dlp/extractor/mixch.py b/yt_dlp/extractor/mixch.py index 4be694728..b980fd01a 100644 --- a/yt_dlp/extractor/mixch.py +++ b/yt_dlp/extractor/mixch.py @@ -1,5 +1,7 @@ from .common import InfoExtractor -from ..utils import UserNotLive, traverse_obj +from ..networking.exceptions import HTTPError +from ..utils import ExtractorError, UserNotLive, int_or_none, url_or_none +from ..utils.traversal import traverse_obj class MixchIE(InfoExtractor): @@ -25,25 +27,23 @@ class MixchIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(f'https://mixch.tv/u/{video_id}/live', video_id) - - initial_js_state = self._parse_json(self._search_regex( - r'(?m)^\s*window\.__INITIAL_JS_STATE__\s*=\s*(\{.+?\});\s*$', webpage, 'initial JS state'), video_id) - if not initial_js_state.get('liveInfo'): + data = self._download_json(f'https://mixch.tv/api-web/users/{video_id}/live', video_id) + if not traverse_obj(data, ('liveInfo', {dict})): raise UserNotLive(video_id=video_id) return { 'id': video_id, - 'title': traverse_obj(initial_js_state, ('liveInfo', 'title')), - 'comment_count': traverse_obj(initial_js_state, ('liveInfo', 'comments')), - 'view_count': traverse_obj(initial_js_state, ('liveInfo', 'visitor')), - 'timestamp': traverse_obj(initial_js_state, ('liveInfo', 'created')), - 'uploader': traverse_obj(initial_js_state, ('broadcasterInfo', 'name')), 'uploader_id': video_id, + **traverse_obj(data, { + 'title': ('liveInfo', 'title', {str}), + 'comment_count': ('liveInfo', 'comments', {int_or_none}), + 'view_count': ('liveInfo', 'visitor', {int_or_none}), + 'timestamp': ('liveInfo', 'created', {int_or_none}), + 'uploader': ('broadcasterInfo', 'name', {str}), + }), 'formats': [{ 'format_id': 'hls', - 'url': (traverse_obj(initial_js_state, ('liveInfo', 'hls')) - or f'https://d1hd0ww6piyb43.cloudfront.net/hls/torte_{video_id}.m3u8'), + 'url': data['liveInfo']['hls'], 'ext': 'mp4', 'protocol': 'm3u8', }], @@ -60,22 +60,38 @@ class MixchArchiveIE(InfoExtractor): 'skip': 'paid video, no DRM. expires at Jan 23', 'info_dict': { 'id': '421', + 'ext': 'mp4', 'title': '96NEKO SHOW TIME', } + }, { + 'url': 'https://mixch.tv/archive/1213', + 'skip': 'paid video, no DRM. expires at Dec 31, 2023', + 'info_dict': { + 'id': '1213', + 'ext': 'mp4', + 'title': '【特別トーク番組アーカイブス】Merm4id×燐舞曲 2nd LIVE「VERSUS」', + 'release_date': '20231201', + 'thumbnail': str, + } + }, { + 'url': 'https://mixch.tv/archive/1214', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - html5_videos = self._parse_html5_media_entries( - url, webpage.replace('video-js', 'video'), video_id, 'hls') - if not html5_videos: - self.raise_login_required(method='cookies') - infodict = html5_videos[0] - infodict.update({ + try: + info_json = self._download_json( + f'https://mixch.tv/api-web/archive/{video_id}', video_id)['archive'] + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 401: + self.raise_login_required() + raise + + return { 'id': video_id, - 'title': self._html_search_regex(r'class="archive-title">(.+?)</', webpage, 'title') - }) - - return infodict + 'title': traverse_obj(info_json, ('title', {str})), + 'formats': self._extract_m3u8_formats(info_json['archiveURL'], video_id), + 'thumbnail': traverse_obj(info_json, ('thumbnailURL', {url_or_none})), + } diff --git a/yt_dlp/extractor/motherless.py b/yt_dlp/extractor/motherless.py index 160150a7b..b6c18fe5b 100644 --- a/yt_dlp/extractor/motherless.py +++ b/yt_dlp/extractor/motherless.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import re import urllib.parse @@ -151,7 +151,7 @@ class MotherlessIE(InfoExtractor): 'd': 'days', } kwargs = {_AGO_UNITS.get(uploaded_ago[-1]): delta} - upload_date = (datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(**kwargs)).strftime('%Y%m%d') + upload_date = (dt.datetime.now(dt.timezone.utc) - dt.timedelta(**kwargs)).strftime('%Y%m%d') comment_count = len(re.findall(r'''class\s*=\s*['"]media-comment-contents\b''', webpage)) uploader_id = self._html_search_regex( diff --git a/yt_dlp/extractor/naver.py b/yt_dlp/extractor/naver.py index 806b79082..885557e91 100644 --- a/yt_dlp/extractor/naver.py +++ b/yt_dlp/extractor/naver.py @@ -4,8 +4,8 @@ import hmac import itertools import json import re +import urllib.parse import time -from urllib.parse import parse_qs, urlparse from .common import InfoExtractor from ..utils import ( @@ -388,7 +388,7 @@ class NaverNowIE(NaverBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - qs = parse_qs(urlparse(url).query) + qs = urllib.parse.parse_qs(urllib.parse.urlparse(url).query) if not self._yes_playlist(show_id, qs.get('shareHightlight')): return self._extract_highlight(show_id, qs['shareHightlight'][0]) diff --git a/yt_dlp/extractor/neteasemusic.py b/yt_dlp/extractor/neteasemusic.py index d332b840c..73b33a9f9 100644 --- a/yt_dlp/extractor/neteasemusic.py +++ b/yt_dlp/extractor/neteasemusic.py @@ -1,9 +1,9 @@ +import hashlib import itertools import json +import random import re import time -from hashlib import md5 -from random import randint from .common import InfoExtractor from ..aes import aes_ecb_encrypt, pkcs7_padding @@ -34,7 +34,7 @@ class NetEaseMusicBaseIE(InfoExtractor): request_text = json.dumps({**query_body, 'header': cookies}, separators=(',', ':')) message = f'nobody{api_path}use{request_text}md5forencrypt'.encode('latin1') - msg_digest = md5(message).hexdigest() + msg_digest = hashlib.md5(message).hexdigest() data = pkcs7_padding(list(str.encode( f'{api_path}-36cd479b6b5-{request_text}-36cd479b6b5-{msg_digest}'))) @@ -53,7 +53,7 @@ class NetEaseMusicBaseIE(InfoExtractor): '__csrf': '', 'os': 'pc', 'channel': 'undefined', - 'requestId': f'{int(time.time() * 1000)}_{randint(0, 1000):04}', + 'requestId': f'{int(time.time() * 1000)}_{random.randint(0, 1000):04}', **traverse_obj(self._get_cookies(self._API_BASE), { 'MUSIC_U': ('MUSIC_U', {lambda i: i.value}), }) diff --git a/yt_dlp/extractor/nhk.py b/yt_dlp/extractor/nhk.py index 7cf5b246b..8bb017a73 100644 --- a/yt_dlp/extractor/nhk.py +++ b/yt_dlp/extractor/nhk.py @@ -8,6 +8,7 @@ from ..utils import ( int_or_none, join_nonempty, parse_duration, + remove_end, traverse_obj, try_call, unescapeHTML, @@ -19,8 +20,7 @@ from ..utils import ( class NhkBaseIE(InfoExtractor): _API_URL_TEMPLATE = 'https://nwapi.nhk.jp/nhkworld/%sod%slist/v7b/%s/%s/%s/all%s.json' - _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/ondemand' - _TYPE_REGEX = r'/(?P<type>video|audio)/' + _BASE_URL_REGEX = r'https?://www3\.nhk\.or\.jp/nhkworld/(?P<lang>[a-z]{2})/' def _call_api(self, m_id, lang, is_video, is_episode, is_clip): return self._download_json( @@ -83,7 +83,7 @@ class NhkBaseIE(InfoExtractor): def _extract_episode_info(self, url, episode=None): fetch_episode = episode is None lang, m_type, episode_id = NhkVodIE._match_valid_url(url).group('lang', 'type', 'id') - is_video = m_type == 'video' + is_video = m_type != 'audio' if is_video: episode_id = episode_id[:4] + '-' + episode_id[4:] @@ -138,9 +138,10 @@ class NhkBaseIE(InfoExtractor): else: if fetch_episode: - audio_path = episode['audio']['audio'] + # From https://www3.nhk.or.jp/nhkworld/common/player/radio/inline/rod.html + audio_path = remove_end(episode['audio']['audio'], '.m4a') info['formats'] = self._extract_m3u8_formats( - 'https://nhkworld-vh.akamaihd.net/i%s/master.m3u8' % audio_path, + f'{urljoin("https://vod-stream.nhk.jp", audio_path)}/index.m3u8', episode_id, 'm4a', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) for f in info['formats']: @@ -155,9 +156,11 @@ class NhkBaseIE(InfoExtractor): class NhkVodIE(NhkBaseIE): - # the 7-character IDs can have alphabetic chars too: assume [a-z] rather than just [a-f], eg - _VALID_URL = [rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>video)/(?P<id>[0-9a-z]+)', - rf'{NhkBaseIE._BASE_URL_REGEX}/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[0-9a-z]+)'] + _VALID_URL = [ + rf'{NhkBaseIE._BASE_URL_REGEX}shows/(?:(?P<type>video)/)?(?P<id>\d{{4}}[\da-z]\d+)/?(?:$|[?#])', + rf'{NhkBaseIE._BASE_URL_REGEX}(?:ondemand|shows)/(?P<type>audio)/(?P<id>[^/?#]+?-\d{{8}}-[\da-z]+)', + rf'{NhkBaseIE._BASE_URL_REGEX}ondemand/(?P<type>video)/(?P<id>\d{{4}}[\da-z]\d+)', # deprecated + ] # Content available only for a limited period of time. Visit # https://www3.nhk.or.jp/nhkworld/en/ondemand/ for working samples. _TESTS = [{ @@ -167,17 +170,16 @@ class NhkVodIE(NhkBaseIE): 'ext': 'mp4', 'title': 'Japan Railway Journal - The Tohoku Shinkansen: Full Speed Ahead', 'description': 'md5:49f7c5b206e03868a2fdf0d0814b92f6', - 'thumbnail': 'md5:51bcef4a21936e7fea1ff4e06353f463', + 'thumbnail': r're:https://.+/.+\.jpg', 'episode': 'The Tohoku Shinkansen: Full Speed Ahead', 'series': 'Japan Railway Journal', - 'modified_timestamp': 1694243656, + 'modified_timestamp': 1707217907, 'timestamp': 1681428600, 'release_timestamp': 1693883728, 'duration': 1679, 'upload_date': '20230413', - 'modified_date': '20230909', + 'modified_date': '20240206', 'release_date': '20230905', - }, }, { # video clip @@ -188,15 +190,15 @@ class NhkVodIE(NhkBaseIE): 'ext': 'mp4', 'title': 'Dining with the Chef - Chef Saito\'s Family recipe: MENCHI-KATSU', 'description': 'md5:5aee4a9f9d81c26281862382103b0ea5', - 'thumbnail': 'md5:d6a4d9b6e9be90aaadda0bcce89631ed', + 'thumbnail': r're:https://.+/.+\.jpg', 'series': 'Dining with the Chef', 'episode': 'Chef Saito\'s Family recipe: MENCHI-KATSU', 'duration': 148, 'upload_date': '20190816', 'release_date': '20230902', 'release_timestamp': 1693619292, - 'modified_timestamp': 1694168033, - 'modified_date': '20230908', + 'modified_timestamp': 1707217907, + 'modified_date': '20240206', 'timestamp': 1565997540, }, }, { @@ -208,7 +210,7 @@ class NhkVodIE(NhkBaseIE): 'title': 'Living in Japan - Tips for Travelers to Japan / Ramen Vending Machines', 'series': 'Living in Japan', 'description': 'md5:0a0e2077d8f07a03071e990a6f51bfab', - 'thumbnail': 'md5:960622fb6e06054a4a1a0c97ea752545', + 'thumbnail': r're:https://.+/.+\.jpg', 'episode': 'Tips for Travelers to Japan / Ramen Vending Machines' }, }, { @@ -245,7 +247,7 @@ class NhkVodIE(NhkBaseIE): 'title': 'おはよう日本(7時台) - 10月8日放送', 'series': 'おはよう日本(7時台)', 'episode': '10月8日放送', - 'thumbnail': 'md5:d733b1c8e965ab68fb02b2d347d0e9b4', + 'thumbnail': r're:https://.+/.+\.jpg', 'description': 'md5:9c1d6cbeadb827b955b20e99ab920ff0', }, 'skip': 'expires 2023-10-15', @@ -255,17 +257,100 @@ class NhkVodIE(NhkBaseIE): 'info_dict': { 'id': 'nw_vod_v_en_3004_952_20230723091000_01_1690074552', 'ext': 'mp4', - 'title': 'Barakan Discovers AMAMI OSHIMA: Isson\'s Treasure Island', + 'title': 'Barakan Discovers - AMAMI OSHIMA: Isson\'s Treasure Isla', 'description': 'md5:5db620c46a0698451cc59add8816b797', - 'thumbnail': 'md5:67d9ff28009ba379bfa85ad1aaa0e2bd', + 'thumbnail': r're:https://.+/.+\.jpg', 'release_date': '20230905', 'timestamp': 1690103400, 'duration': 2939, 'release_timestamp': 1693898699, - 'modified_timestamp': 1698057495, - 'modified_date': '20231023', 'upload_date': '20230723', + 'modified_timestamp': 1707217907, + 'modified_date': '20240206', + 'episode': 'AMAMI OSHIMA: Isson\'s Treasure Isla', + 'series': 'Barakan Discovers', }, + }, { + # /ondemand/video/ url with alphabetical character in 5th position of id + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999a07/', + 'info_dict': { + 'id': 'nw_c_en_9999-a07', + 'ext': 'mp4', + 'episode': 'Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]', + 'series': 'Mini-Dramas on SDGs', + 'modified_date': '20240206', + 'title': 'Mini-Dramas on SDGs - Mini-Dramas on SDGs: Ep 1 Close the Gender Gap [Director\'s Cut]', + 'description': 'md5:3f9dcb4db22fceb675d90448a040d3f6', + 'timestamp': 1621962360, + 'duration': 189, + 'release_date': '20230903', + 'modified_timestamp': 1707217907, + 'upload_date': '20210525', + 'thumbnail': r're:https://.+/.+\.jpg', + 'release_timestamp': 1693713487, + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/video/9999d17/', + 'info_dict': { + 'id': 'nw_c_en_9999-d17', + 'ext': 'mp4', + 'title': 'Flowers of snow blossom - The 72 Pentads of Yamato', + 'description': 'Today’s focus: Snow', + 'release_timestamp': 1693792402, + 'release_date': '20230904', + 'upload_date': '20220128', + 'timestamp': 1643370960, + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 136, + 'series': '', + 'modified_date': '20240206', + 'modified_timestamp': 1707217907, + }, + }, { + # new /shows/ url format + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/2032307/', + 'info_dict': { + 'id': 'nw_vod_v_en_2032_307_20240321113000_01_1710990282', + 'ext': 'mp4', + 'title': 'Japanology Plus - 20th Anniversary Special Part 1', + 'description': 'md5:817d41fc8e54339ad2a916161ea24faf', + 'episode': '20th Anniversary Special Part 1', + 'series': 'Japanology Plus', + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 1680, + 'timestamp': 1711020600, + 'upload_date': '20240321', + 'release_timestamp': 1711022683, + 'release_date': '20240321', + 'modified_timestamp': 1711031012, + 'modified_date': '20240321', + }, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/3020025/', + 'info_dict': { + 'id': 'nw_vod_v_en_3020_025_20230325144000_01_1679723944', + 'ext': 'mp4', + 'title': '100 Ideas to Save the World - Working Styles Evolve', + 'description': 'md5:9e6c7778eaaf4f7b4af83569649f84d9', + 'episode': 'Working Styles Evolve', + 'series': '100 Ideas to Save the World', + 'thumbnail': r're:https://.+/.+\.jpg', + 'duration': 899, + 'upload_date': '20230325', + 'timestamp': 1679755200, + 'release_date': '20230905', + 'release_timestamp': 1693880540, + 'modified_date': '20240206', + 'modified_timestamp': 1707217907, + }, + }, { + # new /shows/audio/ url format + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/livinginjapan-20231001-1/', + 'only_matching': True, + }, { + # valid url even if can't be found in wild; support needed for clip entries extraction + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/9999o80/', + 'only_matching': True, }] def _real_extract(self, url): @@ -273,18 +358,21 @@ class NhkVodIE(NhkBaseIE): class NhkVodProgramIE(NhkBaseIE): - _VALID_URL = rf'{NhkBaseIE._BASE_URL_REGEX}/program{NhkBaseIE._TYPE_REGEX}(?P<id>\w+)(?:.+?\btype=(?P<episode_type>clip|(?:radio|tv)Episode))?' + _VALID_URL = rf'''(?x) + {NhkBaseIE._BASE_URL_REGEX}(?:shows|tv)/ + (?:(?P<type>audio)/programs/)?(?P<id>\w+)/? + (?:\?(?:[^#]+&)?type=(?P<episode_type>clip|(?:radio|tv)Episode))?''' _TESTS = [{ # video program episodes - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/sumo', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/sumo/', 'info_dict': { 'id': 'sumo', 'title': 'GRAND SUMO Highlights', 'description': 'md5:fc20d02dc6ce85e4b72e0273aa52fdbf', }, - 'playlist_mincount': 0, + 'playlist_mincount': 1, }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', @@ -293,40 +381,68 @@ class NhkVodProgramIE(NhkBaseIE): 'playlist_mincount': 12, }, { # video program clips - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/japanrailway/?type=clip', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/japanrailway/?type=clip', 'info_dict': { 'id': 'japanrailway', 'title': 'Japan Railway Journal', 'description': 'md5:ea39d93af7d05835baadf10d1aae0e3f', }, - 'playlist_mincount': 5, - }, { - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/video/10yearshayaomiyazaki/', - 'only_matching': True, + 'playlist_mincount': 12, }, { # audio program - 'url': 'https://www3.nhk.or.jp/nhkworld/en/ondemand/program/audio/listener/', + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/audio/programs/livinginjapan/', + 'info_dict': { + 'id': 'livinginjapan', + 'title': 'Living in Japan', + 'description': 'md5:665bb36ec2a12c5a7f598ee713fc2b54', + }, + 'playlist_mincount': 12, + }, { + # /tv/ program url + 'url': 'https://www3.nhk.or.jp/nhkworld/en/tv/designtalksplus/', + 'info_dict': { + 'id': 'designtalksplus', + 'title': 'DESIGN TALKS plus', + 'description': 'md5:47b3b3a9f10d4ac7b33b53b70a7d2837', + }, + 'playlist_mincount': 20, + }, { + 'url': 'https://www3.nhk.or.jp/nhkworld/en/shows/10yearshayaomiyazaki/', 'only_matching': True, }] + @classmethod + def suitable(cls, url): + return False if NhkVodIE.suitable(url) else super().suitable(url) + + def _extract_meta_from_class_elements(self, class_values, html): + for class_value in class_values: + if value := clean_html(get_element_by_class(class_value, html)): + return value + def _real_extract(self, url): lang, m_type, program_id, episode_type = self._match_valid_url(url).group('lang', 'type', 'id', 'episode_type') episodes = self._call_api( - program_id, lang, m_type == 'video', False, episode_type == 'clip') + program_id, lang, m_type != 'audio', False, episode_type == 'clip') - entries = [] - for episode in episodes: - episode_path = episode.get('url') - if not episode_path: - continue - entries.append(self._extract_episode_info( - urljoin(url, episode_path), episode)) + def entries(): + for episode in episodes: + if episode_path := episode.get('url'): + yield self._extract_episode_info(urljoin(url, episode_path), episode) html = self._download_webpage(url, program_id) - program_title = clean_html(get_element_by_class('p-programDetail__title', html)) - program_description = clean_html(get_element_by_class('p-programDetail__text', html)) + program_title = self._extract_meta_from_class_elements([ + 'p-programDetail__title', # /ondemand/program/ + 'pProgramHero__logoText', # /shows/ + 'tAudioProgramMain__title', # /shows/audio/programs/ + 'p-program-name'], html) # /tv/ + program_description = self._extract_meta_from_class_elements([ + 'p-programDetail__text', # /ondemand/program/ + 'pProgramHero__description', # /shows/ + 'tAudioProgramMain__info', # /shows/audio/programs/ + 'p-program-description'], html) # /tv/ - return self.playlist_result(entries, program_id, program_title, program_description) + return self.playlist_result(entries(), program_id, program_title, program_description) class NhkForSchoolBangumiIE(InfoExtractor): diff --git a/yt_dlp/extractor/niconico.py b/yt_dlp/extractor/niconico.py index 6a4624602..b04ce9615 100644 --- a/yt_dlp/extractor/niconico.py +++ b/yt_dlp/extractor/niconico.py @@ -1,11 +1,10 @@ -import datetime +import datetime as dt import functools import itertools import json import re import time - -from urllib.parse import urlparse +import urllib.parse from .common import InfoExtractor, SearchInfoExtractor from ..networking import Request @@ -820,12 +819,12 @@ class NicovideoSearchDateIE(NicovideoSearchBaseIE, SearchInfoExtractor): 'playlist_mincount': 1610, }] - _START_DATE = datetime.date(2007, 1, 1) + _START_DATE = dt.date(2007, 1, 1) _RESULTS_PER_PAGE = 32 _MAX_PAGES = 50 def _entries(self, url, item_id, start_date=None, end_date=None): - start_date, end_date = start_date or self._START_DATE, end_date or datetime.datetime.now().date() + start_date, end_date = start_date or self._START_DATE, end_date or dt.datetime.now().date() # If the last page has a full page of videos, we need to break down the query interval further last_page_len = len(list(self._get_entries_for_date( @@ -957,7 +956,7 @@ class NiconicoLiveIE(InfoExtractor): 'frontend_id': traverse_obj(embedded_data, ('site', 'frontendId')) or '9', }) - hostname = remove_start(urlparse(urlh.url).hostname, 'sp.') + hostname = remove_start(urllib.parse.urlparse(urlh.url).hostname, 'sp.') latency = try_get(self._configuration_arg('latency'), lambda x: x[0]) if latency not in self._KNOWN_LATENCY: latency = 'high' diff --git a/yt_dlp/extractor/panopto.py b/yt_dlp/extractor/panopto.py index ddea32d70..63c5fd68f 100644 --- a/yt_dlp/extractor/panopto.py +++ b/yt_dlp/extractor/panopto.py @@ -1,8 +1,8 @@ import calendar -import json +import datetime as dt import functools -from datetime import datetime, timezone -from random import random +import json +import random from .common import InfoExtractor from ..compat import ( @@ -243,7 +243,7 @@ class PanoptoIE(PanoptoBaseIE): invocation_id = delivery_info.get('InvocationId') stream_id = traverse_obj(delivery_info, ('Delivery', 'Streams', ..., 'PublicID'), get_all=False, expected_type=str) if invocation_id and stream_id and duration: - timestamp_str = f'/Date({calendar.timegm(datetime.now(timezone.utc).timetuple())}000)/' + timestamp_str = f'/Date({calendar.timegm(dt.datetime.now(dt.timezone.utc).timetuple())}000)/' data = { 'streamRequests': [ { @@ -415,7 +415,7 @@ class PanoptoIE(PanoptoBaseIE): 'cast': traverse_obj(delivery, ('Contributors', ..., 'DisplayName'), expected_type=lambda x: x or None), 'timestamp': session_start_time - 11640000000 if session_start_time else None, 'duration': delivery.get('Duration'), - 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random()}', + 'thumbnail': base_url + f'/Services/FrameGrabber.svc/FrameRedirect?objectId={video_id}&mode=Delivery&random={random.random()}', 'average_rating': delivery.get('AverageRating'), 'chapters': self._extract_chapters(timestamps), 'uploader': delivery.get('OwnerDisplayName') or None, diff --git a/yt_dlp/extractor/patreon.py b/yt_dlp/extractor/patreon.py index d2ddb72cd..9381c7eab 100644 --- a/yt_dlp/extractor/patreon.py +++ b/yt_dlp/extractor/patreon.py @@ -1,8 +1,8 @@ import itertools +import urllib.parse from .common import InfoExtractor from .vimeo import VimeoIE -from ..compat import compat_urllib_parse_unquote from ..networking.exceptions import HTTPError from ..utils import ( KNOWN_EXTENSIONS, @@ -14,7 +14,6 @@ from ..utils import ( parse_iso8601, str_or_none, traverse_obj, - try_get, url_or_none, urljoin, ) @@ -92,7 +91,7 @@ class PatreonIE(PatreonBaseIE): 'thumbnail': 're:^https?://.*$', 'upload_date': '20150211', 'description': 'md5:8af6425f50bd46fbf29f3db0fc3a8364', - 'uploader_id': 'TraciJHines', + 'uploader_id': '@TraciHinesMusic', 'categories': ['Entertainment'], 'duration': 282, 'view_count': int, @@ -106,8 +105,10 @@ class PatreonIE(PatreonBaseIE): 'availability': 'public', 'channel_follower_count': int, 'playable_in_embed': True, - 'uploader_url': 'http://www.youtube.com/user/TraciJHines', + 'uploader_url': 'https://www.youtube.com/@TraciHinesMusic', 'comment_count': int, + 'channel_is_verified': True, + 'chapters': 'count:4', }, 'params': { 'noplaylist': True, @@ -176,6 +177,48 @@ class PatreonIE(PatreonBaseIE): 'uploader_url': 'https://www.patreon.com/thenormies', }, 'skip': 'Patron-only content', + }, { + # dead vimeo and embed URLs, need to extract post_file + 'url': 'https://www.patreon.com/posts/hunter-x-hunter-34007913', + 'info_dict': { + 'id': '34007913', + 'ext': 'mp4', + 'title': 'Hunter x Hunter | Kurapika DESTROYS Uvogin!!!', + 'like_count': int, + 'uploader': 'YaBoyRoshi', + 'timestamp': 1581636833, + 'channel_url': 'https://www.patreon.com/yaboyroshi', + 'thumbnail': r're:^https?://.*$', + 'tags': ['Hunter x Hunter'], + 'uploader_id': '14264111', + 'comment_count': int, + 'channel_follower_count': int, + 'description': 'Kurapika is a walking cheat code!', + 'upload_date': '20200213', + 'channel_id': '2147162', + 'uploader_url': 'https://www.patreon.com/yaboyroshi', + }, + }, { + # NSFW vimeo embed URL + 'url': 'https://www.patreon.com/posts/4k-spiderman-4k-96414599', + 'info_dict': { + 'id': '902250943', + 'ext': 'mp4', + 'title': '❤️(4K) Spiderman Girl Yeonhwa’s Gift ❤️(4K) 스파이더맨걸 연화의 선물', + 'description': '❤️(4K) Spiderman Girl Yeonhwa’s Gift \n❤️(4K) 스파이더맨걸 연화의 선물', + 'uploader': 'Npickyeonhwa', + 'uploader_id': '90574422', + 'uploader_url': 'https://www.patreon.com/Yeonhwa726', + 'channel_id': '10237902', + 'channel_url': 'https://www.patreon.com/Yeonhwa726', + 'duration': 70, + 'timestamp': 1705150153, + 'upload_date': '20240113', + 'comment_count': int, + 'like_count': int, + 'thumbnail': r're:^https?://.+', + }, + 'params': {'skip_download': 'm3u8'}, }] def _real_extract(self, url): @@ -245,25 +288,21 @@ class PatreonIE(PatreonBaseIE): }) # handle Vimeo embeds - if try_get(attributes, lambda x: x['embed']['provider']) == 'Vimeo': - embed_html = try_get(attributes, lambda x: x['embed']['html']) - v_url = url_or_none(compat_urllib_parse_unquote( - self._search_regex(r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', embed_html, 'vimeo url', fatal=False))) - if v_url: - return { - **info, - '_type': 'url_transparent', - 'url': VimeoIE._smuggle_referrer(v_url, 'https://patreon.com'), - 'ie_key': 'Vimeo', - } + if traverse_obj(attributes, ('embed', 'provider')) == 'Vimeo': + v_url = urllib.parse.unquote(self._html_search_regex( + r'(https(?:%3A%2F%2F|://)player\.vimeo\.com.+app_id(?:=|%3D)+\d+)', + traverse_obj(attributes, ('embed', 'html', {str})), 'vimeo url', fatal=False) or '') + if url_or_none(v_url) and self._request_webpage( + v_url, video_id, 'Checking Vimeo embed URL', + headers={'Referer': 'https://patreon.com/'}, + fatal=False, errnote=False): + return self.url_result( + VimeoIE._smuggle_referrer(v_url, 'https://patreon.com/'), + VimeoIE, url_transparent=True, **info) - embed_url = try_get(attributes, lambda x: x['embed']['url']) - if embed_url: - return { - **info, - '_type': 'url', - 'url': embed_url, - } + embed_url = traverse_obj(attributes, ('embed', 'url', {url_or_none})) + if embed_url and self._request_webpage(embed_url, video_id, 'Checking embed URL', fatal=False, errnote=False): + return self.url_result(embed_url, **info) post_file = traverse_obj(attributes, 'post_file') if post_file: diff --git a/yt_dlp/extractor/polsatgo.py b/yt_dlp/extractor/polsatgo.py index 1524a1fb9..1cebb365e 100644 --- a/yt_dlp/extractor/polsatgo.py +++ b/yt_dlp/extractor/polsatgo.py @@ -1,5 +1,5 @@ -from uuid import uuid4 import json +import uuid from .common import InfoExtractor from ..utils import ( @@ -51,7 +51,7 @@ class PolsatGoIE(InfoExtractor): } def _call_api(self, endpoint, media_id, method, params): - rand_uuid = str(uuid4()) + rand_uuid = str(uuid.uuid4()) res = self._download_json( f'https://b2c-mobile.redefine.pl/rpc/{endpoint}/', media_id, note=f'Downloading {method} JSON metadata', diff --git a/yt_dlp/extractor/pr0gramm.py b/yt_dlp/extractor/pr0gramm.py index 66f8a5f44..3e0ccba17 100644 --- a/yt_dlp/extractor/pr0gramm.py +++ b/yt_dlp/extractor/pr0gramm.py @@ -1,5 +1,6 @@ +import datetime as dt import json -from urllib.parse import unquote +import urllib.parse from .common import InfoExtractor from ..compat import functools @@ -114,7 +115,7 @@ class Pr0grammIE(InfoExtractor): cookies = self._get_cookies(self.BASE_URL) if 'me' not in cookies: self._download_webpage(self.BASE_URL, None, 'Refreshing verification information') - if traverse_obj(cookies, ('me', {lambda x: x.value}, {unquote}, {json.loads}, 'verified')): + if traverse_obj(cookies, ('me', {lambda x: x.value}, {urllib.parse.unquote}, {json.loads}, 'verified')): flags |= 0b00110 return flags @@ -196,6 +197,7 @@ class Pr0grammIE(InfoExtractor): 'like_count': ('up', {int}), 'dislike_count': ('down', {int}), 'timestamp': ('created', {int}), + 'upload_date': ('created', {int}, {dt.date.fromtimestamp}, {lambda x: x.strftime('%Y%m%d')}), 'thumbnail': ('thumb', {lambda x: urljoin('https://thumb.pr0gramm.com', x)}) }), } diff --git a/yt_dlp/extractor/prosiebensat1.py b/yt_dlp/extractor/prosiebensat1.py index 46e2e8a8f..4c33baec5 100644 --- a/yt_dlp/extractor/prosiebensat1.py +++ b/yt_dlp/extractor/prosiebensat1.py @@ -1,6 +1,6 @@ +import hashlib import re -from hashlib import sha1 from .common import InfoExtractor from ..compat import compat_str from ..utils import ( @@ -42,7 +42,7 @@ class ProSiebenSat1BaseIE(InfoExtractor): 'Downloading protocols JSON', headers=self.geo_verification_headers(), query={ 'access_id': self._ACCESS_ID, - 'client_token': sha1((raw_ct).encode()).hexdigest(), + 'client_token': hashlib.sha1((raw_ct).encode()).hexdigest(), 'video_id': clip_id, }, fatal=False, expected_status=(403,)) or {} error = protocols.get('error') or {} @@ -53,7 +53,7 @@ class ProSiebenSat1BaseIE(InfoExtractor): urls = (self._download_json( self._V4_BASE_URL + 'urls', clip_id, 'Downloading urls JSON', query={ 'access_id': self._ACCESS_ID, - 'client_token': sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), + 'client_token': hashlib.sha1((raw_ct + server_token + self._SUPPORTED_PROTOCOLS).encode()).hexdigest(), 'protocols': self._SUPPORTED_PROTOCOLS, 'server_token': server_token, 'video_id': clip_id, @@ -77,7 +77,7 @@ class ProSiebenSat1BaseIE(InfoExtractor): if not formats: source_ids = [compat_str(source['id']) for source in video['sources']] - client_id = self._SALT[:2] + sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + client_id = self._SALT[:2] + hashlib.sha1(''.join([clip_id, self._SALT, self._TOKEN, client_location, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() sources = self._download_json( 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, @@ -96,7 +96,7 @@ class ProSiebenSat1BaseIE(InfoExtractor): return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate for source_id in source_ids: - client_id = self._SALT[:2] + sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() + client_id = self._SALT[:2] + hashlib.sha1(''.join([self._SALT, clip_id, self._TOKEN, server_id, client_location, source_id, self._SALT, self._CLIENT_NAME]).encode('utf-8')).hexdigest() urls = self._download_json( 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, clip_id, 'Downloading urls JSON', fatal=False, query={ diff --git a/yt_dlp/extractor/radiokapital.py b/yt_dlp/extractor/radiokapital.py index 8f9737ac3..5d7d3ddeb 100644 --- a/yt_dlp/extractor/radiokapital.py +++ b/yt_dlp/extractor/radiokapital.py @@ -1,18 +1,14 @@ -from .common import InfoExtractor -from ..utils import ( - clean_html, - traverse_obj, - unescapeHTML, -) - import itertools -from urllib.parse import urlencode +import urllib.parse + +from .common import InfoExtractor +from ..utils import clean_html, traverse_obj, unescapeHTML class RadioKapitalBaseIE(InfoExtractor): def _call_api(self, resource, video_id, note='Downloading JSON metadata', qs={}): return self._download_json( - f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urlencode(qs)}', + f'https://www.radiokapital.pl/wp-json/kapital/v1/{resource}?{urllib.parse.urlencode(qs)}', video_id, note=note) def _parse_episode(self, data): diff --git a/yt_dlp/extractor/rokfin.py b/yt_dlp/extractor/rokfin.py index 5099f3ae4..3bc5f3cab 100644 --- a/yt_dlp/extractor/rokfin.py +++ b/yt_dlp/extractor/rokfin.py @@ -1,8 +1,8 @@ +import datetime as dt import itertools import json import re import urllib.parse -from datetime import datetime from .common import InfoExtractor, SearchInfoExtractor from ..utils import ( @@ -156,7 +156,7 @@ class RokfinIE(InfoExtractor): self.raise_login_required('This video is only available to premium users', True, method='cookies') elif scheduled: self.raise_no_formats( - f'Stream is offline; scheduled for {datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}', + f'Stream is offline; scheduled for {dt.datetime.fromtimestamp(scheduled).strftime("%Y-%m-%d %H:%M:%S")}', video_id=video_id, expected=True) uploader = traverse_obj(metadata, ('createdBy', 'username'), ('creator', 'username')) diff --git a/yt_dlp/extractor/sejmpl.py b/yt_dlp/extractor/sejmpl.py index 29cb0152a..eb433d2ac 100644 --- a/yt_dlp/extractor/sejmpl.py +++ b/yt_dlp/extractor/sejmpl.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt from .common import InfoExtractor from .redge import RedCDNLivxIE @@ -13,16 +13,16 @@ from ..utils.traversal import traverse_obj def is_dst(date): - last_march = datetime.datetime(date.year, 3, 31) - last_october = datetime.datetime(date.year, 10, 31) - last_sunday_march = last_march - datetime.timedelta(days=last_march.isoweekday() % 7) - last_sunday_october = last_october - datetime.timedelta(days=last_october.isoweekday() % 7) + last_march = dt.datetime(date.year, 3, 31) + last_october = dt.datetime(date.year, 10, 31) + last_sunday_march = last_march - dt.timedelta(days=last_march.isoweekday() % 7) + last_sunday_october = last_october - dt.timedelta(days=last_october.isoweekday() % 7) return last_sunday_march.replace(hour=2) <= date <= last_sunday_october.replace(hour=3) def rfc3339_to_atende(date): - date = datetime.datetime.fromisoformat(date) - date = date + datetime.timedelta(hours=1 if is_dst(date) else 0) + date = dt.datetime.fromisoformat(date) + date = date + dt.timedelta(hours=1 if is_dst(date) else 0) return int((date.timestamp() - 978307200) * 1000) diff --git a/yt_dlp/extractor/sonyliv.py b/yt_dlp/extractor/sonyliv.py index a6da44525..7c914acbe 100644 --- a/yt_dlp/extractor/sonyliv.py +++ b/yt_dlp/extractor/sonyliv.py @@ -1,4 +1,4 @@ -import datetime +import datetime as dt import itertools import json import math @@ -94,7 +94,7 @@ class SonyLIVIE(InfoExtractor): 'mobileNumber': username, 'channelPartnerID': 'MSMIND', 'country': 'IN', - 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), + 'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), 'otpSize': 6, 'loginType': 'REGISTERORSIGNIN', 'isMobileMandatory': True, @@ -111,7 +111,7 @@ class SonyLIVIE(InfoExtractor): 'otp': self._get_tfa_info('OTP'), 'dmaId': 'IN', 'ageConfirmation': True, - 'timestamp': datetime.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), + 'timestamp': dt.datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%MZ'), 'isMobileMandatory': True, }).encode()) if otp_verify_json['resultCode'] == 'KO': diff --git a/yt_dlp/extractor/soundcloud.py b/yt_dlp/extractor/soundcloud.py index a7c2afd49..c9ed645eb 100644 --- a/yt_dlp/extractor/soundcloud.py +++ b/yt_dlp/extractor/soundcloud.py @@ -1,30 +1,27 @@ import itertools -import re import json -# import random +import re -from .common import ( - InfoExtractor, - SearchInfoExtractor -) +from .common import InfoExtractor, SearchInfoExtractor from ..compat import compat_str -from ..networking import HEADRequest, Request +from ..networking import HEADRequest from ..networking.exceptions import HTTPError from ..utils import ( - error_to_compat_str, + KNOWN_EXTENSIONS, ExtractorError, + error_to_compat_str, float_or_none, int_or_none, - KNOWN_EXTENSIONS, mimetype2ext, parse_qs, str_or_none, - try_get, + try_call, unified_timestamp, update_url_query, url_or_none, urlhandle_detect_ext, ) +from ..utils.traversal import traverse_obj class SoundcloudEmbedIE(InfoExtractor): @@ -54,7 +51,6 @@ class SoundcloudBaseIE(InfoExtractor): _API_AUTH_QUERY_TEMPLATE = '?client_id=%s' _API_AUTH_URL_PW = 'https://api-auth.soundcloud.com/web-auth/sign-in/password%s' _API_VERIFY_AUTH_TOKEN = 'https://api-auth.soundcloud.com/connect/session%s' - _access_token = None _HEADERS = {} _IMAGE_REPL_RE = r'-([0-9a-z]+)\.jpg' @@ -112,21 +108,31 @@ class SoundcloudBaseIE(InfoExtractor): def _initialize_pre_login(self): self._CLIENT_ID = self.cache.load('soundcloud', 'client_id') or 'a3e059563d7fd3372b49b37f00a00bcf' - def _perform_login(self, username, password): - if username != 'oauth': - self.report_warning( - 'Login using username and password is not currently supported. ' - 'Use "--username oauth --password <oauth_token>" to login using an oauth token') - self._access_token = password - query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID - payload = {'session': {'access_token': self._access_token}} - token_verification = Request(self._API_VERIFY_AUTH_TOKEN % query, json.dumps(payload).encode('utf-8')) - response = self._download_json(token_verification, None, note='Verifying login token...', fatal=False) - if response is not False: - self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + def _verify_oauth_token(self, token): + if self._request_webpage( + self._API_VERIFY_AUTH_TOKEN % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID), + None, note='Verifying login token...', fatal=False, + data=json.dumps({'session': {'access_token': token}}).encode()): + self._HEADERS['Authorization'] = f'OAuth {token}' self.report_login() else: - self.report_warning('Provided authorization token seems to be invalid. Continue as guest') + self.report_warning('Provided authorization token is invalid. Continuing as guest') + + def _real_initialize(self): + if self._HEADERS: + return + if token := try_call(lambda: self._get_cookies(self._BASE_URL)['oauth_token'].value): + self._verify_oauth_token(token) + + def _perform_login(self, username, password): + if username != 'oauth': + raise ExtractorError( + 'Login using username and password is not currently supported. ' + 'Use "--username oauth --password <oauth_token>" to login using an oauth token, ' + f'or else {self._login_hint(method="cookies")}', expected=True) + if self._HEADERS: + return + self._verify_oauth_token(password) r''' def genDevId(): @@ -147,14 +153,17 @@ class SoundcloudBaseIE(InfoExtractor): 'user_agent': self._USER_AGENT } - query = self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID - login = sanitized_Request(self._API_AUTH_URL_PW % query, json.dumps(payload).encode('utf-8')) - response = self._download_json(login, None) - self._access_token = response.get('session').get('access_token') - if not self._access_token: - self.report_warning('Unable to get access token, login may has failed') - else: - self._HEADERS = {'Authorization': 'OAuth ' + self._access_token} + response = self._download_json( + self._API_AUTH_URL_PW % (self._API_AUTH_QUERY_TEMPLATE % self._CLIENT_ID), + None, note='Verifying login token...', fatal=False, + data=json.dumps(payload).encode()) + + if token := traverse_obj(response, ('session', 'access_token', {str})): + self._HEADERS['Authorization'] = f'OAuth {token}' + self.report_login() + return + + raise ExtractorError('Unable to get access token, login may have failed', expected=True) ''' # signature generation @@ -217,6 +226,7 @@ class SoundcloudBaseIE(InfoExtractor): 'filesize': int_or_none(urlh.headers.get('Content-Length')), 'url': format_url, 'quality': 10, + 'format_note': 'Original', }) def invalid_url(url): @@ -233,9 +243,13 @@ class SoundcloudBaseIE(InfoExtractor): format_id_list.append(protocol) ext = f.get('ext') if ext == 'aac': - f['abr'] = '256' + f.update({ + 'abr': 256, + 'quality': 5, + 'format_note': 'Premium', + }) for k in ('ext', 'abr'): - v = f.get(k) + v = str_or_none(f.get(k)) if v: format_id_list.append(v) preview = is_preview or re.search(r'/(?:preview|playlist)/0/30/', f['url']) @@ -256,16 +270,25 @@ class SoundcloudBaseIE(InfoExtractor): formats.append(f) # New API - transcodings = try_get( - info, lambda x: x['media']['transcodings'], list) or [] - for t in transcodings: - if not isinstance(t, dict): - continue - format_url = url_or_none(t.get('url')) - if not format_url: - continue - stream = None if extract_flat else self._download_json( - format_url, track_id, query=query, fatal=False, headers=self._HEADERS) + for t in traverse_obj(info, ('media', 'transcodings', lambda _, v: url_or_none(v['url']))): + if extract_flat: + break + format_url = t['url'] + stream = None + + for retry in self.RetryManager(fatal=False): + try: + stream = self._download_json(format_url, track_id, query=query, headers=self._HEADERS) + except ExtractorError as e: + if isinstance(e.cause, HTTPError) and e.cause.status == 429: + self.report_warning( + 'You have reached the API rate limit, which is ~600 requests per ' + '10 minutes. Use the --extractor-retries and --retry-sleep options ' + 'to configure an appropriate retry count and wait time', only_once=True) + retry.error = e.cause + else: + self.report_warning(e.msg) + if not isinstance(stream, dict): continue stream_url = url_or_none(stream.get('url')) diff --git a/yt_dlp/extractor/stacommu.py b/yt_dlp/extractor/stacommu.py index 1308c595d..d2f207fcc 100644 --- a/yt_dlp/extractor/stacommu.py +++ b/yt_dlp/extractor/stacommu.py @@ -174,7 +174,7 @@ class TheaterComplexTownBaseIE(StacommuBaseIE): class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): - _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?videos/episodes/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?videos/episodes/(?P<id>\w+)' IE_NAME = 'theatercomplextown:vod' _TESTS = [{ 'url': 'https://www.theater-complex.town/videos/episodes/hoxqidYNoAn7bP92DN6p78', @@ -195,6 +195,9 @@ class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): }, { 'url': 'https://www.theater-complex.town/en/videos/episodes/6QT7XYwM9dJz5Gf9VB6K5y', 'only_matching': True, + }, { + 'url': 'https://www.theater-complex.town/ja/videos/episodes/hoxqidYNoAn7bP92DN6p78', + 'only_matching': True, }] _API_PATH = 'videoEpisodes' @@ -204,7 +207,7 @@ class TheaterComplexTownVODIE(TheaterComplexTownBaseIE): class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE): - _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:en/)?ppv/(?P<id>\w+)' + _VALID_URL = r'https?://(?:www\.)?theater-complex\.town/(?:(?:en|ja)/)?ppv/(?P<id>\w+)' IE_NAME = 'theatercomplextown:ppv' _TESTS = [{ 'url': 'https://www.theater-complex.town/ppv/wytW3X7khrjJBUpKuV3jen', @@ -223,6 +226,9 @@ class TheaterComplexTownPPVIE(TheaterComplexTownBaseIE): }, { 'url': 'https://www.theater-complex.town/en/ppv/wytW3X7khrjJBUpKuV3jen', 'only_matching': True, + }, { + 'url': 'https://www.theater-complex.town/ja/ppv/qwUVmLmGEiZ3ZW6it9uGys', + 'only_matching': True, }] _API_PATH = 'events' diff --git a/yt_dlp/extractor/stv.py b/yt_dlp/extractor/stv.py index 8b3e63538..0ab780100 100644 --- a/yt_dlp/extractor/stv.py +++ b/yt_dlp/extractor/stv.py @@ -41,7 +41,7 @@ class STVPlayerIE(InfoExtractor): ptype, video_id = self._match_valid_url(url).groups() webpage = self._download_webpage(url, video_id, fatal=False) or '' - props = self._search_nextjs_data(webpage, video_id, default='{}').get('props') or {} + props = self._search_nextjs_data(webpage, video_id, default={}).get('props') or {} player_api_cache = try_get( props, lambda x: x['initialReduxState']['playerApiCache']) or {} diff --git a/yt_dlp/extractor/telewebion.py b/yt_dlp/extractor/telewebion.py index 9378ed021..5fdcddd8b 100644 --- a/yt_dlp/extractor/telewebion.py +++ b/yt_dlp/extractor/telewebion.py @@ -1,8 +1,7 @@ from __future__ import annotations - +import functools import json -from functools import partial -from textwrap import dedent +import textwrap from .common import InfoExtractor from ..utils import ExtractorError, format_field, int_or_none, parse_iso8601 @@ -10,7 +9,7 @@ from ..utils.traversal import traverse_obj def _fmt_url(url): - return partial(format_field, template=url, default=None) + return functools.partial(format_field, template=url, default=None) class TelewebionIE(InfoExtractor): @@ -88,7 +87,7 @@ class TelewebionIE(InfoExtractor): if not video_id.startswith('0x'): video_id = hex(int(video_id)) - episode_data = self._call_graphql_api('getEpisodeDetail', video_id, dedent(''' + episode_data = self._call_graphql_api('getEpisodeDetail', video_id, textwrap.dedent(''' queryEpisode(filter: {EpisodeID: $EpisodeId}, first: 1) { title program { @@ -127,7 +126,7 @@ class TelewebionIE(InfoExtractor): 'formats': ( 'channel', 'descriptor', {str}, {_fmt_url(f'https://cdna.telewebion.com/%s/episode/{video_id}/playlist.m3u8')}, - {partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}), + {functools.partial(self._extract_m3u8_formats, video_id=video_id, ext='mp4', m3u8_id='hls')}), })) info_dict['id'] = video_id return info_dict diff --git a/yt_dlp/extractor/tenplay.py b/yt_dlp/extractor/tenplay.py index a98275d86..11cc5705e 100644 --- a/yt_dlp/extractor/tenplay.py +++ b/yt_dlp/extractor/tenplay.py @@ -1,7 +1,7 @@ import base64 +import datetime as dt import functools import itertools -from datetime import datetime from .common import InfoExtractor from ..networking import HEADRequest @@ -70,7 +70,7 @@ class TenPlayIE(InfoExtractor): username, password = self._get_login_info() if username is None or password is None: self.raise_login_required('Your 10play account\'s details must be provided with --username and --password.') - _timestamp = datetime.now().strftime('%Y%m%d000000') + _timestamp = dt.datetime.now().strftime('%Y%m%d000000') _auth_header = base64.b64encode(_timestamp.encode('ascii')).decode('ascii') data = self._download_json('https://10play.com.au/api/user/auth', video_id, 'Getting bearer token', headers={ 'X-Network-Ten-Auth': _auth_header, diff --git a/yt_dlp/extractor/thisoldhouse.py b/yt_dlp/extractor/thisoldhouse.py index 15f8380d3..fbc12d55d 100644 --- a/yt_dlp/extractor/thisoldhouse.py +++ b/yt_dlp/extractor/thisoldhouse.py @@ -1,5 +1,6 @@ import json +from .brightcove import BrightcoveNewIE from .common import InfoExtractor from .zype import ZypeIE from ..networking import HEADRequest @@ -8,6 +9,7 @@ from ..utils import ( ExtractorError, filter_dict, parse_qs, + smuggle_url, try_call, urlencode_postdata, ) @@ -17,23 +19,43 @@ class ThisOldHouseIE(InfoExtractor): _NETRC_MACHINE = 'thisoldhouse' _VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode|(?:[^/?#]+/)?\d+)/(?P<id>[^/?#]+)' _TESTS = [{ + # Unresolved Brightcove URL embed (formerly Zype), free 'url': 'https://www.thisoldhouse.com/furniture/21017078/how-to-build-a-storage-bench', 'info_dict': { - 'id': '5dcdddf673c3f956ef5db202', + 'id': '6325298523112', 'ext': 'mp4', 'title': 'How to Build a Storage Bench', 'description': 'In the workshop, Tom Silva and Kevin O\'Connor build a storage bench for an entryway.', - 'timestamp': 1442548800, - 'upload_date': '20150918', - 'duration': 674, - 'view_count': int, - 'average_rating': 0, - 'thumbnail': r're:^https?://.*\.jpg\?\d+$', - 'display_id': 'how-to-build-a-storage-bench', + 'timestamp': 1681793639, + 'upload_date': '20230418', + 'duration': 674.54, + 'tags': 'count:11', + 'uploader_id': '6314471934001', + 'thumbnail': r're:^https?://.*\.jpg', }, 'params': { 'skip_download': True, }, + }, { + # Brightcove embed, authwalled + 'url': 'https://www.thisoldhouse.com/glen-ridge-generational/99537/s45-e17-multi-generational', + 'info_dict': { + 'id': '6349675446112', + 'ext': 'mp4', + 'title': 'E17 | Glen Ridge Generational | Multi-Generational', + 'description': 'md5:53c6bc2e8031f3033d693d9a3563222c', + 'timestamp': 1711382202, + 'upload_date': '20240325', + 'duration': 1422.229, + 'tags': 'count:13', + 'uploader_id': '6314471934001', + 'thumbnail': r're:^https?://.*\.jpg', + }, + 'expected_warnings': ['Login with password is not supported for this website'], + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires subscription', }, { # Page no longer has video 'url': 'https://www.thisoldhouse.com/watch/arlington-arts-crafts-arts-and-crafts-class-begins', @@ -98,7 +120,15 @@ class ThisOldHouseIE(InfoExtractor): video_url, video_id = self._search_regex( r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?thisoldhouse\.(?:chorus\.build|com)/videos/zype/([0-9a-f]{24})[^\'"]*)[\'"]', - webpage, 'video url', group=(1, 2)) - video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url + webpage, 'zype url', group=(1, 2), default=(None, None)) + if video_url: + video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Zype URL').url + return self.url_result(video_url, ZypeIE, video_id) - return self.url_result(video_url, ZypeIE, video_id) + video_url, video_id = self._search_regex([ + r'<iframe[^>]+src=[\'"]((?:https?:)?//players\.brightcove\.net/\d+/\w+/index\.html\?videoId=(\d+))', + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)thisoldhouse\.com/videos/brightcove/(\d+))'], + webpage, 'iframe url', group=(1, 2)) + if not parse_qs(video_url).get('videoId'): + video_url = self._request_webpage(HEADRequest(video_url), video_id, 'Resolving Brightcove URL').url + return self.url_result(smuggle_url(video_url, {'referrer': url}), BrightcoveNewIE, video_id) diff --git a/yt_dlp/extractor/tiktok.py b/yt_dlp/extractor/tiktok.py index 02545bc79..3d965dd45 100644 --- a/yt_dlp/extractor/tiktok.py +++ b/yt_dlp/extractor/tiktok.py @@ -4,6 +4,7 @@ import random import re import string import time +import uuid from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse @@ -30,19 +31,65 @@ from ..utils import ( class TikTokBaseIE(InfoExtractor): - _APP_VERSIONS = [('26.1.3', '260103'), ('26.1.2', '260102'), ('26.1.1', '260101'), ('25.6.2', '250602')] - _WORKING_APP_VERSION = None - _APP_NAME = 'trill' - _AID = 1180 _UPLOADER_URL_FORMAT = 'https://www.tiktok.com/@%s' _WEBPAGE_HOST = 'https://www.tiktok.com/' QUALITIES = ('360p', '540p', '720p', '1080p') + _APP_INFO_DEFAULTS = { + # unique "install id" + 'iid': None, + # TikTok (KR/PH/TW/TH/VN) = trill, TikTok (rest of world) = musical_ly, Douyin = aweme + 'app_name': 'musical_ly', + 'app_version': '34.1.2', + 'manifest_app_version': '2023401020', + # "app id": aweme = 1128, trill = 1180, musical_ly = 1233, universal = 0 + 'aid': '0', + } + _KNOWN_APP_INFO = [ + '7351144126450059040', + '7351149742343391009', + '7351153174894626592', + ] + _APP_INFO_POOL = None + _APP_INFO = None + _APP_USER_AGENT = None + @property def _API_HOSTNAME(self): return self._configuration_arg( 'api_hostname', ['api22-normal-c-useast2a.tiktokv.com'], ie_key=TikTokIE)[0] + def _get_next_app_info(self): + if self._APP_INFO_POOL is None: + defaults = { + key: self._configuration_arg(key, [default], ie_key=TikTokIE)[0] + for key, default in self._APP_INFO_DEFAULTS.items() + if key != 'iid' + } + app_info_list = ( + self._configuration_arg('app_info', ie_key=TikTokIE) + or random.sample(self._KNOWN_APP_INFO, len(self._KNOWN_APP_INFO))) + self._APP_INFO_POOL = [ + {**defaults, **dict( + (k, v) for k, v in zip(self._APP_INFO_DEFAULTS, app_info.split('/')) if v + )} for app_info in app_info_list + ] + + if not self._APP_INFO_POOL: + return False + + self._APP_INFO = self._APP_INFO_POOL.pop(0) + + app_name = self._APP_INFO['app_name'] + version = self._APP_INFO['manifest_app_version'] + if app_name == 'musical_ly': + package = f'com.zhiliaoapp.musically/{version}' + else: # trill, aweme + package = f'com.ss.android.ugc.{app_name}/{version}' + self._APP_USER_AGENT = f'{package} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)' + + return True + @staticmethod def _create_url(user_id, video_id): return f'https://www.tiktok.com/@{user_id or "_"}/video/{video_id}' @@ -58,7 +105,7 @@ class TikTokBaseIE(InfoExtractor): 'universal data', display_id, end_pattern=r'</script>', default={}), ('__DEFAULT_SCOPE__', {dict})) or {} - def _call_api_impl(self, ep, query, manifest_app_version, video_id, fatal=True, + def _call_api_impl(self, ep, query, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): self._set_cookie(self._API_HOSTNAME, 'odin_tt', ''.join(random.choices('0123456789abcdef', k=160))) webpage_cookies = self._get_cookies(self._WEBPAGE_HOST) @@ -67,80 +114,85 @@ class TikTokBaseIE(InfoExtractor): return self._download_json( 'https://%s/aweme/v1/%s/' % (self._API_HOSTNAME, ep), video_id=video_id, fatal=fatal, note=note, errnote=errnote, headers={ - 'User-Agent': f'com.ss.android.ugc.{self._APP_NAME}/{manifest_app_version} (Linux; U; Android 13; en_US; Pixel 7; Build/TD1A.220804.031; Cronet/58.0.2991.0)', + 'User-Agent': self._APP_USER_AGENT, 'Accept': 'application/json', }, query=query) - def _build_api_query(self, query, app_version, manifest_app_version): + def _build_api_query(self, query): return { **query, - 'version_name': app_version, - 'version_code': manifest_app_version, - 'build_number': app_version, - 'manifest_version_code': manifest_app_version, - 'update_version_code': manifest_app_version, - 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), - 'uuid': ''.join(random.choices(string.digits, k=16)), - '_rticket': int(time.time() * 1000), - 'ts': int(time.time()), - 'device_brand': 'Google', - 'device_type': 'Pixel 7', 'device_platform': 'android', + 'os': 'android', + 'ssmix': 'a', + '_rticket': int(time.time() * 1000), + 'cdid': str(uuid.uuid4()), + 'channel': 'googleplay', + 'aid': self._APP_INFO['aid'], + 'app_name': self._APP_INFO['app_name'], + 'version_code': ''.join((f'{int(v):02d}' for v in self._APP_INFO['app_version'].split('.'))), + 'version_name': self._APP_INFO['app_version'], + 'manifest_version_code': self._APP_INFO['manifest_app_version'], + 'update_version_code': self._APP_INFO['manifest_app_version'], + 'ab_version': self._APP_INFO['app_version'], 'resolution': '1080*2400', 'dpi': 420, - 'os_version': '13', - 'os_api': '29', - 'carrier_region': 'US', - 'sys_region': 'US', - 'region': 'US', - 'app_name': self._APP_NAME, - 'app_language': 'en', + 'device_type': 'Pixel 7', + 'device_brand': 'Google', 'language': 'en', - 'timezone_name': 'America/New_York', - 'timezone_offset': '-14400', - 'channel': 'googleplay', + 'os_api': '29', + 'os_version': '13', 'ac': 'wifi', - 'mcc_mnc': '310260', - 'is_my_cn': 0, - 'aid': self._AID, - 'ssmix': 'a', - 'as': 'a1qwert123', - 'cp': 'cbfhckdckkde1', + 'is_pad': '0', + 'current_region': 'US', + 'app_type': 'normal', + 'sys_region': 'US', + 'last_install_time': int(time.time()) - random.randint(86400, 1123200), + 'timezone_name': 'America/New_York', + 'residence': 'US', + 'app_language': 'en', + 'timezone_offset': '-14400', + 'host_abi': 'armeabi-v7a', + 'locale': 'en', + 'ac2': 'wifi5g', + 'uoo': '1', + 'carrier_region': 'US', + 'op_region': 'US', + 'build_number': self._APP_INFO['app_version'], + 'region': 'US', + 'ts': int(time.time()), + 'iid': self._APP_INFO['iid'], + 'device_id': random.randint(7250000000000000000, 7351147085025500000), + 'openudid': ''.join(random.choices('0123456789abcdef', k=16)), } def _call_api(self, ep, query, video_id, fatal=True, note='Downloading API JSON', errnote='Unable to download API page'): - if not self._WORKING_APP_VERSION: - app_version = self._configuration_arg('app_version', [''], ie_key=TikTokIE.ie_key())[0] - manifest_app_version = self._configuration_arg('manifest_app_version', [''], ie_key=TikTokIE.ie_key())[0] - if app_version and manifest_app_version: - self._WORKING_APP_VERSION = (app_version, manifest_app_version) - self.write_debug('Imported app version combo from extractor arguments') - elif app_version or manifest_app_version: - self.report_warning('Only one of the two required version params are passed as extractor arguments', only_once=True) + if not self._APP_INFO and not self._get_next_app_info(): + message = 'No working app info is available' + if fatal: + raise ExtractorError(message, expected=True) + else: + self.report_warning(message) + return - if self._WORKING_APP_VERSION: - app_version, manifest_app_version = self._WORKING_APP_VERSION - real_query = self._build_api_query(query, app_version, manifest_app_version) - return self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote) - - for count, (app_version, manifest_app_version) in enumerate(self._APP_VERSIONS, start=1): - real_query = self._build_api_query(query, app_version, manifest_app_version) + max_tries = len(self._APP_INFO_POOL) + 1 # _APP_INFO_POOL + _APP_INFO + for count in itertools.count(1): + self.write_debug(str(self._APP_INFO)) + real_query = self._build_api_query(query) try: - res = self._call_api_impl(ep, real_query, manifest_app_version, video_id, fatal, note, errnote) - self._WORKING_APP_VERSION = (app_version, manifest_app_version) - return res + return self._call_api_impl(ep, real_query, video_id, fatal, note, errnote) except ExtractorError as e: if isinstance(e.cause, json.JSONDecodeError) and e.cause.pos == 0: - if count == len(self._APP_VERSIONS): + message = str(e.cause or e.msg) + if not self._get_next_app_info(): if fatal: - raise e + raise else: - self.report_warning(str(e.cause or e.msg)) + self.report_warning(message) return - self.report_warning('%s. Retrying... (attempt %s of %s)' % (str(e.cause or e.msg), count, len(self._APP_VERSIONS))) + self.report_warning(f'{message}. Retrying... (attempt {count} of {max_tries})') continue - raise e + raise def _extract_aweme_app(self, aweme_id): feed_list = self._call_api( @@ -223,6 +275,7 @@ class TikTokBaseIE(InfoExtractor): def extract_addr(addr, add_meta={}): parsed_meta, res = parse_url_key(addr.get('url_key', '')) + is_bytevc2 = parsed_meta.get('vcodec') == 'bytevc2' if res: known_resolutions.setdefault(res, {}).setdefault('height', int_or_none(addr.get('height'))) known_resolutions[res].setdefault('width', int_or_none(addr.get('width'))) @@ -235,8 +288,11 @@ class TikTokBaseIE(InfoExtractor): 'acodec': 'aac', 'source_preference': -2 if 'aweme/v1' in url else -1, # Downloads from API might get blocked **add_meta, **parsed_meta, + # bytevc2 is bytedance's proprietary (unplayable) video codec + 'preference': -100 if is_bytevc2 else -1, 'format_note': join_nonempty( - add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, delim=' '), + add_meta.get('format_note'), '(API)' if 'aweme/v1' in url else None, + '(UNPLAYABLE)' if is_bytevc2 else None, delim=' '), **audio_meta(url), } for url in addr.get('url_list') or []] @@ -720,7 +776,7 @@ class TikTokIE(TikTokBaseIE): status = traverse_obj(sigi_data, ('VideoPage', 'statusCode', {int})) or 0 video_data = traverse_obj(sigi_data, ('ItemModule', video_id, {dict})) - elif next_data := self._search_nextjs_data(webpage, video_id, default='{}'): + elif next_data := self._search_nextjs_data(webpage, video_id, default={}): self.write_debug('Found next.js data') status = traverse_obj(next_data, ('props', 'pageProps', 'statusCode', {int})) or 0 video_data = traverse_obj(next_data, ('props', 'pageProps', 'itemInfo', 'itemStruct', {dict})) diff --git a/yt_dlp/extractor/twitch.py b/yt_dlp/extractor/twitch.py index c55786a0d..80cba0915 100644 --- a/yt_dlp/extractor/twitch.py +++ b/yt_dlp/extractor/twitch.py @@ -191,17 +191,25 @@ class TwitchBaseIE(InfoExtractor): }] if thumbnail else None def _extract_twitch_m3u8_formats(self, path, video_id, token, signature): - return self._extract_m3u8_formats( + formats = self._extract_m3u8_formats( f'{self._USHER_BASE}/{path}/{video_id}.m3u8', video_id, 'mp4', query={ 'allow_source': 'true', 'allow_audio_only': 'true', 'allow_spectre': 'true', 'p': random.randint(1000000, 10000000), + 'platform': 'web', 'player': 'twitchweb', + 'supported_codecs': 'av1,h265,h264', 'playlist_include_framerate': 'true', 'sig': signature, 'token': token, }) + for fmt in formats: + if fmt.get('vcodec') and fmt['vcodec'].startswith('av01'): + # mpegts does not yet have proper support for av1 + fmt['downloader_options'] = {'ffmpeg_args_out': ['-f', 'mp4']} + + return formats class TwitchVodIE(TwitchBaseIE): diff --git a/yt_dlp/extractor/vk.py b/yt_dlp/extractor/vk.py index e4a78c297..7e3a3a9a9 100644 --- a/yt_dlp/extractor/vk.py +++ b/yt_dlp/extractor/vk.py @@ -707,6 +707,7 @@ class VKWallPostIE(VKBaseIE): class VKPlayBaseIE(InfoExtractor): + _BASE_URL_RE = r'https?://(?:vkplay\.live|live\.vkplay\.ru)/' _RESOLUTIONS = { 'tiny': '256x144', 'lowest': '426x240', @@ -765,7 +766,7 @@ class VKPlayBaseIE(InfoExtractor): class VKPlayIE(VKPlayBaseIE): - _VALID_URL = r'https?://vkplay\.live/(?P<username>[^/#?]+)/record/(?P<id>[a-f0-9-]+)' + _VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P<username>[^/#?]+)/record/(?P<id>[\da-f-]+)' _TESTS = [{ 'url': 'https://vkplay.live/zitsmann/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da', 'info_dict': { @@ -776,13 +777,16 @@ class VKPlayIE(VKPlayBaseIE): 'uploader_id': '13159830', 'release_timestamp': 1683461378, 'release_date': '20230507', - 'thumbnail': r're:https://images.vkplay.live/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview\?change_time=\d+', + 'thumbnail': r're:https://[^/]+/public_video_stream/record/f5e6e3b5-dc52-4d14-965d-0680dd2882da/preview', 'duration': 10608, 'view_count': int, 'like_count': int, 'categories': ['Atomic Heart'], }, 'params': {'skip_download': 'm3u8'}, + }, { + 'url': 'https://live.vkplay.ru/lebwa/record/33a4e4ce-e3ef-49db-bb14-f006cc6fabc9/records', + 'only_matching': True, }] def _real_extract(self, url): @@ -802,7 +806,7 @@ class VKPlayIE(VKPlayBaseIE): class VKPlayLiveIE(VKPlayBaseIE): - _VALID_URL = r'https?://vkplay\.live/(?P<id>[^/#?]+)/?(?:[#?]|$)' + _VALID_URL = rf'{VKPlayBaseIE._BASE_URL_RE}(?P<id>[^/#?]+)/?(?:[#?]|$)' _TESTS = [{ 'url': 'https://vkplay.live/bayda', 'info_dict': { @@ -813,7 +817,7 @@ class VKPlayLiveIE(VKPlayBaseIE): 'uploader_id': '12279401', 'release_timestamp': 1687209962, 'release_date': '20230619', - 'thumbnail': r're:https://images.vkplay.live/public_video_stream/12279401/preview\?change_time=\d+', + 'thumbnail': r're:https://[^/]+/public_video_stream/12279401/preview', 'view_count': int, 'concurrent_view_count': int, 'like_count': int, @@ -822,6 +826,9 @@ class VKPlayLiveIE(VKPlayBaseIE): }, 'skip': 'livestream', 'params': {'skip_download': True}, + }, { + 'url': 'https://live.vkplay.ru/lebwa', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/yt_dlp/extractor/vrt.py b/yt_dlp/extractor/vrt.py index 497233d95..3d26549a4 100644 --- a/yt_dlp/extractor/vrt.py +++ b/yt_dlp/extractor/vrt.py @@ -16,6 +16,7 @@ from ..utils import ( join_nonempty, jwt_encode_hs256, make_archive_id, + merge_dicts, parse_age_limit, parse_iso8601, str_or_none, @@ -425,3 +426,64 @@ class DagelijkseKostIE(VRTBaseIE): ['description', 'twitter:description', 'og:description'], webpage), '_old_archive_ids': [make_archive_id('Canvas', video_id)], } + + +class Radio1BeIE(VRTBaseIE): + _VALID_URL = r'https?://radio1\.be/(?:lees|luister/select)/(?P<id>[\w/-]+)' + _TESTS = [{ + 'url': 'https://radio1.be/luister/select/de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie', + 'info_dict': { + 'id': 'eb6c22e9-544f-44f4-af39-cf8cccd29e22', + 'title': 'Komt N-VA volgend jaar op in Wallonië?', + 'display_id': 'de-ochtend/komt-n-va-volgend-jaar-op-in-wallonie', + 'description': 'md5:b374ea1c9302f38362df9dea1931468e', + 'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+' + }, + 'playlist_mincount': 1 + }, { + 'url': 'https://radio1.be/lees/europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza?view=web', + 'info_dict': { + 'id': '5d47f102-dbdb-4fa0-832b-26c1870311f2', + 'title': 'Europese Unie wil "onmiddellijke humanitaire pauze" en "duurzaam staakt-het-vuren" in Gaza', + 'description': 'md5:1aad1fae7d39edeffde5d3e67d276b64', + 'thumbnail': r're:https?://cds\.vrt\.radio/[^/#\?&]+', + 'display_id': 'europese-unie-wil-onmiddellijke-humanitaire-pauze-en-duurzaam-staakt-het-vuren-in-gaza' + }, + 'playlist_mincount': 1 + }] + + def _extract_video_entries(self, next_js_data, display_id): + video_data = traverse_obj( + next_js_data, ((None, ('paragraphs', ...)), {lambda x: x if x['mediaReference'] else None})) + for data in video_data: + media_reference = data['mediaReference'] + formats, subtitles = self._extract_formats_and_subtitles( + self._call_api(media_reference), display_id) + + yield { + 'id': media_reference, + 'formats': formats, + 'subtitles': subtitles, + **traverse_obj(data, { + 'title': ('title', {str}), + 'description': ('body', {clean_html}) + }), + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + next_js_data = self._search_nextjs_data(webpage, display_id)['props']['pageProps']['item'] + + return self.playlist_result( + self._extract_video_entries(next_js_data, display_id), **merge_dicts(traverse_obj( + next_js_data, ({ + 'id': ('id', {str}), + 'title': ('title', {str}), + 'description': (('description', 'content'), {clean_html}), + }), get_all=False), { + 'display_id': display_id, + 'title': self._html_search_meta(['name', 'og:title', 'twitter:title'], webpage), + 'description': self._html_search_meta(['description', 'og:description', 'twitter:description'], webpage), + 'thumbnail': self._html_search_meta(['og:image', 'twitter:image'], webpage), + })) diff --git a/yt_dlp/extractor/wistia.py b/yt_dlp/extractor/wistia.py index bce5e8326..f2256fdc6 100644 --- a/yt_dlp/extractor/wistia.py +++ b/yt_dlp/extractor/wistia.py @@ -1,6 +1,6 @@ +import base64 import re import urllib.parse -from base64 import b64decode from .common import InfoExtractor from ..networking import HEADRequest @@ -371,7 +371,7 @@ class WistiaChannelIE(WistiaBaseIE): webpage = self._download_webpage(f'https://fast.wistia.net/embed/channel/{channel_id}', channel_id) data = self._parse_json( self._search_regex(r'wchanneljsonp-%s\'\]\s*=[^\"]*\"([A-Za-z0-9=/]*)' % channel_id, webpage, 'jsonp', channel_id), - channel_id, transform_source=lambda x: urllib.parse.unquote_plus(b64decode(x).decode('utf-8'))) + channel_id, transform_source=lambda x: urllib.parse.unquote_plus(base64.b64decode(x).decode('utf-8'))) # XXX: can there be more than one series? series = traverse_obj(data, ('series', 0), default={}) diff --git a/yt_dlp/extractor/wrestleuniverse.py b/yt_dlp/extractor/wrestleuniverse.py index 145246a14..880ee519b 100644 --- a/yt_dlp/extractor/wrestleuniverse.py +++ b/yt_dlp/extractor/wrestleuniverse.py @@ -147,7 +147,7 @@ class WrestleUniverseBaseIE(InfoExtractor): metadata = self._call_api(video_id, msg='metadata', query={'al': lang or 'ja'}, auth=False, fatal=False) if not metadata: webpage = self._download_webpage(url, video_id) - nextjs_data = self._search_nextjs_data(webpage, video_id) + nextjs_data = self._search_nextjs_data(webpage, video_id, fatal=False) metadata = traverse_obj(nextjs_data, ( 'props', 'pageProps', *variadic(props_keys, (str, bytes, dict, set)), {dict})) or {} return metadata diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index 29b600d53..05f716ee1 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2,7 +2,7 @@ import base64 import calendar import collections import copy -import datetime +import datetime as dt import enum import hashlib import itertools @@ -33,6 +33,7 @@ from ..utils import ( clean_html, datetime_from_str, dict_get, + filesize_from_tbr, filter_dict, float_or_none, format_field, @@ -55,6 +56,7 @@ from ..utils import ( str_to_int, strftime_or_none, traverse_obj, + try_call, try_get, unescapeHTML, unified_strdate, @@ -922,10 +924,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): def _parse_time_text(self, text): if not text: return - dt = self.extract_relative_time(text) + dt_ = self.extract_relative_time(text) timestamp = None - if isinstance(dt, datetime.datetime): - timestamp = calendar.timegm(dt.timetuple()) + if isinstance(dt_, dt.datetime): + timestamp = calendar.timegm(dt_.timetuple()) if timestamp is None: timestamp = ( @@ -3833,16 +3835,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id=video_id, only_once=True) throttled = True - tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1024) + tbr = float_or_none(fmt.get('averageBitrate') or fmt.get('bitrate'), 1000) language_preference = ( 10 if audio_track.get('audioIsDefault') and 10 else -10 if 'descriptive' in (audio_track.get('displayName') or '').lower() and -10 else -1) + format_duration = traverse_obj(fmt, ('approxDurationMs', {lambda x: float_or_none(x, 1000)})) # Some formats may have much smaller duration than others (possibly damaged during encoding) # E.g. 2-nOtRESiUc Ref: https://github.com/yt-dlp/yt-dlp/issues/2823 # Make sure to avoid false positives with small duration differences. # E.g. __2ABJjxzNo, ySuUZEjARPY - is_damaged = try_get(fmt, lambda x: float(x['approxDurationMs']) / duration < 500) + is_damaged = try_call(lambda: format_duration < duration // 2) if is_damaged: self.report_warning( f'{video_id}: Some formats are possibly damaged. They will be deprioritized', only_once=True) @@ -3872,6 +3875,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'quality': q(quality) - bool(fmt.get('isDrc')) / 2, 'has_drm': bool(fmt.get('drmFamilies')), 'tbr': tbr, + 'filesize_approx': filesize_from_tbr(tbr, format_duration), 'url': fmt_url, 'width': int_or_none(fmt.get('width')), 'language': join_nonempty(audio_track.get('id', '').split('.')[0], @@ -4563,7 +4567,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if upload_date and live_status not in ('is_live', 'post_live', 'is_upcoming'): # Newly uploaded videos' HLS formats are potentially problematic and need to be checked - upload_datetime = datetime_from_str(upload_date).replace(tzinfo=datetime.timezone.utc) + upload_datetime = datetime_from_str(upload_date).replace(tzinfo=dt.timezone.utc) if upload_datetime >= datetime_from_str('today-2days'): for fmt in info['formats']: if fmt.get('protocol') == 'm3u8_native': diff --git a/yt_dlp/extractor/zattoo.py b/yt_dlp/extractor/zattoo.py index 6bd9ea064..5cc9c5f7a 100644 --- a/yt_dlp/extractor/zattoo.py +++ b/yt_dlp/extractor/zattoo.py @@ -1,5 +1,5 @@ import re -from uuid import uuid4 +import uuid from .common import InfoExtractor from ..compat import compat_str @@ -53,7 +53,7 @@ class ZattooPlatformBaseIE(InfoExtractor): self._request_webpage( '%s/zapi/v3/session/hello' % self._host_url(), None, 'Opening session', data=urlencode_postdata({ - 'uuid': compat_str(uuid4()), + 'uuid': compat_str(uuid.uuid4()), 'lang': 'en', 'app_version': '1.8.2', 'format': 'json', diff --git a/yt_dlp/networking/_helper.py b/yt_dlp/networking/_helper.py index d79dd7953..8e678b26a 100644 --- a/yt_dlp/networking/_helper.py +++ b/yt_dlp/networking/_helper.py @@ -2,6 +2,7 @@ from __future__ import annotations import contextlib import functools +import os import socket import ssl import sys @@ -121,6 +122,9 @@ def make_ssl_context( context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) context.check_hostname = verify context.verify_mode = ssl.CERT_REQUIRED if verify else ssl.CERT_NONE + # OpenSSL 1.1.1+ Python 3.8+ keylog file + if hasattr(context, 'keylog_filename'): + context.keylog_filename = os.environ.get('SSLKEYLOGFILE') or None # Some servers may reject requests if ALPN extension is not sent. See: # https://github.com/python/cpython/issues/85140 diff --git a/yt_dlp/networking/common.py b/yt_dlp/networking/common.py index b8c6a62c0..4c66ba66a 100644 --- a/yt_dlp/networking/common.py +++ b/yt_dlp/networking/common.py @@ -463,9 +463,10 @@ class Request: else: raise TypeError('headers must be a mapping') - def update(self, url=None, data=None, headers=None, query=None): + def update(self, url=None, data=None, headers=None, query=None, extensions=None): self.data = data if data is not None else self.data self.headers.update(headers or {}) + self.extensions.update(extensions or {}) self.url = update_url_query(url or self.url, query or {}) def copy(self): diff --git a/yt_dlp/options.py b/yt_dlp/options.py index dac56dc1f..faa1ee563 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -691,6 +691,10 @@ def create_parser(): '--break-on-existing', action='store_true', dest='break_on_existing', default=False, help='Stop the download process when encountering a file that is in the archive') + selection.add_option( + '--no-break-on-existing', + action='store_false', dest='break_on_existing', + help='Do not stop the download process when encountering a file that is in the archive (default)') selection.add_option( '--break-on-reject', action='store_true', dest='break_on_reject', default=False, @@ -1254,6 +1258,10 @@ def create_parser(): 'the progress attributes are accessible under "progress" key. E.g. ' # TODO: Document the fields inside "progress" '--console-title --progress-template "download-title:%(info.id)s-%(progress.eta)s"')) + verbosity.add_option( + '--progress-delta', + metavar='SECONDS', action='store', dest='progress_delta', type=float, default=0, + help='Time between progress output (default: 0)') verbosity.add_option( '-v', '--verbose', action='store_true', dest='verbose', default=False, diff --git a/yt_dlp/update.py b/yt_dlp/update.py index db50cfa6b..f47cbc5b2 100644 --- a/yt_dlp/update.py +++ b/yt_dlp/update.py @@ -114,7 +114,7 @@ _NON_UPDATEABLE_REASONS = { **{variant: f'Auto-update is not supported for unpackaged {name} executable; Re-download the latest release' for variant, name in {'win32_dir': 'Windows', 'darwin_dir': 'MacOS', 'linux_dir': 'Linux'}.items()}, 'source': 'You cannot update when running from source code; Use git to pull the latest changes', - 'unknown': 'You installed yt-dlp with a package manager or setup.py; Use that to update', + 'unknown': 'You installed yt-dlp from a manual build or with a package manager; Use that to update', 'other': 'You are using an unofficial build of yt-dlp; Build the executable again', } diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index 9efeb6a1c..e3e80f3d3 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -5,7 +5,7 @@ import codecs import collections import collections.abc import contextlib -import datetime +import datetime as dt import email.header import email.utils import errno @@ -50,7 +50,6 @@ from ..compat import ( compat_expanduser, compat_HTMLParseError, compat_os_name, - compat_shlex_quote, ) from ..dependencies import xattr @@ -836,9 +835,11 @@ class Popen(subprocess.Popen): if shell and compat_os_name == 'nt' and kwargs.get('executable') is None: if not isinstance(args, str): - args = ' '.join(compat_shlex_quote(a) for a in args) + args = shell_quote(args, shell=True) shell = False - args = f'{self.__comspec()} /Q /S /D /V:OFF /C "{args}"' + # Set variable for `cmd.exe` newline escaping (see `utils.shell_quote`) + env['='] = '"^\n\n"' + args = f'{self.__comspec()} /Q /S /D /V:OFF /E:ON /C "{args}"' super().__init__(args, *remaining, env=env, shell=shell, **kwargs, startupinfo=self._startupinfo) @@ -1150,14 +1151,14 @@ def extract_timezone(date_str): timezone = TIMEZONE_NAMES.get(m and m.group('tz').strip()) if timezone is not None: date_str = date_str[:-len(m.group('tz'))] - timezone = datetime.timedelta(hours=timezone or 0) + timezone = dt.timedelta(hours=timezone or 0) else: date_str = date_str[:-len(m.group('tz'))] if not m.group('sign'): - timezone = datetime.timedelta() + timezone = dt.timedelta() else: sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( + timezone = dt.timedelta( hours=sign * int(m.group('hours')), minutes=sign * int(m.group('minutes'))) return timezone, date_str @@ -1176,8 +1177,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): with contextlib.suppress(ValueError): date_format = f'%Y-%m-%d{delimiter}%H:%M:%S' - dt = datetime.datetime.strptime(date_str, date_format) - timezone - return calendar.timegm(dt.timetuple()) + dt_ = dt.datetime.strptime(date_str, date_format) - timezone + return calendar.timegm(dt_.timetuple()) def date_formats(day_first=True): @@ -1198,12 +1199,12 @@ def unified_strdate(date_str, day_first=True): for expression in date_formats(day_first): with contextlib.suppress(ValueError): - upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') + upload_date = dt.datetime.strptime(date_str, expression).strftime('%Y%m%d') if upload_date is None: timetuple = email.utils.parsedate_tz(date_str) if timetuple: with contextlib.suppress(ValueError): - upload_date = datetime.datetime(*timetuple[:6]).strftime('%Y%m%d') + upload_date = dt.datetime(*timetuple[:6]).strftime('%Y%m%d') if upload_date is not None: return str(upload_date) @@ -1233,8 +1234,8 @@ def unified_timestamp(date_str, day_first=True): for expression in date_formats(day_first): with contextlib.suppress(ValueError): - dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) - return calendar.timegm(dt.timetuple()) + dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta) + return calendar.timegm(dt_.timetuple()) timetuple = email.utils.parsedate_tz(date_str) if timetuple: @@ -1272,11 +1273,11 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): if precision == 'auto': auto_precision = True precision = 'microsecond' - today = datetime_round(datetime.datetime.now(datetime.timezone.utc), precision) + today = datetime_round(dt.datetime.now(dt.timezone.utc), precision) if date_str in ('now', 'today'): return today if date_str == 'yesterday': - return today - datetime.timedelta(days=1) + return today - dt.timedelta(days=1) match = re.match( r'(?P<start>.+)(?P<sign>[+-])(?P<time>\d+)(?P<unit>microsecond|second|minute|hour|day|week|month|year)s?', date_str) @@ -1291,13 +1292,13 @@ def datetime_from_str(date_str, precision='auto', format='%Y%m%d'): if unit == 'week': unit = 'day' time *= 7 - delta = datetime.timedelta(**{unit + 's': time}) + delta = dt.timedelta(**{unit + 's': time}) new_date = start_time + delta if auto_precision: return datetime_round(new_date, unit) return new_date - return datetime_round(datetime.datetime.strptime(date_str, format), precision) + return datetime_round(dt.datetime.strptime(date_str, format), precision) def date_from_str(date_str, format='%Y%m%d', strict=False): @@ -1312,21 +1313,21 @@ def date_from_str(date_str, format='%Y%m%d', strict=False): return datetime_from_str(date_str, precision='microsecond', format=format).date() -def datetime_add_months(dt, months): +def datetime_add_months(dt_, months): """Increment/Decrement a datetime object by months.""" - month = dt.month + months - 1 - year = dt.year + month // 12 + month = dt_.month + months - 1 + year = dt_.year + month // 12 month = month % 12 + 1 - day = min(dt.day, calendar.monthrange(year, month)[1]) - return dt.replace(year, month, day) + day = min(dt_.day, calendar.monthrange(year, month)[1]) + return dt_.replace(year, month, day) -def datetime_round(dt, precision='day'): +def datetime_round(dt_, precision='day'): """ Round a datetime object's time to a specific precision """ if precision == 'microsecond': - return dt + return dt_ unit_seconds = { 'day': 86400, @@ -1335,8 +1336,8 @@ def datetime_round(dt, precision='day'): 'second': 1, } roundto = lambda x, n: ((x + n / 2) // n) * n - timestamp = roundto(calendar.timegm(dt.timetuple()), unit_seconds[precision]) - return datetime.datetime.fromtimestamp(timestamp, datetime.timezone.utc) + timestamp = roundto(calendar.timegm(dt_.timetuple()), unit_seconds[precision]) + return dt.datetime.fromtimestamp(timestamp, dt.timezone.utc) def hyphenate_date(date_str): @@ -1357,11 +1358,11 @@ class DateRange: if start is not None: self.start = date_from_str(start, strict=True) else: - self.start = datetime.datetime.min.date() + self.start = dt.datetime.min.date() if end is not None: self.end = date_from_str(end, strict=True) else: - self.end = datetime.datetime.max.date() + self.end = dt.datetime.max.date() if self.start > self.end: raise ValueError('Date range: "%s" , the start date must be before the end date' % self) @@ -1372,7 +1373,7 @@ class DateRange: def __contains__(self, date): """Check if the date is in the range""" - if not isinstance(date, datetime.date): + if not isinstance(date, dt.date): date = date_from_str(date) return self.start <= date <= self.end @@ -1637,15 +1638,38 @@ def get_filesystem_encoding(): return encoding if encoding is not None else 'utf-8' -def shell_quote(args): - quoted_args = [] - encoding = get_filesystem_encoding() - for a in args: - if isinstance(a, bytes): - # We may get a filename encoded with 'encodeFilename' - a = a.decode(encoding) - quoted_args.append(compat_shlex_quote(a)) - return ' '.join(quoted_args) +_WINDOWS_QUOTE_TRANS = str.maketrans({'"': '\\"', '\\': '\\\\'}) +_CMD_QUOTE_TRANS = str.maketrans({ + # Keep quotes balanced by replacing them with `""` instead of `\\"` + '"': '""', + # Requires a variable `=` containing `"^\n\n"` (set in `utils.Popen`) + # `=` should be unique since variables containing `=` cannot be set using cmd + '\n': '%=%', + # While we are only required to escape backslashes immediately before quotes, + # we instead escape all of 'em anyways to be consistent + '\\': '\\\\', + # Use zero length variable replacement so `%` doesn't get expanded + # `cd` is always set as long as extensions are enabled (`/E:ON` in `utils.Popen`) + '%': '%%cd:~,%', +}) + + +def shell_quote(args, *, shell=False): + args = list(variadic(args)) + if any(isinstance(item, bytes) for item in args): + deprecation_warning('Passing bytes to utils.shell_quote is deprecated') + encoding = get_filesystem_encoding() + for index, item in enumerate(args): + if isinstance(item, bytes): + args[index] = item.decode(encoding) + + if compat_os_name != 'nt': + return shlex.join(args) + + trans = _CMD_QUOTE_TRANS if shell else _WINDOWS_QUOTE_TRANS + return ' '.join( + s if re.fullmatch(r'[\w#$*\-+./:?@\\]+', s, re.ASCII) else s.translate(trans).join('""') + for s in args) def smuggle_url(url, data): @@ -1996,12 +2020,12 @@ def strftime_or_none(timestamp, date_format='%Y%m%d', default=None): if isinstance(timestamp, (int, float)): # unix timestamp # Using naive datetime here can break timestamp() in Windows # Ref: https://github.com/yt-dlp/yt-dlp/issues/5185, https://github.com/python/cpython/issues/94414 - # Also, datetime.datetime.fromtimestamp breaks for negative timestamps + # Also, dt.datetime.fromtimestamp breaks for negative timestamps # Ref: https://github.com/yt-dlp/yt-dlp/issues/6706#issuecomment-1496842642 - datetime_object = (datetime.datetime.fromtimestamp(0, datetime.timezone.utc) - + datetime.timedelta(seconds=timestamp)) + datetime_object = (dt.datetime.fromtimestamp(0, dt.timezone.utc) + + dt.timedelta(seconds=timestamp)) elif isinstance(timestamp, str): # assume YYYYMMDD - datetime_object = datetime.datetime.strptime(timestamp, '%Y%m%d') + datetime_object = dt.datetime.strptime(timestamp, '%Y%m%d') date_format = re.sub( # Support %s on windows r'(?<!%)(%%)*%s', rf'\g<1>{int(datetime_object.timestamp())}', date_format) return datetime_object.strftime(date_format) @@ -2849,7 +2873,7 @@ def ytdl_is_updateable(): def args_to_str(args): # Get a short string representation for a subprocess command - return ' '.join(compat_shlex_quote(a) for a in args) + return shell_quote(args) def error_to_str(err): @@ -4490,10 +4514,10 @@ def write_xattr(path, key, value): def random_birthday(year_field, month_field, day_field): - start_date = datetime.date(1950, 1, 1) - end_date = datetime.date(1995, 12, 31) + start_date = dt.date(1950, 1, 1) + end_date = dt.date(1995, 12, 31) offset = random.randint(0, (end_date - start_date).days) - random_date = start_date + datetime.timedelta(offset) + random_date = start_date + dt.timedelta(offset) return { year_field: str(random_date.year), month_field: str(random_date.month), @@ -4672,7 +4696,7 @@ def time_seconds(**kwargs): """ Returns TZ-aware time in seconds since the epoch (1970-01-01T00:00:00Z) """ - return time.time() + datetime.timedelta(**kwargs).total_seconds() + return time.time() + dt.timedelta(**kwargs).total_seconds() # create a JSON Web Signature (jws) with HS256 algorithm @@ -5415,6 +5439,17 @@ class FormatSorter: return tuple(self._calculate_field_preference(format, field) for field in self._order) +def filesize_from_tbr(tbr, duration): + """ + @param tbr: Total bitrate in kbps (1000 bits/sec) + @param duration: Duration in seconds + @returns Filesize in bytes + """ + if tbr is None or duration is None: + return None + return int(duration * tbr * (1000 / 8)) + + # XXX: Temporary class _YDLLogger: def __init__(self, ydl=None): diff --git a/yt_dlp/utils/traversal.py b/yt_dlp/utils/traversal.py index 8938f4c78..96eb2eddf 100644 --- a/yt_dlp/utils/traversal.py +++ b/yt_dlp/utils/traversal.py @@ -1,5 +1,6 @@ import collections.abc import contextlib +import http.cookies import inspect import itertools import re @@ -28,7 +29,8 @@ def traverse_obj( Each of the provided `paths` is tested and the first producing a valid result will be returned. The next path will also be tested if the path branched but no results could be found. - Supported values for traversal are `Mapping`, `Iterable` and `re.Match`. + Supported values for traversal are `Mapping`, `Iterable`, `re.Match`, + `xml.etree.ElementTree` (xpath) and `http.cookies.Morsel`. Unhelpful values (`{}`, `None`) are treated as the absence of a value and discarded. The paths will be wrapped in `variadic`, so that `'key'` is conveniently the same as `('key', )`. @@ -36,8 +38,8 @@ def traverse_obj( The keys in the path can be one of: - `None`: Return the current object. - `set`: Requires the only item in the set to be a type or function, - like `{type}`/`{func}`. If a `type`, returns only values - of this type. If a function, returns `func(obj)`. + like `{type}`/`{type, type, ...}/`{func}`. If a `type`, return only + values of this type. If a function, returns `func(obj)`. - `str`/`int`: Return `obj[key]`. For `re.Match`, return `obj.group(key)`. - `slice`: Branch out and return all values in `obj[key]`. - `Ellipsis`: Branch out and return a list of all values. @@ -48,8 +50,10 @@ def traverse_obj( For `Iterable`s, `key` is the index of the value. For `re.Match`es, `key` is the group number (0 = full match) as well as additionally any group names, if given. - - `dict` Transform the current object and return a matching dict. + - `dict`: Transform the current object and return a matching dict. Read as: `{key: traverse_obj(obj, path) for key, path in dct.items()}`. + - `any`-builtin: Take the first matching object and return it, resetting branching. + - `all`-builtin: Take all matching objects and return them as a list, resetting branching. `tuple`, `list`, and `dict` all support nested paths and branches. @@ -102,10 +106,10 @@ def traverse_obj( result = obj elif isinstance(key, set): - assert len(key) == 1, 'Set should only be used to wrap a single item' item = next(iter(key)) - if isinstance(item, type): - if isinstance(obj, item): + if len(key) > 1 or isinstance(item, type): + assert all(isinstance(item, type) for item in key) + if isinstance(obj, tuple(key)): result = obj else: result = try_call(item, args=(obj,)) @@ -117,6 +121,8 @@ def traverse_obj( elif key is ...: branching = True + if isinstance(obj, http.cookies.Morsel): + obj = dict(obj, key=obj.key, value=obj.value) if isinstance(obj, collections.abc.Mapping): result = obj.values() elif is_iterable_like(obj) or isinstance(obj, xml.etree.ElementTree.Element): @@ -131,6 +137,8 @@ def traverse_obj( elif callable(key): branching = True + if isinstance(obj, http.cookies.Morsel): + obj = dict(obj, key=obj.key, value=obj.value) if isinstance(obj, collections.abc.Mapping): iter_obj = obj.items() elif is_iterable_like(obj) or isinstance(obj, xml.etree.ElementTree.Element): @@ -157,6 +165,8 @@ def traverse_obj( } or None elif isinstance(obj, collections.abc.Mapping): + if isinstance(obj, http.cookies.Morsel): + obj = dict(obj, key=obj.key, value=obj.value) result = (try_call(obj.get, args=(key,)) if casesense or try_call(obj.__contains__, args=(key,)) else next((v for k, v in obj.items() if casefold(k) == key), None)) @@ -179,7 +189,7 @@ def traverse_obj( elif isinstance(obj, xml.etree.ElementTree.Element) and isinstance(key, str): xpath, _, special = key.rpartition('/') - if not special.startswith('@') and special != 'text()': + if not special.startswith('@') and not special.endswith('()'): xpath = key special = None @@ -198,7 +208,7 @@ def traverse_obj( return try_call(element.attrib.get, args=(special[1:],)) if special == 'text()': return element.text - assert False, f'apply_specials is missing case for {special!r}' + raise SyntaxError(f'apply_specials is missing case for {special!r}') if xpath: result = list(map(apply_specials, obj.iterfind(xpath))) @@ -228,6 +238,15 @@ def traverse_obj( if not casesense and isinstance(key, str): key = key.casefold() + if key in (any, all): + has_branched = False + filtered_objs = (obj for obj in objs if obj not in (None, {})) + if key is any: + objs = (next(filtered_objs, None),) + else: + objs = (list(filtered_objs),) + continue + if __debug__ and callable(key): # Verify function signature inspect.signature(key).bind(None, None) diff --git a/yt_dlp/version.py b/yt_dlp/version.py index 68c3f00e8..22c2c048d 100644 --- a/yt_dlp/version.py +++ b/yt_dlp/version.py @@ -1,8 +1,8 @@ # Autogenerated by devscripts/update-version.py -__version__ = '2024.03.10' +__version__ = '2024.04.09' -RELEASE_GIT_HEAD = '615a84447e8322720be77a0e64298d7f42848693' +RELEASE_GIT_HEAD = 'ff07792676f404ffff6ee61b5638c9dc1a33a37a' VARIANT = None @@ -12,4 +12,4 @@ CHANNEL = 'stable' ORIGIN = 'yt-dlp/yt-dlp' -_pkg_version = '2024.03.10' +_pkg_version = '2024.04.09'