import os import re import urllib.parse import xml.etree.ElementTree from .common import InfoExtractor # isort: split from .brightcove import BrightcoveLegacyIE, BrightcoveNewIE from .commonprotocols import RtmpIE from .youtube import YoutubeIE from ..compat import compat_etree_fromstring from ..utils import ( KNOWN_EXTENSIONS, ExtractorError, UnsupportedError, determine_ext, dict_get, format_field, int_or_none, is_html, js_to_json, merge_dicts, mimetype2ext, orderedSet, parse_duration, parse_resolution, smuggle_url, str_or_none, traverse_obj, try_call, unescapeHTML, unified_timestamp, unsmuggle_url, url_or_none, xpath_attr, xpath_text, xpath_with_ns, ) class GenericIE(InfoExtractor): IE_DESC = 'Generic downloader that works on some sites' _VALID_URL = r'.*' IE_NAME = 'generic' _NETRC_MACHINE = False # Suppress username warning _TESTS = [ # Direct link to a video { 'url': 'http://media.w3.org/2010/05/sintel/trailer.mp4', 'md5': '67d406c2bcb6af27fa886f31aa934bbe', 'info_dict': { 'id': 'trailer', 'ext': 'mp4', 'title': 'trailer', 'upload_date': '20100513', } }, # Direct link to media delivered compressed (until Accept-Encoding is *) { 'url': 'http://calimero.tk/muzik/FictionJunction-Parallel_Hearts.flac', 'md5': '128c42e68b13950268b648275386fc74', 'info_dict': { 'id': 'FictionJunction-Parallel_Hearts', 'ext': 'flac', 'title': 'FictionJunction-Parallel_Hearts', 'upload_date': '20140522', }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ], 'skip': 'URL invalid', }, # Direct download with broken HEAD { 'url': 'http://ai-radio.org:8000/radio.opus', 'info_dict': { 'id': 'radio', 'ext': 'opus', 'title': 'radio', }, 'params': { 'skip_download': True, # infinite live stream }, 'expected_warnings': [ r'501.*Not Implemented', r'400.*Bad Request', ], }, # Direct link with incorrect MIME type { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'md5': '4ccbebe5f36706d85221f204d7eb5913', 'info_dict': { 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', 'id': '5_Lennart_Poettering_-_Systemd', 'ext': 'webm', 'title': '5_Lennart_Poettering_-_Systemd', 'upload_date': '20141120', }, 'expected_warnings': [ 'URL could be a direct video link, returning it as such.' ] }, # RSS feed { 'url': 'http://phihag.de/2014/youtube-dl/rss2.xml', 'info_dict': { 'id': 'https://phihag.de/2014/youtube-dl/rss2.xml', 'title': 'Zero Punctuation', 'description': 're:.*groundbreaking video review series.*' }, 'playlist_mincount': 11, }, # RSS feed with enclosure { 'url': 'http://podcastfeeds.nbcnews.com/audio/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'info_dict': { 'id': 'http://podcastfeeds.nbcnews.com/nbcnews/video/podcast/MSNBC-MADDOW-NETCAST-M4V.xml', 'title': 'MSNBC Rachel Maddow (video)', 'description': 're:.*her unique approach to storytelling.*', }, 'playlist': [{ 'info_dict': { 'ext': 'mov', 'id': 'pdv_maddow_netcast_mov-12-03-2020-223726', 'title': 'MSNBC Rachel Maddow (video) - 12-03-2020-223726', 'description': 're:.*her unique approach to storytelling.*', 'upload_date': '20201204', }, }], }, # RSS feed with item with description and thumbnails { 'url': 'https://anchor.fm/s/dd00e14/podcast/rss', 'info_dict': { 'id': 'https://anchor.fm/s/dd00e14/podcast/rss', 'title': 're:.*100% Hydrogen.*', 'description': 're:.*In this episode.*', }, 'playlist': [{ 'info_dict': { 'ext': 'm4a', 'id': 'c1c879525ce2cb640b344507e682c36d', 'title': 're:Hydrogen!', 'description': 're:.*In this episode we are going.*', 'timestamp': 1567977776, 'upload_date': '20190908', 'duration': 459, 'thumbnail': r're:^https?://.*\.jpg$', 'episode_number': 1, 'season_number': 1, 'age_limit': 0, 'season': 'Season 1', 'direct': True, 'episode': 'Episode 1', }, }], 'params': { 'skip_download': True, }, }, # RSS feed with enclosures and unsupported link URLs { 'url': 'http://www.hellointernet.fm/podcast?format=rss', 'info_dict': { 'id': 'http://www.hellointernet.fm/podcast?format=rss', 'description': 'CGP Grey and Brady Haran talk about YouTube, life, work, whatever.', 'title': 'Hello Internet', }, 'playlist_mincount': 100, }, # RSS feed with guid { 'url': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'info_dict': { 'id': 'https://www.omnycontent.com/d/playlist/a7b4f8fe-59d9-4afc-a79a-a90101378abf/bf2c1d80-3656-4449-9d00-a903004e8f84/efbff746-e7c1-463a-9d80-a903004e8f8f/podcast.rss', 'description': 'md5:be809a44b63b0c56fb485caf68685520', 'title': 'The Little Red Podcast', }, 'playlist_mincount': 76, }, # SMIL from http://videolectures.net/promogram_igor_mekjavic_eng { 'url': 'http://videolectures.net/promogram_igor_mekjavic_eng/video/1/smil.xml', 'info_dict': { 'id': 'smil', 'ext': 'mp4', 'title': 'Automatics, robotics and biocybernetics', 'description': 'md5:815fc1deb6b3a2bff99de2d5325be482', 'upload_date': '20130627', 'formats': 'mincount:16', 'subtitles': 'mincount:1', }, 'params': { 'force_generic_extractor': True, 'skip_download': True, }, }, # SMIL from http://www1.wdr.de/mediathek/video/livestream/index.html { 'url': 'http://metafilegenerator.de/WDR/WDR_FS/hds/hds.smil', 'info_dict': { 'id': 'hds', 'ext': 'flv', 'title': 'hds', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from https://www.restudy.dk/video/play/id/1637 { 'url': 'https://www.restudy.dk/awsmedia/SmilDirectory/video_1637.xml', 'info_dict': { 'id': 'video_1637', 'ext': 'flv', 'title': 'video_1637', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # SMIL from http://adventure.howstuffworks.com/5266-cool-jobs-iditarod-musher-video.htm { 'url': 'http://services.media.howstuffworks.com/videos/450221/smil-service.smil', 'info_dict': { 'id': 'smil-service', 'ext': 'flv', 'title': 'smil-service', 'formats': 'mincount:1', }, 'params': { 'skip_download': True, }, }, # SMIL from http://new.livestream.com/CoheedandCambria/WebsterHall/videos/4719370 { 'url': 'http://api.new.livestream.com/accounts/1570303/events/1585861/videos/4719370.smil', 'info_dict': { 'id': '4719370', 'ext': 'mp4', 'title': '571de1fd-47bc-48db-abf9-238872a58d1f', 'formats': 'mincount:3', }, 'params': { 'skip_download': True, }, }, # XSPF playlist from http://www.telegraaf.nl/tv/nieuws/binnenland/24353229/__Tikibad_ontruimd_wegens_brand__.html { 'url': 'http://www.telegraaf.nl/xml/playlist/2015/8/7/mZlp2ctYIUEB.xspf', 'info_dict': { 'id': 'mZlp2ctYIUEB', 'ext': 'mp4', 'title': 'Tikibad ontruimd wegens brand', 'description': 'md5:05ca046ff47b931f9b04855015e163a4', 'thumbnail': r're:^https?://.*\.jpg$', 'duration': 33, }, 'params': { 'skip_download': True, }, }, # MPD from http://dash-mse-test.appspot.com/media.html { 'url': 'http://yt-dash-mse-test.commondatastorage.googleapis.com/media/car-20120827-manifest.mpd', 'md5': '4b57baab2e30d6eb3a6a09f0ba57ef53', 'info_dict': { 'id': 'car-20120827-manifest', 'ext': 'mp4', 'title': 'car-20120827-manifest', 'formats': 'mincount:9', 'upload_date': '20130904', }, }, # m3u8 served with Content-Type: audio/x-mpegURL; charset=utf-8 { 'url': 'http://once.unicornmedia.com/now/master/playlist/bb0b18ba-64f5-4b1b-a29f-0ac252f06b68/77a785f3-5188-4806-b788-0893a61634ed/93677179-2d99-4ef4-9e17-fe70d49abfbf/content.m3u8', 'info_dict': { 'id': 'content', 'ext': 'mp4', 'title': 'content', 'formats': 'mincount:8', }, 'params': { # m3u8 downloads 'skip_download': True, }, 'skip': 'video gone', }, # m3u8 served with Content-Type: text/plain { 'url': 'http://www.nacentapps.com/m3u8/index.m3u8', 'info_dict': { 'id': 'index', 'ext': 'mp4', 'title': 'index', 'upload_date': '20140720', 'formats': 'mincount:11', }, 'params': { # m3u8 downloads 'skip_download': True, }, 'skip': 'video gone', }, # google redirect { 'url': 'http://www.google.com/url?sa=t&rct=j&q=&esrc=s&source=web&cd=1&cad=rja&ved=0CCUQtwIwAA&url=http%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DcmQHVoWB5FY&ei=F-sNU-LLCaXk4QT52ICQBQ&usg=AFQjCNEw4hL29zgOohLXvpJ-Bdh2bils1Q&bvm=bv.61965928,d.bGE', 'info_dict': { 'id': 'cmQHVoWB5FY', 'ext': 'mp4', 'upload_date': '20130224', 'uploader_id': 'TheVerge', 'description': r're:^Chris Ziegler takes a look at the\.*', 'uploader': 'The Verge', 'title': 'First Firefox OS phones side-by-side', }, 'params': { 'skip_download': False, } }, { # redirect in Refresh HTTP header 'url': 'https://www.facebook.com/l.php?u=https%3A%2F%2Fwww.youtube.com%2Fwatch%3Fv%3DpO8h3EaFRdo&h=TAQHsoToz&enc=AZN16h-b6o4Zq9pZkCCdOLNKMN96BbGMNtcFwHSaazus4JHT_MFYkAA-WARTX2kvsCIdlAIyHZjl6d33ILIJU7Jzwk_K3mcenAXoAzBNoZDI_Q7EXGDJnIhrGkLXo_LJ_pAa2Jzbx17UHMd3jAs--6j2zaeto5w9RTn8T_1kKg3fdC5WPX9Dbb18vzH7YFX0eSJmoa6SP114rvlkw6pkS1-T&s=1', 'info_dict': { 'id': 'pO8h3EaFRdo', 'ext': 'mp4', 'title': 'Tripeo Boiler Room x Dekmantel Festival DJ Set', 'description': 'md5:6294cc1af09c4049e0652b51a2df10d5', 'upload_date': '20150917', 'uploader_id': 'brtvofficial', 'uploader': 'Boiler Room', }, 'params': { 'skip_download': False, }, }, { 'url': 'http://www.hodiho.fr/2013/02/regis-plante-sa-jeep.html', 'md5': '85b90ccc9d73b4acd9138d3af4c27f89', 'info_dict': { 'id': '13601338388002', 'ext': 'mp4', 'uploader': 'www.hodiho.fr', 'title': 'R\u00e9gis plante sa Jeep', } }, # bandcamp page with custom domain { 'add_ie': ['Bandcamp'], 'url': 'http://bronyrock.com/track/the-pony-mash', 'info_dict': { 'id': '3235767654', 'ext': 'mp3', 'title': 'The Pony Mash', 'uploader': 'M_Pallante', }, 'skip': 'There is a limit of 200 free downloads / month for the test song', }, { # embedded brightcove video # it also tests brightcove videos that need to set the 'Referer' # in the http requests 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.bfmtv.com/video/bfmbusiness/cours-bourse/cours-bourse-l-analyse-technique-154522/', 'info_dict': { 'id': '2765128793001', 'ext': 'mp4', 'title': 'Le cours de bourse : l’analyse technique', 'description': 'md5:7e9ad046e968cb2d1114004aba466fd9', 'uploader': 'BFM BUSINESS', }, 'params': { 'skip_download': True, }, }, { # embedded with itemprop embedURL and video id spelled as `idVideo` 'add_id': ['BrightcoveLegacy'], 'url': 'http://bfmbusiness.bfmtv.com/mediaplayer/chroniques/olivier-delamarche/', 'info_dict': { 'id': '5255628253001', 'ext': 'mp4', 'title': 'md5:37c519b1128915607601e75a87995fc0', 'description': 'md5:37f7f888b434bb8f8cc8dbd4f7a4cf26', 'uploader': 'BFM BUSINESS', 'uploader_id': '876450612001', 'timestamp': 1482255315, 'upload_date': '20161220', }, 'params': { 'skip_download': True, }, }, { # https://github.com/ytdl-org/youtube-dl/issues/2253 'url': 'http://bcove.me/i6nfkrc3', 'md5': '0ba9446db037002366bab3b3eb30c88c', 'info_dict': { 'id': '3101154703001', 'ext': 'mp4', 'title': 'Still no power', 'uploader': 'thestar.com', 'description': 'Mississauga resident David Farmer is still out of power as a result of the ice storm a month ago. To keep the house warm, Farmer cuts wood from his property for a wood burning stove downstairs.', }, 'add_ie': ['BrightcoveLegacy'], 'skip': 'video gone', }, { 'url': 'http://www.championat.com/video/football/v/87/87499.html', 'md5': 'fb973ecf6e4a78a67453647444222983', 'info_dict': { 'id': '3414141473001', 'ext': 'mp4', 'title': 'Видео. Удаление Дзагоева (ЦСКА)', 'description': 'Онлайн-трансляция матча ЦСКА - "Волга"', 'uploader': 'Championat', }, }, { # https://github.com/ytdl-org/youtube-dl/issues/3541 'add_ie': ['BrightcoveLegacy'], 'url': 'http://www.kijk.nl/sbs6/leermijvrouwenkennen/videos/jqMiXKAYan2S/aflevering-1', 'info_dict': { 'id': '3866516442001', 'ext': 'mp4', 'title': 'Leer mij vrouwen kennen: Aflevering 1', 'description': 'Leer mij vrouwen kennen: Aflevering 1', 'uploader': 'SBS Broadcasting', }, 'skip': 'Restricted to Netherlands', 'params': { 'skip_download': True, # m3u8 download }, }, { # Brightcove video in