diff --git a/test/test_utils.py b/test/test_utils.py index 824864577..b60b2b2a1 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -411,10 +411,15 @@ class TestUtil(unittest.TestCase): self.assertEqual(unified_timestamp('Sep 11, 2013 | 5:49 AM'), 1378878540) self.assertEqual(unified_timestamp('December 15, 2017 at 7:49 am'), 1513324140) self.assertEqual(unified_timestamp('2018-03-14T08:32:43.1493874+00:00'), 1521016363) + self.assertEqual(unified_timestamp('2022-10-13T02:37:47.831Z'), 1665628667) self.assertEqual(unified_timestamp('December 31 1969 20:00:01 EDT'), 1) self.assertEqual(unified_timestamp('Wednesday 31 December 1969 18:01:26 MDT'), 86) self.assertEqual(unified_timestamp('12/31/1969 20:01:18 EDT', False), 78) + self.assertEqual(unified_timestamp('2023-03-09T18:01:33.646Z', with_milliseconds=True), 1678384893.646) + # ISO8601 spec says that if no timezone is specified, we should use local timezone; + # but yt-dlp uses UTC to keep things consistent + self.assertEqual(unified_timestamp('2023-03-11T06:48:34.008'), 1678517314) def test_determine_ext(self): self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4') diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 9f730d038..b21bdb81b 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -27,7 +27,12 @@ from .cache import Cache from .compat import functools, urllib # isort: split from .compat import compat_os_name, urllib_req_to_req from .cookies import LenientSimpleCookie, load_cookies -from .downloader import FFmpegFD, get_suitable_downloader, shorten_protocol_name +from .downloader import ( + DashSegmentsFD, + FFmpegFD, + get_suitable_downloader, + shorten_protocol_name, +) from .downloader.rtmp import rtmpdump_version from .extractor import gen_extractor_classes, get_info_extractor from .extractor.common import UnsupportedURLIE @@ -3342,7 +3347,7 @@ class YoutubeDL: fd, success = None, True if info_dict.get('protocol') or info_dict.get('url'): fd = get_suitable_downloader(info_dict, self.params, to_stdout=temp_filename == '-') - if fd != FFmpegFD and 'no-direct-merge' not in self.params['compat_opts'] and ( + if fd not in [FFmpegFD, DashSegmentsFD] and 'no-direct-merge' not in self.params['compat_opts'] and ( info_dict.get('section_start') or info_dict.get('section_end')): msg = ('This format cannot be partially downloaded' if FFmpegFD.available() else 'You have requested downloading the video partially, but ffmpeg is not installed') diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index 3d606bcba..ca39982e8 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -12,6 +12,7 @@ import itertools import optparse import os import re +import time import traceback from .compat import compat_os_name, compat_shlex_quote @@ -331,12 +332,13 @@ def validate_options(opts): (?P-?)(?P[^-]+) )?''' + current_time = time.time() chapters, ranges, from_url = [], [], False for regex in value or []: if advanced and regex == '*from-url': from_url = True continue - elif not regex.startswith('*'): + elif not regex.startswith('*') and not regex.startswith('#'): try: chapters.append(re.compile(regex)) except re.error as err: @@ -353,11 +355,16 @@ def validate_options(opts): err = 'Must be of the form "*start-end"' elif not advanced and any(signs): err = 'Negative timestamps are not allowed' - else: + elif regex.startswith('*'): dur[0] *= -1 if signs[0] else 1 dur[1] *= -1 if signs[1] else 1 if dur[1] == float('-inf'): err = '"-inf" is not a valid end' + elif regex.startswith('#'): + dur[0] = dur[0] * (-1 if signs[0] else 1) + current_time + dur[1] = dur[1] * (-1 if signs[1] else 1) + current_time + if dur[1] == float('-inf'): + err = '"-inf" is not a valid end' if err: raise ValueError(f'invalid {name} time range "{regex}". {err}') ranges.append(dur) diff --git a/yt_dlp/downloader/dash.py b/yt_dlp/downloader/dash.py index afc79b6ca..2c1fbecb5 100644 --- a/yt_dlp/downloader/dash.py +++ b/yt_dlp/downloader/dash.py @@ -36,6 +36,8 @@ class DashSegmentsFD(FragmentFD): 'filename': fmt.get('filepath') or filename, 'live': 'is_from_start' if fmt.get('is_from_start') else fmt.get('is_live'), 'total_frags': fragment_count, + 'section_start': info_dict.get('section_start'), + 'section_end': info_dict.get('section_end'), } if real_downloader: diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index bebbc6b43..6a3cb3aa0 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -2695,7 +2695,7 @@ class InfoExtractor: r = int(s.get('r', 0)) ms_info['total_number'] += 1 + r ms_info['s'].append({ - 't': int(s.get('t', 0)), + 't': int_or_none(s.get('t')), # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) 'd': int(s.attrib['d']), 'r': r, @@ -2737,8 +2737,14 @@ class InfoExtractor: return ms_info mpd_duration = parse_duration(mpd_doc.get('mediaPresentationDuration')) + availability_start_time = unified_timestamp( + mpd_doc.get('availabilityStartTime'), with_milliseconds=True) or 0 stream_numbers = collections.defaultdict(int) for period_idx, period in enumerate(mpd_doc.findall(_add_ns('Period'))): + # segmentIngestTime is completely out of spec, but YT Livestream do this + segment_ingest_time = period.get('{http://youtube.com/yt/2012/10/10}segmentIngestTime') + if segment_ingest_time: + availability_start_time = unified_timestamp(segment_ingest_time, with_milliseconds=True) period_entry = { 'id': period.get('id', f'period-{period_idx}'), 'formats': [], @@ -2917,13 +2923,17 @@ class InfoExtractor: 'Bandwidth': bandwidth, 'Number': segment_number, } + duration = float_or_none(segment_d, representation_ms_info['timescale']) + start = float_or_none(segment_time, representation_ms_info['timescale']) representation_ms_info['fragments'].append({ media_location_key: segment_url, - 'duration': float_or_none(segment_d, representation_ms_info['timescale']), + 'duration': duration, + 'start': availability_start_time + start, + 'end': availability_start_time + start + duration, }) for num, s in enumerate(representation_ms_info['s']): - segment_time = s.get('t') or segment_time + segment_time = s['t'] if s.get('t') is not None else segment_time segment_d = s['d'] add_segment_url() segment_number += 1 @@ -2939,6 +2949,7 @@ class InfoExtractor: fragments = [] segment_index = 0 timescale = representation_ms_info['timescale'] + start = 0 for s in representation_ms_info['s']: duration = float_or_none(s['d'], timescale) for r in range(s.get('r', 0) + 1): @@ -2946,8 +2957,11 @@ class InfoExtractor: fragments.append({ location_key(segment_uri): segment_uri, 'duration': duration, + 'start': availability_start_time + start, + 'end': availability_start_time + start + duration, }) segment_index += 1 + start += duration representation_ms_info['fragments'] = fragments elif 'segment_urls' in representation_ms_info: # Segment URLs with no SegmentTimeline diff --git a/yt_dlp/extractor/youtube.py b/yt_dlp/extractor/youtube.py index e553fff9f..b58768b31 100644 --- a/yt_dlp/extractor/youtube.py +++ b/yt_dlp/extractor/youtube.py @@ -2791,17 +2791,17 @@ class YoutubeIE(YoutubeBaseInfoExtractor): microformats = traverse_obj( prs, (..., 'microformat', 'playerMicroformatRenderer'), expected_type=dict) - _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) - is_live = live_status == 'is_live' - start_time = time.time() + with lock: + _, live_status, _, formats, _ = self._list_formats(video_id, microformats, video_details, prs, player_url) + is_live = live_status == 'is_live' + start_time = time.time() def mpd_feed(format_id, delay): """ @returns (manifest_url, manifest_stream_number, is_live) or None """ for retry in self.RetryManager(fatal=False): - with lock: - refetch_manifest(format_id, delay) + refetch_manifest(format_id, delay) f = next((f for f in formats if f['format_id'] == format_id), None) if not f: @@ -2832,6 +2832,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): begin_index = 0 download_start_time = ctx.get('start') or time.time() + section_start = ctx.get('section_start') or 0 + section_end = ctx.get('section_end') or math.inf + + self.write_debug(f'Selected section: {section_start} -> {section_end}') + lack_early_segments = download_start_time - (live_start_time or download_start_time) > MAX_DURATION if lack_early_segments: self.report_warning(bug_reports_message( @@ -2852,9 +2857,10 @@ class YoutubeIE(YoutubeBaseInfoExtractor): or (mpd_url, stream_number, False)) if not refresh_sequence: if expire_fast and not is_live: - return False, last_seq + return False elif old_mpd_url == mpd_url: - return True, last_seq + return True + if manifestless_orig_fmt: fmt_info = manifestless_orig_fmt else: @@ -2865,14 +2871,13 @@ class YoutubeIE(YoutubeBaseInfoExtractor): fmts = None if not fmts: no_fragment_score += 2 - return False, last_seq + return False fmt_info = next(x for x in fmts if x['manifest_stream_number'] == stream_number) fragments = fmt_info['fragments'] fragment_base_url = fmt_info['fragment_base_url'] assert fragment_base_url - _last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) - return True, _last_seq + return True self.write_debug(f'[{video_id}] Generating fragments for format {format_id}') while is_live: @@ -2892,11 +2897,19 @@ class YoutubeIE(YoutubeBaseInfoExtractor): last_segment_url = None continue else: - should_continue, last_seq = _extract_sequence_from_mpd(True, no_fragment_score > 15) + should_continue = _extract_sequence_from_mpd(True, no_fragment_score > 15) no_fragment_score += 2 if not should_continue: continue + last_fragment = fragments[-1] + last_seq = int(re.search(r'(?:/|^)sq/(\d+)', fragments[-1]['path']).group(1)) + + known_fragment = next( + (fragment for fragment in fragments if f'sq/{known_idx}' in fragment['path']), None) + if known_fragment and known_fragment['end'] > section_end: + break + if known_idx > last_seq: last_segment_url = None continue @@ -2906,20 +2919,36 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if begin_index < 0 and known_idx < 0: # skip from the start when it's negative value known_idx = last_seq + begin_index + if lack_early_segments: - known_idx = max(known_idx, last_seq - int(MAX_DURATION // fragments[-1]['duration'])) + known_idx = max(known_idx, last_seq - int(MAX_DURATION // last_fragment['duration'])) + + fragment_count = last_seq - known_idx if section_end == math.inf else int( + (section_end - section_start) // last_fragment['duration']) + try: for idx in range(known_idx, last_seq): # do not update sequence here or you'll get skipped some part of it - should_continue, _ = _extract_sequence_from_mpd(False, False) + should_continue = _extract_sequence_from_mpd(False, False) if not should_continue: known_idx = idx - 1 raise ExtractorError('breaking out of outer loop') - last_segment_url = urljoin(fragment_base_url, 'sq/%d' % idx) - yield { - 'url': last_segment_url, - 'fragment_count': last_seq, - } + + frag_duration = last_fragment['duration'] + frag_start = last_fragment['start'] - (last_seq - idx) * frag_duration + frag_end = frag_start + frag_duration + + if frag_start >= section_start and frag_end <= section_end: + last_segment_url = urljoin(fragment_base_url, f'sq/{idx}') + + yield { + 'url': last_segment_url, + 'fragment_count': fragment_count, + 'duration': frag_duration, + 'start': frag_start, + 'end': frag_end, + } + if known_idx == last_seq: no_fragment_score += 5 else: @@ -3908,6 +3937,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): dct['downloader_options'] = {'http_chunk_size': CHUNK_SIZE} yield dct + if live_status == 'is_live' and self.get_param('download_ranges') and not self.get_param('live_from_start'): + self.report_warning('For YT livestreams, --download-sections is only supported with --live-from-start') + needs_live_processing = self._needs_live_processing(live_status, duration) skip_bad_formats = 'incomplete' not in format_types if self._configuration_arg('include_incomplete_formats'): diff --git a/yt_dlp/options.py b/yt_dlp/options.py index faa1ee563..3aecae170 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -419,7 +419,14 @@ def create_parser(): general.add_option( '--live-from-start', action='store_true', dest='live_from_start', - help='Download livestreams from the start. Currently only supported for YouTube (Experimental)') + help=('Download livestreams from the start. Currently only supported for YouTube (Experimental). ' + 'Time ranges can be specified using --download-sections to download only a part of the stream. ' + 'Negative values are allowed for specifying a relative previous time, using the # syntax ' + 'e.g. --download-sections "#-24hours - 0" (download last 24 hours), ' + 'e.g. --download-sections "#-1h - 30m" (download from 1 hour ago until the next 30 minutes), ' + 'e.g. --download-sections "#-3days - -2days" (download from 3 days ago until 2 days ago). ' + 'It is also possible to specify an exact unix timestamp range, using the * syntax, ' + 'e.g. --download-sections "*1672531200 - 1672549200" (download between those two timestamps)')) general.add_option( '--no-live-from-start', action='store_false', dest='live_from_start', diff --git a/yt_dlp/utils/_utils.py b/yt_dlp/utils/_utils.py index b63766912..01d54b846 100644 --- a/yt_dlp/utils/_utils.py +++ b/yt_dlp/utils/_utils.py @@ -1209,7 +1209,7 @@ def unified_strdate(date_str, day_first=True): return str(upload_date) -def unified_timestamp(date_str, day_first=True): +def unified_timestamp(date_str, day_first=True, with_milliseconds=False): if not isinstance(date_str, str): return None @@ -1235,7 +1235,7 @@ def unified_timestamp(date_str, day_first=True): for expression in date_formats(day_first): with contextlib.suppress(ValueError): dt_ = dt.datetime.strptime(date_str, expression) - timezone + dt.timedelta(hours=pm_delta) - return calendar.timegm(dt_.timetuple()) + return calendar.timegm(dt_.timetuple()) + (dt_.microsecond / 1e6 if with_milliseconds else 0) timetuple = email.utils.parsedate_tz(date_str) if timetuple: @@ -2035,16 +2035,19 @@ def parse_duration(s): days, hours, mins, secs, ms = [None] * 5 m = re.match(r'''(?x) + (?P[+-])? (?P (?:(?:(?P[0-9]+):)?(?P[0-9]+):)?(?P[0-9]+):)? (?P(?(before_secs)[0-9]{1,2}|[0-9]+)) (?P[.:][0-9]+)?Z?$ ''', s) if m: - days, hours, mins, secs, ms = m.group('days', 'hours', 'mins', 'secs', 'ms') + sign, days, hours, mins, secs, ms = m.group('sign', 'days', 'hours', 'mins', 'secs', 'ms') else: m = re.match( - r'''(?ix)(?:P? + r'''(?ix)(?: + (?P[+-])? + P? (?: [0-9]+\s*y(?:ears?)?,?\s* )? @@ -2068,17 +2071,19 @@ def parse_duration(s): (?P[0-9]+)(?P\.[0-9]+)?\s*s(?:ec(?:ond)?s?)?\s* )?Z?$''', s) if m: - days, hours, mins, secs, ms = m.groups() + sign, days, hours, mins, secs, ms = m.groups() else: - m = re.match(r'(?i)(?:(?P[0-9.]+)\s*(?:hours?)|(?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) + m = re.match(r'(?i)(?P[+-])?(?:(?P[0-9.]+)\s*(?:days?)|(?P[0-9.]+)\s*(?:hours?)|(?P[0-9.]+)\s*(?:mins?\.?|minutes?)\s*)Z?$', s) if m: - hours, mins = m.groups() + sign, days, hours, mins = m.groups() else: return None + sign = -1 if sign == '-' else 1 + if ms: ms = ms.replace(':', '.') - return sum(float(part or 0) * mult for part, mult in ( + return sign * sum(float(part or 0) * mult for part, mult in ( (days, 86400), (hours, 3600), (mins, 60), (secs, 1), (ms, 1)))