From 09b49e1f688831c3ad7181decf38c90f8451e6c4 Mon Sep 17 00:00:00 2001 From: pukkandan Date: Tue, 22 Feb 2022 17:13:30 +0530 Subject: [PATCH] Add pre-processor stage `after_filter` * Move `_match_entry` and `post_extract` to `process_video_result`. It is also left in `process_info` for API compat * `--list-...` options and `--force-write-archive` now obey filtering options * Move `SponsorBlockPP` to `after_filter`. Closes https://github.com/yt-dlp/yt-dlp/issues/2536 * Reverts 4ec82a72bbf7ff0066edb50dcad20aa77ac2fe09 since this commit addresses the issue it was solving --- README.md | 20 +++++++++++--------- test/test_YoutubeDL.py | 20 ++------------------ yt_dlp/YoutubeDL.py | 43 +++++++++++++++++++----------------------- yt_dlp/__init__.py | 4 ++-- yt_dlp/options.py | 8 ++++---- yt_dlp/utils.py | 2 +- 6 files changed, 39 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index 88ddb2f3b..70b2e202f 100644 --- a/README.md +++ b/README.md @@ -982,15 +982,17 @@ ## Post-Processing Options: semicolon ";" delimited list of NAME=VALUE. The "when" argument determines when the postprocessor is invoked. It can be one of - "pre_process" (after extraction), - "before_dl" (before video download), - "post_process" (after video download; - default), "after_move" (after moving file - to their final locations), "after_video" - (after downloading and processing all - formats of a video), or "playlist" (end of - playlist). This option can be used multiple - times to add different postprocessors + "pre_process" (after video extraction), + "after_filter" (after video passes filter), + "before_dl" (before each video download), + "post_process" (after each video download; + default), "after_move" (after moving video + file to it's final locations), + "after_video" (after downloading and + processing all formats of a video), or + "playlist" (at end of playlist). This + option can be used multiple times to add + different postprocessors ## SponsorBlock Options: Make chapter entries for, or remove various segments (sponsor, diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py index 34ed814b4..7637297be 100644 --- a/test/test_YoutubeDL.py +++ b/test/test_YoutubeDL.py @@ -30,9 +30,7 @@ def __init__(self, *args, **kwargs): self.msgs = [] def process_info(self, info_dict): - info_dict = info_dict.copy() - info_dict.pop('__original_infodict', None) - self.downloaded_info_dicts.append(info_dict) + self.downloaded_info_dicts.append(info_dict.copy()) def to_screen(self, msg): self.msgs.append(msg) @@ -898,20 +896,6 @@ def run(self, info): os.unlink(filename) def test_match_filter(self): - class FilterYDL(YDL): - def __init__(self, *args, **kwargs): - super(FilterYDL, self).__init__(*args, **kwargs) - self.params['simulate'] = True - - def process_info(self, info_dict): - super(YDL, self).process_info(info_dict) - - def _match_entry(self, info_dict, incomplete=False): - res = super(FilterYDL, self)._match_entry(info_dict, incomplete) - if res is None: - self.downloaded_info_dicts.append(info_dict.copy()) - return res - first = { 'id': '1', 'url': TEST_URL, @@ -939,7 +923,7 @@ def _match_entry(self, info_dict, incomplete=False): videos = [first, second] def get_videos(filter_=None): - ydl = FilterYDL({'match_filter': filter_}) + ydl = YDL({'match_filter': filter_, 'simulate': True}) for v in videos: ydl.process_ie_result(v, download=True) return [v['id'] for v in ydl.downloaded_info_dicts] diff --git a/yt_dlp/YoutubeDL.py b/yt_dlp/YoutubeDL.py index 36b2b37c0..d9a3c0bce 100644 --- a/yt_dlp/YoutubeDL.py +++ b/yt_dlp/YoutubeDL.py @@ -1037,8 +1037,7 @@ def validate_outtmpl(cls, outtmpl): @staticmethod def _copy_infodict(info_dict): info_dict = dict(info_dict) - for key in ('__original_infodict', '__postprocessors'): - info_dict.pop(key, None) + info_dict.pop('__postprocessors', None) return info_dict def prepare_outtmpl(self, outtmpl, info_dict, sanitize=False): @@ -2512,8 +2511,6 @@ def is_wellformed(f): if '__x_forwarded_for_ip' in info_dict: del info_dict['__x_forwarded_for_ip'] - # TODO Central sorting goes here - if self.params.get('check_formats') is True: formats = LazyList(self._check_formats(formats[::-1]), reverse=True) @@ -2526,6 +2523,12 @@ def is_wellformed(f): info_dict, _ = self.pre_process(info_dict) + if self._match_entry(info_dict) is not None: + return info_dict + + self.post_extract(info_dict) + info_dict, _ = self.pre_process(info_dict, 'after_filter') + # The pre-processors may have modified the formats formats = info_dict.get('formats', [info_dict]) @@ -2610,15 +2613,12 @@ def is_wellformed(f): + ', '.join([f['format_id'] for f in formats_to_download])) max_downloads_reached = False for i, fmt in enumerate(formats_to_download): - formats_to_download[i] = new_info = dict(info_dict) - # Save a reference to the original info_dict so that it can be modified in process_info if needed + formats_to_download[i] = new_info = self._copy_infodict(info_dict) new_info.update(fmt) - new_info['__original_infodict'] = info_dict try: self.process_info(new_info) except MaxDownloadsReached: max_downloads_reached = True - new_info.pop('__original_infodict') # Remove copied info for key, val in tuple(new_info.items()): if info_dict.get(key) == val: @@ -2826,7 +2826,7 @@ def existing_file(self, filepaths, *, default_overwrite=True): return None def process_info(self, info_dict): - """Process a single resolved IE result. (Modified it in-place)""" + """Process a single resolved IE result. (Modifies it in-place)""" assert info_dict.get('_type', 'video') == 'video' original_infodict = info_dict @@ -2834,18 +2834,22 @@ def process_info(self, info_dict): if 'format' not in info_dict and 'ext' in info_dict: info_dict['format'] = info_dict['ext'] + # This is mostly just for backward compatibility of process_info + # As a side-effect, this allows for format-specific filters if self._match_entry(info_dict) is not None: info_dict['__write_download_archive'] = 'ignore' return + # Does nothing under normal operation - for backward compatibility of process_info self.post_extract(info_dict) - self._num_downloads += 1 # info_dict['_filename'] needs to be set for backward compatibility info_dict['_filename'] = full_filename = self.prepare_filename(info_dict, warn=True) temp_filename = self.prepare_filename(info_dict, 'temp') files_to_move = {} + self._num_downloads += 1 + # Forced printings self.__forced_printings(info_dict, full_filename, incomplete=('format' not in info_dict)) @@ -3259,17 +3263,14 @@ def sanitize_info(info_dict, remove_private_keys=False): return info_dict info_dict.setdefault('epoch', int(time.time())) info_dict.setdefault('_type', 'video') - remove_keys = {'__original_infodict'} # Always remove this since this may contain a copy of the entire dict - keep_keys = ['_type'] # Always keep this to facilitate load-info-json + if remove_private_keys: - remove_keys |= { + reject = lambda k, v: v is None or (k.startswith('_') and k != '_type') or k in { 'requested_downloads', 'requested_formats', 'requested_subtitles', 'requested_entries', 'entries', 'filepath', 'infojson_filename', 'original_url', 'playlist_autonumber', } - reject = lambda k, v: k not in keep_keys and ( - k.startswith('_') or k in remove_keys or v is None) else: - reject = lambda k, v: k in remove_keys + reject = lambda k, v: False def filter_fn(obj): if isinstance(obj, dict): @@ -3296,14 +3297,8 @@ def actual_post_extract(info_dict): actual_post_extract(video_dict or {}) return - post_extractor = info_dict.get('__post_extractor') or (lambda: {}) - extra = post_extractor().items() - info_dict.update(extra) - info_dict.pop('__post_extractor', None) - - original_infodict = info_dict.get('__original_infodict') or {} - original_infodict.update(extra) - original_infodict.pop('__post_extractor', None) + post_extractor = info_dict.pop('__post_extractor', None) or (lambda: {}) + info_dict.update(post_extractor()) actual_post_extract(info_dict or {}) diff --git a/yt_dlp/__init__.py b/yt_dlp/__init__.py index b93f47ecc..c87c5b6df 100644 --- a/yt_dlp/__init__.py +++ b/yt_dlp/__init__.py @@ -474,8 +474,8 @@ def report_unplayable_conflict(opt_name, arg, default=False, allowed=None): 'key': 'SponsorBlock', 'categories': sponsorblock_query, 'api': opts.sponsorblock_api, - # Run this immediately after extraction is complete - 'when': 'pre_process' + # Run this after filtering videos + 'when': 'after_filter' }) if opts.parse_metadata: postprocessors.append({ diff --git a/yt_dlp/options.py b/yt_dlp/options.py index 2ba7d2601..6fcef98cd 100644 --- a/yt_dlp/options.py +++ b/yt_dlp/options.py @@ -1550,11 +1550,11 @@ def _dict_from_options_callback( 'and (optionally) arguments to be passed to it, separated by a colon ":". ' 'ARGS are a semicolon ";" delimited list of NAME=VALUE. ' 'The "when" argument determines when the postprocessor is invoked. ' - 'It can be one of "pre_process" (after extraction), ' - '"before_dl" (before video download), "post_process" (after video download; default), ' - '"after_move" (after moving file to their final locations), ' + 'It can be one of "pre_process" (after video extraction), "after_filter" (after video passes filter), ' + '"before_dl" (before each video download), "post_process" (after each video download; default), ' + '"after_move" (after moving video file to it\'s final locations), ' '"after_video" (after downloading and processing all formats of a video), ' - 'or "playlist" (end of playlist). ' + 'or "playlist" (at end of playlist). ' 'This option can be used multiple times to add different postprocessors')) sponsorblock = optparse.OptionGroup(parser, 'SponsorBlock Options', description=( diff --git a/yt_dlp/utils.py b/yt_dlp/utils.py index f5cad0e54..8b0d95efa 100644 --- a/yt_dlp/utils.py +++ b/yt_dlp/utils.py @@ -3166,7 +3166,7 @@ def q(qid): return q -POSTPROCESS_WHEN = {'pre_process', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'} +POSTPROCESS_WHEN = {'pre_process', 'after_filter', 'before_dl', 'after_move', 'post_process', 'after_video', 'playlist'} DEFAULT_OUTTMPL = {