From c2e0fc40a73dd85ab3920f977f579d475e66ef59 Mon Sep 17 00:00:00 2001 From: bashonly <88596187+bashonly@users.noreply.github.com> Date: Tue, 21 Mar 2023 18:12:17 -0500 Subject: [PATCH] [extractor/generic] Add extractor-args `hls_key`, `variant_query` (#6567) Authored by: bashonly --- README.md | 2 ++ yt_dlp/extractor/generic.py | 32 +++++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index de83e421f..9ce85d631 100644 --- a/README.md +++ b/README.md @@ -1798,6 +1798,8 @@ #### youtubetab (YouTube playlists, channels, feeds, etc.) #### generic * `fragment_query`: Passthrough any query in mpd/m3u8 manifest URLs to their fragments. Does not apply to ffmpeg +* `variant_query`: Passthrough the master m3u8 URL query to its variant playlist URLs +* `hls_key`: An HLS AES-128 key URI *or* key (as hex), and optionally the IV (as hex), in the form of `(URI|KEY)[,IV]`; e.g. `generic:hls_key=ABCDEF1234567980,0xFEDCBA0987654321`. Passing any of these values will force usage of the native HLS downloader and override the corresponding values found in the m3u8 playlist #### funimation * `language`: Audio languages to extract, e.g. `funimation:language=english,japanese` diff --git a/yt_dlp/extractor/generic.py b/yt_dlp/extractor/generic.py index 49aa5a1f5..075bb36de 100644 --- a/yt_dlp/extractor/generic.py +++ b/yt_dlp/extractor/generic.py @@ -24,6 +24,7 @@ mimetype2ext, orderedSet, parse_duration, + parse_qs, parse_resolution, smuggle_url, str_or_none, @@ -32,6 +33,7 @@ unescapeHTML, unified_timestamp, unsmuggle_url, + update_url_query, url_or_none, urljoin, variadic, @@ -2184,12 +2186,21 @@ def report_detected(self, name, num=1, note=None): self._downloader.write_debug(f'Identified {num} {name}{format_field(note, None, "; %s")}') - def _fragment_query(self, url): + def _extra_manifest_info(self, info, manifest_url): if self._configuration_arg('fragment_query'): - query_string = urllib.parse.urlparse(url).query + query_string = urllib.parse.urlparse(manifest_url).query if query_string: - return {'extra_param_to_segment_url': query_string} - return {} + info['extra_param_to_segment_url'] = query_string + + hex_or_none = lambda x: x if re.fullmatch(r'(0x)?[\da-f]+', x, re.IGNORECASE) else None + info['hls_aes'] = traverse_obj(self._configuration_arg('hls_key'), { + 'uri': (0, {url_or_none}), 'key': (0, {hex_or_none}), 'iv': (1, {hex_or_none}), + }) or None + + if self._configuration_arg('variant_query'): + query = parse_qs(manifest_url) + for fmt in self._downloader._get_formats(info): + fmt['url'] = update_url_query(fmt['url'], query) def _extract_rss(self, url, video_id, doc): NS_MAP = { @@ -2397,10 +2408,8 @@ def _real_extract(self, url): subtitles = {} if format_id.endswith('mpegurl') or ext == 'm3u8': formats, subtitles = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4', headers=headers) - info_dict.update(self._fragment_query(url)) elif format_id.endswith('mpd') or format_id.endswith('dash+xml') or ext == 'mpd': formats, subtitles = self._extract_mpd_formats_and_subtitles(url, video_id, headers=headers) - info_dict.update(self._fragment_query(url)) elif format_id == 'f4m' or ext == 'f4m': formats = self._extract_f4m_formats(url, video_id, headers=headers) else: @@ -2415,6 +2424,7 @@ def _real_extract(self, url): 'subtitles': subtitles, 'http_headers': headers or None, }) + self._extra_manifest_info(info_dict, url) return info_dict if not self.get_param('test', False) and not is_intentional: @@ -2427,7 +2437,7 @@ def _real_extract(self, url): if first_bytes.startswith(b'#EXTM3U'): self.report_detected('M3U playlist') info_dict['formats'], info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(url, video_id, 'mp4') - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) return info_dict # Maybe it's a direct link to a video? @@ -2478,7 +2488,7 @@ def _real_extract(self, url): doc, mpd_base_url=full_response.geturl().rpartition('/')[0], mpd_url=url) - info_dict.update(self._fragment_query(url)) + self._extra_manifest_info(info_dict, url) self.report_detected('DASH manifest') return info_dict elif re.match(r'^{http://ns\.adobe\.com/f4m/[12]\.0}manifest$', doc.tag): @@ -2592,7 +2602,7 @@ def _extract_embeds(self, url, webpage, *, urlh=None, info_dict={}): formats.extend(fmts) self._merge_subtitles(subs, target=subtitles) for fmt in formats: - fmt.update(self._fragment_query(src)) + self._extra_manifest_info(fmt, src) if not formats: formats.append({ @@ -2795,10 +2805,10 @@ def filter_video(urls): return [self._extract_xspf_playlist(video_url, video_id)] elif ext == 'm3u8': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_m3u8_formats_and_subtitles(video_url, video_id, ext='mp4', headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'mpd': entry_info_dict['formats'], entry_info_dict['subtitles'] = self._extract_mpd_formats_and_subtitles(video_url, video_id, headers=headers) - entry_info_dict.update(self._fragment_query(video_url)) + self._extra_manifest_info(entry_info_dict, video_url) elif ext == 'f4m': entry_info_dict['formats'] = self._extract_f4m_formats(video_url, video_id, headers=headers) elif re.search(r'(?i)\.(?:ism|smil)/manifest', video_url) and video_url != url: