From 8f423cf8051fbfeedd57cca00d106012e6e86a97 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 29 Feb 2024 23:49:25 +0100 Subject: [PATCH] [ie/rai] Fix m3u8 formats extraction (#9291) Closes #887 Authored by: nixxo --- yt_dlp/extractor/rai.py | 60 ++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 18 deletions(-) diff --git a/yt_dlp/extractor/rai.py b/yt_dlp/extractor/rai.py index f6219c2db..c1fc65c81 100644 --- a/yt_dlp/extractor/rai.py +++ b/yt_dlp/extractor/rai.py @@ -28,6 +28,29 @@ class RaiBaseIE(InfoExtractor): _GEO_COUNTRIES = ['IT'] _GEO_BYPASS = False + def _fix_m3u8_formats(self, media_url, video_id): + fmts = self._extract_m3u8_formats( + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False) + + # Fix malformed m3u8 manifests by setting audio-only/video-only formats + for f in fmts: + if not f.get('acodec'): + f['acodec'] = 'mp4a' + if not f.get('vcodec'): + f['vcodec'] = 'avc1' + man_url = f['url'] + if re.search(r'chunklist(?:_b\d+)*_ao[_.]', man_url): # audio only + f['vcodec'] = 'none' + elif re.search(r'chunklist(?:_b\d+)*_vo[_.]', man_url): # video only + f['acodec'] = 'none' + else: # video+audio + if f['acodec'] == 'none': + f['acodec'] = 'mp4a' + if f['vcodec'] == 'none': + f['vcodec'] = 'avc1' + + return fmts + def _extract_relinker_info(self, relinker_url, video_id, audio_only=False): def fix_cdata(s): # remove \r\n\t before and after to avoid @@ -69,8 +92,7 @@ class RaiBaseIE(InfoExtractor): 'format_id': 'https-mp3', }) elif ext == 'm3u8' or 'format=m3u8' in media_url: - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + formats.extend(self._fix_m3u8_formats(media_url, video_id)) elif ext == 'f4m': # very likely no longer needed. Cannot find any url that uses it. manifest_url = update_url_query( @@ -153,10 +175,10 @@ class RaiBaseIE(InfoExtractor): 'format_id': f'https-{tbr}', 'width': format_copy.get('width'), 'height': format_copy.get('height'), - 'tbr': format_copy.get('tbr'), - 'vcodec': format_copy.get('vcodec'), - 'acodec': format_copy.get('acodec'), - 'fps': format_copy.get('fps'), + 'tbr': format_copy.get('tbr') or tbr, + 'vcodec': format_copy.get('vcodec') or 'avc1', + 'acodec': format_copy.get('acodec') or 'mp4a', + 'fps': format_copy.get('fps') or 25, } if format_copy else { 'format_id': f'https-{tbr}', 'width': _QUALITY[tbr][0], @@ -245,7 +267,7 @@ class RaiPlayIE(RaiBaseIE): 'series': 'Report', 'season': '2013/14', 'subtitles': {'it': 'count:4'}, - 'release_year': 2022, + 'release_year': 2024, 'episode': 'Espresso nel caffè - 07/04/2014', 'timestamp': 1396919880, 'upload_date': '20140408', @@ -253,7 +275,7 @@ class RaiPlayIE(RaiBaseIE): }, 'params': {'skip_download': True}, }, { - # 1080p direct mp4 url + # 1080p 'url': 'https://www.raiplay.it/video/2021/11/Blanca-S1E1-Senza-occhi-b1255a4a-8e72-4a2f-b9f3-fc1308e00736.html', 'md5': 'aeda7243115380b2dd5e881fd42d949a', 'info_dict': { @@ -274,7 +296,7 @@ class RaiPlayIE(RaiBaseIE): 'episode': 'Senza occhi', 'timestamp': 1637318940, 'upload_date': '20211119', - 'formats': 'count:12', + 'formats': 'count:7', }, 'params': {'skip_download': True}, 'expected_warnings': ['Video not available. Likely due to geo-restriction.'] @@ -527,7 +549,7 @@ class RaiPlaySoundPlaylistIE(InfoExtractor): 'info_dict': { 'id': 'ilruggitodelconiglio', 'title': 'Il Ruggito del Coniglio', - 'description': 'md5:48cff6972435964284614d70474132e6', + 'description': 'md5:62a627b3a2d0635d08fa8b6e0a04f27e', }, 'playlist_mincount': 65, }, { @@ -634,19 +656,20 @@ class RaiIE(RaiBaseIE): } -class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE +class RaiNewsIE(RaiBaseIE): _VALID_URL = rf'https?://(www\.)?rainews\.it/(?!articoli)[^?#]+-(?P{RaiBaseIE._UUID_RE})(?:-[^/?#]+)?\.html' _EMBED_REGEX = [rf']+data-src="(?P/iframe/[^?#]+?{RaiBaseIE._UUID_RE}\.html)'] _TESTS = [{ # new rainews player (#3911) - 'url': 'https://www.rainews.it/rubriche/24mm/video/2022/05/24mm-del-29052022-12cf645d-1ffd-4220-b27c-07c226dbdecf.html', + 'url': 'https://www.rainews.it/video/2024/02/membri-della-croce-rossa-evacuano-gli-abitanti-di-un-villaggio-nella-regione-ucraina-di-kharkiv-il-filmato-dallucraina--31e8017c-845c-43f5-9c48-245b43c3a079.html', 'info_dict': { - 'id': '12cf645d-1ffd-4220-b27c-07c226dbdecf', + 'id': '31e8017c-845c-43f5-9c48-245b43c3a079', 'ext': 'mp4', - 'title': 'Puntata del 29/05/2022', - 'duration': 1589, - 'upload_date': '20220529', + 'title': 'md5:1e81364b09de4a149042bac3c7d36f0b', + 'duration': 196, + 'upload_date': '20240225', 'uploader': 'rainews', + 'formats': 'count:2', }, 'params': {'skip_download': True}, }, { @@ -659,7 +682,8 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE 'description': 'I film in uscita questa settimana.', 'thumbnail': r're:^https?://.*\.png$', 'duration': 833, - 'upload_date': '20161103' + 'upload_date': '20161103', + 'formats': 'count:8', }, 'params': {'skip_download': True}, 'expected_warnings': ['unable to extract player_data'], @@ -684,7 +708,7 @@ class RaiNewsIE(RaiIE): # XXX: Do not subclass from concrete IE if not relinker_url: # fallback on old implementation for some old content try: - return self._extract_from_content_id(video_id, url) + return RaiIE._real_extract(self, url) except GeoRestrictedError: raise except ExtractorError as e: