diff --git a/yt_dlp/extractor/_extractors.py b/yt_dlp/extractor/_extractors.py index c0a330dbe..b6e7e6e27 100644 --- a/yt_dlp/extractor/_extractors.py +++ b/yt_dlp/extractor/_extractors.py @@ -101,6 +101,7 @@ from .americastestkitchen import ( AmericasTestKitchenIE, AmericasTestKitchenSeasonIE, ) +from .amvnews import AMVNewsIE from .anchorfm import AnchorFMEpisodeIE from .angel import AngelIE from .anvato import AnvatoIE diff --git a/yt_dlp/extractor/amvnews.py b/yt_dlp/extractor/amvnews.py new file mode 100644 index 000000000..c0ad7dddc --- /dev/null +++ b/yt_dlp/extractor/amvnews.py @@ -0,0 +1,117 @@ +import re +from collections import defaultdict + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + float_or_none, + parse_duration, + unescapeHTML, + urljoin, +) + + +class AMVNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?amvnews\.ru/(?:index.php)?\?go=Files&in=view&id=(?P[0-9]+)' + _TESTS = [{ + 'url': 'https://amvnews.ru/index.php?go=Files&in=view&id=12345', + 'info_dict': { + 'id': '12345', + 'ext': 'mp4', + 'description': 'md5:3c1391ce952f2125ce615b43081de1d0', + 'title': 'Jadeite | Music: Jai Wolf - Lost', + 'duration': 113, + 'creator': 'Leafa', + 'formats': [ + { + 'url': 'https://amvnews.ru/index.php?go=Files&file=down&id=12345&alt=4', + 'ext': 'mp4', + 'vcodec': 'h264', + 'acodec': 'aac', + 'width': 640, + 'height': 360, + 'fps': 23.98, + }, + { + 'url': 'https://amvnews.ru/index.php?go=Files&file=down&id=12345', + 'ext': 'mp4', + 'vcodec': 'h264', + 'acodec': 'aac', + 'width': 1920, + 'height': 1080, + 'fps': 23.98, + }, + { + 'url': 'https://amvnews.ru/index.php?go=Files&file=down&id=12345&alt=1', + 'ext': 'mp4', + 'vcodec': 'h264', + 'acodec': 'aac', + 'width': 3840, + 'height': 2160, + 'fps': 23.98, + } + ], + } + }] + + def _real_extract(self, html_url): + video_id = self._match_id(html_url) + webpage = self._download_webpage(html_url, video_id) + + formats = [] + subtitles = defaultdict(list) + + for link, info, name in re.findall( + r']*?(?:overlib\(\'(?P[^\']*)\'[^>]*)?>Download *(?P[^<]*)', + webpage, flags=re.IGNORECASE): + + url = urljoin('https://amvnews.ru/', unescapeHTML(link)) + + clean_name = clean_html(name) + + if 'subtitle' in clean_name.lower(): + # there are usually only english and russian subtitles (en, ru) + subtitles[clean_name.lower()[0:2]].append({ + 'url': url, + 'ext': self._search_regex(r'type: (\w+)', info.lower(), 'ext', default='srt'), + 'name': clean_name, + }) + elif 'resolution: ' in info.lower(): + formats.append({ + 'url': url, + 'ext': 'mp4', + 'format_note': clean_name, + 'vcodec': self._search_regex(r'Codecs: (\w+)', info, 'vcodec', fatal=False, flags=re.IGNORECASE), + 'acodec': self._search_regex(r'Codecs: \w+(?:\s*\([^\)]*\))*\/(\w+)', info, 'acodec', + fatal=False, flags=re.IGNORECASE), + 'width': int_or_none(self._search_regex(r'Resolution: (\d+)', info, 'width', + fatal=False, flags=re.IGNORECASE)), + 'height': int_or_none(self._search_regex(r'Resolution: \d+x(\d+)', info, 'height', + fatal=False, flags=re.IGNORECASE)), + 'fps': float_or_none(self._search_regex(r'Resolution: \d+x\d+\@([\d\.]+)', info, 'fps', + fatal=False, flags=re.IGNORECASE)), + 'duration': parse_duration(self._search_regex(r'Duration: ([ \w]+)', info, 'duration', + fatal=False, flags=re.IGNORECASE)), + }) + + title = self._html_extract_title(webpage) + if title: + title = title.removeprefix('AMV | Videos | ') + + url = None + if not formats: # use "url" field instead + formats = None + url = 'https://amvnews.ru/index.php?go=Files&file=down&id=' + str(video_id) + + return { + 'id': video_id, + 'title': title, + 'description': self._html_search_regex(r'
(.*?)
', webpage, 'description', + fatal=False, flags=re.DOTALL | re.IGNORECASE), + 'creator': self._html_search_regex(r'(.*?)', webpage, 'creator', + fatal=False, flags=re.IGNORECASE), + 'url': url, + 'formats': formats, + 'subtitles': subtitles, + }