From 9c1f99402fa25a5a691944c133432741af19829b Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Wed, 23 Sep 2020 23:09:00 +0200 Subject: [PATCH 1/7] [bandcamp] fix regexp for JSON matching on bandcamp --- youtube_dl/extractor/bandcamp.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f14b407dc..ad1812320 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -91,10 +91,11 @@ def _real_extract(self, url): duration = None formats = [] - track_info = self._parse_json( - self._search_regex( - r'trackinfo\s*:\s*\[\s*({.+?})\s*\]\s*,\s*?\n', - webpage, 'track info', default='{}'), title) + trackinfo_block = self._search_regex( + r'trackinfo":\[\s*({.+?})\s*\],"', + webpage, 'track info', default='{}') + quoted_json = trackinfo_block.replace('"', '"') + track_info = self._parse_json(quoted_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -117,7 +118,7 @@ def _real_extract(self, url): def extract(key): return self._search_regex( - r'\b%s\s*["\']?\s*:\s*(["\'])(?P(?:(?!\1).)+)\1' % key, + r',"%s":(")(?P(?:(?!").)+)"' % key, webpage, key, default=None, group='value') artist = extract('artist') From 14194392a813a12b3a1477ec75bcd0c8626ef3bb Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sat, 26 Sep 2020 17:34:35 +0200 Subject: [PATCH 2/7] [bandcamp] use unescapeHTML instead of a simple replace of quotes --- youtube_dl/extractor/bandcamp.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index ad1812320..55d110e28 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -92,10 +92,10 @@ def _real_extract(self, url): formats = [] trackinfo_block = self._search_regex( - r'trackinfo":\[\s*({.+?})\s*\],"', + r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - quoted_json = trackinfo_block.replace('"', '"') - track_info = self._parse_json(quoted_json, title) + unescaped_json = unescapeHTML(trackinfo_block) + track_info = self._parse_json(unescaped_json, title) if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -118,7 +118,7 @@ def _real_extract(self, url): def extract(key): return self._search_regex( - r',"%s":(")(?P(?:(?!").)+)"' % key, + r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, webpage, key, default=None, group='value') artist = extract('artist') From f43a856334b633e3d2f778b455fb08a4a06fbf51 Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sun, 27 Sep 2020 14:51:42 +0200 Subject: [PATCH 3/7] [bandcamp] match album titles inside the new JSON data block, and unescape the title properly --- youtube_dl/extractor/bandcamp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 55d110e28..f036a89eb 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -316,10 +316,10 @@ def _real_extract(self, url): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*:\s*"((?:\\.|[^"\\])+?)"', + r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', webpage, 'title', fatal=False) if title: - title = title.replace(r'\"', '"') + title = unescapeHTML(title) return { '_type': 'playlist', 'uploader_id': uploader_id, From 9385ec4b1c797ffab66b945f23fd4248c0c8a32e Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sun, 27 Sep 2020 15:11:08 +0200 Subject: [PATCH 4/7] [bandcamp] fix the freeDownloadPage JSON lookup, and use the id from the URL to match the tracks --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index f036a89eb..eccb867a0 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -128,12 +128,12 @@ def extract(key): release_date = unified_strdate(extract('album_release_date')) download_link = self._search_regex( - r'freeDownloadPage\s*:\s*(["\'])(?P(?:(?!\1).)+)\1', webpage, + r'freeDownloadPage(?:["\']|"):\s*(["\']|")(?P(?:(?!\1).)+)\1', webpage, 'download link', default=None, group='url') if download_link: track_id = self._search_regex( - r'(?ms)var TralbumData = .*?[{,]\s*id: (?P\d+),?$', - webpage, 'track id') + r'\?id=(?P\d+)&', + download_link, 'track id') download_webpage = self._download_webpage( download_link, track_id, 'Downloading free downloads page') From 37f625598cb9b02cb06b3f12033cc29699d70818 Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Sun, 27 Sep 2020 15:52:55 +0200 Subject: [PATCH 5/7] [bandcamp] update youtuble dl test song information to match title as artist - track, and add missing keys from info_dict --- youtube_dl/extractor/bandcamp.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index eccb867a0..3d32b1e0f 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,8 +33,11 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, + 'uploader': 'youtube-dl \\', + 'timestamp': 1354224127, + 'upload_date': '20121129', }, '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { From 75a83afe3b8fd9dfe242ca2de428c313a2bd3e0e Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Mon, 28 Sep 2020 19:42:56 +0200 Subject: [PATCH 6/7] [bandcamp] fix test song uploader name, cleanup remanings " and \ in data, including album titles --- youtube_dl/extractor/bandcamp.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3d32b1e0f..3405b570a 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,9 +33,9 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \\ - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, - 'uploader': 'youtube-dl \\', + 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, 'upload_date': '20121129', }, @@ -43,7 +43,7 @@ class BandcampIE(InfoExtractor): }, { # free download 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '853e35bf34aa1d6fe2615ae612564b36', + 'md5': '5d92af55811e47f38962a54c30b07ef0', 'info_dict': { 'id': '2650410135', 'ext': 'aiff', @@ -94,11 +94,12 @@ def _real_extract(self, url): duration = None formats = [] - trackinfo_block = self._search_regex( + trackinfo_block = self._html_search_regex( r'trackinfo(?:["\']|"):\[\s*({.+?})\s*\],(?:["\']|")', webpage, 'track info', default='{}') - unescaped_json = unescapeHTML(trackinfo_block) - track_info = self._parse_json(unescaped_json, title) + + track_info = self._parse_json(trackinfo_block, title) + if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -120,9 +121,10 @@ def _real_extract(self, url): duration = float_or_none(track_info.get('duration')) def extract(key): - return self._search_regex( - r',(["\']|")%s\1:\1(?P(?:(?!\1).)+)\1' % key, + data = self._html_search_regex( + r',(["\']|")%s\1:\1(?P(?:\\\1|((?!\1).))+)\1' % key, webpage, key, default=None, group='value') + return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data artist = extract('artist') album = extract('album_title') @@ -319,10 +321,12 @@ def _real_extract(self, url): if self._html_search_meta('duration', elem_content, default=None)] title = self._html_search_regex( - r'album_title\s*(?:"|["\']):\s*(?:"|["\'])((?:\\.|[^"\\])+?)(?:"|["\'])', - webpage, 'title', fatal=False) + r'album_title\s*(?:"|["\']):\s*("|["\'])(?P(?:\\\1|((?!\1).))+)\1', + webpage, 'title', fatal=False, group='album') + if title: - title = unescapeHTML(title) + title = title.replace(r'\"', '"') + return { '_type': 'playlist', 'uploader_id': uploader_id, From 03edd545a9e14b0fbcb36574248d8cf0e7a224d6 Mon Sep 17 00:00:00 2001 From: Gilles Pietri Date: Tue, 29 Sep 2020 12:09:55 +0200 Subject: [PATCH 7/7] [bandcamp] Revert test song title, and extract title generally (which may fail, as the other title json values might come up), instead of out of trackinfo, as bandcamp prefixes it with artist - --- youtube_dl/extractor/bandcamp.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 3405b570a..04b8aa80f 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -33,7 +33,7 @@ class BandcampIE(InfoExtractor): 'info_dict': { 'id': '1812978515', 'ext': 'mp3', - 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", + 'title': "youtube-dl \"'/\\\u00e4\u21ad - youtube-dl test song \"'/\\\u00e4\u21ad", 'duration': 9.8485, 'uploader': "youtube-dl \"'/\\\u00e4\u21ad", 'timestamp': 1354224127, @@ -99,7 +99,6 @@ def _real_extract(self, url): webpage, 'track info', default='{}') track_info = self._parse_json(trackinfo_block, title) - if track_info: file_ = track_info.get('file') if isinstance(file_, dict): @@ -115,7 +114,7 @@ def _real_extract(self, url): 'acodec': ext, 'abr': int_or_none(abr_str), }) - track = track_info.get('title') + track_id = str_or_none(track_info.get('track_id') or track_info.get('id')) track_number = int_or_none(track_info.get('track_num')) duration = float_or_none(track_info.get('duration')) @@ -126,6 +125,7 @@ def extract(key): webpage, key, default=None, group='value') return data.replace(r'\"', '"').replace('\\\\', '\\') if data else data + track = extract('title') artist = extract('artist') album = extract('album_title') timestamp = unified_timestamp(