From dcbb45803f9b70041ec0ef9c3c6547340bd1ef7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 13 Nov 2013 16:21:24 +0100 Subject: [PATCH 01/29] [youtube:playlist] Don't use the gdata api (closes #1508) Parse the playlist pages instead --- test/test_youtube_lists.py | 14 +++++------ youtube_dl/extractor/youtube.py | 43 ++++++++++----------------------- 2 files changed, 20 insertions(+), 37 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 4b7a7847b..50ad52695 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -27,7 +27,7 @@ def assertIsPlaylist(self, info): def test_youtube_playlist(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re')[0] + result = ie.extract('https://www.youtube.com/playlist?list=PLwiyx1dc3P2JR9N8gQaQN_BCvlSlap7re') self.assertIsPlaylist(result) self.assertEqual(result['title'], 'ytdl test PL') ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] @@ -44,13 +44,13 @@ def test_youtube_playlist_noplaylist(self): def test_issue_673(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('PLBB231211A4F62143')[0] + result = ie.extract('PLBB231211A4F62143') self.assertTrue(len(result['entries']) > 25) def test_youtube_playlist_long(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q')[0] + result = ie.extract('https://www.youtube.com/playlist?list=UUBABnxM4Ar9ten8Mdjj1j0Q') self.assertIsPlaylist(result) self.assertTrue(len(result['entries']) >= 799) @@ -58,7 +58,7 @@ def test_youtube_playlist_with_deleted(self): #651 dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC')[0] + result = ie.extract('https://www.youtube.com/playlist?list=PLwP_SiAcdui0KVebT0mU9Apz359a4ubsC') ytie_results = [YoutubeIE()._extract_id(url['url']) for url in result['entries']] self.assertFalse('pElCt5oNDuI' in ytie_results) self.assertFalse('KdPEApIVdWM' in ytie_results) @@ -66,7 +66,7 @@ def test_youtube_playlist_with_deleted(self): def test_youtube_playlist_empty(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx')[0] + result = ie.extract('https://www.youtube.com/playlist?list=PLtPgu7CB4gbZDA7i_euNxn75ISqxwZPYx') self.assertIsPlaylist(result) self.assertEqual(len(result['entries']), 0) @@ -74,7 +74,7 @@ def test_youtube_course(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) # TODO find a > 100 (paginating?) videos course - result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8')[0] + result = ie.extract('https://www.youtube.com/course?list=ECUl4u3cNGP61MdtwGTqZA0MreSaDybji8') entries = result['entries'] self.assertEqual(YoutubeIE()._extract_id(entries[0]['url']), 'j9WZyLZCBzs') self.assertEqual(len(entries), 25) @@ -99,7 +99,7 @@ def test_youtube_user(self): def test_youtube_safe_search(self): dl = FakeYDL() ie = YoutubePlaylistIE(dl) - result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl')[0] + result = ie.extract('PLtPgu7CB4gbY9oDN3drwC3cMbJggS7dKl') self.assertEqual(len(result['entries']), 2) def test_youtube_show(self): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c992cba97..d97ea8c83 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1506,8 +1506,9 @@ class YoutubePlaylistIE(InfoExtractor): | ((?:PL|EC|UU|FL)[0-9A-Za-z-_]{10,}) )""" - _TEMPLATE_URL = 'https://gdata.youtube.com/feeds/api/playlists/%s?max-results=%i&start-index=%i&v=2&alt=json&safeSearch=none' - _MAX_RESULTS = 50 + _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&page=%s' + _MORE_PAGES_INDICATOR = r'data-link-type="next"' + _VIDEO_RE = r'href="/watch\?v=([0-9A-Za-z_-]{11})&' IE_NAME = u'youtube:playlist' @classmethod @@ -1532,41 +1533,23 @@ def _real_extract(self, url): else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) - # Download playlist videos from API - videos = [] + # Extract the video ids from the playlist pages + ids = [] for page_num in itertools.count(1): - start_index = self._MAX_RESULTS * (page_num - 1) + 1 - if start_index >= 1000: - self._downloader.report_warning(u'Max number of results reached') - break - url = self._TEMPLATE_URL % (playlist_id, self._MAX_RESULTS, start_index) + url = self._TEMPLATE_URL % (playlist_id, page_num) page = self._download_webpage(url, playlist_id, u'Downloading page #%s' % page_num) + # The ids are duplicated + new_ids = orderedSet(re.findall(self._VIDEO_RE, page)) + ids.extend(new_ids) - try: - response = json.loads(page) - except ValueError as err: - raise ExtractorError(u'Invalid JSON in API response: ' + compat_str(err)) - - if 'feed' not in response: - raise ExtractorError(u'Got a malformed response from YouTube API') - playlist_title = response['feed']['title']['$t'] - if 'entry' not in response['feed']: - # Number of videos is a multiple of self._MAX_RESULTS + if re.search(self._MORE_PAGES_INDICATOR, page) is None: break - for entry in response['feed']['entry']: - index = entry['yt$position']['$t'] - if 'media$group' in entry and 'yt$videoid' in entry['media$group']: - videos.append(( - index, - 'https://www.youtube.com/watch?v=' + entry['media$group']['yt$videoid']['$t'] - )) + playlist_title = self._og_search_title(page) - videos = [v[1] for v in sorted(videos)] - - url_results = [self.url_result(vurl, 'Youtube') for vurl in videos] - return [self.playlist_result(url_results, playlist_id, playlist_title)] + url_results = [self.url_result(vid, 'Youtube') for vid in ids] + return self.playlist_result(url_results, playlist_id, playlist_title) class YoutubeChannelIE(InfoExtractor): From 880e1c529de1d0f7f0a065afc4148320894a25b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Wed, 13 Nov 2013 16:39:11 +0100 Subject: [PATCH 02/29] [youtube:playlist] Login into youtube if requested (fixes #1757) Allows to download private playlists --- youtube_dl/extractor/youtube.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index d97ea8c83..c48c0e24f 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1490,7 +1490,7 @@ def _real_extract(self, url): }) return results -class YoutubePlaylistIE(InfoExtractor): +class YoutubePlaylistIE(YoutubeBaseInfoExtractor): IE_DESC = u'YouTube.com playlists' _VALID_URL = r"""(?: (?:https?://)? @@ -1516,6 +1516,9 @@ def suitable(cls, url): """Receives a URL and returns True if suitable for this IE.""" return re.match(cls._VALID_URL, url, re.VERBOSE) is not None + def _real_initialize(self): + self._login() + def _real_extract(self, url): # Extract playlist id mobj = re.match(self._VALID_URL, url, re.VERBOSE) From 00ea0f11eb76e7a67648790524a50f7254b9578f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 22 Nov 2013 20:00:35 +0100 Subject: [PATCH 03/29] Print full title in --get-title output (#1806) --- youtube_dl/YoutubeDL.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index a2e3df1f9..2700051cf 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -639,7 +639,7 @@ def process_info(self, info_dict): # Forced printings if self.params.get('forcetitle', False): - compat_print(info_dict['title']) + compat_print(info_dict['fulltitle']) if self.params.get('forceid', False): compat_print(info_dict['id']) if self.params.get('forceurl', False): From 50123be4211e2c16aa5d2fc9ebadbaf72a9becce Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister Date: Fri, 22 Nov 2013 20:23:55 +0100 Subject: [PATCH 04/29] release 2013.11.22.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index c1f581cd6..770b046a5 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.22' +__version__ = '2013.11.22.1' From d35dc6d3b57781e5f1c0a5df308e3c08f66371a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= Date: Fri, 22 Nov 2013 21:19:31 +0100 Subject: [PATCH 05/29] [bandcamp] move the album test to the album extractor and return a single track instead of a playlist --- youtube_dl/extractor/bandcamp.py | 52 ++++++++++++++++---------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 359d4174b..1aa9dbefd 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -20,28 +20,6 @@ class BandcampIE(InfoExtractor): u"title": u"youtube-dl test song \"'/\\\u00e4\u21ad" }, u'skip': u'There is a limit of 200 free downloads / month for the test song' - }, { - u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', - u'playlist': [ - { - u'file': u'1353101989.mp3', - u'md5': u'39bc1eded3476e927c724321ddf116cf', - u'info_dict': { - u'title': u'Intro', - } - }, - { - u'file': u'38097443.mp3', - u'md5': u'1a2c32e2691474643e912cc6cd4bffaa', - u'info_dict': { - u'title': u'Kero One - Keep It Alive (Blazo remix)', - } - }, - ], - u'params': { - u'playlistend': 2 - }, - u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' }] def _real_extract(self, url): @@ -63,13 +41,11 @@ def _real_extract(self, url): 'url': format_url, 'ext': format_id.partition('-')[0] } for format_id, format_url in sorted(d['file'].items())] - entries.append({ + return { 'id': compat_str(d['id']), 'title': d['title'], 'formats': formats, - }) - - return self.playlist_result(entries, title, title) + } else: raise ExtractorError(u'No free songs found') @@ -112,6 +88,30 @@ class BandcampAlbumIE(InfoExtractor): IE_NAME = u'Bandcamp:album' _VALID_URL = r'http://.*?\.bandcamp\.com/album/(?P.*)' + _TEST = { + u'url': u'http://blazo.bandcamp.com/album/jazz-format-mixtape-vol-1', + u'playlist': [ + { + u'file': u'1353101989.mp3', + u'md5': u'39bc1eded3476e927c724321ddf116cf', + u'info_dict': { + u'title': u'Intro', + } + }, + { + u'file': u'38097443.mp3', + u'md5': u'1a2c32e2691474643e912cc6cd4bffaa', + u'info_dict': { + u'title': u'Kero One - Keep It Alive (Blazo remix)', + } + }, + ], + u'params': { + u'playlistend': 2 + }, + u'skip': u'Bancamp imposes download limits. See test_playlists:test_bandcamp_album for the playlist test' + } + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') From 9f79463803f40a15a6350dc693af75ec215147f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 21:25:12 +0100 Subject: [PATCH 06/29] [howcast] update test's checksum --- youtube_dl/extractor/howcast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/howcast.py b/youtube_dl/extractor/howcast.py index 46954337f..bafc5826f 100644 --- a/youtube_dl/extractor/howcast.py +++ b/youtube_dl/extractor/howcast.py @@ -8,7 +8,7 @@ class HowcastIE(InfoExtractor): _TEST = { u'url': u'http://www.howcast.com/videos/390161-How-to-Tie-a-Square-Knot-Properly', u'file': u'390161.mp4', - u'md5': u'1d7ba54e2c9d7dc6935ef39e00529138', + u'md5': u'8b743df908c42f60cf6496586c7f12c3', u'info_dict': { u"description": u"The square knot, also known as the reef knot, is one of the oldest, most basic knots to tie, and can be used in many different ways. Here's the proper way to tie a square knot.", u"title": u"How to Tie a Square Knot Properly" From d3b30148edb6795aadc96b3a464c492b239a2242 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Fri, 22 Nov 2013 21:26:31 +0100 Subject: [PATCH 07/29] [bambuser:channel] Update test --- test/test_playlists.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_playlists.py b/test/test_playlists.py index d83b3bf51..7c67239a4 100644 --- a/test/test_playlists.py +++ b/test/test_playlists.py @@ -102,7 +102,7 @@ def test_bambuser_channel(self): result = ie.extract('http://bambuser.com/channel/pixelversity') self.assertIsPlaylist(result) self.assertEqual(result['title'], u'pixelversity') - self.assertTrue(len(result['entries']) >= 66) + self.assertTrue(len(result['entries']) >= 60) def test_bandcamp_album(self): dl = FakeYDL() From 7012b23c947fc1ed146e314a30d3c70a5fde70e7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 22:46:46 +0100 Subject: [PATCH 08/29] Match --download-archive during playlist processing (Fixes #1745) --- test/test_youtube_lists.py | 6 ++--- youtube_dl/YoutubeDL.py | 43 +++++++++++++++++++++++---------- youtube_dl/extractor/common.py | 4 ++- youtube_dl/extractor/youtube.py | 26 ++++++++++++-------- 4 files changed, 52 insertions(+), 27 deletions(-) diff --git a/test/test_youtube_lists.py b/test/test_youtube_lists.py index 50ad52695..938517a2d 100644 --- a/test/test_youtube_lists.py +++ b/test/test_youtube_lists.py @@ -84,16 +84,16 @@ def test_youtube_channel(self): dl = FakeYDL() ie = YoutubeChannelIE(dl) #test paginated channel - result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w')[0] + result = ie.extract('https://www.youtube.com/channel/UCKfVa3S1e4PHvxWcwyMMg8w') self.assertTrue(len(result['entries']) > 90) #test autogenerated channel - result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos')[0] + result = ie.extract('https://www.youtube.com/channel/HCtnHdj3df7iM/videos') self.assertTrue(len(result['entries']) >= 18) def test_youtube_user(self): dl = FakeYDL() ie = YoutubeUserIE(dl) - result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation')[0] + result = ie.extract('https://www.youtube.com/user/TheLinuxFoundation') self.assertTrue(len(result['entries']) >= 320) def test_youtube_safe_search(self): diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 2700051cf..beb7d0cd1 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -355,15 +355,17 @@ def prepare_filename(self, info_dict): def _match_entry(self, info_dict): """ Returns None iff the file should be downloaded """ - title = info_dict['title'] - matchtitle = self.params.get('matchtitle', False) - if matchtitle: - if not re.search(matchtitle, title, re.IGNORECASE): - return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' - rejecttitle = self.params.get('rejecttitle', False) - if rejecttitle: - if re.search(rejecttitle, title, re.IGNORECASE): - return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' + if 'title' in info_dict: + # This can happen when we're just evaluating the playlist + title = info_dict['title'] + matchtitle = self.params.get('matchtitle', False) + if matchtitle: + if not re.search(matchtitle, title, re.IGNORECASE): + return u'[download] "' + title + '" title did not match pattern "' + matchtitle + '"' + rejecttitle = self.params.get('rejecttitle', False) + if rejecttitle: + if re.search(rejecttitle, title, re.IGNORECASE): + return u'"' + title + '" title matched reject pattern "' + rejecttitle + '"' date = info_dict.get('upload_date', None) if date is not None: dateRange = self.params.get('daterange', DateRange()) @@ -374,8 +376,8 @@ def _match_entry(self, info_dict): if age_limit < info_dict.get('age_limit', 0): return u'Skipping "' + title + '" because it is age restricted' if self.in_download_archive(info_dict): - return (u'%(title)s has already been recorded in archive' - % info_dict) + return (u'%s has already been recorded in archive' + % info_dict.get('title', info_dict.get('id', u'video'))) return None @staticmethod @@ -454,7 +456,7 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): ie_key=ie_result.get('ie_key'), extra_info=extra_info) elif result_type == 'playlist': - self.add_extra_info(ie_result, extra_info) + # We process each entry in the playlist playlist = ie_result.get('title', None) or ie_result.get('id', None) self.to_screen(u'[download] Downloading playlist: %s' % playlist) @@ -484,6 +486,12 @@ def process_ie_result(self, ie_result, download=True, extra_info={}): 'webpage_url': ie_result['webpage_url'], 'extractor_key': ie_result['extractor_key'], } + + reason = self._match_entry(entry) + if reason is not None: + self.to_screen(u'[download] ' + reason) + continue + entry_result = self.process_ie_result(entry, download=download, extra_info=extra) @@ -810,7 +818,16 @@ def in_download_archive(self, info_dict): fn = self.params.get('download_archive') if fn is None: return False - vid_id = info_dict['extractor'] + u' ' + info_dict['id'] + extractor = info_dict.get('extractor_id') + if extractor is None: + if 'id' in info_dict: + extractor = info_dict.get('ie_key') # key in a playlist + if extractor is None: + return False # Incomplete video information + # Future-proof against any change in case + # and backwards compatibility with prior versions + extractor = extractor.lower() + vid_id = extractor + u' ' + info_dict['id'] try: with locked_file(fn, 'r', encoding='utf-8') as archive_file: for line in archive_file: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index eb3435c77..3cebeaf29 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -229,12 +229,14 @@ def report_login(self): self.to_screen(u'Logging in') #Methods for following #608 - def url_result(self, url, ie=None): + def url_result(self, url, ie=None, video_id=None): """Returns a url that points to a page that should be processed""" #TODO: ie should be the class used for getting the info video_info = {'_type': 'url', 'url': url, 'ie_key': ie} + if video_id is not None: + video_info['id'] = video_id return video_info def playlist_result(self, entries, playlist_id=None, playlist_title=None): """Returns a playlist""" diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 9b09793eb..126688652 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1552,7 +1552,7 @@ def _real_extract(self, url): video_id = query_dict['v'][0] if self._downloader.params.get('noplaylist'): self.to_screen(u'Downloading just video %s because of --no-playlist' % video_id) - return self.url_result('https://www.youtube.com/watch?v=' + video_id, 'Youtube') + return self.url_result(video_id, 'Youtube', video_id=video_id) else: self.to_screen(u'Downloading playlist PL%s - add --no-playlist to just download video %s' % (playlist_id, video_id)) @@ -1571,7 +1571,8 @@ def _real_extract(self, url): playlist_title = self._og_search_title(page) - url_results = [self.url_result(vid, 'Youtube') for vid in ids] + url_results = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in ids] return self.playlist_result(url_results, playlist_id, playlist_title) @@ -1626,9 +1627,9 @@ def _real_extract(self, url): self._downloader.to_screen(u'[youtube] Channel %s: Found %i videos' % (channel_id, len(video_ids))) - urls = ['http://www.youtube.com/watch?v=%s' % id for id in video_ids] - url_entries = [self.url_result(eurl, 'Youtube') for eurl in urls] - return [self.playlist_result(url_entries, channel_id)] + url_entries = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_entries, channel_id) class YoutubeUserIE(InfoExtractor): @@ -1692,9 +1693,11 @@ def _real_extract(self, url): if len(ids_in_page) < self._GDATA_PAGE_SIZE: break - urls = ['http://www.youtube.com/watch?v=%s' % video_id for video_id in video_ids] - url_results = [self.url_result(rurl, 'Youtube') for rurl in urls] - return [self.playlist_result(url_results, playlist_title = username)] + url_results = [ + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] + return self.playlist_result(url_results, playlist_title=username) + class YoutubeSearchIE(SearchInfoExtractor): IE_DESC = u'YouTube.com searches' @@ -1735,7 +1738,8 @@ def _get_n_results(self, query, n): if len(video_ids) > n: video_ids = video_ids[:n] - videos = [self.url_result('http://www.youtube.com/watch?v=%s' % id, 'Youtube') for id in video_ids] + videos = [self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in video_ids] return self.playlist_result(videos, query) class YoutubeSearchDateIE(YoutubeSearchIE): @@ -1795,7 +1799,9 @@ def _real_extract(self, url): feed_html = info['feed_html'] m_ids = re.finditer(r'"/watch\?v=(.*?)["&]', feed_html) ids = orderedSet(m.group(1) for m in m_ids) - feed_entries.extend(self.url_result(id, 'Youtube') for id in ids) + feed_entries.extend( + self.url_result(video_id, 'Youtube', video_id=video_id) + for video_id in ids) if info['paging'] is None: break return self.playlist_result(feed_entries, playlist_title=self._PLAYLIST_TITLE) From d7386f6276b7b01ff4254136524d29c8f243721e Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 23:05:56 +0100 Subject: [PATCH 09/29] [update] Check if version from repository is newer before updating Closes #1704 --- youtube_dl/update.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index e5f441707..be7800e8b 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -41,6 +41,7 @@ def b(x): if signature != sha256(message).digest(): return False return True + def update_self(to_screen, verbose): """Update the program file with the latest version from the repository""" @@ -82,6 +83,13 @@ def update_self(to_screen, verbose): return version_id = versions_info['latest'] + + def version_tuple(version_str): + return tuple(map(int, version_str.split('.'))) + if version_tuple(__version__) >= version_tuple(version_str): + to_screen(u'youtube-dl is up to date (%s)' % __version__) + return + to_screen(u'Updating to version ' + version_id + '...') version = versions_info['versions'][version_id] From a87b0615aa311083923e607c3d1a5cdceab818f7 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Fri, 22 Nov 2013 23:08:15 +0100 Subject: [PATCH 10/29] release 2013.11.22.2 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 770b046a5..f6d18f945 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.22.1' +__version__ = '2013.11.22.2' From 43afe2858870d140b2a133fda2a0cbbd642a3bfc Mon Sep 17 00:00:00 2001 From: Itay Brandes <Brandes.Itay@gmail.com> Date: Sat, 23 Nov 2013 10:22:18 +0200 Subject: [PATCH 11/29] Log to an external logger (fixes #1810) Sadly applications using youtube-dl's python sources can't directly access it's log stream. It's pretty much limited to stdout and stderr only. It should log to logging.Logger instance passed to YoutubeDL's params dictionary. --- youtube_dl/YoutubeDL.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index beb7d0cd1..6729d53ad 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -97,6 +97,7 @@ class YoutubeDL(object): playlistend: Playlist item to end at. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. + logger: Log messages to a logging.Logger instance. logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file @@ -192,7 +193,9 @@ def add_post_processor(self, pp): def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - if not self.params.get('quiet', False): + if self.params.get('logger', False): + self.params['logger'].debug(message) + elif not self.params.get('quiet', False): terminator = [u'\n', u''][skip_eol] output = message + terminator write_string(output, self._screen_file) @@ -200,10 +203,13 @@ def to_screen(self, message, skip_eol=False): def to_stderr(self, message): """Print message to stderr.""" assert type(message) == type(u'') - output = message + u'\n' - if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr - output = output.encode(preferredencoding()) - sys.stderr.write(output) + if self.params.get('logger', False): + self.params['logger'].error(message) + else: + output = message + u'\n' + if 'b' in getattr(self._screen_file, 'mode', '') or sys.version_info[0] < 3: # Python 2 lies about the mode of sys.stdout/sys.stderr + output = output.encode(preferredencoding()) + sys.stderr.write(output) def to_console_title(self, message): if not self.params.get('consoletitle', False): From 52ad14aeb0178a187a861e7ce2259b7046702281 Mon Sep 17 00:00:00 2001 From: Takuya Tsuchida <takuya0301@gmail.com> Date: Sat, 23 Nov 2013 18:19:44 +0900 Subject: [PATCH 12/29] Add support for niconico --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/niconico.py | 190 +++++++++++++++++++++++++++++++ 2 files changed, 191 insertions(+) create mode 100644 youtube_dl/extractor/niconico.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 02f9e2546..f443f11f6 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -98,6 +98,7 @@ from .nbc import NBCNewsIE from .newgrounds import NewgroundsIE from .nhl import NHLIE, NHLVideocenterIE +from .niconico import NiconicoIE from .nowvideo import NowVideoIE from .ooyala import OoyalaIE from .orf import ORFIE diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py new file mode 100644 index 000000000..8638a8ee8 --- /dev/null +++ b/youtube_dl/extractor/niconico.py @@ -0,0 +1,190 @@ +# encoding: utf-8 + +import re +import socket +import xml.etree.ElementTree + +from .common import InfoExtractor +from ..utils import ( + compat_http_client, + compat_urllib_error, + compat_urllib_parse, + compat_urllib_request, + compat_urlparse, + compat_str, + + ExtractorError, + unified_strdate, +) + +class NiconicoIE(InfoExtractor): + IE_NAME = u'niconico' + IE_DESC = u'ニコニコ動画' + + _TEST = { + u'url': u'http://www.nicovideo.jp/watch/sm22312215', + u'file': u'sm22312215.mp4', + u'md5': u'd1a75c0823e2f629128c43e1212760f9', + u'info_dict': { + u'title': u'Big Buck Bunny', + u'uploader': u'takuya0301', + u'uploader_id': u'2698420', + u'upload_date': u'20131123', + u'description': u'(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org', + }, + u'params': { + u'username': u'ydl.niconico@gmail.com', + u'password': u'youtube-dl', + }, + } + + _VALID_URL = r'^(?:https?://)?(?:www\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' + _LOGIN_URL = 'https://secure.nicovideo.jp/secure/login' + _NETRC_MACHINE = 'niconico' + # If True it will raise an error if no login info is provided + _LOGIN_REQUIRED = True + + def _real_initialize(self): + self._login() + + def _login(self): + (username, password) = self._get_login_info() + # No authentication to be performed + if username is None: + if self._LOGIN_REQUIRED: + raise ExtractorError(u'No login info available, needed for using %s.' % self.IE_NAME, expected=True) + return False + + # Log in + login_form_strs = { + u'mail': username, + u'password': password, + } + # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode + # chokes on unicode + login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) + login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') + request = compat_urllib_request.Request(self._LOGIN_URL, login_data) + try: + self.report_login() + login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') + if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') + return False + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + return False + return True + + def _real_extract(self, url): + video_id = self._extract_id(url) + + # Get video webpage + self.report_video_webpage_download(video_id) + url = 'http://www.nicovideo.jp/watch/' + video_id + request = compat_urllib_request.Request(url) + try: + video_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) + + # Get video info + self.report_video_info_webpage_download(video_id) + url = 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id + request = compat_urllib_request.Request(url) + try: + video_info_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download video info webpage: %s' % compat_str(err)) + + # Get flv info + self.report_flv_info_webpage_download(video_id) + url = 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id + request = compat_urllib_request.Request(url) + try: + flv_info_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + raise ExtractorError(u'Unable to download flv info webpage: %s' % compat_str(err)) + + # Start extracting information + self.report_information_extraction(video_id) + video_info = xml.etree.ElementTree.fromstring(video_info_webpage) + + # url + video_real_url = compat_urlparse.parse_qs(flv_info_webpage.decode('utf-8'))['url'][0] + + # title + video_title = video_info.find('.//title').text + + # ext + video_extension = video_info.find('.//movie_type').text + + # format + video_format = video_extension.upper() + + # thumbnail + video_thumbnail = video_info.find('.//thumbnail_url').text + + # description + video_description = video_info.find('.//description').text + + # uploader_id + video_uploader_id = video_info.find('.//user_id').text + + # uploader + url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id + request = compat_urllib_request.Request(url) + try: + user_info_webpage = compat_urllib_request.urlopen(request).read() + except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: + self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) + + user_info = xml.etree.ElementTree.fromstring(user_info_webpage) + video_uploader = user_info.find('.//nickname').text + + # uploder_date + video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) + + # view_count + video_view_count = video_info.find('.//view_counter').text + + # webpage_url + video_webpage_url = video_info.find('.//watch_url').text + + return { + 'id': video_id, + 'url': video_real_url, + 'title': video_title, + 'ext': video_extension, + 'format': video_format, + 'thumbnail': video_thumbnail, + 'description': video_description, + 'uploader': video_uploader, + 'upload_date': video_upload_date, + 'uploader_id': video_uploader_id, + 'view_count': video_view_count, + 'webpage_url': video_webpage_url, + } + + def _extract_id(self, url): + mobj = re.match(self._VALID_URL, url) + if mobj is None: + raise ExtractorError(u'Invalid URL: %s' % url) + video_id = mobj.group(1) + return video_id + + def report_video_webpage_download(self, video_id): + """Report attempt to download video webpage.""" + self.to_screen(u'%s: Downloading video webpage' % video_id) + + def report_video_info_webpage_download(self, video_id): + """Report attempt to download video info webpage.""" + self.to_screen(u'%s: Downloading video info webpage' % video_id) + + def report_flv_info_webpage_download(self, video_id): + """Report attempt to download flv info webpage.""" + self.to_screen(u'%s: Downloading flv info webpage' % video_id) + + def report_information_extraction(self, video_id): + """Report attempt to extract video information.""" + self.to_screen(u'%s: Extracting video information' % video_id) From e5c146d58682dbdda2b46fc71a16ddc5b1fcc9fc Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sat, 23 Nov 2013 15:57:42 +0100 Subject: [PATCH 13/29] [streamcloud] skip test on travis --- youtube_dl/extractor/streamcloud.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/extractor/streamcloud.py b/youtube_dl/extractor/streamcloud.py index d476693ec..9faf3a5e3 100644 --- a/youtube_dl/extractor/streamcloud.py +++ b/youtube_dl/extractor/streamcloud.py @@ -21,6 +21,7 @@ class StreamcloudIE(InfoExtractor): u'title': u'youtube-dl test video \'/\\ ä ↭', u'duration': 9, }, + u'skip': u'Only available from the EU' } def _real_extract(self, url): From 36de0a0e1a49f6324616f9b508920ff7d06136c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sat, 23 Nov 2013 23:26:06 +0100 Subject: [PATCH 14/29] [brightcove] Set the 'videoPlayer' value to the 'videoId' if it's missing in the parameters (fixes #1815) --- youtube_dl/extractor/brightcove.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index d8c35465a..74a7d13e3 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -75,14 +75,17 @@ def _build_brighcove_url(cls, object_str): params = {'flashID': object_doc.attrib['id'], 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } - playerKey = find_xpath_attr(object_doc, './param', 'name', 'playerKey') + def find_param(name): + return find_xpath_attr(object_doc, './param', 'name', name) + playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: params['playerKey'] = playerKey.attrib['value'] - videoPlayer = find_xpath_attr(object_doc, './param', 'name', '@videoPlayer') + # The three fields hold the id of the video + videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: params['@videoPlayer'] = videoPlayer.attrib['value'] - linkBase = find_xpath_attr(object_doc, './param', 'name', 'linkBaseURL') + linkBase = find_param('linkBaseURL') if linkBase is not None: params['linkBaseURL'] = linkBase.attrib['value'] data = compat_urllib_parse.urlencode(params) From 8bf9319e9c0c02f5516b00509476abff89eb3d41 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:08:11 +0100 Subject: [PATCH 15/29] Simplify logger code(#1811) --- youtube_dl/YoutubeDL.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 6729d53ad..d7e2417ac 100644 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -97,7 +97,7 @@ class YoutubeDL(object): playlistend: Playlist item to end at. matchtitle: Download only matching titles. rejecttitle: Reject downloads for matching titles. - logger: Log messages to a logging.Logger instance. + logger: Log messages to a logging.Logger instance. logtostderr: Log messages to stderr instead of stdout. writedescription: Write the video description to a .description file writeinfojson: Write the video description to a .info.json file @@ -193,7 +193,7 @@ def add_post_processor(self, pp): def to_screen(self, message, skip_eol=False): """Print message to stdout if not in quiet mode.""" - if self.params.get('logger', False): + if self.params.get('logger'): self.params['logger'].debug(message) elif not self.params.get('quiet', False): terminator = [u'\n', u''][skip_eol] @@ -203,7 +203,7 @@ def to_screen(self, message, skip_eol=False): def to_stderr(self, message): """Print message to stderr.""" assert type(message) == type(u'') - if self.params.get('logger', False): + if self.params.get('logger'): self.params['logger'].error(message) else: output = message + u'\n' From 13ebea791fb4293acf939730ad5a9c07e553005f Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:37:14 +0100 Subject: [PATCH 16/29] [niconico] Simplify and make work with old Python versions The website requires SSLv3, otherwise it just times out during SSL negotiation. --- youtube_dl/extractor/niconico.py | 121 ++++++++----------------------- youtube_dl/utils.py | 29 ++++++-- 2 files changed, 55 insertions(+), 95 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 8638a8ee8..22898b5a1 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -17,6 +17,7 @@ unified_strdate, ) + class NiconicoIE(InfoExtractor): IE_NAME = u'niconico' IE_DESC = u'ニコニコ動画' @@ -38,8 +39,7 @@ class NiconicoIE(InfoExtractor): }, } - _VALID_URL = r'^(?:https?://)?(?:www\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' - _LOGIN_URL = 'https://secure.nicovideo.jp/secure/login' + _VALID_URL = r'^https?://(?:www\.|secure\.)?nicovideo\.jp/watch/([a-z][a-z][0-9]+)(?:.*)$' _NETRC_MACHINE = 'niconico' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = True @@ -57,99 +57,63 @@ def _login(self): # Log in login_form_strs = { - u'mail': username, - u'password': password, + u'mail': username, + u'password': password, } # Convert to UTF-8 *before* urlencode because Python 2.x's urlencode # chokes on unicode login_form = dict((k.encode('utf-8'), v.encode('utf-8')) for k,v in login_form_strs.items()) - login_data = compat_urllib_parse.urlencode(login_form).encode('ascii') - request = compat_urllib_request.Request(self._LOGIN_URL, login_data) - try: - self.report_login() - login_results = compat_urllib_request.urlopen(request).read().decode('utf-8') - if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: - self._downloader.report_warning(u'unable to log in: bad username or password') - return False - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - self._downloader.report_warning(u'unable to log in: %s' % compat_str(err)) + login_data = compat_urllib_parse.urlencode(login_form).encode('utf-8') + request = compat_urllib_request.Request( + u'https://secure.nicovideo.jp/secure/login', login_data) + login_results = self._download_webpage( + request, u'', note=u'Logging in', errnote=u'Unable to log in') + if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None: + self._downloader.report_warning(u'unable to log in: bad username or password') return False return True def _real_extract(self, url): - video_id = self._extract_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) # Get video webpage - self.report_video_webpage_download(video_id) - url = 'http://www.nicovideo.jp/watch/' + video_id - request = compat_urllib_request.Request(url) - try: - video_webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video webpage: %s' % compat_str(err)) + video_webpage = self._download_webpage( + 'http://www.nicovideo.jp/watch/' + video_id, video_id) - # Get video info - self.report_video_info_webpage_download(video_id) - url = 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id - request = compat_urllib_request.Request(url) - try: - video_info_webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download video info webpage: %s' % compat_str(err)) + video_info_webpage = self._download_webpage( + 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, + note=u'Downloading video info page') # Get flv info - self.report_flv_info_webpage_download(video_id) - url = 'http://flapi.nicovideo.jp/api/getflv?v=' + video_id - request = compat_urllib_request.Request(url) - try: - flv_info_webpage = compat_urllib_request.urlopen(request).read() - except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: - raise ExtractorError(u'Unable to download flv info webpage: %s' % compat_str(err)) + flv_info_webpage = self._download_webpage( + u'http://flapi.nicovideo.jp/api/getflv?v=' + video_id, + video_id, u'Downloading flv info') + video_real_url = compat_urlparse.parse_qs(flv_info_webpage)['url'][0] # Start extracting information - self.report_information_extraction(video_id) video_info = xml.etree.ElementTree.fromstring(video_info_webpage) - - # url - video_real_url = compat_urlparse.parse_qs(flv_info_webpage.decode('utf-8'))['url'][0] - - # title video_title = video_info.find('.//title').text - - # ext video_extension = video_info.find('.//movie_type').text - - # format video_format = video_extension.upper() - - # thumbnail video_thumbnail = video_info.find('.//thumbnail_url').text - - # description video_description = video_info.find('.//description').text - - # uploader_id video_uploader_id = video_info.find('.//user_id').text + video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) + video_view_count = video_info.find('.//view_counter').text + video_webpage_url = video_info.find('.//watch_url').text # uploader + video_uploader = video_uploader_id url = 'http://seiga.nicovideo.jp/api/user/info?id=' + video_uploader_id - request = compat_urllib_request.Request(url) try: - user_info_webpage = compat_urllib_request.urlopen(request).read() + user_info_webpage = self._download_webpage( + url, video_id, note=u'Downloading user information') except (compat_urllib_error.URLError, compat_http_client.HTTPException, socket.error) as err: self._downloader.report_warning(u'Unable to download user info webpage: %s' % compat_str(err)) - - user_info = xml.etree.ElementTree.fromstring(user_info_webpage) - video_uploader = user_info.find('.//nickname').text - - # uploder_date - video_upload_date = unified_strdate(video_info.find('.//first_retrieve').text.split('+')[0]) - - # view_count - video_view_count = video_info.find('.//view_counter').text - - # webpage_url - video_webpage_url = video_info.find('.//watch_url').text + else: + user_info = xml.etree.ElementTree.fromstring(user_info_webpage) + video_uploader = user_info.find('.//nickname').text return { 'id': video_id, @@ -165,26 +129,3 @@ def _real_extract(self, url): 'view_count': video_view_count, 'webpage_url': video_webpage_url, } - - def _extract_id(self, url): - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError(u'Invalid URL: %s' % url) - video_id = mobj.group(1) - return video_id - - def report_video_webpage_download(self, video_id): - """Report attempt to download video webpage.""" - self.to_screen(u'%s: Downloading video webpage' % video_id) - - def report_video_info_webpage_download(self, video_id): - """Report attempt to download video info webpage.""" - self.to_screen(u'%s: Downloading video info webpage' % video_id) - - def report_flv_info_webpage_download(self, video_id): - """Report attempt to download flv info webpage.""" - self.to_screen(u'%s: Downloading flv info webpage' % video_id) - - def report_information_extraction(self, video_id): - """Report attempt to extract video information.""" - self.to_screen(u'%s: Extracting video information' % video_id) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 0720fe9eb..34b3d19e0 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -12,6 +12,7 @@ import pipes import platform import re +import ssl import socket import sys import traceback @@ -535,13 +536,31 @@ def formatSeconds(secs): else: return '%d' % secs + def make_HTTPS_handler(opts): - if sys.version_info < (3,2): - # Python's 2.x handler is very simplistic - return compat_urllib_request.HTTPSHandler() + if sys.version_info < (3, 2): + import httplib + + class HTTPSConnectionV3(httplib.HTTPSConnection): + def __init__(self, *args, **kwargs): + httplib.HTTPSConnection.__init__(self, *args, **kwargs) + + def connect(self): + sock = socket.create_connection((self.host, self.port), self.timeout) + if self._tunnel_host: + self.sock = sock + self._tunnel() + try: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv3) + except ssl.SSLError as e: + self.sock = ssl.wrap_socket(sock, self.key_file, self.cert_file, ssl_version=ssl.PROTOCOL_SSLv23) + + class HTTPSHandlerV3(compat_urllib_request.HTTPSHandler): + def https_open(self, req): + return self.do_open(HTTPSConnectionV3, req) + return HTTPSHandlerV3() else: - import ssl - context = ssl.SSLContext(ssl.PROTOCOL_SSLv23) + context = ssl.SSLContext(ssl.PROTOCOL_SSLv3) context.set_default_verify_paths() context.verify_mode = (ssl.CERT_NONE From 38b2db6a666e094896927217aa293750a732e81d Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:39:49 +0100 Subject: [PATCH 17/29] Credit @takuya0301 for niconico --- youtube_dl/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 64ebf4d48..19904dbfd 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -35,6 +35,7 @@ 'Jelle van der Waa', 'Marcin Cieślak', 'Anton Larionov', + 'Takuya Tsuchida', ) __license__ = 'Public Domain' From 2e767313e49b43400b3baae247e0f4c9e9e24992 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:52:21 +0100 Subject: [PATCH 18/29] [update] fix error --- youtube_dl/update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/update.py b/youtube_dl/update.py index be7800e8b..cd9670166 100644 --- a/youtube_dl/update.py +++ b/youtube_dl/update.py @@ -86,7 +86,7 @@ def update_self(to_screen, verbose): def version_tuple(version_str): return tuple(map(int, version_str.split('.'))) - if version_tuple(__version__) >= version_tuple(version_str): + if version_tuple(__version__) >= version_tuple(version_id): to_screen(u'youtube-dl is up to date (%s)' % __version__) return From 23e6d50d73188eab26944e41f164a5a1ab7f547a Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:52:53 +0100 Subject: [PATCH 19/29] [bandcamp] Remove unused variable --- youtube_dl/extractor/bandcamp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 1aa9dbefd..3a32c14c5 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,7 +34,6 @@ def _real_extract(self, url): json_code = m_trackinfo.group(1) data = json.loads(json_code) - entries = [] for d in data: formats = [{ 'format_id': 'format_id', From bd49928f7a0254eeb8d5f918c5649ce4eb78ef36 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:53:50 +0100 Subject: [PATCH 20/29] [niconico] Clarify download --- youtube_dl/extractor/niconico.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py index 22898b5a1..729607ea3 100644 --- a/youtube_dl/extractor/niconico.py +++ b/youtube_dl/extractor/niconico.py @@ -77,9 +77,9 @@ def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group(1) - # Get video webpage - video_webpage = self._download_webpage( - 'http://www.nicovideo.jp/watch/' + video_id, video_id) + # Get video webpage. We are not actually interested in it, but need + # the cookies in order to be able to download the info webpage + self._download_webpage('http://www.nicovideo.jp/watch/' + video_id, video_id) video_info_webpage = self._download_webpage( 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id, From 66ec0192406bbf1bffcb6c4e72fe1529f1e21195 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 06:54:26 +0100 Subject: [PATCH 21/29] [youtube] do not use variable name twice --- youtube_dl/extractor/youtube.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 126688652..07a457f4d 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1571,8 +1571,8 @@ def _real_extract(self, url): playlist_title = self._og_search_title(page) - url_results = [self.url_result(video_id, 'Youtube', video_id=video_id) - for video_id in ids] + url_results = [self.url_result(vid_id, 'Youtube', video_id=vid_id) + for vid_id in ids] return self.playlist_result(url_results, playlist_id, playlist_title) From 382ed50e0ecfb2fa692049030c858b99159c791b Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:30:05 +0100 Subject: [PATCH 22/29] [viki] Add extractor (fixes #1813) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/viki.py | 91 ++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+) create mode 100644 youtube_dl/extractor/viki.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index f443f11f6..867734fa2 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -157,6 +157,7 @@ from .videopremium import VideoPremiumIE from .vimeo import VimeoIE, VimeoChannelIE from .vine import VineIE +from .viki import VikiIE from .vk import VKIE from .wat import WatIE from .websurg import WeBSurgIE diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py new file mode 100644 index 000000000..78d03c079 --- /dev/null +++ b/youtube_dl/extractor/viki.py @@ -0,0 +1,91 @@ +import re + +from ..utils import ( + unified_strdate, +) +from .subtitles import SubtitlesInfoExtractor + + +class VikiIE(SubtitlesInfoExtractor): + IE_NAME = u'viki' + + _VALID_URL = r'^https?://(?:www\.)?viki\.com/videos/(?P<id>[0-9]+v)' + _TEST = { + u'url': u'http://www.viki.com/videos/1023585v-heirs-episode-14', + u'file': u'1023585v.mp4', + u'md5': u'a21454021c2646f5433514177e2caa5f', + u'info_dict': { + u'title': u'Heirs Episode 14', + u'uploader': u'SBS', + u'description': u'md5:c4b17b9626dd4b143dcc4d855ba3474e', + u'upload_date': u'20131121', + u'age_limit': 13, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + webpage = self._download_webpage(url, video_id) + title = self._og_search_title(webpage) + description = self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + uploader = self._html_search_regex( + r'<strong>Broadcast Network: </strong>\s*([^<]*)<', webpage, + u'uploader') + if uploader is not None: + uploader = uploader.strip() + + rating_str = self._html_search_regex( + r'<strong>Rating: </strong>\s*([^<]*)<', webpage, + u'rating information', default='').strip() + RATINGS = { + 'G': 0, + 'PG': 10, + 'PG-13': 13, + 'R': 16, + 'NC': 18, + } + age_limit = RATINGS.get(rating_str) + + info_url = 'http://www.viki.com/player5_fragment/%s?action=show&controller=videos' % video_id + info_webpage = self._download_webpage(info_url, video_id) + video_url = self._html_search_regex( + r'<source[^>]+src="([^"]+)"', info_webpage, u'video URL') + + upload_date_str = self._html_search_regex( + r'"created_at":"([^"]+)"', info_webpage, u'upload date') + upload_date = ( + unified_strdate(upload_date_str) + if upload_date_str is not None + else None + ) + + # subtitles + video_subtitles = self.extract_subtitles(video_id, info_webpage) + if self._downloader.params.get('listsubtitles', False): + self._list_available_subtitles(video_id, info_webpage) + return + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'description': description, + 'thumbnail': thumbnail, + 'age_limit': age_limit, + 'uploader': uploader, + 'subtitles': video_subtitles, + 'upload_date': upload_date, + } + + def _get_available_subtitles(self, video_id, info_webpage): + res = {} + for sturl in re.findall(r'<track src="([^"]+)"/>'): + m = re.search(r'/(?P<lang>[a-z]+)\.vtt', sturl) + if not m: + continue + res[m.group('lang')] = sturl + return res From eaaafc59c2f8ffaee4df06092a57f65eec1b6eaa Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:30:34 +0100 Subject: [PATCH 23/29] release 2013.11.24 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index f6d18f945..68ef46a30 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.22.2' +__version__ = '2013.11.24' From 0c7c19d6bc55a624532f2426d080aea51962cfe0 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:51:44 +0100 Subject: [PATCH 24/29] [clipfish] Add extractor (Fixes #1760) --- youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/clipfish.py | 53 ++++++++++++++++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100644 youtube_dl/extractor/clipfish.py diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 867734fa2..4c280fa5e 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -20,6 +20,7 @@ from .canalplus import CanalplusIE from .canalc2 import Canalc2IE from .cinemassacre import CinemassacreIE +from .clipfish import ClipfishIE from .cnn import CNNIE from .collegehumor import CollegeHumorIE from .comedycentral import ComedyCentralIE diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py new file mode 100644 index 000000000..95449da3c --- /dev/null +++ b/youtube_dl/extractor/clipfish.py @@ -0,0 +1,53 @@ +import re +import time +import xml.etree.ElementTree + +from .common import InfoExtractor + + +class ClipfishIE(InfoExtractor): + IE_NAME = u'clipfish' + + _VALID_URL = r'^https?://(?:www\.)?clipfish\.de/.*?/video/(?P<id>[0-9]+)/' + _TEST = { + u'url': u'http://www.clipfish.de/special/supertalent/video/4028320/supertalent-2013-ivana-opacak-singt-nobodys-perfect/', + u'file': u'4028320.f4v', + u'md5': u'5e38bda8c329fbfb42be0386a3f5a382', + u'info_dict': { + u'title': u'Supertalent 2013: Ivana Opacak singt Nobody\'s Perfect', + u'duration': 399, + } + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group(1) + + info_url = ('http://www.clipfish.de/devxml/videoinfo/%s?ts=%d' % + (video_id, int(time.time()))) + info_xml = self._download_webpage( + info_url, video_id, note=u'Downloading info page') + doc = xml.etree.ElementTree.fromstring(info_xml) + title = doc.find('title').text + video_url = doc.find('filename').text + thumbnail = doc.find('imageurl').text + duration_str = doc.find('duration').text + m = re.match( + r'^(?P<hours>[0-9]+):(?P<minutes>[0-9]{2}):(?P<seconds>[0-9]{2}):(?P<ms>[0-9]*)$', + duration_str) + if m: + duration = ( + (int(m.group('hours')) * 60 * 60) + + (int(m.group('minutes')) * 60) + + (int(m.group('seconds'))) + ) + else: + duration = None + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + 'thumbnail': thumbnail, + 'duration': duration, + } From 138df537ffaeda182789440c4086f009a739dde3 Mon Sep 17 00:00:00 2001 From: Philipp Hagemeister <phihag@phihag.de> Date: Sun, 24 Nov 2013 07:51:56 +0100 Subject: [PATCH 25/29] release 2013.11.24.1 --- youtube_dl/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 68ef46a30..de92411bb 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,2 +1,2 @@ -__version__ = '2013.11.24' +__version__ = '2013.11.24.1' From d214fdb8fe796e92485e28038ee72d28caa3ad10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 11:02:34 +0100 Subject: [PATCH 26/29] [brightcove] Don't use 'or' with the xml nodes, use the 'value' attribute instead --- youtube_dl/extractor/brightcove.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 74a7d13e3..66fe0ac9a 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -76,18 +76,21 @@ def _build_brighcove_url(cls, object_str): 'playerID': find_xpath_attr(object_doc, './param', 'name', 'playerID').attrib['value'], } def find_param(name): - return find_xpath_attr(object_doc, './param', 'name', name) + node = find_xpath_attr(object_doc, './param', 'name', name) + if node is not None: + return node.attrib['value'] + return None playerKey = find_param('playerKey') # Not all pages define this value if playerKey is not None: - params['playerKey'] = playerKey.attrib['value'] + params['playerKey'] = playerKey # The three fields hold the id of the video videoPlayer = find_param('@videoPlayer') or find_param('videoId') or find_param('videoID') if videoPlayer is not None: - params['@videoPlayer'] = videoPlayer.attrib['value'] + params['@videoPlayer'] = videoPlayer linkBase = find_param('linkBaseURL') if linkBase is not None: - params['linkBaseURL'] = linkBase.attrib['value'] + params['linkBaseURL'] = linkBase data = compat_urllib_parse.urlencode(params) return cls._FEDERATED_URL_TEMPLATE % data From dc65dcbb6d709ef6e38f336fe77c14767d6c8f9d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 11:28:44 +0100 Subject: [PATCH 27/29] [mixcloud] The description field may be missing (fixes #1819) --- youtube_dl/extractor/mixcloud.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a200dcd74..e2baf44d7 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -60,7 +60,7 @@ def _real_extract(self, url): 'title': info['name'], 'url': final_song_url, 'ext': 'mp3', - 'description': info['description'], + 'description': info.get('description'), 'thumbnail': info['pictures'].get('extra_large'), 'uploader': info['user']['name'], 'uploader_id': info['user']['username'], From f459d17018812dc896324f8208cdfe2ada04ea50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 14:33:50 +0100 Subject: [PATCH 28/29] [youtube] Add an extractor for downloading the watch history (closes #1821) --- test/test_all_urls.py | 1 + youtube_dl/extractor/__init__.py | 1 + youtube_dl/extractor/youtube.py | 14 ++++++++++++++ 3 files changed, 16 insertions(+) diff --git a/test/test_all_urls.py b/test/test_all_urls.py index 56e5f80e1..42813da1a 100644 --- a/test/test_all_urls.py +++ b/test/test_all_urls.py @@ -100,6 +100,7 @@ def test_no_duplicates(self): def test_keywords(self): self.assertMatch(':ytsubs', ['youtube:subscriptions']) self.assertMatch(':ytsubscriptions', ['youtube:subscriptions']) + self.assertMatch(':ythistory', ['youtube:history']) self.assertMatch(':thedailyshow', ['ComedyCentral']) self.assertMatch(':tds', ['ComedyCentral']) self.assertMatch(':colbertreport', ['ComedyCentral']) diff --git a/youtube_dl/extractor/__init__.py b/youtube_dl/extractor/__init__.py index 4c280fa5e..1fbd10bc5 100644 --- a/youtube_dl/extractor/__init__.py +++ b/youtube_dl/extractor/__init__.py @@ -186,6 +186,7 @@ YoutubeTruncatedURLIE, YoutubeWatchLaterIE, YoutubeFavouritesIE, + YoutubeHistoryIE, ) from .zdf import ZDFIE diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index 07a457f4d..64d4c2445 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -1826,6 +1826,20 @@ class YoutubeWatchLaterIE(YoutubeFeedsInfoExtractor): _PAGING_STEP = 100 _PERSONAL_FEED = True +class YoutubeHistoryIE(YoutubeFeedsInfoExtractor): + IE_DESC = u'Youtube watch history, "ythistory" keyword (requires authentication)' + _VALID_URL = u'https?://www\.youtube\.com/feed/history|:ythistory' + _FEED_NAME = 'history' + _PERSONAL_FEED = True + _PLAYLIST_TITLE = u'Youtube Watch History' + + def _real_extract(self, url): + webpage = self._download_webpage('https://www.youtube.com/feed/history', u'History') + data_paging = self._search_regex(r'data-paging="(\d+)"', webpage, u'data-paging') + # The step is actually a ridiculously big number (like 1374343569725646) + self._PAGING_STEP = int(data_paging) + return super(YoutubeHistoryIE, self)._real_extract(url) + class YoutubeFavouritesIE(YoutubeBaseInfoExtractor): IE_NAME = u'youtube:favorites' IE_DESC = u'YouTube.com favourite videos, "ytfav" keyword (requires authentication)' From 267ed0c5d3547c68f1d34203c2ae4b0d826a29d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jaime=20Marqui=CC=81nez=20Ferra=CC=81ndiz?= <jaime.marquinez.ferrandiz@gmail.com> Date: Sun, 24 Nov 2013 14:59:19 +0100 Subject: [PATCH 29/29] [collegehumor] Encode the xml before calling xml.etree.ElementTree.fromstring (fixes #1822) Uses a new helper method in InfoExtractor: _download_xml --- youtube_dl/extractor/collegehumor.py | 7 ++----- youtube_dl/extractor/common.py | 6 ++++++ 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/collegehumor.py b/youtube_dl/extractor/collegehumor.py index 0c29acfb1..b27c1dfc5 100644 --- a/youtube_dl/extractor/collegehumor.py +++ b/youtube_dl/extractor/collegehumor.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree from .common import InfoExtractor from ..utils import ( @@ -46,11 +45,10 @@ def _real_extract(self, url): self.report_extraction(video_id) xmlUrl = 'http://www.collegehumor.com/moogaloop/video/' + video_id - metaXml = self._download_webpage(xmlUrl, video_id, + mdoc = self._download_xml(xmlUrl, video_id, u'Downloading info XML', u'Unable to download video info XML') - mdoc = xml.etree.ElementTree.fromstring(metaXml) try: videoNode = mdoc.findall('./video')[0] youtubeIdNode = videoNode.find('./youtubeID') @@ -65,11 +63,10 @@ def _real_extract(self, url): if next_url.endswith(u'manifest.f4m'): manifest_url = next_url + '?hdcore=2.10.3' - manifestXml = self._download_webpage(manifest_url, video_id, + adoc = self._download_xml(manifest_url, video_id, u'Downloading XML manifest', u'Unable to download video info XML') - adoc = xml.etree.ElementTree.fromstring(manifestXml) try: video_id = adoc.findall('./{http://ns.adobe.com/f4m/1.0}id')[0].text except IndexError: diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 3cebeaf29..482a231ec 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -4,6 +4,7 @@ import socket import sys import netrc +import xml.etree.ElementTree from ..utils import ( compat_http_client, @@ -208,6 +209,11 @@ def _download_webpage(self, url_or_request, video_id, note=None, errnote=None): """ Returns the data of the page as a string """ return self._download_webpage_handle(url_or_request, video_id, note, errnote)[0] + def _download_xml(self, url_or_request, video_id, note=u'Downloading XML', errnote=u'Unable to downloand XML'): + """Return the xml as an xml.etree.ElementTree.Element""" + xml_string = self._download_webpage(url_or_request, video_id, note, errnote) + return xml.etree.ElementTree.fromstring(xml_string.encode('utf-8')) + def to_screen(self, msg): """Print msg to screen, prefixing it with '[ie_name]'""" self._downloader.to_screen(u'[%s] %s' % (self.IE_NAME, msg))