From 145c5a83a80536b781fd043016bd27c91c760667 Mon Sep 17 00:00:00 2001 From: Evan Spensley <94762716+evansp@users.noreply.github.com> Date: Tue, 14 Jun 2022 09:33:29 -0400 Subject: [PATCH] [extractor/GoogleDrive] Add folder extractor (#4009) Closes #3388 Authored by: evansp, pukkandan --- yt_dlp/extractor/common.py | 2 +- yt_dlp/extractor/extractors.py | 5 ++- yt_dlp/extractor/googledrive.py | 56 +++++++++++++++++++++++++++++++++ 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 7adabf6f9..080d682eb 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -1188,7 +1188,7 @@ class InfoExtractor: self.report_warning('unable to extract %s' % _name + bug_reports_message()) return None - def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', contains_pattern='.+', fatal=True, **kwargs): + def _search_json(self, start_pattern, string, name, video_id, *, end_pattern='', contains_pattern='(?s:.+)', fatal=True, **kwargs): """Searches string for the JSON object specified by start_pattern""" # NB: end_pattern is only used to reduce the size of the initial match return self._parse_json( diff --git a/yt_dlp/extractor/extractors.py b/yt_dlp/extractor/extractors.py index 9e1ef4067..c7167a2cf 100644 --- a/yt_dlp/extractor/extractors.py +++ b/yt_dlp/extractor/extractors.py @@ -597,7 +597,10 @@ from .godtube import GodTubeIE from .gofile import GofileIE from .golem import GolemIE from .goodgame import GoodGameIE -from .googledrive import GoogleDriveIE +from .googledrive import ( + GoogleDriveIE, + GoogleDriveFolderIE, +) from .googlepodcasts import ( GooglePodcastsIE, GooglePodcastsFeedIE, diff --git a/yt_dlp/extractor/googledrive.py b/yt_dlp/extractor/googledrive.py index c0905f86a..b4f483298 100644 --- a/yt_dlp/extractor/googledrive.py +++ b/yt_dlp/extractor/googledrive.py @@ -276,3 +276,59 @@ class GoogleDriveIE(InfoExtractor): 'automatic_captions': self.extract_automatic_captions( video_id, subtitles_id, hl), } + + +class GoogleDriveFolderIE(InfoExtractor): + IE_NAME = 'GoogleDrive:Folder' + _VALID_URL = r'https?://(?:docs|drive)\.google\.com/drive/folders/(?P[\w-]{28,})' + _TESTS = [{ + 'url': 'https://drive.google.com/drive/folders/1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'info_dict': { + 'id': '1dQ4sx0-__Nvg65rxTSgQrl7VyW_FZ9QI', + 'title': 'Forrest' + }, + 'playlist_count': 3, + }] + _BOUNDARY = '=====vc17a3rwnndj=====' + _REQUEST = "/drive/v2beta/files?openDrive=true&reason=102&syncType=0&errorRecovery=false&q=trashed%20%3D%20false%20and%20'{folder_id}'%20in%20parents&fields=kind%2CnextPageToken%2Citems(kind%2CmodifiedDate%2CmodifiedByMeDate%2ClastViewedByMeDate%2CfileSize%2Cowners(kind%2CpermissionId%2Cid)%2ClastModifyingUser(kind%2CpermissionId%2Cid)%2ChasThumbnail%2CthumbnailVersion%2Ctitle%2Cid%2CresourceKey%2Cshared%2CsharedWithMeDate%2CuserPermission(role)%2CexplicitlyTrashed%2CmimeType%2CquotaBytesUsed%2Ccopyable%2CfileExtension%2CsharingUser(kind%2CpermissionId%2Cid)%2Cspaces%2Cversion%2CteamDriveId%2ChasAugmentedPermissions%2CcreatedDate%2CtrashingUser(kind%2CpermissionId%2Cid)%2CtrashedDate%2Cparents(id)%2CshortcutDetails(targetId%2CtargetMimeType%2CtargetLookupStatus)%2Ccapabilities(canCopy%2CcanDownload%2CcanEdit%2CcanAddChildren%2CcanDelete%2CcanRemoveChildren%2CcanShare%2CcanTrash%2CcanRename%2CcanReadTeamDrive%2CcanMoveTeamDriveItem)%2Clabels(starred%2Ctrashed%2Crestricted%2Cviewed))%2CincompleteSearch&appDataFilter=NO_APP_DATA&spaces=drive&pageToken={page_token}&maxResults=50&supportsTeamDrives=true&includeItemsFromAllDrives=true&corpora=default&orderBy=folder%2Ctitle_natural%20asc&retryCount=0&key={key} HTTP/1.1" + _DATA = f'''--{_BOUNDARY} +content-type: application/http +content-transfer-encoding: binary + +GET %s + +--{_BOUNDARY} +''' + + def _call_api(self, folder_id, key, data, **kwargs): + response = self._download_webpage( + 'https://clients6.google.com/batch/drive/v2beta', + folder_id, data=data.encode('utf-8'), + headers={ + 'Content-Type': 'text/plain;charset=UTF-8;', + 'Origin': 'https://drive.google.com', + }, query={ + '$ct': f'multipart/mixed; boundary="{self._BOUNDARY}"', + 'key': key + }, **kwargs) + return self._search_json('', response, 'api response', folder_id, **kwargs) or {} + + def _get_folder_items(self, folder_id, key): + page_token = '' + while page_token is not None: + request = self._REQUEST.format(folder_id=folder_id, page_token=page_token, key=key) + page = self._call_api(folder_id, key, self._DATA % request) + yield from page['items'] + page_token = page.get('nextPageToken') + + def _real_extract(self, url): + folder_id = self._match_id(url) + + webpage = self._download_webpage(url, folder_id) + key = self._search_regex(r'"(\w{39})"', webpage, 'key') + + folder_info = self._call_api(folder_id, key, self._DATA % f'/drive/v2beta/files/{folder_id} HTTP/1.1', fatal=False) + + return self.playlist_from_matches( + self._get_folder_items(folder_id, key), folder_id, folder_info.get('title'), + ie=GoogleDriveIE, getter=lambda item: f'https://drive.google.com/file/d/{item["id"]}')