From 8817a80d3ac69f2dfd12bdc41657c4a04139807c Mon Sep 17 00:00:00 2001 From: Simon Sawicki <37424085+Grub4K@users.noreply.github.com> Date: Fri, 16 Sep 2022 19:02:00 +0200 Subject: [PATCH] [cookies] Parse cookies leniently (#4780) Closes #4776, #3778 Authored by: Grub4K --- test/test_cookies.py | 146 +++++++++++++++++++++++++++++++++++++ yt_dlp/cookies.py | 96 ++++++++++++++++++++++++ yt_dlp/extractor/common.py | 3 +- 3 files changed, 244 insertions(+), 1 deletion(-) diff --git a/test/test_cookies.py b/test/test_cookies.py index cfeb11b55..61619df29 100644 --- a/test/test_cookies.py +++ b/test/test_cookies.py @@ -3,6 +3,7 @@ from datetime import datetime, timezone from yt_dlp import cookies from yt_dlp.cookies import ( + LenientSimpleCookie, LinuxChromeCookieDecryptor, MacChromeCookieDecryptor, WindowsChromeCookieDecryptor, @@ -137,3 +138,148 @@ class TestCookies(unittest.TestCase): def test_pbkdf2_sha1(self): key = pbkdf2_sha1(b'peanuts', b' ' * 16, 1, 16) self.assertEqual(key, b'g\xe1\x8e\x0fQ\x1c\x9b\xf3\xc9`!\xaa\x90\xd9\xd34') + + +class TestLenientSimpleCookie(unittest.TestCase): + def _run_tests(self, *cases): + for message, raw_cookie, expected in cases: + cookie = LenientSimpleCookie(raw_cookie) + + with self.subTest(message, expected=expected): + self.assertEqual(cookie.keys(), expected.keys(), message) + + for key, expected_value in expected.items(): + morsel = cookie[key] + if isinstance(expected_value, tuple): + expected_value, expected_attributes = expected_value + else: + expected_attributes = {} + + attributes = { + key: value + for key, value in dict(morsel).items() + if value != "" + } + self.assertEqual(attributes, expected_attributes, message) + + self.assertEqual(morsel.value, expected_value, message) + + def test_parsing(self): + self._run_tests( + # Copied from https://github.com/python/cpython/blob/v3.10.7/Lib/test/test_http_cookies.py + ( + "Test basic cookie", + "chips=ahoy; vienna=finger", + {"chips": "ahoy", "vienna": "finger"}, + ), + ( + "Test quoted cookie", + 'keebler="E=mc2; L=\\"Loves\\"; fudge=\\012;"', + {"keebler": 'E=mc2; L="Loves"; fudge=\012;'}, + ), + ( + "Allow '=' in an unquoted value", + "keebler=E=mc2", + {"keebler": "E=mc2"}, + ), + ( + "Allow cookies with ':' in their name", + "key:term=value:term", + {"key:term": "value:term"}, + ), + ( + "Allow '[' and ']' in cookie values", + "a=b; c=[; d=r; f=h", + {"a": "b", "c": "[", "d": "r", "f": "h"}, + ), + ( + "Test basic cookie attributes", + 'Customer="WILE_E_COYOTE"; Version=1; Path=/acme', + {"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})}, + ), + ( + "Test flag only cookie attributes", + 'Customer="WILE_E_COYOTE"; HttpOnly; Secure', + {"Customer": ("WILE_E_COYOTE", {"httponly": True, "secure": True})}, + ), + ( + "Test flag only attribute with values", + "eggs=scrambled; httponly=foo; secure=bar; Path=/bacon", + {"eggs": ("scrambled", {"httponly": "foo", "secure": "bar", "path": "/bacon"})}, + ), + ( + "Test special case for 'expires' attribute, 4 digit year", + 'Customer="W"; expires=Wed, 01 Jan 2010 00:00:00 GMT', + {"Customer": ("W", {"expires": "Wed, 01 Jan 2010 00:00:00 GMT"})}, + ), + ( + "Test special case for 'expires' attribute, 2 digit year", + 'Customer="W"; expires=Wed, 01 Jan 98 00:00:00 GMT', + {"Customer": ("W", {"expires": "Wed, 01 Jan 98 00:00:00 GMT"})}, + ), + ( + "Test extra spaces in keys and values", + "eggs = scrambled ; secure ; path = bar ; foo=foo ", + {"eggs": ("scrambled", {"secure": True, "path": "bar"}), "foo": "foo"}, + ), + ( + "Test quoted attributes", + 'Customer="WILE_E_COYOTE"; Version="1"; Path="/acme"', + {"Customer": ("WILE_E_COYOTE", {"version": "1", "path": "/acme"})} + ), + # Our own tests that CPython passes + ( + "Allow ';' in quoted value", + 'chips="a;hoy"; vienna=finger', + {"chips": "a;hoy", "vienna": "finger"}, + ), + ( + "Keep only the last set value", + "a=c; a=b", + {"a": "b"}, + ), + ) + + def test_lenient_parsing(self): + self._run_tests( + ( + "Ignore and try to skip invalid cookies", + 'chips={"ahoy;": 1}; vienna="finger;"', + {"vienna": "finger;"}, + ), + ( + "Ignore cookies without a name", + "a=b; unnamed; c=d", + {"a": "b", "c": "d"}, + ), + ( + "Ignore '\"' cookie without name", + 'a=b; "; c=d', + {"a": "b", "c": "d"}, + ), + ( + "Skip all space separated values", + "x a=b c=d x; e=f", + {"a": "b", "c": "d", "e": "f"}, + ), + ( + "Skip all space separated values", + 'x a=b; data={"complex": "json", "with": "key=value"}; x c=d x', + {"a": "b", "c": "d"}, + ), + ( + "Expect quote mending", + 'a=b; invalid="; c=d', + {"a": "b", "c": "d"}, + ), + ( + "Reset morsel after invalid to not capture attributes", + "a=b; invalid; Version=1; c=d", + {"a": "b", "c": "d"}, + ), + ( + "Continue after non-flag attribute without value", + "a=b; path; Version=1; c=d", + {"a": "b", "c": "d"}, + ), + ) diff --git a/yt_dlp/cookies.py b/yt_dlp/cookies.py index c3b14f03b..d502e91da 100644 --- a/yt_dlp/cookies.py +++ b/yt_dlp/cookies.py @@ -1,6 +1,7 @@ import base64 import contextlib import http.cookiejar +import http.cookies import json import os import re @@ -990,3 +991,98 @@ def _parse_browser_specification(browser_name, profile=None, keyring=None, conta if profile is not None and _is_path(profile): profile = os.path.expanduser(profile) return browser_name, profile, keyring, container + + +class LenientSimpleCookie(http.cookies.SimpleCookie): + """More lenient version of http.cookies.SimpleCookie""" + # From https://github.com/python/cpython/blob/v3.10.7/Lib/http/cookies.py + _LEGAL_KEY_CHARS = r"\w\d!#%&'~_`><@,:/\$\*\+\-\.\^\|\)\(\?\}\{\=" + _LEGAL_VALUE_CHARS = _LEGAL_KEY_CHARS + r"\[\]" + + _RESERVED = { + "expires", + "path", + "comment", + "domain", + "max-age", + "secure", + "httponly", + "version", + "samesite", + } + + _FLAGS = {"secure", "httponly"} + + # Added 'bad' group to catch the remaining value + _COOKIE_PATTERN = re.compile(r""" + \s* # Optional whitespace at start of cookie + (?P # Start of group 'key' + [""" + _LEGAL_KEY_CHARS + r"""]+?# Any word of at least one letter + ) # End of group 'key' + ( # Optional group: there may not be a value. + \s*=\s* # Equal Sign + ( # Start of potential value + (?P # Start of group 'val' + "(?:[^\\"]|\\.)*" # Any doublequoted string + | # or + \w{3},\s[\w\d\s-]{9,11}\s[\d:]{8}\sGMT # Special case for "expires" attr + | # or + [""" + _LEGAL_VALUE_CHARS + r"""]* # Any word or empty string + ) # End of group 'val' + | # or + (?P(?:\\;|[^;])*?) # 'bad' group fallback for invalid values + ) # End of potential value + )? # End of optional value group + \s* # Any number of spaces. + (\s+|;|$) # Ending either at space, semicolon, or EOS. + """, re.ASCII | re.VERBOSE) + + def load(self, data): + # Workaround for https://github.com/yt-dlp/yt-dlp/issues/4776 + if not isinstance(data, str): + return super().load(data) + + morsel = None + index = 0 + length = len(data) + + while 0 <= index < length: + match = self._COOKIE_PATTERN.search(data, index) + if not match: + break + + index = match.end(0) + if match.group("bad"): + morsel = None + continue + + key, value = match.group("key", "val") + + if key[0] == "$": + if morsel is not None: + morsel[key[1:]] = True + continue + + lower_key = key.lower() + if lower_key in self._RESERVED: + if morsel is None: + continue + + if value is None: + if lower_key not in self._FLAGS: + morsel = None + continue + value = True + else: + value, _ = self.value_decode(value) + + morsel[key] = value + + elif value is not None: + morsel = self.get(key, http.cookies.Morsel()) + real_value, coded_value = self.value_decode(value) + morsel.set(key, real_value, coded_value) + self[key] = morsel + + else: + morsel = None diff --git a/yt_dlp/extractor/common.py b/yt_dlp/extractor/common.py index 30042d61f..e8fa8fdde 100644 --- a/yt_dlp/extractor/common.py +++ b/yt_dlp/extractor/common.py @@ -22,6 +22,7 @@ import xml.etree.ElementTree from ..compat import functools # isort: split from ..compat import compat_etree_fromstring, compat_expanduser, compat_os_name +from ..cookies import LenientSimpleCookie from ..downloader import FileDownloader from ..downloader.f4m import get_base_url, remove_encrypted_media from ..utils import ( @@ -3632,7 +3633,7 @@ class InfoExtractor: def _get_cookies(self, url): """ Return a http.cookies.SimpleCookie with the cookies for the url """ - return http.cookies.SimpleCookie(self._downloader._calc_cookies(url)) + return LenientSimpleCookie(self._downloader._calc_cookies(url)) def _apply_first_set_cookie_header(self, url_handle, cookie): """