From f38f9a0028c4fbd2c89e894cc47a4c9ad6c06819 Mon Sep 17 00:00:00 2001
From: julien <julien@vso-software.fr>
Date: Wed, 10 Nov 2021 13:52:26 +0100
Subject: [PATCH] [YouTube] Unthrottle downloads by responding to the "n"
 parameter challenge cf https://github.com/ytdl-org/youtube-dl/pull/30184

---
 youtube_dl/compat.py            |  21 +++
 youtube_dl/extractor/youtube.py | 308 ++++++++++++++++++++++++++++++--
 2 files changed, 313 insertions(+), 16 deletions(-)

diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index 9e45c454b26..29e0d3a0292 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -2962,6 +2962,25 @@ def unpack(self, string):
         compat_Struct = struct.Struct
 
 
+# compat_map/filter() returning an iterator, supposedly the
+# same versioning as for zip below
+try:
+    from future_builtins import map as compat_map
+except ImportError:
+    try:
+        from itertools import imap as compat_map
+    except ImportError:
+        compat_map = map
+
+try:
+    from future_builtins import filter as compat_filter
+except ImportError:
+    try:
+        from itertools import ifilter as compat_filter
+    except ImportError:
+        compat_filter = filter
+
+
 try:
     from future_builtins import zip as compat_zip
 except ImportError:  # not 2.6+ or is 3.x
@@ -3015,6 +3034,7 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
     'compat_etree_fromstring',
     'compat_etree_register_namespace',
     'compat_expanduser',
+    'compat_filter',
     'compat_get_terminal_size',
     'compat_getenv',
     'compat_getpass',
@@ -3026,6 +3046,7 @@ def compat_ctypes_WINFUNCTYPE(*args, **kwargs):
     'compat_integer_types',
     'compat_itertools_count',
     'compat_kwargs',
+    'compat_map',
     'compat_numeric_types',
     'compat_ord',
     'compat_os_name',
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index dc4bd4a77f9..ad87ef8c593 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -13,6 +13,7 @@
 from ..compat import (
     compat_chr,
     compat_HTTPError,
+    compat_map as map,
     compat_parse_qs,
     compat_str,
     compat_urllib_parse_unquote_plus,
@@ -1253,6 +1254,17 @@ def _extract_player_info(cls, player_url):
             raise ExtractorError('Cannot identify player %r' % player_url)
         return id_m.group('id')
 
+    def _get_player_code(self, video_id, player_url, player_id=None):
+        if not player_id:
+            player_id = self._extract_player_info(player_url)
+
+        if player_id not in self._code_cache:
+            self._code_cache[player_id] = self._download_webpage(
+                player_url, video_id,
+                note='Downloading player ' + player_id,
+                errnote='Download of %s failed' % player_url)
+        return self._code_cache[player_id]
+
     def _extract_signature_function(self, video_id, player_url, example_sig):
         player_id = self._extract_player_info(player_url)
 
@@ -1265,12 +1277,7 @@ def _extract_signature_function(self, video_id, player_url, example_sig):
         if cache_spec is not None:
             return lambda s: ''.join(s[i] for i in cache_spec)
 
-        if player_id not in self._code_cache:
-            self._code_cache[player_id] = self._download_webpage(
-                player_url, video_id,
-                note='Downloading player ' + player_id,
-                errnote='Download of %s failed' % player_url)
-        code = self._code_cache[player_id]
+        code = self._get_player_code(video_id, player_url, player_id)
         res = self._parse_sig_js(code)
 
         test_string = ''.join(map(compat_chr, range(len(example_sig))))
@@ -1349,11 +1356,6 @@ def _decrypt_signature(self, s, video_id, player_url):
         if player_url is None:
             raise ExtractorError('Cannot decrypt signature without player_url')
 
-        if player_url.startswith('//'):
-            player_url = 'https:' + player_url
-        elif not re.match(r'https?://', player_url):
-            player_url = compat_urlparse.urljoin(
-                'https://www.youtube.com', player_url)
         try:
             player_id = (player_url, self._signature_cache_id(s))
             if player_id not in self._player_cache:
@@ -1370,6 +1372,280 @@ def _decrypt_signature(self, s, video_id, player_url):
             raise ExtractorError(
                 'Signature extraction failed: ' + tb, cause=e)
 
+    def _extract_player_url(self, webpage):
+        player_url = self._search_regex(
+            r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
+            webpage or '', 'player URL', fatal=False)
+        if not player_url:
+            return
+        if player_url.startswith('//'):
+            player_url = 'https:' + player_url
+        elif not re.match(r'https?://', player_url):
+            player_url = compat_urlparse.urljoin(
+                'https://www.youtube.com', player_url)
+        return player_url
+
+    # Based on an equivalent function [1] in the youtube.lua script from VLC
+    # Many thanks to @linkfanel [2]
+    # NB This code could fail if YT should revise the player code and would then have
+    # to be reworked (thankless task previously undertaken at [1])
+    # 1. https://code.videolan.org/videolan/vlc/-/blob/4fb284e5af69aa9ac2100ccbdd3b88debec9987f/share/lua/playlist/youtube.lua#L116
+    # 2. https://github.com/ytdl-org/youtube-dl/issues/30097#issuecomment-950157377
+    def _n_descramble(self, n_param, js):
+        """Compute the response to YT's "n" parameter challenge
+
+        Args:
+        n_param -- challenge string that is the value of the
+                   URL's "n" query parameter
+        js      -- text of the JS player code that includes the
+                   challenge response algorithm
+        """
+        if not js:
+            return
+
+        # helper functions (part 1)
+        def isiterable(x):
+            try:
+                return x.__getitem__ and True
+            except AttributeError:
+                return False
+
+        def find_first(pattern, string, flags=0, groups=1):
+            pattern = re.compile(pattern, flags)
+            return next(map(lambda m: m.groups() if groups is True else m.group(groups),
+                            pattern.finditer(string)),
+                        (None, ) * pattern.groups if groups is True else None)
+
+        # Look for the descrambler function's name
+        # a.D&&(b=a.get("n"))&&(b=lha(b),a.set("n",b))}};
+        descrambler = find_first(r'[=(,&|](\w+)\(\w+\),\w+\.set\("n",', js)
+        if not descrambler:
+            self.report_warning("Couldn't extract YouTube video throttling parameter descrambling function name")
+            return
+        # Fetch the code of the descrambler function
+        # lha=function(a){var b=a.split(""),c=[310282131,"KLf3",b,null,function(d,e){d.push(e)},-45817231, [data and transformations...] ,1248130556];c[3]=c;c[15]=c;c[18]=c;try{c[40](c[14],c[2]),c[25](c[48]),c[21](c[32],c[23]), [scripted calls...] ,c[25](c[33],c[3])}catch(d){return"enhanced_except_4ZMBnuz-_w8_"+a}return b.join("")};
+        code = find_first(r'(?s)%s=function\([^)]+\)\{(.+?)\};' % (descrambler, ), js)
+        if not code:
+            self.report_warning("Couldn't extract YouTube video throttling parameter descrambling code")
+            return
+        # Split code into two main sections: 1/ data and transformations,
+        # and 2/ a script of calls
+        datac, script = find_first(r'(?s)c=\[(.+)\];.+?;try\{(.+)\}catch\(', code, groups=True)
+        if not datac or not script:
+            self.report_warning("Couldn't extract YouTube video throttling parameter descrambling rules")
+            return
+        # Split "n" parameter into a table as descrambling operates on it
+        # as one of several arrays - in Python just copy it as a list
+        n = list(n_param)
+        # Helper: table_len = function() ... end - in Python just use len
+
+        # Common routine shared by the compound transformations,
+        # compounding the "n" parameter with an input string,
+        # character by character using a Base64 alphabet.
+        # d.forEach(function(l,m,n){this.push(n[m]=h[(h.indexOf(l)-h.indexOf(this[m])+m-32+f--)%h.length])},e.split(""))
+        def compound(ntab, strg, alphabet, charcode):
+            if ntab != n or type(strg) != compat_str:
+                return True
+            inp = list(strg)
+            llen = len(alphabet)
+            ntab_copy = ntab[:]
+            for i, c in enumerate(ntab_copy):
+                if type(c) != compat_str:
+                    return True
+                pos1 = alphabet.find(c)
+                pos2 = alphabet.find(inp[i])
+                if pos1 < 0 or pos2 < 0:
+                    return True
+                pos = (pos1 - pos2 + charcode - 32) % llen
+                newc = alphabet[pos]
+                ntab[i] = newc
+                inp.append(newc)
+
+        # The data section contains among others function code for a number
+        # of transformations, most of which are basic array operations.
+        # We can match these functions' code to identify them, and emulate
+        # the corresponding transformations.
+
+        # helper fns (in-place)
+        def swap(s, i, j):
+            x = s[i]
+            s[i] = s[j]
+            s[j] = x
+
+        def rotate(s, i):
+            tmp = s[:]
+            tmp[i:] = s
+            tmp[:i] = tmp[len(s):]
+            s[:] = tmp[:len(s)]
+
+        def remove(s, i):
+            del s[i]
+
+        # Compounding functions use a subfunction, so we need to be
+        # more specific in how much parsed data we consume.
+        cp_skip = r'(?s)^.*?\},e\.split\(""\)\)},\s*(.*)$'
+        def_skip = r"(?s)^.*?\},\s*(.*)$"
+
+        trans = (
+            # fn_name, fn, fn_detect_pattern, skip_re
+            ('reverse', lambda tab, _: tab.reverse(),
+                        # noqa: E127
+                        # function(d){d.reverse()}
+                        # function(d){for(var e=d.length;e;)d.push(d.splice(--e,1)[0])}
+                        r"^function\(d\)",
+                        def_skip),
+            ('append', lambda tab, val: tab.append(val),
+                       # noqa: E127
+                       # function(d,e){d.push(e)}
+                       r"^function\(d,e\){d\.push\(e\)\},",
+                       def_skip),
+            ('remove', lambda tab, i: remove(tab, i % len(tab)) if type(i) == int else True,
+                       # noqa: E127
+                       # function(d,e){e=(e%d.length+d.length)%d.length;d.splice(e,1)}
+                       r"^[^}]+?;d\.splice\(e,1\)\},",
+                       def_skip),
+            ('swap', lambda tab, i: swap(tab, 0, i % len(tab)) if type(i) == int else True,
+                     # noqa: E127
+                     # function(d,e){e=(e%d.length+d.length)%d.length;var f=d[0];d[0]=d[e];d[e]=f}
+                     # function(d,e){e=(e%d.length+d.length)%d.length;d.splice(0,1,d.splice(e,1,d[0])[0])}
+                     r"^[^}]+?;(?:var\sf=d\[0\];d\[0\]=d\[e\];d\[e\]=f|d\.splice\(0,1,d\.splice\(e,1,d\[0\]\)\[0\]\))\},",
+                     def_skip),
+            ('rotate', lambda tab, shift: rotate(tab, shift % len(tab)) if type(shift) == int else True,
+                       # noqa: E127
+                       # function(d,e){for(e=(e%d.length+d.length)%d.length;e--;)d.unshift(d.pop())}
+                       # function(d,e){e=(e%d.length+d.length)%d.length;d.splice(-e).reverse().forEach(function(f){d.unshift(f)})}
+                       r"^[^}]+?d\.unshift\((?:d\.pop\(\)|f\)\})\)},",
+                       def_skip),
+            # Compound transformations first build a variation of a
+            # Base64 alphabet, then in a common section, compound the
+            # "n" parameter with an input string, character by character.
+            ('compound1', lambda tab, s: compound(tab, s, "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ-_", 96),
+                          # noqa: E127
+                          # function(d,e){for(var f=64,h=[];++f-h.length-32;)switch(f){case 58:f=96;continue;case 91:f=44;break;case 65:f=47;continue;case 46:f=153;case 123:f-=58;default:h.push(String.fromCharCode(f))} [ compound... ] }
+                          r"^[^}]+?case\s58:f=96;",
+                          cp_skip),
+            ('compound2', lambda tab, s: compound(tab, s, "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_", 96),
+                          # noqa: E127
+                          # function(d,e){for(var f=64,h=[];++f-h.length-32;){switch(f){case 58:f-=14;case 91:case 92:case 93:continue;case 123:f=47;case 94:case 95:case 96:continue;case 46:f=95}h.push(String.fromCharCode(f))} [ compound... ] }
+                          # function(d,e){for(var f=64,h=[];++f-h.length-32;)switch(f){case 46:f=95;default:h.push(String.fromCharCode(f));case 94:case 95:case 96:break;case 123:f-=76;case 92:case 93:continue;case 58:f=44;case 91:} [ compound... ] }
+
+                          r"^[^}]+?case\s58:f(?:-=14|=44);",
+                          cp_skip),
+            # Fallback
+            ('unid', lambda _, __: self.report_warning("Couldn't apply unidentified YouTube video throttling parameter transformation, aborting descrambling") or True,
+                     # noqa: E127
+                     None,
+                     def_skip),
+        )
+        # The data section actually mixes input data, reference to the
+        # "n" parameter array, and self-reference to its own array, with
+        # transformation functions used to modify itself. We parse it
+        # as such into a table.
+        data = []
+        datac += ","
+        while datac:
+            # Transformation functions
+            if re.match(r"^function\(", datac):
+                name, el, _, skip = next(
+                    itertools.dropwhile(
+                        lambda x: x[2] is not None and not re.match(x[2], datac), trans))
+                datac = find_first(skip, datac)
+            # String input data
+            elif re.match(r'^"[^"]*",', datac):
+                el, datac = find_first(r'(?s)^"([^"]*)",\s*(.*)$', datac, groups=True)
+            # Integer input data
+            elif re.match(r'^-?\d+,', datac):
+                el, datac = find_first(r"(?s)^(.*?),\s*(.*)$", datac, groups=True)
+                el = int(el)
+            # Reference to "n" parameter array
+            elif re.match('^b,', datac):
+                el = n
+                datac = find_first(r"(?s)^b,\s*(.*)$", datac)
+            # Replaced by self-reference to data array after its declaration
+            elif re.match('^null,', datac):
+                el = data
+                datac = find_first(r"(?s)^null,\s*(.*)$", datac)
+            else:
+                self.report_warning("Couldn't parse unidentified YouTube video throttling parameter descrambling data"
+                                    '\nNear: "%s"' % datac[:64])
+                el = False
+                # Lua tables can't contain nil values: Python can, but still use False
+                datac = find_first(r"(?s)^[^,]*?,\s*(.*)$", datac)
+            data.append(el)
+
+        # Debugging helper to print data array elements
+        def prd(el, tab=None):
+            if not el:
+                return "???"
+            elif el == n:
+                return "n"
+            elif el == data:
+                return "data"
+            elif type(el) == compat_str:
+                return '"%s"' % (el, )
+            elif type(el) == int:
+                if isiterable(tab):
+                    return "%d -> %d" % (el, el % len(tab), )
+                return "%d" % (el, )
+            else:
+                for tr in trans:
+                    if el == tr[1]:
+                        return tr[0]
+            return repr(el)
+
+        # The script section contains a series of calls to elements of
+        # the data section array onto other elements of it: calls to
+        # transformations, with a reference to the data array itself or
+        # the "n" parameter array as first argument, and often input data
+        # as a second argument. We parse and emulate those calls to follow
+        # the descrambling script.
+        # c[40](c[14],c[2]),c[25](c[48]),c[21](c[32],c[23]), [...]
+        for ifunc, itab, iarg in map(lambda m: m.groups(),
+                                     re.finditer(r"c\[(\d+)\]\(c\[(\d+)\](?:,\s*c\[(\d+)\]|[^)]*?)\)", script)):
+            func = data[int(ifunc)]
+            tab = data[int(itab)]
+            arg = iarg and data[int(iarg)]
+            # Uncomment to debug transformation chain
+            # nprev = ''.join(n)
+            # dprev = ' '.join(map(prd, data))
+            # print(''.join(('"n" parameter transformation: ', prd(func), "(", prd(tab), (", " + prd(arg, tab)) if arg else '', ") ", ifunc, "(", itab, (", " + iarg) if iarg else "", ")")))
+            if not callable(func) or not isiterable(tab) or func(tab, arg):
+                self.report_warning("Invalid data type encountered during YouTube video throttling parameter descrambling transformation chain, aborting"
+                                    "\nCouldn't descramble YouTube throttling URL parameter: data transfer will be throttled")
+                self.report_warning("Couldn't process youtube video URL, please check for updates to this script")
+                break
+            # Uncomment to debug transformation chain
+            # nnew = ''.join(n)
+            # if nprev != nnew:
+            #     print('from: ' + nprev + "\nto:   " + nnew)
+            # dnew = ' '.join(map(prd, data))
+            # if dprev != dnew:
+            #     print('from: ' + dprev + "\nto:   " + dnew)
+        return ''.join(n)
+
+    def _unthrottle_format_urls(self, video_id, player_url, formats):
+        if not player_url:
+            return
+        player_id = self._extract_player_info(player_url)
+        code = self._get_player_code(video_id, player_url, player_id)
+        n_cache = {}
+        for fmt in formats:
+            parsed_fmt_url = compat_urlparse.urlparse(fmt['url'])
+            qs = compat_urlparse.parse_qs(parsed_fmt_url.query)
+            n_param = qs.get('n')
+            if not n_param:
+                continue
+            n_param = n_param[-1]
+            n_response = n_cache.get(n_param)
+            if not n_response:
+                n_response = self._n_descramble(n_param, code)
+                if n_response:
+                    n_cache[n_param] = n_response
+            if n_response:
+                qs['n'] = [n_response]
+                fmt['url'] = compat_urlparse.urlunparse(
+                    parsed_fmt_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
     def _mark_watched(self, video_id, player_response):
         playback_url = url_or_none(try_get(
             player_response,
@@ -1631,11 +1907,7 @@ def feed_entry(name):
                 if not (sc and fmt_url and encrypted_sig):
                     continue
                 if not player_url:
-                    if not webpage:
-                        continue
-                    player_url = self._search_regex(
-                        r'"(?:PLAYER_JS_URL|jsUrl)"\s*:\s*"([^"]+)"',
-                        webpage, 'player URL', fatal=False)
+                    player_url = self._extract_player_url(webpage)
                 if not player_url:
                     continue
                 signature = self._decrypt_signature(sc['s'][0], video_id, player_url)
@@ -1781,6 +2053,10 @@ def feed_entry(name):
         is_live = video_details.get('isLive')
         owner_profile_url = microformat.get('ownerProfileUrl')
 
+        if not player_url:
+            player_url = self._extract_player_url(webpage)
+        self._unthrottle_format_urls(video_id, player_url, formats)
+
         info = {
             'id': video_id,
             'title': self._live_title(video_title) if is_live else video_title,