[bot] AutoMerging: merge all upstream's changes:

* https://github.com/ytdl-org/youtube-dl: [youtube:tab] Pass innertube context and x-goog-visitor-id header along with continuation requests (closes ytdl-org#28702) [mtv] Fix Viacom A/B Testing Video Player extraction(closes ytdl-org#28703) [pornhub] Extract DASH and HLS formats from get_media end point (closes ytdl-org#28698)
hellopony · Apr 8, 2021 · 65ffb9c · 65ffb9c
2 parents a7d3382 + 1b0a13f
commit 65ffb9c
Show file tree

Hide file tree

Showing 3 changed files with 42 additions and 27 deletions.
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
@@ -255,7 +255,9 @@ def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
 
     @staticmethod
     def _extract_child_with_type(parent, t):
-        return next(c for c in parent['children'] if c.get('type') == t)
+        for c in parent['children']:
+            if c.get('type') == t:
+                return c
 
     def _extract_mgid(self, webpage):
         try:
@@ -286,7 +288,8 @@ def _extract_mgid(self, webpage):
             data = self._parse_json(self._search_regex(
                 r'__DATA__\s*=\s*({.+?});', webpage, 'data'), None)
             main_container = self._extract_child_with_type(data, 'MainContainer')
-            video_player = self._extract_child_with_type(main_container, 'VideoPlayer')
+            ab_testing = self._extract_child_with_type(main_container, 'ABTesting')
+            video_player = self._extract_child_with_type(ab_testing or main_container, 'VideoPlayer')
             mgid = video_player['props']['media']['video']['config']['uri']
 
         return mgid

diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
@@ -398,6 +398,16 @@ def parse_quality_items(quality_items):
         formats = []
 
         def add_format(format_url, height=None):
+            ext = determine_ext(format_url)
+            if ext == 'mpd':
+                formats.extend(self._extract_mpd_formats(
+                    format_url, video_id, mpd_id='dash', fatal=False))
+                return
+            if ext == 'm3u8':
+                formats.extend(self._extract_m3u8_formats(
+                    format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+                    m3u8_id='hls', fatal=False))
+                return
             tbr = None
             mobj = re.search(r'(?P<height>\d+)[pP]?_(?P<tbr>\d+)[kK]', format_url)
             if mobj:
@@ -417,16 +427,6 @@ def add_format(format_url, height=None):
                     r'/(\d{6}/\d{2})/', video_url, 'upload data', default=None)
                 if upload_date:
                     upload_date = upload_date.replace('/', '')
-            ext = determine_ext(video_url)
-            if ext == 'mpd':
-                formats.extend(self._extract_mpd_formats(
-                    video_url, video_id, mpd_id='dash', fatal=False))
-                continue
-            elif ext == 'm3u8':
-                formats.extend(self._extract_m3u8_formats(
-                    video_url, video_id, 'mp4', entry_protocol='m3u8_native',
-                    m3u8_id='hls', fatal=False))
-                continue
             if '/video/get_media' in video_url:
                 medias = self._download_json(video_url, video_id, fatal=False)
                 if isinstance(medias, list):

diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
@@ -306,7 +306,7 @@ def _extract_ytcfg(self, video_id, webpage):
         return self._parse_json(
             self._search_regex(
                 r'ytcfg\.set\s*\(\s*({.+?})\s*\)\s*;', webpage, 'ytcfg',
-                default='{}'), video_id, fatal=False)
+                default='{}'), video_id, fatal=False) or {}
 
     def _extract_video(self, renderer):
         video_id = renderer['videoId']
@@ -2475,7 +2475,7 @@ def _extract_continuation(cls, renderer):
             ctp = continuation_ep.get('clickTrackingParams')
             return YoutubeTabIE._build_continuation_query(continuation, ctp)
 
-    def _entries(self, tab, identity_token):
+    def _entries(self, tab, item_id, webpage):
         tab_content = try_get(tab, lambda x: x['content'], dict)
         if not tab_content:
             return
@@ -2535,26 +2535,37 @@ def _entries(self, tab, identity_token):
                 yield entry
             continuation = self._extract_continuation(rich_grid_renderer)
 
+        ytcfg = self._extract_ytcfg(item_id, webpage)
+        client_version = try_get(
+            ytcfg, lambda x: x['INNERTUBE_CLIENT_VERSION'], compat_str) or '2.20210407.08.00'
+
         headers = {
             'x-youtube-client-name': '1',
-            'x-youtube-client-version': '2.20201112.04.01',
+            'x-youtube-client-version': client_version,
             'content-type': 'application/json',
         }
+
+        context = try_get(ytcfg, lambda x: x['INNERTUBE_CONTEXT'], dict) or {
+            'client': {
+                'clientName': 'WEB',
+                'clientVersion': client_version,
+            }
+        }
+        visitor_data = try_get(context, lambda x: x['client']['visitorData'], compat_str)
+
+        identity_token = self._extract_identity_token(ytcfg, webpage)
         if identity_token:
             headers['x-youtube-identity-token'] = identity_token
 
         data = {
-            'context': {
-                'client': {
-                    'clientName': 'WEB',
-                    'clientVersion': '2.20201021.03.00',
-                }
-            },
+            'context': context,
         }
 
         for page_num in itertools.count(1):
             if not continuation:
                 break
+            if visitor_data:
+                headers['x-goog-visitor-id'] = visitor_data
             data['continuation'] = continuation['continuation']
             data['clickTracking'] = {
                 'clickTrackingParams': continuation['itct']
@@ -2579,6 +2590,9 @@ def _entries(self, tab, identity_token):
             if not response:
                 break
 
+            visitor_data = try_get(
+                response, lambda x: x['responseContext']['visitorData'], compat_str) or visitor_data
+
             continuation_contents = try_get(
                 response, lambda x: x['continuationContents'], dict)
             if continuation_contents:
@@ -2687,7 +2701,7 @@ def _extract_alert(data):
                 alerts.append(text)
         return '\n'.join(alerts)
 
-    def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
+    def _extract_from_tabs(self, item_id, webpage, data, tabs):
         selected_tab = self._extract_selected_tab(tabs)
         renderer = try_get(
             data, lambda x: x['metadata']['channelMetadataRenderer'], dict)
@@ -2712,7 +2726,7 @@ def _extract_from_tabs(self, item_id, webpage, data, tabs, identity_token):
                 if renderer:
                     title = try_get(renderer, lambda x: x['hashtag']['simpleText'])
         playlist = self.playlist_result(
-            self._entries(selected_tab, identity_token),
+            self._entries(selected_tab, item_id, webpage),
             playlist_id=playlist_id, playlist_title=title,
             playlist_description=description)
         playlist.update(self._extract_uploader(data))
@@ -2736,8 +2750,7 @@ def _extract_from_playlist(self, item_id, url, data, playlist):
             self._playlist_entries(playlist), playlist_id=playlist_id,
             playlist_title=title)
 
-    def _extract_identity_token(self, webpage, item_id):
-        ytcfg = self._extract_ytcfg(item_id, webpage)
+    def _extract_identity_token(self, ytcfg, webpage):
         if ytcfg:
             token = try_get(ytcfg, lambda x: x['ID_TOKEN'], compat_str)
             if token:
@@ -2760,12 +2773,11 @@ def _real_extract(self, url):
                 return self.url_result(video_id, ie=YoutubeIE.ie_key(), video_id=video_id)
             self.to_screen('Downloading playlist %s - add --no-playlist to just download video %s' % (playlist_id, video_id))
         webpage = self._download_webpage(url, item_id)
-        identity_token = self._extract_identity_token(webpage, item_id)
         data = self._extract_yt_initial_data(item_id, webpage)
         tabs = try_get(
             data, lambda x: x['contents']['twoColumnBrowseResultsRenderer']['tabs'], list)
         if tabs:
-            return self._extract_from_tabs(item_id, webpage, data, tabs, identity_token)
+            return self._extract_from_tabs(item_id, webpage, data, tabs)
         playlist = try_get(
             data, lambda x: x['contents']['twoColumnWatchNextResults']['playlist']['playlist'], dict)
         if playlist: