From 04196b3b68bfbe5160eaa051910bfadcfb33d3dd Mon Sep 17 00:00:00 2001 From: Ahmed Date: Sun, 21 Feb 2021 16:28:29 +0400 Subject: [PATCH 001/394] [StreamwoIE] Add new extractor --- youtube_dl/extractor/extractors.py | 1 + youtube_dl/extractor/streamwo.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 youtube_dl/extractor/streamwo.py diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 1a39c25c52d..88c3dcb9ecc 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1159,6 +1159,7 @@ from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE +from .streamwo import StreamwoIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stv import STVPlayerIE diff --git a/youtube_dl/extractor/streamwo.py b/youtube_dl/extractor/streamwo.py new file mode 100644 index 00000000000..74ed3025261 --- /dev/null +++ b/youtube_dl/extractor/streamwo.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class StreamwoIE(InfoExtractor): + _VALID_URL = r'https?://streamwo\.com/(?P\w+)' + _TEST = { + "url": 'https://streamwo.com/4529aff4', + 'md5': 'eaf1f163635c868ecbba95d23ba83448', + 'info_dict': { + 'id': '4529aff4', + 'ext': 'mp4', + 'title': 'Goal 2021 13-04-50' + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + self.report_extraction(video_id) + title = self._html_search_regex(r'(.+?)', webpage, 'title') + video_url = self._html_search_regex(r'', webpage, u'video URL') + return { + 'id': video_id, + 'url': video_url, + 'ext': 'mp4', + "title": title + } From dd640efb4f45a72c46212b9ecf539eb18ebc070a Mon Sep 17 00:00:00 2001 From: Ahmed Alsuwaidi Date: Sun, 21 Feb 2021 18:17:28 +0400 Subject: [PATCH 002/394] Update streamwo.py --- youtube_dl/extractor/streamwo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamwo.py b/youtube_dl/extractor/streamwo.py index 74ed3025261..db622901a55 100644 --- a/youtube_dl/extractor/streamwo.py +++ b/youtube_dl/extractor/streamwo.py @@ -21,7 +21,7 @@ def _real_extract(self, url): webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_url = self._html_search_regex(r'', webpage, u'video URL') + video_url = self._html_search_regex(r'', webpage, 'video URL') return { 'id': video_id, 'url': video_url, From 7a0e03ef28501a5a49a34d0a1dde23f2e11f6788 Mon Sep 17 00:00:00 2001 From: Ahmed Alsuwaidi Date: Mon, 22 Feb 2021 21:02:06 +0400 Subject: [PATCH 003/394] Replaced test video in streamwo.py This test video should play fine in browser. --- youtube_dl/extractor/streamwo.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/youtube_dl/extractor/streamwo.py b/youtube_dl/extractor/streamwo.py index db622901a55..cf1d37d2517 100644 --- a/youtube_dl/extractor/streamwo.py +++ b/youtube_dl/extractor/streamwo.py @@ -7,12 +7,12 @@ class StreamwoIE(InfoExtractor): _VALID_URL = r'https?://streamwo\.com/(?P\w+)' _TEST = { - "url": 'https://streamwo.com/4529aff4', - 'md5': 'eaf1f163635c868ecbba95d23ba83448', + "url": 'https://streamwo.com/c11cf208', + 'md5': '64a3e444e10f90051725cc1776682b06', 'info_dict': { - 'id': '4529aff4', + 'id': 'c11cf208', 'ext': 'mp4', - 'title': 'Goal 2021 13-04-50' + 'title': 'pexels-rodnae-productions-6192787 } } From 236fc0d61b89f7dad8e35009739dd8d838720922 Mon Sep 17 00:00:00 2001 From: Ahmed Alsuwaidi Date: Mon, 22 Feb 2021 22:04:27 +0400 Subject: [PATCH 004/394] Update streamwo.py Added missing ' --- youtube_dl/extractor/streamwo.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/youtube_dl/extractor/streamwo.py b/youtube_dl/extractor/streamwo.py index cf1d37d2517..ec5ecd750d9 100644 --- a/youtube_dl/extractor/streamwo.py +++ b/youtube_dl/extractor/streamwo.py @@ -12,7 +12,7 @@ class StreamwoIE(InfoExtractor): 'info_dict': { 'id': 'c11cf208', 'ext': 'mp4', - 'title': 'pexels-rodnae-productions-6192787 + 'title': 'pexels-rodnae-productions-6192787' } } From 250ee17e01062291c71f6c6ea3fda42a651bff3e Mon Sep 17 00:00:00 2001 From: Ahmed Date: Tue, 23 Feb 2021 10:50:29 +0400 Subject: [PATCH 005/394] updated Streamwo to update _real_extract method to return self.url_result of the embedded video --- youtube_dl/extractor/streamwo.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/youtube_dl/extractor/streamwo.py b/youtube_dl/extractor/streamwo.py index 74ed3025261..3edc296ad76 100644 --- a/youtube_dl/extractor/streamwo.py +++ b/youtube_dl/extractor/streamwo.py @@ -7,12 +7,12 @@ class StreamwoIE(InfoExtractor): _VALID_URL = r'https?://streamwo\.com/(?P\w+)' _TEST = { - "url": 'https://streamwo.com/4529aff4', - 'md5': 'eaf1f163635c868ecbba95d23ba83448', + "url": 'https://streamwo.com/c11cf208', + 'md5': '64a3e444e10f90051725cc1776682b06', 'info_dict': { - 'id': '4529aff4', + 'id': 'zrxKAY8', 'ext': 'mp4', - 'title': 'Goal 2021 13-04-50' + 'title': "Imgur" } } @@ -20,11 +20,7 @@ def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) self.report_extraction(video_id) - title = self._html_search_regex(r'(.+?)', webpage, 'title') - video_url = self._html_search_regex(r'', webpage, u'video URL') - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - "title": title - } + video_url = self._html_search_regex(r'', webpage, 'video URL') + if video_url: + return self.url_result(video_url, ie="Generic") + return self.url_result(url, ie='Generic') From 3c75b8dc156073f407ab102dca45659609cb32e9 Mon Sep 17 00:00:00 2001 From: Ahmed Date: Fri, 26 Feb 2021 14:09:36 +0400 Subject: [PATCH 006/394] Implemented get_referrer_url method in utils.py along with relevant tests in test_utils.py --- test/test_utils.py | 31 ++++++++++++++++++ youtube_dl/extractor/common.py | 16 ++++++++- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/generic.py | 10 ++++++ youtube_dl/extractor/streamwo.py | 26 --------------- youtube_dl/utils.py | 52 ++++++++++++++++++++++++++++++ 6 files changed, 108 insertions(+), 28 deletions(-) delete mode 100644 youtube_dl/extractor/streamwo.py diff --git a/test/test_utils.py b/test/test_utils.py index 259c4763e1e..c867d72bba6 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -80,6 +80,7 @@ subtitles_filename, timeconvert, unescapeHTML, + get_referrer_url, unified_strdate, unified_timestamp, unsmuggle_url, @@ -306,6 +307,36 @@ def test_unescape_html(self): # HTML5 entities self.assertEqual(unescapeHTML('.''), '.\'') + def test_get_referrer_url(self): + # No-Referrer + self.assertEqual(get_referrer_url("https://example.com/page", "https://mozilla.org", "no-referrer"), None) + # No-Referrer-When-Downgrade + self.assertEqual(get_referrer_url("https://example.com/page", "https://example.com/otherpage", "no-referrer-when-downgrade"), "https://example.com/page") + self.assertEqual(get_referrer_url("https://example.com/page", "https://mozilla.org", "no-referrer-when-downgrade"), "https://example.com/page") + self.assertEqual(get_referrer_url("https://example.com/page", "http://example.com", "no-referrer-when-downgrade"), None) + # Origin + self.assertEqual(get_referrer_url("https://example.com/page", "http://example.com", "origin"), "https://example.com/") + # Origin-When-Cross-Origin + self.assertEqual(get_referrer_url("https://example.com/page", "https://example.com/otherpage", "origin-when-cross-origin"), "https://example.com/page") + self.assertEqual(get_referrer_url("https://example.com/page", "http://mozilla.org", "origin-when-cross-origin"), "https://example.com/") + self.assertEqual(get_referrer_url("https://example.com/page", "http://example.com/page", "origin-when-cross-origin"), "https://example.com/") + # Same-Origin + self.assertEqual(get_referrer_url("https://example.com/page", "https://example.com/otherpage", "same-origin"), "https://example.com/page") + self.assertEqual(get_referrer_url("https://example.com/page", "https://mozilla.org", "same-origin"), None) + self.assertEqual(get_referrer_url("https://example.com/page", "http://example.com/page", "same-origin"), None) + # Strict-Origin + self.assertEqual(get_referrer_url("https://example.com/page", "https://mozilla.org", "strict-origin"), "https://example.com/") + self.assertEqual(get_referrer_url("https://example.com/page", "http://example.com", "strict-origin"), None) + self.assertEqual(get_referrer_url("http://example.com/page", "http://example.com", "strict-origin"), "http://example.com/") + # Strict-Origin-When-Cross-Origin + self.assertEqual(get_referrer_url("https://example.com/page", "https://example.com/otherpage", "strict-origin-when-cross-origin"), "https://example.com/page") + self.assertEqual(get_referrer_url("https://example.com/page", "https://mozilla.org", "strict-origin-when-cross-origin"), "https://example.com/") + self.assertEqual(get_referrer_url("https://example.com/page", "http://example.com/otherpage", "strict-origin-when-cross-origin"), None) + # Unsafe-Url + self.assertEqual(get_referrer_url("https://example.com/page?q=123", "https://example.com/page?q=123", "unsafe-url"), "https://example.com/page?q=123") + self.assertEqual(get_referrer_url("https://example.com/page?q=123", "https://mozilla.org", "unsafe-url"), "https://example.com/page?q=123") + self.assertEqual(get_referrer_url("https://example.com/page?q=123", "https://example.com/page?q=123", "unsafe-url"), "https://example.com/page?q=123") + def test_date_from_str(self): self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) self.assertEqual(date_from_str('now+7day'), date_from_str('now+1week')) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 8eb110f4e66..498f5e2138e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -81,6 +81,7 @@ xpath_element, xpath_text, xpath_with_ns, + get_referrer_url ) @@ -2487,6 +2488,14 @@ def parse_content_type(content_type): return f return {} + def get_referrer_policy(page): + mobj = re.search(r'', page) + if mobj: + return mobj.group(1) + else: + # If no policy is set, default value is no-refferer-when-downgrade + return "no-referrer-when-downgrade" + def _media_formats(src, cur_media_type, type_info={}): full_url = absolute_url(src) ext = type_info.get('ext') or determine_ext(full_url) @@ -2509,6 +2518,7 @@ def _media_formats(src, cur_media_type, type_info={}): return is_plain_url, formats entries = [] + referrer_policy = get_referrer_policy(webpage) # amp-video and amp-audio are very similar to their HTML5 counterparts # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) @@ -2590,7 +2600,11 @@ def _media_formats(src, cur_media_type, type_info={}): 'url': absolute_url(src), }) for f in media_info['formats']: - f.setdefault('http_headers', {})['Referer'] = base_url + referrer = get_referrer_url(base_url, f["url"], referrer_policy) + if referrer: + f.setdefault('http_headers', {})['Referer'] = referrer + else: + f.setdefault('http_headers', {}) if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 88c3dcb9ecc..1a39c25c52d 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -1159,7 +1159,6 @@ from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE -from .streamwo import StreamwoIE from .streetvoice import StreetVoiceIE from .stretchinternet import StretchInternetIE from .stv import STVPlayerIE diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index c2b1b3bdfef..a1fc04375e6 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2248,6 +2248,16 @@ class GenericIE(InfoExtractor): }, 'playlist_mincount': 52, }, + { + # Test StreamWo + "url": 'https://streamwo.com/c11cf208', + 'md5': '64a3e444e10f90051725cc1776682b06', + 'info_dict': { + 'id': 'zrxKAY8', + 'ext': 'mp4', + 'title': "Imgur" + } + } ] def report_following_redirect(self, new_url): diff --git a/youtube_dl/extractor/streamwo.py b/youtube_dl/extractor/streamwo.py deleted file mode 100644 index 3edc296ad76..00000000000 --- a/youtube_dl/extractor/streamwo.py +++ /dev/null @@ -1,26 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class StreamwoIE(InfoExtractor): - _VALID_URL = r'https?://streamwo\.com/(?P\w+)' - _TEST = { - "url": 'https://streamwo.com/c11cf208', - 'md5': '64a3e444e10f90051725cc1776682b06', - 'info_dict': { - 'id': 'zrxKAY8', - 'ext': 'mp4', - 'title': "Imgur" - } - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - self.report_extraction(video_id) - video_url = self._html_search_regex(r'', webpage, 'video URL') - if video_url: - return self.url_result(video_url, ie="Generic") - return self.url_result(url, ie='Generic') diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 8e4d144c9ac..ba6e32f1873 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2326,6 +2326,58 @@ def bug_reports_message(): return msg +def get_referrer_url(base_url, target_url, policy): + # Returns correct referrer url based on the site policy + # https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Referrer-Policy#examples + parsed_url = compat_urlparse.urlparse(base_url) + parsed_target_url = compat_urlparse.urlparse(target_url) + if policy == "no-referrer": + return None + elif policy == "no-referrer-when-downgrade": + if "http" == parsed_target_url.scheme and "https" == parsed_url.scheme: + return None + elif "https" == parsed_target_url.scheme: + return base_url + elif policy == "origin": + return compat_urlparse.urljoin(base_url, "/") + elif policy == "origin-when-cross-origin": + if parsed_url.netloc == parsed_target_url.netloc: + if parsed_url.scheme == parsed_target_url.scheme: + return base_url + elif "http" == parsed_target_url.scheme: + return compat_urlparse.urljoin(base_url, "/") + else: + return compat_urlparse.urljoin(base_url, "/") + elif policy == "same-origin": + if parsed_url.netloc == parsed_target_url.netloc: + if parsed_url.scheme == parsed_target_url.scheme: + return base_url + else: + return None + else: + return None + elif policy == "strict-origin": + if "http" == parsed_url.scheme: + return compat_urlparse.urljoin(base_url, "/") + elif parsed_url.netloc == parsed_target_url.netloc: + if parsed_url.scheme == parsed_target_url.scheme: + return base_url + else: + return None + else: + return compat_urlparse.urljoin(base_url, "/") + elif policy == "strict-origin-when-cross-origin": + if parsed_url.netloc == parsed_target_url.netloc: + if parsed_url.scheme == parsed_target_url.scheme: + return base_url + else: + return None + else: + return compat_urlparse.urljoin(base_url, "/") + elif policy == "unsafe-url": + return base_url + + class YoutubeDLError(Exception): """Base exception for YoutubeDL errors.""" pass From b3c422a6a67fd8ff7493e7c8d2c210a6678d488b Mon Sep 17 00:00:00 2001 From: Ahmed Date: Mon, 1 Mar 2021 13:23:52 +0400 Subject: [PATCH 007/394] Implemented requested changes, added test cases to test_utils for the default fallback option --- test/test_utils.py | 4 ++ youtube_dl/extractor/common.py | 16 ++----- youtube_dl/extractor/generic.py | 2 +- youtube_dl/utils.py | 77 +++++++++++++++++---------------- 4 files changed, 48 insertions(+), 51 deletions(-) diff --git a/test/test_utils.py b/test/test_utils.py index c867d72bba6..26743a813b4 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -336,6 +336,10 @@ def test_get_referrer_url(self): self.assertEqual(get_referrer_url("https://example.com/page?q=123", "https://example.com/page?q=123", "unsafe-url"), "https://example.com/page?q=123") self.assertEqual(get_referrer_url("https://example.com/page?q=123", "https://mozilla.org", "unsafe-url"), "https://example.com/page?q=123") self.assertEqual(get_referrer_url("https://example.com/page?q=123", "https://example.com/page?q=123", "unsafe-url"), "https://example.com/page?q=123") + # None + self.assertEqual(get_referrer_url("https://example.com/page", "https://example.com/otherpage", None), "https://example.com/page") + self.assertEqual(get_referrer_url("https://example.com/page", "https://mozilla.org", None), "https://example.com/") + self.assertEqual(get_referrer_url("https://example.com/page", "http://example.com/otherpage", None), None) def test_date_from_str(self): self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 498f5e2138e..0259608db7e 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2488,13 +2488,8 @@ def parse_content_type(content_type): return f return {} - def get_referrer_policy(page): - mobj = re.search(r'', page) - if mobj: - return mobj.group(1) - else: - # If no policy is set, default value is no-refferer-when-downgrade - return "no-referrer-when-downgrade" + def get_referrer_policy_from_meta_element(page): + return self._html_search_meta('referrer', page) def _media_formats(src, cur_media_type, type_info={}): full_url = absolute_url(src) @@ -2518,7 +2513,7 @@ def _media_formats(src, cur_media_type, type_info={}): return is_plain_url, formats entries = [] - referrer_policy = get_referrer_policy(webpage) + referrer_policy = get_referrer_policy_from_meta_element(webpage) # amp-video and amp-audio are very similar to their HTML5 counterparts # so we wll include them right here (see # https://www.ampproject.org/docs/reference/components/amp-video) @@ -2601,10 +2596,7 @@ def _media_formats(src, cur_media_type, type_info={}): }) for f in media_info['formats']: referrer = get_referrer_url(base_url, f["url"], referrer_policy) - if referrer: - f.setdefault('http_headers', {})['Referer'] = referrer - else: - f.setdefault('http_headers', {}) + f.setdefault('http_headers', {})['Referer'] = referrer if media_info['formats'] or media_info['subtitles']: entries.append(media_info) return entries diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index a1fc04375e6..b1803372813 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -2249,7 +2249,7 @@ class GenericIE(InfoExtractor): 'playlist_mincount': 52, }, { - # Test StreamWo + # Test Native