From 9be945a064862639b2fef676f6d93644701ab9de Mon Sep 17 00:00:00 2001 From: nixxo Date: Wed, 6 Jan 2021 22:44:49 +0100 Subject: [PATCH 1/9] [rai] improved subtitles extraction (see #27698) --- youtube_dl/extractor/rai.py | 36 +++++++++++++++++++++++++----------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 0a68d16b038..2c4f1980690 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -103,22 +103,33 @@ def _extract_relinker_info(self, relinker_url, video_id): }.items() if v is not None) @staticmethod - def _extract_subtitles(url, subtitle_url): - subtitles = {} - if subtitle_url and isinstance(subtitle_url, compat_str): - subtitle_url = urljoin(url, subtitle_url) + def _extract_subtitles(url, video_data): + + def create_sub(url, lang): STL_EXT = '.stl' SRT_EXT = '.srt' - subtitles['it'] = [{ - 'ext': 'stl', - 'url': subtitle_url, + sub = {} + sub[lang] = [{ + 'ext': determine_ext(url), + 'url': url, }] - if subtitle_url.endswith(STL_EXT): - srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT - subtitles['it'].append({ + if url.endswith(STL_EXT): + srt_url = url[:-len(STL_EXT)] + SRT_EXT + sub[lang].append({ 'ext': 'srt', 'url': srt_url, }) + return sub + + subtitles = {} + subtitlesArray = video_data.get('subtitlesArray') + subtitlesArray.append({'url': video_data.get('subtitles')}) + for subtitle in subtitlesArray or []: + sub_url = subtitle.get('url') + if sub_url and isinstance(sub_url, compat_str): + subtitles.update(create_sub( + urljoin(url, sub_url), subtitle.get('language') or 'it')) + return subtitles @@ -138,6 +149,9 @@ class RaiPlayIE(RaiBaseIE): 'duration': 6160, 'series': 'Report', 'season': '2013/14', + 'subtitles': { + 'it': 'count:2', + }, }, 'params': { 'skip_download': True, @@ -172,7 +186,7 @@ def _real_extract(self, url): if date_published and time_published: date_published += ' ' + time_published - subtitles = self._extract_subtitles(url, video.get('subtitles')) + subtitles = self._extract_subtitles(url, video) program_info = media.get('program_info') or {} season = media.get('season') From 280be54fffff4b66cfc3760cced8f5e8965e9860 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 09:57:42 +0100 Subject: [PATCH 2/9] [rai] vorious fix after review - naming convention fix for subtitlesArray in subtitles_array - added 'subtitlesUrl' as another key for subtitles extraction - added TESTS for 'subtitlesArray' key --- youtube_dl/extractor/rai.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 2c4f1980690..dc441f8ba3b 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -122,9 +122,10 @@ def create_sub(url, lang): return sub subtitles = {} - subtitlesArray = video_data.get('subtitlesArray') - subtitlesArray.append({'url': video_data.get('subtitles')}) - for subtitle in subtitlesArray or []: + subtitles_array = video_data.get('subtitlesArray') or [] + subtitles_array.append({'url': video_data.get('subtitles')}) + subtitles_array.append({'url': video_data.get('subtitlesUrl')}) + for subtitle in subtitles_array: sub_url = subtitle.get('url') if sub_url and isinstance(sub_url, compat_str): subtitles.update(create_sub( @@ -159,6 +160,10 @@ class RaiPlayIE(RaiBaseIE): }, { 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', 'only_matching': True, + }, { + # subtitles at 'subtitlesArray' key (see #27698) + 'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -341,7 +346,7 @@ class RaiIE(RaiBaseIE): 'skip_download': True, }, }, { - # ContentItem in iframe (see #12652) + # ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key 'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html', 'info_dict': { 'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd', @@ -349,6 +354,9 @@ class RaiIE(RaiBaseIE): 'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015', 'description': 'md5:d291b03407ec505f95f27970c0b025f4', 'upload_date': '20150913', + 'subtitles': { + 'it': 'count:2', + }, }, 'params': { 'skip_download': True, @@ -393,7 +401,7 @@ def _extract_from_content_id(self, content_id, url): 'url': compat_urlparse.urljoin(url, thumbnail_url), }) - subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + subtitles = self._extract_subtitles(url, media) info = { 'id': content_id, From e0a179c20613a23c68a29e29d01c7b6a55b56299 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 11:31:19 +0100 Subject: [PATCH 3/9] [rai] create_sub func inline fix --- youtube_dl/extractor/rai.py | 35 ++++++++++++++++------------------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index dc441f8ba3b..25b3e140bbd 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -104,23 +104,8 @@ def _extract_relinker_info(self, relinker_url, video_id): @staticmethod def _extract_subtitles(url, video_data): - - def create_sub(url, lang): - STL_EXT = '.stl' - SRT_EXT = '.srt' - sub = {} - sub[lang] = [{ - 'ext': determine_ext(url), - 'url': url, - }] - if url.endswith(STL_EXT): - srt_url = url[:-len(STL_EXT)] + SRT_EXT - sub[lang].append({ - 'ext': 'srt', - 'url': srt_url, - }) - return sub - + STL_EXT = '.stl' + SRT_EXT = '.srt' subtitles = {} subtitles_array = video_data.get('subtitlesArray') or [] subtitles_array.append({'url': video_data.get('subtitles')}) @@ -128,8 +113,20 @@ def create_sub(url, lang): for subtitle in subtitles_array: sub_url = subtitle.get('url') if sub_url and isinstance(sub_url, compat_str): - subtitles.update(create_sub( - urljoin(url, sub_url), subtitle.get('language') or 'it')) + sub_lang = subtitle.get('language') or 'it' + sub_url = urljoin(url, sub_url) + sub = {} + sub[sub_lang] = [{ + 'ext': determine_ext(sub_url), + 'url': sub_url, + }] + if sub_url.endswith(STL_EXT): + srt_url = sub_url[:-len(STL_EXT)] + SRT_EXT + sub[sub_lang].append({ + 'ext': 'srt', + 'url': srt_url, + }) + subtitles.update(sub) return subtitles From 86d40d2d97eb2fe11df5df68f1f4606a8a83a6d9 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 11:32:04 +0100 Subject: [PATCH 4/9] [rai] added test for 'subtitlesArray' key in test_download.py script --- test/test_subtitles.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/test/test_subtitles.py b/test/test_subtitles.py index 17aaaf20d9a..550e0ca0081 100644 --- a/test/test_subtitles.py +++ b/test/test_subtitles.py @@ -258,16 +258,24 @@ def test_allsubtitles(self): class TestRaiPlaySubtitles(BaseTestSubtitles): - url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' IE = RaiPlayIE - def test_allsubtitles(self): + def test_subtitles_key(self): + self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html' self.DL.params['writesubtitles'] = True self.DL.params['allsubtitles'] = True subtitles = self.getSubtitles() self.assertEqual(set(subtitles.keys()), set(['it'])) self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a') + def test_subtitles_array_key(self): + self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html' + self.DL.params['writesubtitles'] = True + self.DL.params['allsubtitles'] = True + subtitles = self.getSubtitles() + self.assertEqual(set(subtitles.keys()), set(['it'])) + self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd') + class TestVikiSubtitles(BaseTestSubtitles): url = 'http://www.viki.com/videos/1060846v-punch-episode-18' From 172419befd89d9f69ee478f7db147e5c79ee0f70 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 13:20:29 +0100 Subject: [PATCH 5/9] Update youtube_dl/extractor/rai.py Co-authored-by: remitamine --- youtube_dl/extractor/rai.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 25b3e140bbd..0d662d72675 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -108,8 +108,8 @@ def _extract_subtitles(url, video_data): SRT_EXT = '.srt' subtitles = {} subtitles_array = video_data.get('subtitlesArray') or [] - subtitles_array.append({'url': video_data.get('subtitles')}) - subtitles_array.append({'url': video_data.get('subtitlesUrl')}) + for k in ('subtitles', 'subtitlesUrl'): + subtitles_array.append({'url': video_data.get(k)}) for subtitle in subtitles_array: sub_url = subtitle.get('url') if sub_url and isinstance(sub_url, compat_str): From 1a2c6b10d2f31bf94e8a786c4c242d5f9a66e36a Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 13:31:39 +0100 Subject: [PATCH 6/9] Update youtube_dl/extractor/rai.py Co-authored-by: remitamine --- youtube_dl/extractor/rai.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 0d662d72675..5dd329328fa 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -115,11 +115,10 @@ def _extract_subtitles(url, video_data): if sub_url and isinstance(sub_url, compat_str): sub_lang = subtitle.get('language') or 'it' sub_url = urljoin(url, sub_url) - sub = {} - sub[sub_lang] = [{ + subtitles.setdefault(sub_lang, []).append({ 'ext': determine_ext(sub_url), 'url': sub_url, - }] + }) if sub_url.endswith(STL_EXT): srt_url = sub_url[:-len(STL_EXT)] + SRT_EXT sub[sub_lang].append({ From b913be2592be44a19d6cbb79bf5793dd7abd6933 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 13:32:36 +0100 Subject: [PATCH 7/9] [rai] fix suggestion "use the extension extracted from determine_ext." --- youtube_dl/extractor/rai.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 5dd329328fa..48082b9ceea 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -104,8 +104,8 @@ def _extract_relinker_info(self, relinker_url, video_id): @staticmethod def _extract_subtitles(url, video_data): - STL_EXT = '.stl' - SRT_EXT = '.srt' + STL_EXT = 'stl' + SRT_EXT = 'srt' subtitles = {} subtitles_array = video_data.get('subtitlesArray') or [] for k in ('subtitles', 'subtitlesUrl'): @@ -119,13 +119,11 @@ def _extract_subtitles(url, video_data): 'ext': determine_ext(sub_url), 'url': sub_url, }) - if sub_url.endswith(STL_EXT): - srt_url = sub_url[:-len(STL_EXT)] + SRT_EXT - sub[sub_lang].append({ - 'ext': 'srt', - 'url': srt_url, + if determine_ext(sub_url) == STL_EXT: + subtitles.setdefault(sub_lang, []).append({ + 'ext': SRT_EXT, + 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, }) - subtitles.update(sub) return subtitles From 9384d866ea610a9ca7f83fdf199e9e1d66ba7a27 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 14:10:51 +0100 Subject: [PATCH 8/9] [rai] dry --- youtube_dl/extractor/rai.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 48082b9ceea..7ecf240d830 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -119,12 +119,11 @@ def _extract_subtitles(url, video_data): 'ext': determine_ext(sub_url), 'url': sub_url, }) - if determine_ext(sub_url) == STL_EXT: - subtitles.setdefault(sub_lang, []).append({ + if STL_EXT == subtitles[sub_lang][0]['ext']: + subtitles[sub_lang].append({ 'ext': SRT_EXT, 'url': sub_url[:-len(STL_EXT)] + SRT_EXT, }) - return subtitles From 06c9a055f9f39acfe5542747ba65205614086714 Mon Sep 17 00:00:00 2001 From: nixxo Date: Thu, 7 Jan 2021 14:35:01 +0100 Subject: [PATCH 9/9] [rai] improve determine_ext and fallback value --- youtube_dl/extractor/rai.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 7ecf240d830..67b86fc72c1 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -115,11 +115,12 @@ def _extract_subtitles(url, video_data): if sub_url and isinstance(sub_url, compat_str): sub_lang = subtitle.get('language') or 'it' sub_url = urljoin(url, sub_url) + sub_ext = determine_ext(sub_url, SRT_EXT) subtitles.setdefault(sub_lang, []).append({ - 'ext': determine_ext(sub_url), + 'ext': sub_ext, 'url': sub_url, }) - if STL_EXT == subtitles[sub_lang][0]['ext']: + if STL_EXT == sub_ext: subtitles[sub_lang].append({ 'ext': SRT_EXT, 'url': sub_url[:-len(STL_EXT)] + SRT_EXT,