Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[rai] improved subtitles extraction #27705

Merged
merged 9 commits into from
Jan 7, 2021
12 changes: 10 additions & 2 deletions test/test_subtitles.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,16 +258,24 @@ def test_allsubtitles(self):


class TestRaiPlaySubtitles(BaseTestSubtitles):
url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
IE = RaiPlayIE

def test_allsubtitles(self):
def test_subtitles_key(self):
self.url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['it']))
self.assertEqual(md5(subtitles['it']), 'b1d90a98755126b61e667567a1f6680a')

def test_subtitles_array_key(self):
self.url = 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html'
self.DL.params['writesubtitles'] = True
self.DL.params['allsubtitles'] = True
subtitles = self.getSubtitles()
self.assertEqual(set(subtitles.keys()), set(['it']))
self.assertEqual(md5(subtitles['it']), '4b3264186fbb103508abe5311cfcb9cd')


class TestVikiSubtitles(BaseTestSubtitles):
url = 'http://www.viki.com/videos/1060846v-punch-episode-18'
Expand Down
49 changes: 32 additions & 17 deletions youtube_dl/extractor/rai.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,22 +103,27 @@ def _extract_relinker_info(self, relinker_url, video_id):
}.items() if v is not None)

@staticmethod
def _extract_subtitles(url, subtitle_url):
def _extract_subtitles(url, video_data):
STL_EXT = 'stl'
SRT_EXT = 'srt'
subtitles = {}
if subtitle_url and isinstance(subtitle_url, compat_str):
subtitle_url = urljoin(url, subtitle_url)
STL_EXT = '.stl'
SRT_EXT = '.srt'
subtitles['it'] = [{
'ext': 'stl',
'url': subtitle_url,
}]
if subtitle_url.endswith(STL_EXT):
srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT
subtitles['it'].append({
'ext': 'srt',
'url': srt_url,
subtitles_array = video_data.get('subtitlesArray') or []
for k in ('subtitles', 'subtitlesUrl'):
subtitles_array.append({'url': video_data.get(k)})
for subtitle in subtitles_array:
sub_url = subtitle.get('url')
if sub_url and isinstance(sub_url, compat_str):
sub_lang = subtitle.get('language') or 'it'
sub_url = urljoin(url, sub_url)
subtitles.setdefault(sub_lang, []).append({
'ext': determine_ext(sub_url),
'url': sub_url,
})
if STL_EXT == subtitles[sub_lang][0]['ext']:
nixxo marked this conversation as resolved.
Show resolved Hide resolved
subtitles[sub_lang].append({
'ext': SRT_EXT,
'url': sub_url[:-len(STL_EXT)] + SRT_EXT,
})
return subtitles


Expand All @@ -138,13 +143,20 @@ class RaiPlayIE(RaiBaseIE):
'duration': 6160,
'series': 'Report',
'season': '2013/14',
'subtitles': {
'it': 'count:2',
},
},
'params': {
'skip_download': True,
},
}, {
'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
'only_matching': True,
}, {
# subtitles at 'subtitlesArray' key (see #27698)
'url': 'https://www.raiplay.it/video/2020/12/Report---04-01-2021-2e90f1de-8eee-4de4-ac0e-78d21db5b600.html',
'only_matching': True,
}]

def _real_extract(self, url):
Expand Down Expand Up @@ -172,7 +184,7 @@ def _real_extract(self, url):
if date_published and time_published:
date_published += ' ' + time_published

subtitles = self._extract_subtitles(url, video.get('subtitles'))
subtitles = self._extract_subtitles(url, video)

program_info = media.get('program_info') or {}
season = media.get('season')
Expand Down Expand Up @@ -327,14 +339,17 @@ class RaiIE(RaiBaseIE):
'skip_download': True,
},
}, {
# ContentItem in iframe (see #12652)
# ContentItem in iframe (see #12652) and subtitle at 'subtitlesUrl' key
'url': 'http://www.presadiretta.rai.it/dl/portali/site/puntata/ContentItem-3ed19d13-26c2-46ff-a551-b10828262f1b.html',
'info_dict': {
'id': '1ad6dc64-444a-42a4-9bea-e5419ad2f5fd',
'ext': 'mp4',
'title': 'Partiti acchiappavoti - Presa diretta del 13/09/2015',
'description': 'md5:d291b03407ec505f95f27970c0b025f4',
'upload_date': '20150913',
'subtitles': {
'it': 'count:2',
},
},
'params': {
'skip_download': True,
Expand Down Expand Up @@ -379,7 +394,7 @@ def _extract_from_content_id(self, content_id, url):
'url': compat_urlparse.urljoin(url, thumbnail_url),
})

subtitles = self._extract_subtitles(url, media.get('subtitlesUrl'))
subtitles = self._extract_subtitles(url, media)

info = {
'id': content_id,
Expand Down