From f2d10584d3eeac0f90420048789b0ac67d19c042 Mon Sep 17 00:00:00 2001 From: Alexander Seiler Date: Sat, 11 Nov 2017 19:30:10 +0100 Subject: [PATCH] [srgssr] Extract the correct video + add subtitle support (closes #14717) --- youtube_dl/extractor/srgssr.py | 225 ++++++++++++++++++++++----------- 1 file changed, 151 insertions(+), 74 deletions(-) diff --git a/youtube_dl/extractor/srgssr.py b/youtube_dl/extractor/srgssr.py index bb73eb1d5c2..b79ed570026 100644 --- a/youtube_dl/extractor/srgssr.py +++ b/youtube_dl/extractor/srgssr.py @@ -7,13 +7,30 @@ from ..compat import compat_urllib_parse_urlparse from ..utils import ( ExtractorError, + float_or_none, + int_or_none, + mimetype2ext, parse_iso8601, qualities, ) class SRGSSRIE(InfoExtractor): - _VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?Psrf|rts|rsi|rtr|swi):(?:[^:]+:)?(?Pvideo|audio):(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + (?: + https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| + srgssr + ): + (?P + srf|rts|rsi|rtr|swi + ):(?:[^:]+:)? + (?P + video|audio + ): + (?P + [0-9a-f\-]{36}|\d+ + ) + ''' _GEO_BYPASS = False _GEO_COUNTRIES = ['CH'] @@ -33,17 +50,24 @@ def _get_tokenized_src(self, url, video_id, format_id): video_id, 'Downloading %s token' % format_id, fatal=False) or {} auth_params = token.get('token', {}).get('authparams') if auth_params: - url += '?' + auth_params + url += ('?' if '?' not in url else '&') + auth_params return url - def get_media_data(self, bu, media_type, media_id): - media_data = self._download_json( - 'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), - media_id)[media_type.capitalize()] - - if media_data.get('block') and media_data['block'] in self._ERRORS: - message = self._ERRORS[media_data['block']] - if media_data['block'] == 'GEOBLOCK': + def _get_media_data(self, bu, media_type, media_id): + query = {'onlyChapters': True} if media_type == 'video' else {} + full_media_data = self._download_json( + 'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' + % (bu, media_type, media_id), + media_id, query=query)['chapterList'] + try: + media_data = next( + x for x in full_media_data if x['id'] == media_id) + except StopIteration: + raise ExtractorError('No media information found') + + if media_data.get('blockReason') and media_data['blockReason'] in self._ERRORS: + message = self._ERRORS[media_data['blockReason']] + if media_data['blockReason'] == 'GEOBLOCK': self.raise_geo_restricted( msg=message, countries=self._GEO_COUNTRIES) raise ExtractorError( @@ -51,46 +75,92 @@ def get_media_data(self, bu, media_type, media_id): return media_data - def _real_extract(self, url): - bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + def _get_subtitles(self, media_data, bu, media_type): + subtitles = {} + if media_type == 'audio': + return subtitles + + subtitle_data = media_data.get('subtitleList', []) + default_language_codes = { + 'srf': 'de', + 'rts': 'fr', + 'rsi': 'it', + 'rtr': 'rm', + 'swi': 'en', + } + known_formats = ('TTML', 'VTT') + for sub in subtitle_data: + form = sub['format'] + if form not in known_formats: + continue + lang = sub.get('locale') or default_language_codes[bu] + subtitles.setdefault(lang, []).append({ + 'ext': form.lower(), + 'url': sub['url'] + }) + # Prefer VTT subtitles over TTML: + priorities = { + 'ttml': 1, + 'vtt': 2, + } + for lang in subtitles: + subtitles[lang].sort(key=lambda x: priorities[x['ext']]) - media_data = self.get_media_data(bu, media_type, media_id) + return subtitles - metadata = media_data['AssetMetadatas']['AssetMetadata'][0] - title = metadata['title'] - description = metadata.get('description') - created_date = media_data.get('createdDate') or metadata.get('createdDate') + def _real_extract(self, url): + bu, media_type, media_id = re.match(self._VALID_URL, url).groups() + media_data = self._get_media_data(bu, media_type, media_id) + title = media_data['title'] + description = media_data.get('description') + thumbnail = media_data.get('imageUrl') + created_date = media_data.get('date') timestamp = parse_iso8601(created_date) + duration = float_or_none(media_data['duration'], scale=1000) - thumbnails = [{ - 'id': image.get('id'), - 'url': image['url'], - } for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] + subtitles = self.extract_subtitles(media_data, bu, media_type) - preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) formats = [] - for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): - protocol = source.get('@protocol') - for asset in source['url']: - asset_url = asset['text'] - quality = asset['@quality'] - format_id = '%s-%s' % (protocol, quality) - if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): - asset_url = self._get_tokenized_src(asset_url, media_id, format_id) - if protocol.startswith('HTTP-HDS'): - formats.extend(self._extract_f4m_formats( - asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', - media_id, f4m_id=format_id, fatal=False)) - elif protocol.startswith('HTTP-HLS'): - formats.extend(self._extract_m3u8_formats( - asset_url, media_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) + preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) + for source in media_data.get('resourceList', []): + protocol = source.get('protocol') + quality = source.get('quality') + encoding = source.get('encoding') + mime_type = source.get('mimeType') + format_url = source.get('url') + format_id = '%s-%s-%s' % (protocol, encoding, quality) + + if protocol in ('HDS', 'HLS'): + asset_url = self._get_tokenized_src( + format_url, media_id, format_id) + if protocol == 'HDS': + formats.extend(self._extract_akamai_formats( + asset_url, media_id)) else: + formats.extend(self._extract_m3u8_formats( + asset_url, media_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False + )) + elif protocol in ('HTTP', 'HTTPS', 'RTMP'): + formats.append({ + 'format_id': format_id, + 'ext': mimetype2ext(mime_type) if mime_type else None, + 'url': format_url, + 'preference': preference(quality) + }) + podcast_keys = ('podcastSdUrl', 'podcastHdUrl') + podcast_qualities = ('SD', 'HD') + + # This is needed because for audio medias the podcast url is usually + # always included, even if is only an audio segment and not the + # whole episode. + if int_or_none(media_data['position']) == 0: + for key, quality in zip(podcast_keys, podcast_qualities): + if media_data.get(key): formats.append({ - 'format_id': format_id, - 'url': asset_url, + 'format_id': 'PODCAST-%s' % quality, + 'url': media_data[key], 'preference': preference(quality), - 'ext': 'flv' if protocol == 'RTMP' else None, }) self._sort_formats(formats) @@ -98,37 +168,49 @@ def _real_extract(self, url): 'id': media_id, 'title': title, 'description': description, + 'duration': duration, 'timestamp': timestamp, - 'thumbnails': thumbnails, + 'thumbnail': thumbnail, + 'subtitles': subtitles, 'formats': formats, } class SRGSSRPlayIE(InfoExtractor): IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' - _VALID_URL = r'https?://(?:(?:www|play)\.)?(?Psrf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?Pvideo|audio)/[^?]+\?id=(?P[0-9a-f\-]{36}|\d+)' + _VALID_URL = r'''(?x) + https?:// + (?: + (?: + www|play + )\. + )? + (?P + srf|rts|rsi|rtr|swissinfo + )\.ch/play/ + (?: + tv|radio + )/[^/]+/ + (?P + video|audio + )/[^?]+\?id= + (?P + [0-9a-f\-]{36}|\d+ + ) + ''' _TESTS = [{ 'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', - 'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', + 'md5': '9764693a295be9a24ce231440b200ba4', 'info_dict': { 'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', 'ext': 'mp4', - 'upload_date': '20130701', 'title': 'Snowden beantragt Asyl in Russland', - 'timestamp': 1372713995, - } - }, { - # No Speichern (Save) button - 'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', - 'md5': '0a274ce38fda48c53c01890651985bc6', - 'info_dict': { - 'id': '677f5829-e473-4823-ac83-a1087fe97faa', - 'ext': 'flv', - 'upload_date': '20130710', - 'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', - 'description': 'md5:88604432b60d5a38787f152dec89cd56', - 'timestamp': 1373493600, + 'description': None, + 'duration': 113.827, + 'upload_date': '20130701', + 'timestamp': 1372708215, + 'thumbnail': r're:^https?://.*1383719781\.png$', }, }, { 'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', @@ -137,35 +219,30 @@ class SRGSSRPlayIE(InfoExtractor): 'ext': 'mp3', 'upload_date': '20151013', 'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', - 'timestamp': 1444750398, + 'duration': 336.839, + 'timestamp': 1444709160, }, 'params': { # rtmp download 'skip_download': True, }, }, { - 'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', - 'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', + 'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', 'info_dict': { - 'id': '6348260', - 'display_id': '6348260', + 'id': '42960270', 'ext': 'mp4', - 'duration': 1796, - 'title': 'Le 19h30', - 'description': '', - 'uploader': '19h30', - 'upload_date': '20141201', - 'timestamp': 1417458600, - 'thumbnail': r're:^https?://.*\.image', - 'view_count': int, + 'title': 'Why people were against tax reforms', + 'description': 'md5:8c5c1b6a2a37c17670cf87f608ff4755', + 'duration': 94.0, + 'upload_date': '20170215', + 'timestamp': 1487173560, + 'thumbnail': 'https://www.swissinfo.ch/srgscalableimage/42961964', }, 'params': { - # m3u8 download 'skip_download': True, - } + }, }] def _real_extract(self, url): bu, media_type, media_id = re.match(self._VALID_URL, url).groups() - # other info can be extracted from url + '&layout=json' return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR')