-
Notifications
You must be signed in to change notification settings - Fork 10k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[srgssr] Extract the correct video + add subtitles (closes #14717) #14725
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,13 +7,30 @@ | |
from ..compat import compat_urllib_parse_urlparse | ||
from ..utils import ( | ||
ExtractorError, | ||
float_or_none, | ||
int_or_none, | ||
mimetype2ext, | ||
parse_iso8601, | ||
qualities, | ||
) | ||
|
||
|
||
class SRGSSRIE(InfoExtractor): | ||
_VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)' | ||
_VALID_URL = r'''(?x) | ||
(?: | ||
https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn| | ||
srgssr | ||
): | ||
(?P<bu> | ||
srf|rts|rsi|rtr|swi | ||
):(?:[^:]+:)? | ||
(?P<type> | ||
video|audio | ||
): | ||
(?P<id> | ||
[0-9a-f\-]{36}|\d+ | ||
) | ||
''' | ||
_GEO_BYPASS = False | ||
_GEO_COUNTRIES = ['CH'] | ||
|
||
|
@@ -33,102 +50,167 @@ def _get_tokenized_src(self, url, video_id, format_id): | |
video_id, 'Downloading %s token' % format_id, fatal=False) or {} | ||
auth_params = token.get('token', {}).get('authparams') | ||
if auth_params: | ||
url += '?' + auth_params | ||
url += ('?' if '?' not in url else '&') + auth_params | ||
return url | ||
|
||
def get_media_data(self, bu, media_type, media_id): | ||
media_data = self._download_json( | ||
'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id), | ||
media_id)[media_type.capitalize()] | ||
|
||
if media_data.get('block') and media_data['block'] in self._ERRORS: | ||
message = self._ERRORS[media_data['block']] | ||
if media_data['block'] == 'GEOBLOCK': | ||
def _get_media_data(self, bu, media_type, media_id): | ||
query = {'onlyChapters': True} if media_type == 'video' else {} | ||
full_media_data = self._download_json( | ||
'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json' | ||
% (bu, media_type, media_id), | ||
media_id, query=query)['chapterList'] | ||
try: | ||
media_data = next( | ||
x for x in full_media_data if x['id'] == media_id) | ||
except StopIteration: | ||
raise ExtractorError('No media information found') | ||
|
||
if media_data.get('blockReason') and media_data['blockReason'] in self._ERRORS: | ||
message = self._ERRORS[media_data['blockReason']] | ||
if media_data['blockReason'] == 'GEOBLOCK': | ||
self.raise_geo_restricted( | ||
msg=message, countries=self._GEO_COUNTRIES) | ||
raise ExtractorError( | ||
'%s said: %s' % (self.IE_NAME, message), expected=True) | ||
|
||
return media_data | ||
|
||
def _real_extract(self, url): | ||
bu, media_type, media_id = re.match(self._VALID_URL, url).groups() | ||
def _get_subtitles(self, media_data, bu, media_type): | ||
subtitles = {} | ||
if media_type == 'audio': | ||
return subtitles | ||
|
||
subtitle_data = media_data.get('subtitleList', []) | ||
default_language_codes = { | ||
'srf': 'de', | ||
'rts': 'fr', | ||
'rsi': 'it', | ||
'rtr': 'rm', | ||
'swi': 'en', | ||
} | ||
known_formats = ('TTML', 'VTT') | ||
for sub in subtitle_data: | ||
form = sub['format'] | ||
if form not in known_formats: | ||
continue | ||
lang = sub.get('locale') or default_language_codes[bu] | ||
subtitles.setdefault(lang, []).append({ | ||
'ext': form.lower(), | ||
'url': sub['url'] | ||
}) | ||
# Prefer VTT subtitles over TTML: | ||
priorities = { | ||
'ttml': 1, | ||
'vtt': 2, | ||
} | ||
for lang in subtitles: | ||
subtitles[lang].sort(key=lambda x: priorities[x['ext']]) | ||
|
||
media_data = self.get_media_data(bu, media_type, media_id) | ||
return subtitles | ||
|
||
metadata = media_data['AssetMetadatas']['AssetMetadata'][0] | ||
title = metadata['title'] | ||
description = metadata.get('description') | ||
created_date = media_data.get('createdDate') or metadata.get('createdDate') | ||
def _real_extract(self, url): | ||
bu, media_type, media_id = re.match(self._VALID_URL, url).groups() | ||
media_data = self._get_media_data(bu, media_type, media_id) | ||
title = media_data['title'] | ||
description = media_data.get('description') | ||
thumbnail = media_data.get('imageUrl') | ||
created_date = media_data.get('date') | ||
timestamp = parse_iso8601(created_date) | ||
duration = float_or_none(media_data['duration'], scale=1000) | ||
|
||
thumbnails = [{ | ||
'id': image.get('id'), | ||
'url': image['url'], | ||
} for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])] | ||
subtitles = self.extract_subtitles(media_data, bu, media_type) | ||
|
||
preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) | ||
formats = [] | ||
for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []): | ||
protocol = source.get('@protocol') | ||
for asset in source['url']: | ||
asset_url = asset['text'] | ||
quality = asset['@quality'] | ||
format_id = '%s-%s' % (protocol, quality) | ||
if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'): | ||
asset_url = self._get_tokenized_src(asset_url, media_id, format_id) | ||
if protocol.startswith('HTTP-HDS'): | ||
formats.extend(self._extract_f4m_formats( | ||
asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0', | ||
media_id, f4m_id=format_id, fatal=False)) | ||
elif protocol.startswith('HTTP-HLS'): | ||
formats.extend(self._extract_m3u8_formats( | ||
asset_url, media_id, 'mp4', 'm3u8_native', | ||
m3u8_id=format_id, fatal=False)) | ||
preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD']) | ||
for source in media_data.get('resourceList', []): | ||
protocol = source.get('protocol') | ||
quality = source.get('quality') | ||
encoding = source.get('encoding') | ||
mime_type = source.get('mimeType') | ||
format_url = source.get('url') | ||
format_id = '%s-%s-%s' % (protocol, encoding, quality) | ||
|
||
if protocol in ('HDS', 'HLS'): | ||
asset_url = self._get_tokenized_src( | ||
format_url, media_id, format_id) | ||
if protocol == 'HDS': | ||
formats.extend(self._extract_akamai_formats( | ||
asset_url, media_id)) | ||
else: | ||
formats.extend(self._extract_m3u8_formats( | ||
asset_url, media_id, 'mp4', 'm3u8_native', | ||
m3u8_id=format_id, fatal=False | ||
)) | ||
elif protocol in ('HTTP', 'HTTPS', 'RTMP'): | ||
formats.append({ | ||
'format_id': format_id, | ||
'ext': mimetype2ext(mime_type) if mime_type else None, | ||
'url': format_url, | ||
'preference': preference(quality) | ||
}) | ||
podcast_keys = ('podcastSdUrl', 'podcastHdUrl') | ||
podcast_qualities = ('SD', 'HD') | ||
|
||
# This is needed because for audio medias the podcast url is usually | ||
# always included, even if is only an audio segment and not the | ||
# whole episode. | ||
Comment on lines
+154
to
+156
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do you have an example for this case(whole podcast URL present for an audio segment)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Have a look at this: https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb The loaded We are only interested in the object |
||
if int_or_none(media_data['position']) == 0: | ||
for key, quality in zip(podcast_keys, podcast_qualities): | ||
if media_data.get(key): | ||
formats.append({ | ||
'format_id': format_id, | ||
'url': asset_url, | ||
'format_id': 'PODCAST-%s' % quality, | ||
'url': media_data[key], | ||
'preference': preference(quality), | ||
'ext': 'flv' if protocol == 'RTMP' else None, | ||
}) | ||
self._sort_formats(formats) | ||
|
||
return { | ||
'id': media_id, | ||
'title': title, | ||
'description': description, | ||
'duration': duration, | ||
'timestamp': timestamp, | ||
'thumbnails': thumbnails, | ||
'thumbnail': thumbnail, | ||
'subtitles': subtitles, | ||
'formats': formats, | ||
} | ||
|
||
|
||
class SRGSSRPlayIE(InfoExtractor): | ||
IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites' | ||
_VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)' | ||
_VALID_URL = r'''(?x) | ||
https?:// | ||
(?: | ||
(?: | ||
www|play | ||
)\. | ||
)? | ||
(?P<bu> | ||
srf|rts|rsi|rtr|swissinfo | ||
)\.ch/play/ | ||
(?: | ||
tv|radio | ||
)/[^/]+/ | ||
(?P<type> | ||
video|audio | ||
)/[^?]+\?id= | ||
(?P<id> | ||
[0-9a-f\-]{36}|\d+ | ||
) | ||
''' | ||
|
||
_TESTS = [{ | ||
'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5', | ||
'md5': 'da6b5b3ac9fa4761a942331cef20fcb3', | ||
'md5': '9764693a295be9a24ce231440b200ba4', | ||
'info_dict': { | ||
'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5', | ||
'ext': 'mp4', | ||
'upload_date': '20130701', | ||
'title': 'Snowden beantragt Asyl in Russland', | ||
'timestamp': 1372713995, | ||
} | ||
}, { | ||
# No Speichern (Save) button | ||
'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa', | ||
'md5': '0a274ce38fda48c53c01890651985bc6', | ||
'info_dict': { | ||
'id': '677f5829-e473-4823-ac83-a1087fe97faa', | ||
'ext': 'flv', | ||
'upload_date': '20130710', | ||
'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive', | ||
'description': 'md5:88604432b60d5a38787f152dec89cd56', | ||
'timestamp': 1373493600, | ||
'description': None, | ||
'duration': 113.827, | ||
'upload_date': '20130701', | ||
'timestamp': 1372708215, | ||
'thumbnail': r're:^https?://.*1383719781\.png$', | ||
}, | ||
}, { | ||
'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc', | ||
|
@@ -137,35 +219,30 @@ class SRGSSRPlayIE(InfoExtractor): | |
'ext': 'mp3', | ||
'upload_date': '20151013', | ||
'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem', | ||
'timestamp': 1444750398, | ||
'duration': 336.839, | ||
'timestamp': 1444709160, | ||
}, | ||
'params': { | ||
# rtmp download | ||
'skip_download': True, | ||
}, | ||
}, { | ||
'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260', | ||
'md5': '67a2a9ae4e8e62a68d0e9820cc9782df', | ||
'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270', | ||
'info_dict': { | ||
'id': '6348260', | ||
'display_id': '6348260', | ||
'id': '42960270', | ||
'ext': 'mp4', | ||
'duration': 1796, | ||
'title': 'Le 19h30', | ||
'description': '', | ||
'uploader': '19h30', | ||
'upload_date': '20141201', | ||
'timestamp': 1417458600, | ||
'thumbnail': r're:^https?://.*\.image', | ||
'view_count': int, | ||
'title': 'Why people were against tax reforms', | ||
'description': 'md5:8c5c1b6a2a37c17670cf87f608ff4755', | ||
'duration': 94.0, | ||
'upload_date': '20170215', | ||
'timestamp': 1487173560, | ||
'thumbnail': 'https://www.swissinfo.ch/srgscalableimage/42961964', | ||
}, | ||
'params': { | ||
# m3u8 download | ||
'skip_download': True, | ||
} | ||
}, | ||
}] | ||
|
||
def _real_extract(self, url): | ||
bu, media_type, media_id = re.match(self._VALID_URL, url).groups() | ||
# other info can be extracted from url + '&layout=json' | ||
return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR') |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
do you have an example with TTML subtitles? all the videos that i've tested with has only VTT subtitles.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think TTML subtitles do not exist anymore.
This video (https://www.srf.ch/play/tv/rundschau/video/schwander-rot-gruene-stadtpolitik-min-li-marti-tamilen-kirche?urn=urn:srf:video:2da578e3-dbb4-4657-a539-f01089a67831) used to have TTML subtitles.