Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[srgssr] Extract the correct video + add subtitles (closes #14717) #14725

Closed
wants to merge 1 commit into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
225 changes: 151 additions & 74 deletions youtube_dl/extractor/srgssr.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,30 @@
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
mimetype2ext,
parse_iso8601,
qualities,
)


class SRGSSRIE(InfoExtractor):
_VALID_URL = r'(?:https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|srgssr):(?P<bu>srf|rts|rsi|rtr|swi):(?:[^:]+:)?(?P<type>video|audio):(?P<id>[0-9a-f\-]{36}|\d+)'
_VALID_URL = r'''(?x)
(?:
https?://tp\.srgssr\.ch/p(?:/[^/]+)+\?urn=urn|
srgssr
):
(?P<bu>
srf|rts|rsi|rtr|swi
):(?:[^:]+:)?
(?P<type>
video|audio
):
(?P<id>
[0-9a-f\-]{36}|\d+
)
'''
_GEO_BYPASS = False
_GEO_COUNTRIES = ['CH']

Expand All @@ -33,102 +50,167 @@ def _get_tokenized_src(self, url, video_id, format_id):
video_id, 'Downloading %s token' % format_id, fatal=False) or {}
auth_params = token.get('token', {}).get('authparams')
if auth_params:
url += '?' + auth_params
url += ('?' if '?' not in url else '&') + auth_params
return url

def get_media_data(self, bu, media_type, media_id):
media_data = self._download_json(
'http://il.srgssr.ch/integrationlayer/1.0/ue/%s/%s/play/%s.json' % (bu, media_type, media_id),
media_id)[media_type.capitalize()]

if media_data.get('block') and media_data['block'] in self._ERRORS:
message = self._ERRORS[media_data['block']]
if media_data['block'] == 'GEOBLOCK':
def _get_media_data(self, bu, media_type, media_id):
query = {'onlyChapters': True} if media_type == 'video' else {}
full_media_data = self._download_json(
'https://il.srgssr.ch/integrationlayer/2.0/%s/mediaComposition/%s/%s.json'
% (bu, media_type, media_id),
media_id, query=query)['chapterList']
try:
media_data = next(
x for x in full_media_data if x['id'] == media_id)
except StopIteration:
raise ExtractorError('No media information found')

if media_data.get('blockReason') and media_data['blockReason'] in self._ERRORS:
message = self._ERRORS[media_data['blockReason']]
if media_data['blockReason'] == 'GEOBLOCK':
self.raise_geo_restricted(
msg=message, countries=self._GEO_COUNTRIES)
raise ExtractorError(
'%s said: %s' % (self.IE_NAME, message), expected=True)

return media_data

def _real_extract(self, url):
bu, media_type, media_id = re.match(self._VALID_URL, url).groups()
def _get_subtitles(self, media_data, bu, media_type):
subtitles = {}
if media_type == 'audio':
return subtitles

subtitle_data = media_data.get('subtitleList', [])
default_language_codes = {
'srf': 'de',
'rts': 'fr',
'rsi': 'it',
'rtr': 'rm',
'swi': 'en',
}
known_formats = ('TTML', 'VTT')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have an example with TTML subtitles? all the videos that i've tested with has only VTT subtitles.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

for sub in subtitle_data:
form = sub['format']
if form not in known_formats:
continue
lang = sub.get('locale') or default_language_codes[bu]
subtitles.setdefault(lang, []).append({
'ext': form.lower(),
'url': sub['url']
})
# Prefer VTT subtitles over TTML:
priorities = {
'ttml': 1,
'vtt': 2,
}
for lang in subtitles:
subtitles[lang].sort(key=lambda x: priorities[x['ext']])

media_data = self.get_media_data(bu, media_type, media_id)
return subtitles

metadata = media_data['AssetMetadatas']['AssetMetadata'][0]
title = metadata['title']
description = metadata.get('description')
created_date = media_data.get('createdDate') or metadata.get('createdDate')
def _real_extract(self, url):
bu, media_type, media_id = re.match(self._VALID_URL, url).groups()
media_data = self._get_media_data(bu, media_type, media_id)
title = media_data['title']
description = media_data.get('description')
thumbnail = media_data.get('imageUrl')
created_date = media_data.get('date')
timestamp = parse_iso8601(created_date)
duration = float_or_none(media_data['duration'], scale=1000)

thumbnails = [{
'id': image.get('id'),
'url': image['url'],
} for image in media_data.get('Image', {}).get('ImageRepresentations', {}).get('ImageRepresentation', [])]
subtitles = self.extract_subtitles(media_data, bu, media_type)

preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD'])
formats = []
for source in media_data.get('Playlists', {}).get('Playlist', []) + media_data.get('Downloads', {}).get('Download', []):
protocol = source.get('@protocol')
for asset in source['url']:
asset_url = asset['text']
quality = asset['@quality']
format_id = '%s-%s' % (protocol, quality)
if protocol.startswith('HTTP-HDS') or protocol.startswith('HTTP-HLS'):
asset_url = self._get_tokenized_src(asset_url, media_id, format_id)
if protocol.startswith('HTTP-HDS'):
formats.extend(self._extract_f4m_formats(
asset_url + ('?' if '?' not in asset_url else '&') + 'hdcore=3.4.0',
media_id, f4m_id=format_id, fatal=False))
elif protocol.startswith('HTTP-HLS'):
formats.extend(self._extract_m3u8_formats(
asset_url, media_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False))
preference = qualities(['LQ', 'MQ', 'SD', 'HQ', 'HD'])
for source in media_data.get('resourceList', []):
protocol = source.get('protocol')
quality = source.get('quality')
encoding = source.get('encoding')
mime_type = source.get('mimeType')
format_url = source.get('url')
format_id = '%s-%s-%s' % (protocol, encoding, quality)

if protocol in ('HDS', 'HLS'):
asset_url = self._get_tokenized_src(
format_url, media_id, format_id)
if protocol == 'HDS':
formats.extend(self._extract_akamai_formats(
asset_url, media_id))
else:
formats.extend(self._extract_m3u8_formats(
asset_url, media_id, 'mp4', 'm3u8_native',
m3u8_id=format_id, fatal=False
))
elif protocol in ('HTTP', 'HTTPS', 'RTMP'):
formats.append({
'format_id': format_id,
'ext': mimetype2ext(mime_type) if mime_type else None,
'url': format_url,
'preference': preference(quality)
})
podcast_keys = ('podcastSdUrl', 'podcastHdUrl')
podcast_qualities = ('SD', 'HD')

# This is needed because for audio medias the podcast url is usually
# always included, even if is only an audio segment and not the
# whole episode.
Comment on lines +154 to +156
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do you have an example for this case(whole podcast URL present for an audio segment)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have a look at this: https://www.srf.ch/play/radio/popupaudioplayer?id=50b20dc8-f05b-4972-bf03-e438ff2833eb

The loaded json file is this: https://il.srgssr.ch/integrationlayer/2.0/mediaComposition/byUrn/urn:srf:audio:50b20dc8-f05b-4972-bf03-e438ff2833eb.json

We are only interested in the object chapterList -> 5. But this object contains a key podcastSdUrl which contains the URL https://podcasts.srf.ch/world/audio/Echo-der-Zeit_23-02-2021-1800.1614103565876.mp3?assetId=7602dd05-7f30-4c9f-88d4-2cd9c445efa8. This URL links to the podcast of the whole episode, not just segment we would like to retrieve.

if int_or_none(media_data['position']) == 0:
for key, quality in zip(podcast_keys, podcast_qualities):
if media_data.get(key):
formats.append({
'format_id': format_id,
'url': asset_url,
'format_id': 'PODCAST-%s' % quality,
'url': media_data[key],
'preference': preference(quality),
'ext': 'flv' if protocol == 'RTMP' else None,
})
self._sort_formats(formats)

return {
'id': media_id,
'title': title,
'description': description,
'duration': duration,
'timestamp': timestamp,
'thumbnails': thumbnails,
'thumbnail': thumbnail,
'subtitles': subtitles,
'formats': formats,
}


class SRGSSRPlayIE(InfoExtractor):
IE_DESC = 'srf.ch, rts.ch, rsi.ch, rtr.ch and swissinfo.ch play sites'
_VALID_URL = r'https?://(?:(?:www|play)\.)?(?P<bu>srf|rts|rsi|rtr|swissinfo)\.ch/play/(?:tv|radio)/[^/]+/(?P<type>video|audio)/[^?]+\?id=(?P<id>[0-9a-f\-]{36}|\d+)'
_VALID_URL = r'''(?x)
https?://
(?:
(?:
www|play
)\.
)?
(?P<bu>
srf|rts|rsi|rtr|swissinfo
)\.ch/play/
(?:
tv|radio
)/[^/]+/
(?P<type>
video|audio
)/[^?]+\?id=
(?P<id>
[0-9a-f\-]{36}|\d+
)
'''

_TESTS = [{
'url': 'http://www.srf.ch/play/tv/10vor10/video/snowden-beantragt-asyl-in-russland?id=28e1a57d-5b76-4399-8ab3-9097f071e6c5',
'md5': 'da6b5b3ac9fa4761a942331cef20fcb3',
'md5': '9764693a295be9a24ce231440b200ba4',
'info_dict': {
'id': '28e1a57d-5b76-4399-8ab3-9097f071e6c5',
'ext': 'mp4',
'upload_date': '20130701',
'title': 'Snowden beantragt Asyl in Russland',
'timestamp': 1372713995,
}
}, {
# No Speichern (Save) button
'url': 'http://www.srf.ch/play/tv/top-gear/video/jaguar-xk120-shadow-und-tornado-dampflokomotive?id=677f5829-e473-4823-ac83-a1087fe97faa',
'md5': '0a274ce38fda48c53c01890651985bc6',
'info_dict': {
'id': '677f5829-e473-4823-ac83-a1087fe97faa',
'ext': 'flv',
'upload_date': '20130710',
'title': 'Jaguar XK120, Shadow und Tornado-Dampflokomotive',
'description': 'md5:88604432b60d5a38787f152dec89cd56',
'timestamp': 1373493600,
'description': None,
'duration': 113.827,
'upload_date': '20130701',
'timestamp': 1372708215,
'thumbnail': r're:^https?://.*1383719781\.png$',
},
}, {
'url': 'http://www.rtr.ch/play/radio/actualitad/audio/saira-tujetsch-tuttina-cuntinuar-cun-sedrun-muster-turissem?id=63cb0778-27f8-49af-9284-8c7a8c6d15fc',
Expand All @@ -137,35 +219,30 @@ class SRGSSRPlayIE(InfoExtractor):
'ext': 'mp3',
'upload_date': '20151013',
'title': 'Saira: Tujetsch - tuttina cuntinuar cun Sedrun Mustér Turissem',
'timestamp': 1444750398,
'duration': 336.839,
'timestamp': 1444709160,
},
'params': {
# rtmp download
'skip_download': True,
},
}, {
'url': 'http://www.rts.ch/play/tv/-/video/le-19h30?id=6348260',
'md5': '67a2a9ae4e8e62a68d0e9820cc9782df',
'url': 'http://play.swissinfo.ch/play/tv/business/video/why-people-were-against-tax-reforms?id=42960270',
'info_dict': {
'id': '6348260',
'display_id': '6348260',
'id': '42960270',
'ext': 'mp4',
'duration': 1796,
'title': 'Le 19h30',
'description': '',
'uploader': '19h30',
'upload_date': '20141201',
'timestamp': 1417458600,
'thumbnail': r're:^https?://.*\.image',
'view_count': int,
'title': 'Why people were against tax reforms',
'description': 'md5:8c5c1b6a2a37c17670cf87f608ff4755',
'duration': 94.0,
'upload_date': '20170215',
'timestamp': 1487173560,
'thumbnail': 'https://www.swissinfo.ch/srgscalableimage/42961964',
},
'params': {
# m3u8 download
'skip_download': True,
}
},
}]

def _real_extract(self, url):
bu, media_type, media_id = re.match(self._VALID_URL, url).groups()
# other info can be extracted from url + '&layout=json'
return self.url_result('srgssr:%s:%s:%s' % (bu[:3], media_type, media_id), 'SRGSSR')