From c856983d54021d9485b966d1ecddba031a0e84b7 Mon Sep 17 00:00:00 2001 From: dirkf Date: Tue, 9 Aug 2022 19:37:58 +0100 Subject: [PATCH] [ZDF] Overhaul ZDF extractors * pull some yt-dlp changes into ZDFBaseIE._extract_format() * add test cases from yt-dlp to ZDFIE * fix crash in ZDFIE._extract_mobile() when object had no `formitaeten` * improve title extraction in ZDFChannelIE (remove trailing station ident) * avoid extracting non-video playlist items (fixes #31149) --- youtube_dl/extractor/zdf.py | 174 +++++++++++++++++++++--------------- 1 file changed, 100 insertions(+), 74 deletions(-) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 3d39bb33aec..56cd2955db5 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, ExtractorError, + extract_attributes, float_or_none, int_or_none, merge_dicts, @@ -15,6 +16,7 @@ orderedSet, parse_codecs, qualities, + str_or_none, try_get, unified_timestamp, update_url_query, @@ -57,28 +59,39 @@ def _extract_format(self, video_id, formats, format_urls, meta): format_urls.add(format_url) mime_type = meta.get('mimeType') ext = determine_ext(format_url) + + join_nonempty = lambda s, l: s.join(filter(None, l)) + meta_map = lambda t: map(lambda x: str_or_none(meta.get(x)), t) + if mime_type == 'application/x-mpegURL' or ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( + new_formats = self._extract_m3u8_formats( format_url, video_id, 'mp4', m3u8_id='hls', - entry_protocol='m3u8_native', fatal=False)) + entry_protocol='m3u8_native', fatal=False) elif mime_type == 'application/f4m+xml' or ext == 'f4m': - formats.extend(self._extract_f4m_formats( - update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False)) + new_formats = self._extract_f4m_formats( + update_url_query(format_url, {'hdcore': '3.7.0'}), video_id, f4m_id='hds', fatal=False) else: f = parse_codecs(meta.get('mimeCodec')) + if not f: + data = meta.get('type', '').split('_') + if try_get(data, lambda x: x[2]) == ext: + f = dict(zip(('vcodec', 'acodec'), data[1])) + format_id = ['http'] - for p in (meta.get('type'), meta.get('quality')): - if p and isinstance(p, compat_str): - format_id.append(p) + format_id.extend(join_nonempty('-', meta_map(('type', 'quality')))) f.update({ 'url': format_url, 'format_id': '-'.join(format_id), - 'format_note': meta.get('quality'), - 'language': meta.get('language'), - 'quality': qualities(self._QUALITIES)(meta.get('quality')), - 'preference': -10, + 'tbr': int_or_none(self._search_regex(r'_(\d+)k_', format_url, 'tbr', default=None)) }) - formats.append(f) + new_formats = [f] + + formats.extend(merge_dicts(f, { + 'format_note': join_nonempty(',', meta_map(('quality', 'class'))), + 'language': meta.get('language'), + 'language_preference': 10 if meta.get('class') == 'main' else -10 if meta.get('class') == 'ad' else -1, + 'quality': qualities(self._QUALITIES)(meta.get('quality')), + }) for f in new_formats) def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): ptmd = self._call_api( @@ -107,6 +120,7 @@ def _extract_ptmd(self, ptmd_url, video_id, api_token, referrer): 'type': f.get('type'), 'mimeType': f.get('mimeType'), 'quality': quality.get('quality'), + 'class': track.get('class'), 'language': track.get('language'), }) self._sort_formats(formats) @@ -171,6 +185,20 @@ class ZDFIE(ZDFBaseIE): 'duration': 2615, 'timestamp': 1465021200, 'upload_date': '20160604', + 'thumbnail': 'https://www.zdf.de/assets/mauve-im-labor-100~768x432?cb=1464909117806', + }, + }, { + 'url': 'https://www.zdf.de/funk/druck-11790/funk-alles-ist-verzaubert-102.html', + 'md5': '1b93bdec7d02fc0b703c5e7687461628', + 'info_dict': { + 'ext': 'mp4', + 'id': 'video_funk_1770473', + 'duration': 1278, + 'description': 'Die Neue an der Schule verdreht Ismail den Kopf.', + 'title': 'Alles ist verzaubert', + 'timestamp': 1635520560, + 'upload_date': '20211029', + 'thumbnail': 'https://www.zdf.de/assets/teaser-funk-alles-ist-verzaubert-100~1920x1080?cb=1636466431799', }, }, { # Same as https://www.phoenix.de/sendungen/dokumentationen/gesten-der-maechtigen-i-a-89468.html?ref=suche @@ -204,6 +232,19 @@ class ZDFIE(ZDFBaseIE): 'timestamp': 1641355200, 'upload_date': '20220105', }, + 'skip': 'No longer available "Diese Seite wurde leider nicht gefunden"' + }, { + 'url': 'https://www.zdf.de/serien/soko-stuttgart/das-geld-anderer-leute-100.html', + 'info_dict': { + 'id': '191205_1800_sendung_sok8', + 'ext': 'mp4', + 'title': 'Das Geld anderer Leute', + 'description': 'md5:cb6f660850dc5eb7d1ab776ea094959d', + 'duration': 2581.0, + 'timestamp': 1654790700, + 'upload_date': '20220609', + 'thumbnail': 'https://epg-image.zdf.de/fotobase-webdelivery/images/e2d7e55a-09f0-424e-ac73-6cac4dd65f35?layout=2400x1350', + }, }] def _extract_entry(self, url, player, content, video_id): @@ -265,15 +306,16 @@ def _extract_mobile(self, video_id): 'https://zdf-cdn.live.cellular.de/mediathekV2/document/%s' % video_id, video_id) - document = video['document'] - - title = document['titel'] - content_id = document['basename'] - formats = [] - format_urls = set() - for f in document['formitaeten']: - self._extract_format(content_id, formats, format_urls, f) + formitaeten = try_get(video, lambda x: x['document']['formitaeten'], list) + document = formitaeten and video['document'] + if formitaeten: + title = document['titel'] + content_id = document['basename'] + + format_urls = set() + for f in formitaeten or []: + self._extract_format(content_id, formats, format_urls, f) self._sort_formats(formats) thumbnails = [] @@ -320,9 +362,9 @@ class ZDFChannelIE(ZDFBaseIE): 'url': 'https://www.zdf.de/sport/das-aktuelle-sportstudio', 'info_dict': { 'id': 'das-aktuelle-sportstudio', - 'title': 'das aktuelle sportstudio | ZDF', + 'title': 'das aktuelle sportstudio', }, - 'playlist_mincount': 23, + 'playlist_mincount': 18, }, { 'url': 'https://www.zdf.de/dokumentation/planet-e', 'info_dict': { @@ -330,6 +372,14 @@ class ZDFChannelIE(ZDFBaseIE): 'title': 'planet e.', }, 'playlist_mincount': 50, + }, { + 'url': 'https://www.zdf.de/gesellschaft/aktenzeichen-xy-ungeloest', + 'info_dict': { + 'id': 'aktenzeichen-xy-ungeloest', + 'title': 'Aktenzeichen XY... ungelöst', + 'entries': "lambda x: not any('xy580-fall1-kindermoerder-gesucht-100' in e['url'] for e in x)", + }, + 'playlist_mincount': 2, }, { 'url': 'https://www.zdf.de/filme/taunuskrimi/', 'only_matching': True, @@ -339,60 +389,36 @@ class ZDFChannelIE(ZDFBaseIE): def suitable(cls, url): return False if ZDFIE.suitable(url) else super(ZDFChannelIE, cls).suitable(url) + def _og_search_title(self, webpage, fatal=False): + title = super(ZDFChannelIE, self)._og_search_title(webpage, fatal=fatal) + return re.split(r'\s+[-|]\s+ZDF(?:mediathek)?$', title or '')[0] or None + def _real_extract(self, url): channel_id = self._match_id(url) webpage = self._download_webpage(url, channel_id) - entries = [ - self.url_result(item_url, ie=ZDFIE.ie_key()) - for item_url in orderedSet(re.findall( - r'data-plusbar-url=["\'](http.+?\.html)', webpage))] - - return self.playlist_result( - entries, channel_id, self._og_search_title(webpage, fatal=False)) - - r""" - player = self._extract_player(webpage, channel_id) - - channel_id = self._search_regex( - r'docId\s*:\s*(["\'])(?P(?!\1).+?)\1', webpage, - 'channel id', group='id') - - channel = self._call_api( - 'https://api.zdf.de/content/documents/%s.json' % channel_id, - player, url, channel_id) - - items = [] - for module in channel['module']: - for teaser in try_get(module, lambda x: x['teaser'], list) or []: - t = try_get( - teaser, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - items.extend(try_get( - t, - lambda x: x['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - items.extend(try_get( - module, - lambda x: x['filterRef']['resultsWithVideo']['http://zdf.de/rels/search/results'], - list) or []) - - entries = [] - entry_urls = set() - for item in items: - t = try_get(item, lambda x: x['http://zdf.de/rels/target'], dict) - if not t: - continue - sharing_url = t.get('http://zdf.de/rels/sharing-url') - if not sharing_url or not isinstance(sharing_url, compat_str): - continue - if sharing_url in entry_urls: - continue - entry_urls.add(sharing_url) - entries.append(self.url_result( - sharing_url, ie=ZDFIE.ie_key(), video_id=t.get('id'))) - - return self.playlist_result(entries, channel_id, channel.get('title')) - """ + matches = re.finditer( + r''']*?\sdata-plusbar-id\s*=\s*(["'])(?P[\w-]+)\1[^>]*?\sdata-plusbar-url=\1(?P%s)\1''' % ZDFIE._VALID_URL, + webpage) + + if self._downloader.params.get('noplaylist', False): + entry = next( + (self.url_result(m.group('url'), ie=ZDFIE.ie_key()) for m in matches), + None) + self.to_screen('Downloading just the main video because of --no-playlist') + if entry: + return entry + else: + self.to_screen('Downloading playlist %s - add --no-playlist to download just the main video' % (channel_id, )) + + def check_video(m): + v_ref = self._search_regex( + r'''(]*?\shref\s*=[^>]+?\sdata-target-id\s*=\s*(["'])%s\2[^>]*>)''' % (m.group('p_id'), ), + webpage, 'check id', default='') + v_ref = extract_attributes(v_ref) + return v_ref.get('data-target-video-type') != 'novideo' + + return self.playlist_from_matches( + (m.group('url') for m in matches if check_video(m)), + channel_id, self._og_search_title(webpage, fatal=False))