From 2eb372919991fdecd573cc868efd80c33e764303 Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Fri, 23 Aug 2019 11:42:12 +0200 Subject: [PATCH 1/2] [ZDF/3sat/phoenix] Let ZDF extractor work with 3sat.de This patch also fixes the brokenness of 3sat. --- youtube_dl/extractor/dreisat.py | 193 ----------------------------- youtube_dl/extractor/extractors.py | 1 - youtube_dl/extractor/phoenix.py | 130 ++++++++++++++++++- youtube_dl/extractor/zdf.py | 14 ++- 4 files changed, 141 insertions(+), 197 deletions(-) delete mode 100644 youtube_dl/extractor/dreisat.py diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py deleted file mode 100644 index 848d387d109..00000000000 --- a/youtube_dl/extractor/dreisat.py +++ /dev/null @@ -1,193 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - float_or_none, - ExtractorError, -) - - -class DreiSatIE(InfoExtractor): - IE_NAME = '3sat' - _GEO_COUNTRIES = ['DE'] - _VALID_URL = r'https?://(?:www\.)?3sat\.de/mediathek/(?:(?:index|mediathek)\.php)?\?(?:(?:mode|display)=[^&]+&)*obj=(?P[0-9]+)' - _TESTS = [ - { - 'url': 'http://www.3sat.de/mediathek/index.php?mode=play&obj=45918', - 'md5': 'be37228896d30a88f315b638900a026e', - 'info_dict': { - 'id': '45918', - 'ext': 'mp4', - 'title': 'Waidmannsheil', - 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': 'SCHWEIZWEIT', - 'uploader_id': '100000210', - 'upload_date': '20140913' - }, - 'params': { - 'skip_download': True, # m3u8 downloads - } - }, - { - 'url': 'http://www.3sat.de/mediathek/mediathek.php?mode=play&obj=51066', - 'only_matching': True, - }, - ] - - def _parse_smil_formats(self, smil, smil_url, video_id, namespace=None, f4m_params=None, transform_rtmp_url=None): - param_groups = {} - for param_group in smil.findall(self._xpath_ns('./head/paramGroup', namespace)): - group_id = param_group.get(self._xpath_ns( - 'id', 'http://www.w3.org/XML/1998/namespace')) - params = {} - for param in param_group: - params[param.get('name')] = param.get('value') - param_groups[group_id] = params - - formats = [] - for video in smil.findall(self._xpath_ns('.//video', namespace)): - src = video.get('src') - if not src: - continue - bitrate = int_or_none(self._search_regex(r'_(\d+)k', src, 'bitrate', None)) or float_or_none(video.get('system-bitrate') or video.get('systemBitrate'), 1000) - group_id = video.get('paramGroup') - param_group = param_groups[group_id] - for proto in param_group['protocols'].split(','): - formats.append({ - 'url': '%s://%s' % (proto, param_group['host']), - 'app': param_group['app'], - 'play_path': src, - 'ext': 'flv', - 'format_id': '%s-%d' % (proto, bitrate), - 'tbr': bitrate, - }) - self._sort_formats(formats) - return formats - - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - details_url = 'http://www.3sat.de/mediathek/xmlservice/web/beitragsDetails?id=%s' % video_id - return self.extract_from_xml_url(video_id, details_url) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index dc6a067717e..6b384c5b4b8 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -293,7 +293,6 @@ DiscoveryPlusIE, HGTVDeIE, ) -from .dreisat import DreiSatIE from .drbonanza import DRBonanzaIE from .drtuber import DrTuberIE from .drtv import ( diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index e435c28e171..374aae51910 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,9 +1,18 @@ from __future__ import unicode_literals -from .dreisat import DreiSatIE +import re +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_strdate, + xpath_text, + determine_ext, + ExtractorError, +) -class PhoenixIE(DreiSatIE): + +class PhoenixIE(InfoExtractor): IE_NAME = 'phoenix.de' _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ (?: @@ -33,6 +42,123 @@ class PhoenixIE(DreiSatIE): }, ] + def extract_from_xml_url(self, video_id, xml_url): + doc = self._download_xml( + xml_url, video_id, + note='Downloading video info', + errnote='Failed to download video info') + + status_code = xpath_text(doc, './status/statuscode') + if status_code and status_code != 'ok': + if status_code == 'notVisibleAnymore': + message = 'Video %s is not available' % video_id + else: + message = '%s returned error: %s' % (self.IE_NAME, status_code) + raise ExtractorError(message, expected=True) + + title = xpath_text(doc, './/information/title', 'title', True) + + urls = [] + formats = [] + for fnode in doc.findall('.//formitaeten/formitaet'): + video_url = xpath_text(fnode, 'url') + if not video_url or video_url in urls: + continue + urls.append(video_url) + + is_available = 'http://www.metafilegenerator' not in video_url + geoloced = 'static_geoloced_online' in video_url + if not is_available or geoloced: + continue + + format_id = fnode.attrib['basetype'] + format_m = re.match(r'''(?x) + (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ + (?P[^_]+)_(?P[^_]+)_(?P[^_]+) + ''', format_id) + + ext = determine_ext(video_url, None) or format_m.group('container') + + if ext == 'meta': + continue + elif ext == 'smil': + formats.extend(self._extract_smil_formats( + video_url, video_id, fatal=False)) + elif ext == 'm3u8': + # the certificates are misconfigured (see + # https://github.com/ytdl-org/youtube-dl/issues/8665) + if video_url.startswith('https://'): + continue + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + video_url, video_id, f4m_id=format_id, fatal=False)) + else: + quality = xpath_text(fnode, './quality') + if quality: + format_id += '-' + quality + + abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) + vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) + + tbr = int_or_none(self._search_regex( + r'_(\d+)k', video_url, 'bitrate', None)) + if tbr and vbr and not abr: + abr = tbr - vbr + + formats.append({ + 'format_id': format_id, + 'url': video_url, + 'ext': ext, + 'acodec': format_m.group('acodec'), + 'vcodec': format_m.group('vcodec'), + 'abr': abr, + 'vbr': vbr, + 'tbr': tbr, + 'width': int_or_none(xpath_text(fnode, './width')), + 'height': int_or_none(xpath_text(fnode, './height')), + 'filesize': int_or_none(xpath_text(fnode, './filesize')), + 'protocol': format_m.group('proto').lower(), + }) + + geolocation = xpath_text(doc, './/details/geolocation') + if not formats and geolocation and geolocation != 'none': + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + self._sort_formats(formats) + + thumbnails = [] + for node in doc.findall('.//teaserimages/teaserimage'): + thumbnail_url = node.text + if not thumbnail_url: + continue + thumbnail = { + 'url': thumbnail_url, + } + thumbnail_key = node.get('key') + if thumbnail_key: + m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) + if m: + thumbnail['width'] = int(m.group(1)) + thumbnail['height'] = int(m.group(2)) + thumbnails.append(thumbnail) + + upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) + + return { + 'id': video_id, + 'title': title, + 'description': xpath_text(doc, './/information/detail'), + 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), + 'thumbnails': thumbnails, + 'uploader': xpath_text(doc, './/details/originChannelTitle'), + 'uploader_id': xpath_text(doc, './/details/originChannelId'), + 'upload_date': upload_date, + 'formats': formats, + } + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) diff --git a/youtube_dl/extractor/zdf.py b/youtube_dl/extractor/zdf.py index 5ed2946c2b7..d9b393e6e76 100644 --- a/youtube_dl/extractor/zdf.py +++ b/youtube_dl/extractor/zdf.py @@ -39,11 +39,23 @@ def _extract_player(self, webpage, video_id, fatal=True): class ZDFIE(ZDFBaseIE): - _VALID_URL = r'https?://www\.zdf\.de/(?:[^/]+/)*(?P[^/?]+)\.html' + IE_NAME = "ZDF-3sat" + _VALID_URL = r'https?://www\.(zdf|3sat)\.de/(?:[^/]+/)*(?P[^/?]+)\.html' _QUALITIES = ('auto', 'low', 'med', 'high', 'veryhigh', 'hd') _GEO_COUNTRIES = ['DE'] _TESTS = [{ + 'url': 'https://www.3sat.de/wissen/wissenschaftsdoku/luxusgut-lebensraum-100.html', + 'info_dict': { + 'id': 'luxusgut-lebensraum-100', + 'ext': 'mp4', + 'title': 'Luxusgut Lebensraum', + 'description': 'md5:5c09b2f45ac3bc5233d1b50fc543d061', + 'duration': 2601, + 'timestamp': 1566497700, + 'upload_date': '20190822', + } + }, { 'url': 'https://www.zdf.de/dokumentation/terra-x/die-magie-der-farben-von-koenigspurpur-und-jeansblau-100.html', 'info_dict': { 'id': 'die-magie-der-farben-von-koenigspurpur-und-jeansblau-100', From deea93587c70b193ff924caf96ef67a4a9ea4d9b Mon Sep 17 00:00:00 2001 From: FliegendeWurst <2012gdwu@web.de> Date: Mon, 16 Sep 2019 22:53:49 +0200 Subject: [PATCH 2/2] [phoenix] Initial extractor for phoenix.de --- youtube_dl/extractor/phoenix.py | 182 ++++++-------------------------- 1 file changed, 31 insertions(+), 151 deletions(-) diff --git a/youtube_dl/extractor/phoenix.py b/youtube_dl/extractor/phoenix.py index 374aae51910..795276075ec 100644 --- a/youtube_dl/extractor/phoenix.py +++ b/youtube_dl/extractor/phoenix.py @@ -1,171 +1,51 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - int_or_none, - unified_strdate, - xpath_text, - determine_ext, - ExtractorError, -) +from ..utils import ExtractorError class PhoenixIE(InfoExtractor): IE_NAME = 'phoenix.de' - _VALID_URL = r'''(?x)https?://(?:www\.)?phoenix\.de/content/ - (?: - phoenix/die_sendungen/(?:[^/]+/)? - )? - (?P[0-9]+)''' + _VALID_URL = r'''https?://(?:www\.)?phoenix.de/\D+(?P\d+)\.html''' _TESTS = [ { - 'url': 'http://www.phoenix.de/content/884301', - 'md5': 'ed249f045256150c92e72dbb70eadec6', + 'url': 'https://www.phoenix.de/sendungen/dokumentationen/unsere-welt-in-zukunft---stadt-a-1283620.html', + 'md5': '342585dc397a3322e714741259696166', 'info_dict': { - 'id': '884301', + 'id': '0OB4HFc43Ns', 'ext': 'mp4', - 'title': 'Michael Krons mit Hans-Werner Sinn', - 'description': 'Im Dialog - Sa. 25.10.14, 00.00 - 00.35 Uhr', - 'upload_date': '20141025', - 'uploader': 'Im Dialog', + 'title': 'Unsere Welt in Zukunft - Stadt', + 'description': 'md5:ae05b0c55ca0d45ca40494a1c9d877dc', + 'upload_date': '20190912', + 'uploader': 'phoenix', + 'uploader_id': 'phoenix', } }, { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/869815', - 'only_matching': True, - }, - { - 'url': 'http://www.phoenix.de/content/phoenix/die_sendungen/diskussionen/928234', + 'url': 'https://www.phoenix.de/drohnenangriffe-in-saudi-arabien-a-1286995.html?ref=aktuelles', 'only_matching': True, }, + # an older page: https://www.phoenix.de/sendungen/gespraeche/phoenix-persoenlich/im-dialog-a-177727.html + # seems to not have an embedded video, even though it's uploaded on youtube: https://www.youtube.com/watch?v=4GxnoUHvOkM ] - def extract_from_xml_url(self, video_id, xml_url): - doc = self._download_xml( - xml_url, video_id, - note='Downloading video info', - errnote='Failed to download video info') - - status_code = xpath_text(doc, './status/statuscode') - if status_code and status_code != 'ok': - if status_code == 'notVisibleAnymore': - message = 'Video %s is not available' % video_id - else: - message = '%s returned error: %s' % (self.IE_NAME, status_code) - raise ExtractorError(message, expected=True) - - title = xpath_text(doc, './/information/title', 'title', True) - - urls = [] - formats = [] - for fnode in doc.findall('.//formitaeten/formitaet'): - video_url = xpath_text(fnode, 'url') - if not video_url or video_url in urls: - continue - urls.append(video_url) - - is_available = 'http://www.metafilegenerator' not in video_url - geoloced = 'static_geoloced_online' in video_url - if not is_available or geoloced: - continue - - format_id = fnode.attrib['basetype'] - format_m = re.match(r'''(?x) - (?P[^_]+)_(?P[^_]+)_(?P[^_]+)_ - (?P[^_]+)_(?P[^_]+)_(?P[^_]+) - ''', format_id) - - ext = determine_ext(video_url, None) or format_m.group('container') - - if ext == 'meta': - continue - elif ext == 'smil': - formats.extend(self._extract_smil_formats( - video_url, video_id, fatal=False)) - elif ext == 'm3u8': - # the certificates are misconfigured (see - # https://github.com/ytdl-org/youtube-dl/issues/8665) - if video_url.startswith('https://'): - continue - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', 'm3u8_native', - m3u8_id=format_id, fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id, fatal=False)) - else: - quality = xpath_text(fnode, './quality') - if quality: - format_id += '-' + quality - - abr = int_or_none(xpath_text(fnode, './audioBitrate'), 1000) - vbr = int_or_none(xpath_text(fnode, './videoBitrate'), 1000) - - tbr = int_or_none(self._search_regex( - r'_(\d+)k', video_url, 'bitrate', None)) - if tbr and vbr and not abr: - abr = tbr - vbr - - formats.append({ - 'format_id': format_id, - 'url': video_url, - 'ext': ext, - 'acodec': format_m.group('acodec'), - 'vcodec': format_m.group('vcodec'), - 'abr': abr, - 'vbr': vbr, - 'tbr': tbr, - 'width': int_or_none(xpath_text(fnode, './width')), - 'height': int_or_none(xpath_text(fnode, './height')), - 'filesize': int_or_none(xpath_text(fnode, './filesize')), - 'protocol': format_m.group('proto').lower(), - }) - - geolocation = xpath_text(doc, './/details/geolocation') - if not formats and geolocation and geolocation != 'none': - self.raise_geo_restricted(countries=self._GEO_COUNTRIES) - - self._sort_formats(formats) - - thumbnails = [] - for node in doc.findall('.//teaserimages/teaserimage'): - thumbnail_url = node.text - if not thumbnail_url: - continue - thumbnail = { - 'url': thumbnail_url, - } - thumbnail_key = node.get('key') - if thumbnail_key: - m = re.match('^([0-9]+)x([0-9]+)$', thumbnail_key) - if m: - thumbnail['width'] = int(m.group(1)) - thumbnail['height'] = int(m.group(2)) - thumbnails.append(thumbnail) - - upload_date = unified_strdate(xpath_text(doc, './/details/airtime')) - - return { - 'id': video_id, - 'title': title, - 'description': xpath_text(doc, './/information/detail'), - 'duration': int_or_none(xpath_text(doc, './/details/lengthSec')), - 'thumbnails': thumbnails, - 'uploader': xpath_text(doc, './/details/originChannelTitle'), - 'uploader_id': xpath_text(doc, './/details/originChannelId'), - 'upload_date': upload_date, - 'formats': formats, - } + def extract_from_json_api(self, video_id, api_url): + doc = self._download_json( + api_url, video_id, + note='Downloading webpage metadata', + errnote='Failed to load webpage metadata') + + for a in doc['absaetze']: + if a["typ"] == 'video-youtube': + return { + '_type': 'url_transparent', + 'id': a['id'], + 'url': 'https://www.youtube.com/watch?v=%s' % a['id'], + 'ie_key': 'Youtube', + } + raise ExtractorError("No downloadable video found", expected=True) def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - internal_id = self._search_regex( - r'