From a6f75e6e89d2f2deba783cb92369747c1b44797d Mon Sep 17 00:00:00 2001 From: Remita Amine Date: Wed, 30 Dec 2020 16:45:53 +0100 Subject: [PATCH] [yandexdisk] extract info from webpage the public API does not return metadata when download limit is reached --- youtube_dl/extractor/yandexdisk.py | 89 ++++++++++++++++-------------- 1 file changed, 47 insertions(+), 42 deletions(-) diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py index 21f37c1927e..6fcd8ee7e9b 100644 --- a/youtube_dl/extractor/yandexdisk.py +++ b/youtube_dl/extractor/yandexdisk.py @@ -2,24 +2,23 @@ from __future__ import unicode_literals import json +import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( determine_ext, - ExtractorError, float_or_none, int_or_none, mimetype2ext, - parse_iso8601, + try_get, urljoin, ) class YandexDiskIE(InfoExtractor): _VALID_URL = r'''(?x)https?:// - (?: - (?:www\.)?yadi\.sk| + (?P + yadi\.sk| disk\.yandex\. (?: az| @@ -38,7 +37,7 @@ class YandexDiskIE(InfoExtractor): _TESTS = [{ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y', - 'md5': '33955d7ae052f15853dc41f35f17581c', + 'md5': 'a4a8d52958c8fddcf9845935070402ae', 'info_dict': { 'id': 'VdOeDou8eZs6Y', 'ext': 'mp4', @@ -46,10 +45,9 @@ class YandexDiskIE(InfoExtractor): 'duration': 168.6, 'uploader': 'y.botova', 'uploader_id': '300043621', - 'timestamp': 1421396809, - 'upload_date': '20150116', 'view_count': int, }, + 'expected_warnings': ['Unable to download JSON metadata'], }, { 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce', 'only_matching': True, @@ -59,51 +57,58 @@ class YandexDiskIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + domain, video_id = re.match(self._VALID_URL, url).groups() - try: - resource = self._download_json( - 'https://cloud-api.yandex.net/v1/disk/public/resources', - video_id, query={'public_key': url}) - except ExtractorError as e: - if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: - error_description = self._parse_json( - e.cause.read().decode(), video_id)['description'] - raise ExtractorError(error_description, expected=True) - raise + webpage = self._download_webpage(url, video_id) + store = self._parse_json(self._search_regex( + r']+id="store-prefetch"[^>]*>\s*({.+?})\s*', + webpage, 'store'), video_id) + resource = store['resources'][store['rootResourceId']] title = resource['name'] - public_url = resource.get('public_url') + meta = resource.get('meta') or {} + + public_url = meta.get('short_url') if public_url: video_id = self._match_id(public_url) - self._set_cookie('yadi.sk', 'yandexuid', '0') + source_url = (self._download_json( + 'https://cloud-api.yandex.net/v1/disk/public/resources/download', + video_id, query={'public_key': url}, fatal=False) or {}).get('href') + video_streams = resource.get('videoStreams') or {} + video_hash = resource.get('hash') or url + environment = store.get('environment') or {} + sk = environment.get('sk') + yandexuid = environment.get('yandexuid') + if sk and yandexuid and not (source_url and video_streams): + self._set_cookie(domain, 'yandexuid', yandexuid) - def call_api(action): - return (self._download_json( - urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ - 'hash': url, - # obtain sk if needed from call_api('check-auth') while - # the yandexuid cookie is set and sending an empty JSON object - 'sk': 'ya6b52f8c6b12abe91a66d22d3a31084b' - }).encode(), headers={ - 'Content-Type': 'text/plain', - }, fatal=False) or {}).get('data') or {} + def call_api(action): + return (self._download_json( + urljoin(url, '/public/api/') + action, video_id, data=json.dumps({ + 'hash': video_hash, + 'sk': sk, + }).encode(), headers={ + 'Content-Type': 'text/plain', + }, fatal=False) or {}).get('data') or {} + if not source_url: + # TODO: figure out how to detect if download limit has + # been reached and then avoid unnecessary source format + # extraction requests + source_url = call_api('download-url').get('url') + if not video_streams: + video_streams = call_api('get-video-streams') formats = [] - source_url = resource.get('file') - if not source_url: - source_url = call_api('download-url').get('url') if source_url: formats.append({ 'url': source_url, 'format_id': 'source', - 'ext': determine_ext(title, mimetype2ext(resource.get('mime_type')) or 'mp4'), + 'ext': determine_ext(title, meta.get('ext') or mimetype2ext(meta.get('mime_type')) or 'mp4'), 'quality': 1, - 'filesize': int_or_none(resource.get('size')) + 'filesize': int_or_none(meta.get('size')) }) - video_streams = call_api('get-video-streams') for video in (video_streams.get('videos') or []): format_url = video.get('url') if not format_url: @@ -128,15 +133,15 @@ def call_api(action): }) self._sort_formats(formats) - owner = resource.get('owner') or {} + uid = resource.get('uid') + display_name = try_get(store, lambda x: x['users'][uid]['displayName']) return { 'id': video_id, 'title': title, 'duration': float_or_none(video_streams.get('duration'), 1000), - 'uploader': owner.get('display_name'), - 'uploader_id': owner.get('uid'), - 'view_count': int_or_none(resource.get('views_count')), - 'timestamp': parse_iso8601(resource.get('created')), + 'uploader': display_name, + 'uploader_id': uid, + 'view_count': int_or_none(meta.get('views_counter')), 'formats': formats, }