From 2e9c31369e0cb58b4073ab6e6a106ba46789ef9e Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sun, 25 Aug 2019 14:22:51 -0500 Subject: [PATCH 1/4] [utils] Introduce clean_html_markdown --- test/test_utils.py | 28 ++++++++++++++++++++++++ youtube_dl/utils.py | 53 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/test/test_utils.py b/test/test_utils.py index 659c6ece53d..b4974d9f4cc 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -20,6 +20,7 @@ args_to_str, encode_base_n, clean_html, + clean_html_markdown, date_from_str, DateRange, detect_exe_version, @@ -1025,6 +1026,33 @@ def test_clean_html(self): self.assertEqual(clean_html('a:\n "b"'), 'a: "b"') self.assertEqual(clean_html('a
\xa0b'), 'a\nb') + def test_clean_html_markdown(self): + self.assertEqual(clean_html_markdown( + '

Happy Text

\n' + '

When you do it your way you can go anywhere you choose. And just raise cain. I thought today we would make a happy little stream that\'s just running through the woods here. I was blessed with a very steady hand; and it comes in very handy when you\'re doing these little delicate things. You have to allow the paint to break to make it beautiful. Let\'s do it again then, what the heck.

\n' + '

This is your creation - and it\'s just as unique and special as you are.

\n' + '

Paint anything you want on the canvas. Create your own world. By now you should be quite happy about what\'s happening here. You can\'t have light without dark. You can\'t know happiness unless you\'ve known sorrow. Let\'s get crazy.

\n' + '\n' + '

I like to beat the brush. There we go.
\n' + 'We don\'t need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.

\n' + '
'), + "# Happy Text\n" + "\n" + "When you do it your way you can go *anywhere* you choose. And just raise cain. I thought today we would make a happy little stream that's just running through the woods here. I was **blessed** with a very steady hand; and it comes in very handy when you're doing these little delicate things. You have to allow the paint to break to ***make it beautiful***. Let's do it again then, what the heck.\n" + "\n" + "## This is your creation - and it's just as unique and special as you are.\n" + "\n" + "Paint **anything** you want on the canvas. Create your own world. By now you should be quite happy about what's happening here. You can't have light without dark. You can't know *happiness* unless you've known *sorrow*. Let's get crazy.\n" + "\n" + "- You can spend all day playing with mountains. \n" + "- We'll put a happy little sky in here. \n" + "\n" + "I like to beat the brush. There we go. \n" + "We don't need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.") + def test_intlist_to_bytes(self): self.assertEqual( intlist_to_bytes([0, 1, 127, 128, 255]), diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 7987572419a..e5286a44a2e 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -2023,6 +2023,59 @@ def clean_html(html): return html.strip() +def clean_html_markdown(html): + """Clean an HTML snippet into readable markdown""" + + if html is None: # Convenience for sanitizing descriptions etc. + return html + + # Remove Newlines + html = html.replace('\n', ' ') + + # Paragraphs and Line Breaks + html = re.sub(r'(?u)<\s*br\s*/?\s*>\s*', ' \n', html) + html = re.sub(r'(?u)<\s*/?\s*p\b[^>]*>', '\n\n', html) + + # Headings + html = re.sub(r'(?u)<\s*h1\b[^>]*>', '\n\n# ', html) + html = re.sub(r'(?u)<\s*h2\b[^>]*>', '\n\n## ', html) + html = re.sub(r'(?u)<\s*h3\b[^>]*>', '\n\n### ', html) + html = re.sub(r'(?u)<\s*h4\b[^>]*>', '\n\n#### ', html) + html = re.sub(r'(?u)<\s*h5\b[^>]*>', '\n\n##### ', html) + html = re.sub(r'(?u)<\s*h6\b[^>]*>', '\n\n###### ', html) + html = re.sub(r'(?u)<\s*/\s*h[123456]\b[^>]*>', '\n\n', html) + + # Lists + html = re.sub(r'(?u)<\s*/?\s*(ul|ol)\b[^>]*>', '\n\n', html) + html = re.sub(r'(?u)<\s*li\b[^>]*>', '\n- ', html) + + # Emphasis + html = re.sub(r'(?u)<\s*/?\s*(i|em)\b[^>]*>', '*', html) + html = re.sub(r'(?u)<\s*/?\s*(b|strong)\b[^>]*>', '**', html) + + # Strip html tags + html = re.sub('<.*?>', '', html) + # Replace html entities + html = unescapeHTML(html) + + # remove duplicate blank lines + cleaned_text = '' + sequential_blank_lines = 0 + for line in html.splitlines(): + line = line.lstrip() + if not line: + sequential_blank_lines += 1 + if sequential_blank_lines <= 1: + cleaned_text += '\n' + continue + else: + sequential_blank_lines = 0 + + cleaned_text += line + '\n' + + return cleaned_text.strip() + + def sanitize_open(filename, open_mode): """Try to open the given filename, and slightly tweak it if this fails. From 36e98a143cf55f520a17cbfabba06a83561492d5 Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sun, 25 Aug 2019 14:36:59 -0500 Subject: [PATCH 2/4] [utils] Add more date formats --- youtube_dl/utils.py | 1 + 1 file changed, 1 insertion(+) diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index e5286a44a2e..07158fe5866 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -1761,6 +1761,7 @@ def random_user_agent(): DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) DATE_FORMATS_MONTH_FIRST.extend([ '%m-%d-%Y', + '%m-%d-%y', '%m.%d.%Y', '%m/%d/%Y', '%m/%d/%y', From 400dcfe257e9c380c78a74f807a8437267ab1d48 Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 31 Aug 2019 15:49:15 -0500 Subject: [PATCH 3/4] [extractor/common] Allow protocols to be skipped in _extract_akamai_formats --- youtube_dl/extractor/common.py | 42 ++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 20 deletions(-) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 85978661793..7d509a90470 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -2547,27 +2547,29 @@ def _media_formats(src, cur_media_type, type_info={}): entries.append(media_info) return entries - def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): + def _extract_akamai_formats(self, manifest_url, video_id, hosts={}, skip_protocols=[]): formats = [] - hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') - hds_host = hosts.get('hds') - if hds_host: - f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) - if 'hdcore=' not in f4m_url: - f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign - f4m_formats = self._extract_f4m_formats( - f4m_url, video_id, f4m_id='hds', fatal=False) - for entry in f4m_formats: - entry.update({'extra_param_to_segment_url': hdcore_sign}) - formats.extend(f4m_formats) - m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') - hls_host = hosts.get('hls') - if hls_host: - m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) - formats.extend(self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) + if 'f4m' not in skip_protocols: + hdcore_sign = 'hdcore=3.7.0' + f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + hds_host = hosts.get('hds') + if hds_host: + f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) + if 'hdcore=' not in f4m_url: + f4m_url += ('&' if '?' in f4m_url else '?') + hdcore_sign + f4m_formats = self._extract_f4m_formats( + f4m_url, video_id, f4m_id='hds', fatal=False) + for entry in f4m_formats: + entry.update({'extra_param_to_segment_url': hdcore_sign}) + formats.extend(f4m_formats) + if 'm3u8' not in skip_protocols: + m3u8_url = re.sub(r'(https?://[^/]+)/z/', r'\1/i/', manifest_url).replace('/manifest.f4m', '/master.m3u8') + hls_host = hosts.get('hls') + if hls_host: + m3u8_url = re.sub(r'(https?://)[^/]+', r'\1' + hls_host, m3u8_url) + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) return formats def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): From a462f6ed21ea1158666d1440c071b69d77b66374 Mon Sep 17 00:00:00 2001 From: "Jeremie J. Jarosh" Date: Sat, 24 Aug 2019 14:59:02 -0500 Subject: [PATCH 4/4] [audible] Add new extractor --- youtube_dl/extractor/audible.py | 317 +++++++++++++++++++++++++++++ youtube_dl/extractor/extractors.py | 4 + 2 files changed, 321 insertions(+) create mode 100644 youtube_dl/extractor/audible.py diff --git a/youtube_dl/extractor/audible.py b/youtube_dl/extractor/audible.py new file mode 100644 index 00000000000..afb168a9bb0 --- /dev/null +++ b/youtube_dl/extractor/audible.py @@ -0,0 +1,317 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_urlparse, +) +from ..utils import ( + clean_html, + clean_html_markdown, + ExtractorError, + extract_attributes, + get_element_by_class, + get_element_by_id, + get_elements_by_class, + unified_strdate, + urlencode_postdata, +) + + +class AudibleBaseIE(InfoExtractor): + _BASE_URL = 'https://www.audible.com' + + def _is_logged_in(self, html=None): + if not html: + html = self._download_webpage( + self._BASE_URL, None, + 'Checking login status') + + logged_in_elm = get_element_by_class('ui-it-credit-balance', html) + + if logged_in_elm is None: + self.report_warning( + 'You don\'t appear to be logged in. You will not be able to ' + 'download full audiobooks without being logged in. It is ' + 'currently not possible to automate the login process for ' + 'Audible. You must login via a browser, then export your ' + 'cookies and pass the cookie file to youtube-dl with ' + '--cookies.') + return False + + else: + return True + + +class AudibleIE(AudibleBaseIE): + IE_NAME = 'audible' + _VALID_URL = r'https?://(?:.+?\.)?audible\.com/pd/(?:.+)/(?P[^/?#&]+)' + _TESTS = [{ + 'url': 'https://www.audible.com/pd/Neil-Gaimans-How-the-Marquis-Got-His-Coat-Back-Audiobook/B01LZB4R8W', + 'md5': '7bcfd4aab323cee607d8425c9aba275b', + 'info_dict': { + 'id': 'B01LZB4R8W', + 'ext': 'mp3', + 'title': 'Neil Gaiman\'s How the Marquis Got His Coat Back', + 'description': 'md5:851082468b157f20c82caf10051c5a24', + 'thumbnail': 're:^https?://.*\.jpg$', + 'creator': 'Neil Gaiman', + 'album_artist': 'Neil Gaiman', + 'artist': 'Paterson Joseph, Bernard Cribbins, Samantha Beart, Adrian Lester, Mitch Benn, Don Warrington', + }, + 'expected_warnings': ['You don\'t appear to be logged in.'] + }, { + 'url': 'https://www.audible.com/pd/Merrick-Audiobook/B002UUKMKQ', + 'md5': '3bcbc2ed79201332db8d72b4c95a0269', + 'info_dict': { + 'id': 'B002UUKMKQ', + 'ext': 'mp3', + 'title': 'Merrick', + 'description': 'md5:82c8d4687e361ebb70162039288dcba2', + 'thumbnail': 're:^https?://.*\.jpg$', + 'creator': 'Anne Rice', + 'album_artist': 'Anne Rice', + 'artist': 'Graeme Malcolm', + 'series': 'The Vampire Chronicles', + 'album': 'The Vampire Chronicles', + 'episode_number': 7, + 'track_number': 7, + 'episode_id': 'Book 7', + }, + 'expected_warnings': ['You don\'t appear to be logged in.'] + }] + + @staticmethod + def _get_label_text(class_name, html, prefix=None): + label_text = None + + label_html = get_element_by_class(class_name, html) + if label_html: + label_text = re.sub(r'\s+', ' ', clean_html(label_html)) + if prefix and label_text.startswith(prefix): + label_text = label_text[len(prefix):].strip() + + return label_text + + def _real_extract(self, url): + book_id = self._match_id(url) + webpage = self._download_webpage(url, book_id) + + title = self._og_search_title(webpage) + + thumbnails = [] + og_thumbnail = self._og_search_thumbnail(webpage) + if og_thumbnail: + thumbnails.append({ + 'url': og_thumbnail, + 'preference': 210 + }) + thumb_element = self._search_regex( + r'(]+alt=["\'][^\'"]*\bcover art\b[^>]*>)', webpage, + 'thumbnail element', default=None) + if thumb_element: + lg_thumbnail_attrs = extract_attributes(thumb_element) + if lg_thumbnail_attrs.get('src'): + thumbnails.append({ + 'url': lg_thumbnail_attrs.get('src'), + 'preference': 500 + }) + + authors = self._get_label_text('authorLabel', webpage, prefix='By:') + narrators = self._get_label_text('narratorLabel', webpage, prefix='Narrated by:') + performance_type = self._get_label_text('format', webpage) + publisher = self._get_label_text('publisherLabel', webpage, prefix='Publisher:') + + release_date_yyyymmdd = None + release_date = self._get_label_text('releaseDateLabel', webpage, prefix='Release date:') + if release_date: + release_date_yyyymmdd = unified_strdate(release_date, False) + + book_series = None + book_in_series = None + book_number = None + in_multiple_series = False + all_series = self._get_label_text('seriesLabel', webpage, prefix='Series:') + if all_series: + series_sep = all_series.split(',') + book_series = series_sep[0].strip() + if len(series_sep) > 1: + book_in_series = series_sep[1].strip() + if book_in_series.startswith('Book'): + book_number = float(book_in_series[4:].strip()) + if len(series_sep) > 2 and len(series_sep) % 2 == 0: + in_multiple_series = True + + categories = [] + breadcrumbs_text = get_elements_by_class('navigation-link', webpage) + if breadcrumbs_text: + categories.extend(breadcrumbs_text) + + description = "" + # Not all summaries show up on a given book, but the publisher summary + # is the most common + editorial_summary_html = get_element_by_class('productEditorialSummary', webpage) + if editorial_summary_html: + editorial_summary_text = clean_html_markdown(editorial_summary_html) + description += editorial_summary_text + '\n\n' + publisher_summary_html = get_element_by_class('productPublisherSummary', webpage) + if publisher_summary_html: + publisher_summary_text = clean_html_markdown(publisher_summary_html) + description += publisher_summary_text + '\n\n' + critics_summary_html = get_element_by_class('productCriticsSummary', webpage) + if critics_summary_html: + critics_summary_text = clean_html_markdown(critics_summary_html) + description += critics_summary_text + '\n\n' + if in_multiple_series: + series_list_text = '## Series List\n\n' + for sidx in range(0, len(series_sep), 2): + series_list_text += '- %s, %s\n' % ( + series_sep[sidx].strip(), + series_sep[sidx + 1].strip()) + description += series_list_text + '\n' + + # Audio Sample + formats = [] + sample_audio = self._search_regex( + r'\s+data-mp3=(["\'])(?P.+?)\1', webpage, + 'Audio Sample', default=None, group='url') + sample_format = { + 'url': sample_audio, + 'format_id': 'sample', + 'format': 'sample - audio only', + 'vcodec': 'none', + } + formats.append(sample_format) + + is_logged_in = self._is_logged_in(webpage) + book_purchased = False + purchase_date_elm = get_element_by_id('adbl-buy-box-purchase-date', webpage) + if purchase_date_elm is not None: + book_purchased = True + + if is_logged_in and not book_purchased: + self.report_warning( + 'You don\'t appear to own this title.', + book_id) + + duration = None + chapters = [] + if is_logged_in and book_purchased: + cloud_player_url = 'https://www.audible.com/cloudplayer?asin=' + book_id + cloud_player_page = self._download_webpage( + cloud_player_url, book_id, 'Retrieving token') + cloud_player_form = self._hidden_inputs(cloud_player_page) + + token = cloud_player_form.get('token') + if token is None: + raise ExtractorError("Could not find token") + + metadata = self._download_json( + 'https://www.audible.com/contentlicenseajax', book_id, + data=urlencode_postdata({ + 'asin': book_id, + 'token': token, + 'key': 'AudibleCloudPlayer', + 'action': 'getUrl' + }), + headers={'Referer': cloud_player_url}) + + m3u8_url = metadata.get('hlscontentLicenseUrl') + if m3u8_url: + m3u8_formats = self._extract_akamai_formats( + m3u8_url, book_id, skip_protocols=['f4m']) + formats.extend(m3u8_formats) + self._sort_formats(formats) + + duration = metadata.get('runTime') + + for md_chapter in metadata.get('cloudPlayerChapters', []): + ch_start_time = md_chapter.get('chapterStartPosition') + ch_end_time = md_chapter.get('chapterEndPosition') + ch_title = md_chapter.get('chapterTitle') + + if ch_start_time is None or ch_end_time is None: + self.report_warning('Missing chapter information') + chapters = [] + break + + chapter = { + 'start_time': float(ch_start_time) / 1000, + 'end_time': float(ch_end_time) / 1000 + } + + if ch_title: + chapter['title'] = ch_title + + chapters.append(chapter) + + return { + 'id': book_id, + 'title': title, + 'formats': formats, + 'duration': duration, + 'chapters': chapters if len(chapters) > 0 else None, + 'thumbnails': thumbnails if len(thumbnails) > 0 else None, + 'creator': authors, + 'album_artist': authors, + 'artist': narrators, + 'album_type': performance_type, + 'uploader': publisher, + 'release_date': release_date_yyyymmdd, + 'release_year': int(release_date_yyyymmdd[:4]) if release_date_yyyymmdd else None, + 'series': book_series, + 'album': book_series, + 'episode_number': book_number, + 'track_number': book_number, + 'episode_id': book_in_series, + 'categories': categories if len(categories) > 0 else None, + 'genre': ', '.join(categories) if len(categories) > 0 else None, + 'description': description if description is not "" else None, + } + + +class AudibleLibraryIE(AudibleBaseIE): + IE_NAME = 'audible:library' + _VALID_URL = r'https?://(?:.+?\.)?audible\.com/lib\b' + + def _real_initialize(self): + if not self._is_logged_in(): + raise ExtractorError('Not logged in.', expected=True) + + def _real_extract(self, url): + entries = [] + + last_page = None + page_num = 0 + while True: + page_num += 1 + page_id = "Page%d" % page_num + + # update url to current page number + parsed_url = compat_urlparse.urlparse(url) + qs = compat_urlparse.parse_qs(parsed_url.query) + qs['page'] = page_num + page_url = compat_urlparse.urlunparse( + parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True))) + + webpage = self._download_webpage(page_url, page_id) + + for book_link in re.findall(r'(]+aria-describedby=["\']product-list-flyout-[^"\'][^>]*>)', webpage): + book_link_attrs = extract_attributes(book_link) + if book_link_attrs.get('href'): + entries.append(self.url_result( + self._BASE_URL + book_link_attrs.get('href'), + ie=AudibleIE.ie_key())) + + if last_page is None: + pages = get_elements_by_class('pageNumberElement', webpage) + if pages: + last_page = int(pages[-1]) + + if page_num >= last_page: + break + + return self.playlist_result(entries) diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 06de556b7ab..53c4033eca5 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -69,6 +69,10 @@ from .atresplayer import AtresPlayerIE from .atttechchannel import ATTTechChannelIE from .atvat import ATVAtIE +from .audible import ( + AudibleIE, + AudibleLibraryIE, +) from .audimedia import AudiMediaIE from .audioboom import AudioBoomIE from .audiomack import AudiomackIE, AudiomackAlbumIE