ytdl-org · triple-j · Aug 25, 2019 · Aug 25, 2019 · Aug 31, 2019 · Aug 24, 2019
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -20,6 +20,7 @@
     args_to_str,
     encode_base_n,
     clean_html,
+    clean_html_markdown,
     date_from_str,
     DateRange,
     detect_exe_version,
@@ -1025,6 +1026,33 @@ def test_clean_html(self):
         self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"')
         self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')
 
+    def test_clean_html_markdown(self):
+        self.assertEqual(clean_html_markdown(
+            '<div id="out" class="markdown-body"><h1>Happy Text</h1>\n'
+            '<p>When you do it your way you can go <em>anywhere</em> you choose. And just raise cain. I thought today we would make a happy little stream that\'s just running through the woods here. I was <strong>blessed</strong> with a very steady hand; and it comes in very handy when you\'re doing these little delicate things. You have to allow the paint to break to <strong><em>make it beautiful</em></strong>. Let\'s do it again then, what the heck.</p>\n'
+            '<h2>This is your creation - and it\'s just as unique and special as you are.</h2>\n'
+            '<p>Paint <b>anything</b> you want on the canvas. Create your own world. By now you should be quite happy about what\'s happening here. You can\'t have light without dark. You can\'t know <i>happiness</i> unless you\'ve known <em>sorrow</em>. Let\'s get crazy.</p>\n'
+            '<ul>\n'
+            '<li>You can spend all day playing with mountains.</li>\n'
+            '<li>We\'ll put a happy little sky in here.</li>\n'
+            '</ul>\n'
+            '<p>I like to beat the brush. There we go.<br>\n'
+            'We don\'t need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.</p>\n'
+            '</div>'),
+            "# Happy Text\n"
+            "\n"
+            "When you do it your way you can go *anywhere* you choose. And just raise cain. I thought today we would make a happy little stream that's just running through the woods here. I was **blessed** with a very steady hand; and it comes in very handy when you're doing these little delicate things. You have to allow the paint to break to ***make it beautiful***. Let's do it again then, what the heck.\n"
+            "\n"
+            "## This is your creation - and it's just as unique and special as you are.\n"
+            "\n"
+            "Paint **anything** you want on the canvas. Create your own world. By now you should be quite happy about what's happening here. You can't have light without dark. You can't know *happiness* unless you've known *sorrow*. Let's get crazy.\n"
+            "\n"
+            "- You can spend all day playing with mountains. \n"
+            "- We'll put a happy little sky in here. \n"
+            "\n"
+            "I like to beat the brush. There we go.  \n"
+            "We don't need any guidelines or formats. All we need to do is just let it flow right out of us. Trees live in your fan brush, but you have to scare them out.")
+
     def test_intlist_to_bytes(self):
         self.assertEqual(
             intlist_to_bytes([0, 1, 127, 128, 255]),

diff --git a/youtube_dl/extractor/audible.py b/youtube_dl/extractor/audible.py
@@ -0,0 +1,317 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+    compat_urllib_parse_urlencode,
+    compat_urlparse,
+)
+from ..utils import (
+    clean_html,
+    clean_html_markdown,
+    ExtractorError,
+    extract_attributes,
+    get_element_by_class,
+    get_element_by_id,
+    get_elements_by_class,
+    unified_strdate,
+    urlencode_postdata,
+)
+
+
+class AudibleBaseIE(InfoExtractor):
+    _BASE_URL = 'https://www.audible.com'
+
+    def _is_logged_in(self, html=None):
+        if not html:
+            html = self._download_webpage(
+                self._BASE_URL, None,
+                'Checking login status')
+
+        logged_in_elm = get_element_by_class('ui-it-credit-balance', html)
+
+        if logged_in_elm is None:
+            self.report_warning(
+                'You don\'t appear to be logged in.  You will not be able to '
+                'download full audiobooks without being logged in.  It is '
+                'currently not possible to automate the login process for '
+                'Audible.  You must login via a browser, then export your '
+                'cookies and pass the cookie file to youtube-dl with '
+                '--cookies.')
+            return False
+
+        else:
+            return True
+
+
+class AudibleIE(AudibleBaseIE):
+    IE_NAME = 'audible'
+    _VALID_URL = r'https?://(?:.+?\.)?audible\.com/pd/(?:.+)/(?P<id>[^/?#&]+)'
+    _TESTS = [{
+        'url': 'https://www.audible.com/pd/Neil-Gaimans-How-the-Marquis-Got-His-Coat-Back-Audiobook/B01LZB4R8W',
+        'md5': '7bcfd4aab323cee607d8425c9aba275b',
+        'info_dict': {
+            'id': 'B01LZB4R8W',
+            'ext': 'mp3',
+            'title': 'Neil Gaiman\'s How the Marquis Got His Coat Back',
+            'description': 'md5:851082468b157f20c82caf10051c5a24',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'creator': 'Neil Gaiman',
+            'album_artist': 'Neil Gaiman',
+            'artist': 'Paterson Joseph, Bernard Cribbins, Samantha Beart, Adrian Lester, Mitch Benn, Don Warrington',
+        },
+        'expected_warnings': ['You don\'t appear to be logged in.']
+    }, {
+        'url': 'https://www.audible.com/pd/Merrick-Audiobook/B002UUKMKQ',
+        'md5': '3bcbc2ed79201332db8d72b4c95a0269',
+        'info_dict': {
+            'id': 'B002UUKMKQ',
+            'ext': 'mp3',
+            'title': 'Merrick',
+            'description': 'md5:82c8d4687e361ebb70162039288dcba2',
+            'thumbnail': 're:^https?://.*\.jpg$',
+            'creator': 'Anne Rice',
+            'album_artist': 'Anne Rice',
+            'artist': 'Graeme Malcolm',
+            'series': 'The Vampire Chronicles',
+            'album': 'The Vampire Chronicles',
+            'episode_number': 7,
+            'track_number': 7,
+            'episode_id': 'Book 7',
+        },
+        'expected_warnings': ['You don\'t appear to be logged in.']
+    }]
+
+    @staticmethod
+    def _get_label_text(class_name, html, prefix=None):
+        label_text = None
+
+        label_html = get_element_by_class(class_name, html)
+        if label_html:
+            label_text = re.sub(r'\s+', ' ', clean_html(label_html))
+            if prefix and label_text.startswith(prefix):
+                label_text = label_text[len(prefix):].strip()
+
+        return label_text
+
+    def _real_extract(self, url):
+        book_id = self._match_id(url)
+        webpage = self._download_webpage(url, book_id)
+
+        title = self._og_search_title(webpage)
+
+        thumbnails = []
+        og_thumbnail = self._og_search_thumbnail(webpage)
+        if og_thumbnail:
+            thumbnails.append({
+                'url': og_thumbnail,
+                'preference': 210
+            })
+        thumb_element = self._search_regex(
+            r'(<img[^>]+alt=["\'][^\'"]*\bcover art\b[^>]*>)', webpage,
+            'thumbnail element', default=None)
+        if thumb_element:
+            lg_thumbnail_attrs = extract_attributes(thumb_element)
+            if lg_thumbnail_attrs.get('src'):
+                thumbnails.append({
+                    'url': lg_thumbnail_attrs.get('src'),
+                    'preference': 500
+                })
+
+        authors = self._get_label_text('authorLabel', webpage, prefix='By:')
+        narrators = self._get_label_text('narratorLabel', webpage, prefix='Narrated by:')
+        performance_type = self._get_label_text('format', webpage)
+        publisher = self._get_label_text('publisherLabel', webpage, prefix='Publisher:')
+
+        release_date_yyyymmdd = None
+        release_date = self._get_label_text('releaseDateLabel', webpage, prefix='Release date:')
+        if release_date:
+            release_date_yyyymmdd = unified_strdate(release_date, False)
+
+        book_series = None
+        book_in_series = None
+        book_number = None
+        in_multiple_series = False
+        all_series = self._get_label_text('seriesLabel', webpage, prefix='Series:')
+        if all_series:
+            series_sep = all_series.split(',')
+            book_series = series_sep[0].strip()
+            if len(series_sep) > 1:
+                book_in_series = series_sep[1].strip()
+                if book_in_series.startswith('Book'):
+                    book_number = float(book_in_series[4:].strip())
+                if len(series_sep) > 2 and len(series_sep) % 2 == 0:
+                    in_multiple_series = True
+
+        categories = []
+        breadcrumbs_text = get_elements_by_class('navigation-link', webpage)
+        if breadcrumbs_text:
+            categories.extend(breadcrumbs_text)
+
+        description = ""
+        # Not all summaries show up on a given book, but the publisher summary
+        # is the most common
+        editorial_summary_html = get_element_by_class('productEditorialSummary', webpage)
+        if editorial_summary_html:
+            editorial_summary_text = clean_html_markdown(editorial_summary_html)
+            description += editorial_summary_text + '\n\n'
+        publisher_summary_html = get_element_by_class('productPublisherSummary', webpage)
+        if publisher_summary_html:
+            publisher_summary_text = clean_html_markdown(publisher_summary_html)
+            description += publisher_summary_text + '\n\n'
+        critics_summary_html = get_element_by_class('productCriticsSummary', webpage)
+        if critics_summary_html:
+            critics_summary_text = clean_html_markdown(critics_summary_html)
+            description += critics_summary_text + '\n\n'
+        if in_multiple_series:
+            series_list_text = '## Series List\n\n'
+            for sidx in range(0, len(series_sep), 2):
+                series_list_text += '- %s, %s\n' % (
+                    series_sep[sidx].strip(),
+                    series_sep[sidx + 1].strip())
+            description += series_list_text + '\n'
+
+        # Audio Sample
+        formats = []
+        sample_audio = self._search_regex(
+            r'\s+data-mp3=(["\'])(?P<url>.+?)\1', webpage,
+            'Audio Sample', default=None, group='url')
+        sample_format = {
+            'url': sample_audio,
+            'format_id': 'sample',
+            'format': 'sample - audio only',
+            'vcodec': 'none',
+        }
+        formats.append(sample_format)
+
+        is_logged_in = self._is_logged_in(webpage)
+        book_purchased = False
+        purchase_date_elm = get_element_by_id('adbl-buy-box-purchase-date', webpage)
+        if purchase_date_elm is not None:
+            book_purchased = True
+
+        if is_logged_in and not book_purchased:
+            self.report_warning(
+                'You don\'t appear to own this title.',
+                book_id)
+
+        duration = None
+        chapters = []
+        if is_logged_in and book_purchased:
+            cloud_player_url = 'https://www.audible.com/cloudplayer?asin=' + book_id
+            cloud_player_page = self._download_webpage(
+                cloud_player_url, book_id, 'Retrieving token')
+            cloud_player_form = self._hidden_inputs(cloud_player_page)
+
+            token = cloud_player_form.get('token')
+            if token is None:
+                raise ExtractorError("Could not find token")
+
+            metadata = self._download_json(
+                'https://www.audible.com/contentlicenseajax', book_id,
+                data=urlencode_postdata({
+                    'asin': book_id,
+                    'token': token,
+                    'key': 'AudibleCloudPlayer',
+                    'action': 'getUrl'
+                }),
+                headers={'Referer': cloud_player_url})
+
+            m3u8_url = metadata.get('hlscontentLicenseUrl')
+            if m3u8_url:
+                m3u8_formats = self._extract_akamai_formats(
+                    m3u8_url, book_id, skip_protocols=['f4m'])
+                formats.extend(m3u8_formats)
+            self._sort_formats(formats)
+
+            duration = metadata.get('runTime')
+
+            for md_chapter in metadata.get('cloudPlayerChapters', []):
+                ch_start_time = md_chapter.get('chapterStartPosition')
+                ch_end_time = md_chapter.get('chapterEndPosition')
+                ch_title = md_chapter.get('chapterTitle')
+
+                if ch_start_time is None or ch_end_time is None:
+                    self.report_warning('Missing chapter information')
+                    chapters = []
+                    break
+
+                chapter = {
+                    'start_time': float(ch_start_time) / 1000,
+                    'end_time': float(ch_end_time) / 1000
+                }
+
+                if ch_title:
+                    chapter['title'] = ch_title
+
+                chapters.append(chapter)
+
+        return {
+            'id': book_id,
+            'title': title,
+            'formats': formats,
+            'duration': duration,
+            'chapters': chapters if len(chapters) > 0 else None,
+            'thumbnails': thumbnails if len(thumbnails) > 0 else None,
+            'creator': authors,
+            'album_artist': authors,
+            'artist': narrators,
+            'album_type': performance_type,
+            'uploader': publisher,
+            'release_date': release_date_yyyymmdd,
+            'release_year': int(release_date_yyyymmdd[:4]) if release_date_yyyymmdd else None,
+            'series': book_series,
+            'album': book_series,
+            'episode_number': book_number,
+            'track_number': book_number,
+            'episode_id': book_in_series,
+            'categories': categories if len(categories) > 0 else None,
+            'genre': ', '.join(categories) if len(categories) > 0 else None,
+            'description': description if description is not "" else None,
+        }
+
+
+class AudibleLibraryIE(AudibleBaseIE):
+    IE_NAME = 'audible:library'
+    _VALID_URL = r'https?://(?:.+?\.)?audible\.com/lib\b'
+
+    def _real_initialize(self):
+        if not self._is_logged_in():
+            raise ExtractorError('Not logged in.', expected=True)
+
+    def _real_extract(self, url):
+        entries = []
+
+        last_page = None
+        page_num = 0
+        while True:
+            page_num += 1
+            page_id = "Page%d" % page_num
+
+            # update url to current page number
+            parsed_url = compat_urlparse.urlparse(url)
+            qs = compat_urlparse.parse_qs(parsed_url.query)
+            qs['page'] = page_num
+            page_url = compat_urlparse.urlunparse(
+                parsed_url._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+            webpage = self._download_webpage(page_url, page_id)
+
+            for book_link in re.findall(r'(<a[^>]+aria-describedby=["\']product-list-flyout-[^"\'][^>]*>)', webpage):
+                book_link_attrs = extract_attributes(book_link)
+                if book_link_attrs.get('href'):
+                    entries.append(self.url_result(
+                        self._BASE_URL + book_link_attrs.get('href'),
+                        ie=AudibleIE.ie_key()))
+
+            if last_page is None:
+                pages = get_elements_by_class('pageNumberElement', webpage)
+                if pages:
+                    last_page = int(pages[-1])
+
+            if page_num >= last_page:
+                break
+
+        return self.playlist_result(entries)