[archiveorg] Use and fix get_element_by_class()

Use get_element_by_class() from utils to get rid of yet another regex. This function used to return only the content of the element, and not the element itself, including its tag and attributes. The whole group of get_element_by_X() functions are a bit of a misnomer, as they all return the *content* of the element and not the element itself. All these functions can now return the whole element when setting their `include_tag` parameter to `True`. By default it is `False` so no other code will be affected by this change. Tests have been added to test/test_utils.py accordingly. This uncovered a bug which prevented elements starting with a hyphen as their class name from being found. This has been fixed by fixing the regex used in get_elements_by_class().
ytdl-org · Feb 19, 2020 · b98d1c0 · b98d1c0
1 parent e910f49
commit b98d1c0
Show file tree

Hide file tree

Showing 3 changed files with 90 additions and 21 deletions.
diff --git a/test/test_utils.py b/test/test_utils.py
@@ -1401,8 +1401,49 @@ def test_get_element_by_class(self):
         '''
 
         self.assertEqual(get_element_by_class('foo', html), 'nice')
+        self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar">nice</span>')
         self.assertEqual(get_element_by_class('no-such-class', html), None)
 
+        html = '''
+            <span class="foo bar"/>
+        '''
+
+        self.assertEqual(get_element_by_class('foo', html), None)
+        self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"/>')
+
+        html = '''
+            <span class="foo bar"></span>
+        '''
+
+        self.assertEqual(get_element_by_class('foo', html), '')
+        self.assertEqual(get_element_by_class('foo', html, include_tag=True), '<span class="foo bar"></span>')
+
+        html = '''
+            <span class="content-section__wrap bar">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('content-section__wrap', html), 'nice')
+        self.assertEqual(get_element_by_class('content-section__wrap', html, include_tag=True), '<span class="content-section__wrap bar">nice</span>')
+
+        html = '''
+            <span class="-test-hyphen">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('-test-hyphen', html), 'nice')
+
+        html = '''
+            <span class="_test_underscore">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('_test_underscore', html), 'nice')
+
+        html = '''
+            <span class="ä-umlaut ↑-unicode">nice</span>
+        '''
+
+        self.assertEqual(get_element_by_class('ä-umlaut', html), 'nice')
+        self.assertEqual(get_element_by_class('↑-unicode', html), 'nice')
+
     def test_get_element_by_attribute(self):
         html = '''
             <span class="foo bar">nice</span>

diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
@@ -4,6 +4,7 @@
 from ..utils import (
     clean_html,
     extract_attributes,
+    get_element_by_class,
     unified_strdate,
 )
 
@@ -41,9 +42,8 @@ def _real_extract(self, url):
         video_id = self._match_id(url)
         webpage = self._download_webpage(
             'http://archive.org/embed/' + video_id, video_id)
-        input_element_with_playlist = self._search_regex(
-            r'(<\s*input.*\s*class\s*=\s*[\'"].*\s*js-play8-playlist\s*.*[\'"]\s*.*>)',
-            webpage, 'jwplayer playlist')
+        input_element_with_playlist = get_element_by_class(
+            'js-play8-playlist', webpage, include_tag=True)
         jwplayer_playlist = self._parse_json(extract_attributes(
             input_element_with_playlist)['value'], video_id)
         info = self._parse_jwplayer_data(

diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
@@ -1926,32 +1926,55 @@ def xpath_attr(node, xpath, key, name=None, fatal=False, default=NO_DEFAULT):
     return n.attrib[key]
 
 
-def get_element_by_id(id, html):
-    """Return the content of the tag with the specified ID in the passed HTML document"""
-    return get_element_by_attribute('id', id, html)
+def get_element_by_id(id, html, include_tag=False):
+    """
+    Return the content of the tag with the specified ID in the passed HTML document.
+
+    The whole element, including its tag, is returned when `include_flag` is `True`.
+    """
+    return get_element_by_attribute('id', id, html, include_tag)
+
 
+def get_element_by_class(class_name, html, include_tag=False):
+    """
+    Return the content of the first tag with the specified class in the passed HTML document.
 
-def get_element_by_class(class_name, html):
-    """Return the content of the first tag with the specified class in the passed HTML document"""
-    retval = get_elements_by_class(class_name, html)
+    The whole element, including its tag, is returned when `include_flag` is `True`.
+    """
+    retval = get_elements_by_class(class_name, html, include_tag)
     return retval[0] if retval else None
 
 
-def get_element_by_attribute(attribute, value, html, escape_value=True):
-    retval = get_elements_by_attribute(attribute, value, html, escape_value)
+def get_element_by_attribute(attribute, value, html, escape_value=True,
+                             include_tag=False):
+    """
+    Return the content of the first tag with the specified attribute in the passed HTML document.
+
+    The whole element, including its tag, is returned when `include_flag` is `True`.
+    """
+    retval = get_elements_by_attribute(attribute, value, html, escape_value,
+                                       include_tag)
     return retval[0] if retval else None
 
 
-def get_elements_by_class(class_name, html):
-    """Return the content of all tags with the specified class in the passed HTML document as a list"""
+def get_elements_by_class(class_name, html, include_tag=False):
+    """
+    Return the content of all tags with the specified class in the passed HTML document as a list.
+
+    The whole elements, including their tags, are returned when `include_flag` is `True`.
+    """
     return get_elements_by_attribute(
-        'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name),
-        html, escape_value=False)
+        'class', r'[^\'"]*(?<![\w-])%s(?![\w-])[^\'"]*' % re.escape(class_name),
+        html, escape_value=False, include_tag=include_tag)
 
 
-def get_elements_by_attribute(attribute, value, html, escape_value=True):
-    """Return the content of the tag with the specified attribute in the passed HTML document"""
+def get_elements_by_attribute(attribute, value, html, escape_value=True,
+                              include_tag=False):
+    """
+    Return the content of all tags with the specified attribute in the passed HTML document.
 
+    The whole elements, including their tags, are returned when `include_flag` is `True`.
+    """
     value = re.escape(value) if escape_value else value
 
     retlist = []
@@ -1960,11 +1983,13 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
          \s+%s=['"]?%s['"]?
          (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
-        \s*>
+        \s*(?:\/\s*>|>
         (?P<content>.*?)
-        </\1>
+        </\1>)
     ''' % (re.escape(attribute), value), html):
-        res = m.group('content')
+        res = m.group(0) if include_tag else m.group('content')
+        if res is None:
+            continue
 
         if res.startswith('"') or res.startswith("'"):
             res = res[1:-1]
@@ -1981,7 +2006,10 @@ def __init__(self):
         compat_HTMLParser.__init__(self)
 
     def handle_starttag(self, tag, attrs):
-        self.attrs = dict(attrs)
+        # Make sure we're looking at the first attributes. Later ones are from
+        # embedded elements.
+        if not self.attrs:
+            self.attrs = dict(attrs)
 
 
 def extract_attributes(html_element):