Handle weird attribute inputs (#213)

Fixes #212
facelessuser · Feb 9, 2021 · 0ba6ce8 · 0ba6ce8
1 parent 0240305
commit 0ba6ce8
Show file tree

Hide file tree

Showing 6 changed files with 100 additions and 8 deletions.
diff --git a/docs/src/dictionary/en-custom.txt b/docs/src/dictionary/en-custom.txt
@@ -26,6 +26,7 @@ TODO
 Tox
 Twemoji
 URL's
+UTF
 Unescape
 Virtualenv
 WIP

diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md
@@ -4,6 +4,9 @@
 
 - **NEW**: `:link` and `:any-link` no longer include `#!html <link>` due to a change in the level 4 selector
   specification.
+- **FIX**: BeautifulSoup, when using `find`, is quite forgiving of odd types that a user may place in an element's
+  attribute value. Soup Sieve will also now be more forgiving and attempt to match these unexpected values in a sane
+  manner by normalizing them before compare. (#212)
 
 ## 2.1.0
 

diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py
@@ -4,6 +4,7 @@
 import re
 from .import css_types as ct
 import unicodedata
+from collections.abc import Sequence
 
 import bs4
 
@@ -276,29 +277,61 @@ def split_namespace(el, attr_name):
 
         return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)
 
-    @staticmethod
-    def get_attribute_by_name(el, name, default=None):
+    @classmethod
+    def normalize_value(cls, value):
+        """Normalize the value to be a string or list of strings."""
+
+        # Treat `None` as empty string.
+        if value is None:
+            return ''
+
+        # Pass through strings
+        if (isinstance(value, str)):
+            return value
+
+        # If it's a byte string, convert it to Unicode, treating it as UTF-8.
+        if isinstance(value, bytes):
+            return value.decode("utf8")
+
+        # BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
+        if isinstance(value, Sequence):
+            new_value = []
+            for v in value:
+                if isinstance(v, Sequence):
+                    # This is most certainly a user error and will crash and burn later,
+                    # but to avoid excessive recursion, kick out now.
+                    new_value.append(v)
+                else:
+                    # Convert the child to a string
+                    new_value.append(cls.normalize_value(v))
+            return new_value
+
+        # Try and make anything else a string
+        return str(value)
+
+    @classmethod
+    def get_attribute_by_name(cls, el, name, default=None):
         """Get attribute by name."""
 
         value = default
         if el._is_xml:
             try:
-                value = el.attrs[name]
+                value = cls.normalize_value(el.attrs[name])
             except KeyError:
                 pass
         else:
             for k, v in el.attrs.items():
                 if util.lower(k) == name:
-                    value = v
+                    value = cls.normalize_value(v)
                     break
         return value
 
-    @staticmethod
-    def iter_attributes(el):
+    @classmethod
+    def iter_attributes(cls, el):
         """Iterate attributes."""
 
         for k, v in el.attrs.items():
-            yield k, v
+            yield k, cls.normalize_value(v)
 
     @classmethod
     def get_classes(cls, el):

diff --git a/tests/test_level2/test_attribute.py b/tests/test_level2/test_attribute.py
@@ -1,6 +1,7 @@
 """Test attribute selector."""
 from .. import util
 from soupsieve import SelectorSyntaxError
+from bs4 import BeautifulSoup as BS
 
 
 class TestAttribute(util.TestCase):
@@ -370,3 +371,49 @@ def test_attribute_contains_cannot_have_escaped_spaces(self):
             [],
             flags=util.HTML
         )
+
+    def test_none_inputs(self):
+        """Test weird inputs."""
+
+        soup = BS('<span>text</span>', 'html.parser')
+        soup.span['foo'] = None
+        self.assertEqual(len(soup.select('span[foo]')), 1)
+
+    def test_numeric_inputs(self):
+        """Test weird inputs."""
+
+        soup = BS('<span>text</span>', 'html.parser')
+        soup.span['foo'] = 3
+        self.assertEqual(len(soup.select('span[foo="3"]')), 1)
+        soup.span['foo'] = 3.3
+        self.assertEqual(len(soup.select('span[foo="3.3"]')), 1)
+
+    def test_sequence_inputs(self):
+        """Test weird inputs."""
+
+        soup = BS('<span>text</span>', 'html.parser')
+        soup.span['foo'] = [3, "4"]
+        self.assertEqual(len(soup.select('span[foo="3 4"]')), 1)
+
+    def test_bytes_inputs(self):
+        """Test weird inputs."""
+
+        soup = BS('<span>text</span>', 'html.parser')
+        soup.span['foo'] = b'test'
+        self.assertEqual(len(soup.select('span[foo="test"]')), 1)
+
+    def test_weird_inputs(self):
+        """Test weird inputs."""
+
+        soup = BS('<span>text</span>', 'html.parser')
+        soup.span['foo'] = {'3': '4'}
+        self.assertEqual(len(soup.select('span[foo="{\'3\': \'4\'}"]')), 1)
+
+    def test_nested_sequences(self):
+        """Nested sequences will crash and burn due to the way BS handles them."""
+
+        # The exact exception is not important as it can fail in various locations for different reasons
+        with self.assertRaises(Exception):
+            soup = BS('<span>text</span>', 'html.parser')
+            soup.span['foo'] = [['1']]
+            soup.select("span['foo']")
diff --git a/tests/test_level3/test_not.py b/tests/test_level3/test_not.py
@@ -1,5 +1,6 @@
 """Test not selectors."""
 from .. import util
+from bs4 import BeautifulSoup as BS
 
 
 class TestNot(util.TestCase):
@@ -47,3 +48,10 @@ def test_not_case(self):
             ["0", "2", "3", "4", "5", "6", "pre"],
             flags=util.HTML
         )
+
+    def test_none_inputs(self):
+        """Test weird inputs."""
+
+        soup = BS('<span foo-"something">text</span>', 'html.parser')
+        soup.span['foo'] = None
+        self.assertEqual(len(soup.select('span:not([foo])')), 0)
diff --git a/tox.ini b/tox.ini
@@ -48,7 +48,7 @@ commands =
 [flake8]
 exclude=build/*,.tox/*
 max-line-length=120
-ignore=D202,D203,D401,E741,W504
+ignore=D202,D203,D401,E741,W504,N817
 
 [pytest]
 filterwarnings =