From 0ba6ce8e79a8351211e4b8a42bc0f1770f95f61f Mon Sep 17 00:00:00 2001 From: Isaac Muse Date: Tue, 9 Feb 2021 10:39:48 -0700 Subject: [PATCH] Handle weird attribute inputs (#213) Fixes #212 --- docs/src/dictionary/en-custom.txt | 1 + docs/src/markdown/about/changelog.md | 3 ++ soupsieve/css_match.py | 47 +++++++++++++++++++++++----- tests/test_level2/test_attribute.py | 47 ++++++++++++++++++++++++++++ tests/test_level3/test_not.py | 8 +++++ tox.ini | 2 +- 6 files changed, 100 insertions(+), 8 deletions(-) diff --git a/docs/src/dictionary/en-custom.txt b/docs/src/dictionary/en-custom.txt index d6619d08..b6202349 100644 --- a/docs/src/dictionary/en-custom.txt +++ b/docs/src/dictionary/en-custom.txt @@ -26,6 +26,7 @@ TODO Tox Twemoji URL's +UTF Unescape Virtualenv WIP diff --git a/docs/src/markdown/about/changelog.md b/docs/src/markdown/about/changelog.md index f3951d29..48a61c7b 100644 --- a/docs/src/markdown/about/changelog.md +++ b/docs/src/markdown/about/changelog.md @@ -4,6 +4,9 @@ - **NEW**: `:link` and `:any-link` no longer include `#!html ` due to a change in the level 4 selector specification. +- **FIX**: BeautifulSoup, when using `find`, is quite forgiving of odd types that a user may place in an element's + attribute value. Soup Sieve will also now be more forgiving and attempt to match these unexpected values in a sane + manner by normalizing them before compare. (#212) ## 2.1.0 diff --git a/soupsieve/css_match.py b/soupsieve/css_match.py index 91aa30c2..a9eeaad2 100644 --- a/soupsieve/css_match.py +++ b/soupsieve/css_match.py @@ -4,6 +4,7 @@ import re from .import css_types as ct import unicodedata +from collections.abc import Sequence import bs4 @@ -276,29 +277,61 @@ def split_namespace(el, attr_name): return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None) - @staticmethod - def get_attribute_by_name(el, name, default=None): + @classmethod + def normalize_value(cls, value): + """Normalize the value to be a string or list of strings.""" + + # Treat `None` as empty string. + if value is None: + return '' + + # Pass through strings + if (isinstance(value, str)): + return value + + # If it's a byte string, convert it to Unicode, treating it as UTF-8. + if isinstance(value, bytes): + return value.decode("utf8") + + # BeautifulSoup supports sequences of attribute values, so make sure the children are strings. + if isinstance(value, Sequence): + new_value = [] + for v in value: + if isinstance(v, Sequence): + # This is most certainly a user error and will crash and burn later, + # but to avoid excessive recursion, kick out now. + new_value.append(v) + else: + # Convert the child to a string + new_value.append(cls.normalize_value(v)) + return new_value + + # Try and make anything else a string + return str(value) + + @classmethod + def get_attribute_by_name(cls, el, name, default=None): """Get attribute by name.""" value = default if el._is_xml: try: - value = el.attrs[name] + value = cls.normalize_value(el.attrs[name]) except KeyError: pass else: for k, v in el.attrs.items(): if util.lower(k) == name: - value = v + value = cls.normalize_value(v) break return value - @staticmethod - def iter_attributes(el): + @classmethod + def iter_attributes(cls, el): """Iterate attributes.""" for k, v in el.attrs.items(): - yield k, v + yield k, cls.normalize_value(v) @classmethod def get_classes(cls, el): diff --git a/tests/test_level2/test_attribute.py b/tests/test_level2/test_attribute.py index b452a894..920fc983 100644 --- a/tests/test_level2/test_attribute.py +++ b/tests/test_level2/test_attribute.py @@ -1,6 +1,7 @@ """Test attribute selector.""" from .. import util from soupsieve import SelectorSyntaxError +from bs4 import BeautifulSoup as BS class TestAttribute(util.TestCase): @@ -370,3 +371,49 @@ def test_attribute_contains_cannot_have_escaped_spaces(self): [], flags=util.HTML ) + + def test_none_inputs(self): + """Test weird inputs.""" + + soup = BS('text', 'html.parser') + soup.span['foo'] = None + self.assertEqual(len(soup.select('span[foo]')), 1) + + def test_numeric_inputs(self): + """Test weird inputs.""" + + soup = BS('text', 'html.parser') + soup.span['foo'] = 3 + self.assertEqual(len(soup.select('span[foo="3"]')), 1) + soup.span['foo'] = 3.3 + self.assertEqual(len(soup.select('span[foo="3.3"]')), 1) + + def test_sequence_inputs(self): + """Test weird inputs.""" + + soup = BS('text', 'html.parser') + soup.span['foo'] = [3, "4"] + self.assertEqual(len(soup.select('span[foo="3 4"]')), 1) + + def test_bytes_inputs(self): + """Test weird inputs.""" + + soup = BS('text', 'html.parser') + soup.span['foo'] = b'test' + self.assertEqual(len(soup.select('span[foo="test"]')), 1) + + def test_weird_inputs(self): + """Test weird inputs.""" + + soup = BS('text', 'html.parser') + soup.span['foo'] = {'3': '4'} + self.assertEqual(len(soup.select('span[foo="{\'3\': \'4\'}"]')), 1) + + def test_nested_sequences(self): + """Nested sequences will crash and burn due to the way BS handles them.""" + + # The exact exception is not important as it can fail in various locations for different reasons + with self.assertRaises(Exception): + soup = BS('text', 'html.parser') + soup.span['foo'] = [['1']] + soup.select("span['foo']") diff --git a/tests/test_level3/test_not.py b/tests/test_level3/test_not.py index 8731d421..cde96068 100644 --- a/tests/test_level3/test_not.py +++ b/tests/test_level3/test_not.py @@ -1,5 +1,6 @@ """Test not selectors.""" from .. import util +from bs4 import BeautifulSoup as BS class TestNot(util.TestCase): @@ -47,3 +48,10 @@ def test_not_case(self): ["0", "2", "3", "4", "5", "6", "pre"], flags=util.HTML ) + + def test_none_inputs(self): + """Test weird inputs.""" + + soup = BS('text', 'html.parser') + soup.span['foo'] = None + self.assertEqual(len(soup.select('span:not([foo])')), 0) diff --git a/tox.ini b/tox.ini index 9849baff..cb360c58 100644 --- a/tox.ini +++ b/tox.ini @@ -48,7 +48,7 @@ commands = [flake8] exclude=build/*,.tox/* max-line-length=120 -ignore=D202,D203,D401,E741,W504 +ignore=D202,D203,D401,E741,W504,N817 [pytest] filterwarnings =