Skip to content

Commit

Permalink
Handle weird attribute inputs (#213)
Browse files Browse the repository at this point in the history
Fixes #212
  • Loading branch information
facelessuser authored Feb 9, 2021
1 parent 0240305 commit 0ba6ce8
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 8 deletions.
1 change: 1 addition & 0 deletions docs/src/dictionary/en-custom.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ TODO
Tox
Twemoji
URL's
UTF
Unescape
Virtualenv
WIP
Expand Down
3 changes: 3 additions & 0 deletions docs/src/markdown/about/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@

- **NEW**: `:link` and `:any-link` no longer include `#!html <link>` due to a change in the level 4 selector
specification.
- **FIX**: BeautifulSoup, when using `find`, is quite forgiving of odd types that a user may place in an element's
attribute value. Soup Sieve will also now be more forgiving and attempt to match these unexpected values in a sane
manner by normalizing them before compare. (#212)

## 2.1.0

Expand Down
47 changes: 40 additions & 7 deletions soupsieve/css_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import re
from .import css_types as ct
import unicodedata
from collections.abc import Sequence

import bs4

Expand Down Expand Up @@ -276,29 +277,61 @@ def split_namespace(el, attr_name):

return getattr(attr_name, 'namespace', None), getattr(attr_name, 'name', None)

@staticmethod
def get_attribute_by_name(el, name, default=None):
@classmethod
def normalize_value(cls, value):
"""Normalize the value to be a string or list of strings."""

# Treat `None` as empty string.
if value is None:
return ''

# Pass through strings
if (isinstance(value, str)):
return value

# If it's a byte string, convert it to Unicode, treating it as UTF-8.
if isinstance(value, bytes):
return value.decode("utf8")

# BeautifulSoup supports sequences of attribute values, so make sure the children are strings.
if isinstance(value, Sequence):
new_value = []
for v in value:
if isinstance(v, Sequence):
# This is most certainly a user error and will crash and burn later,
# but to avoid excessive recursion, kick out now.
new_value.append(v)
else:
# Convert the child to a string
new_value.append(cls.normalize_value(v))
return new_value

# Try and make anything else a string
return str(value)

@classmethod
def get_attribute_by_name(cls, el, name, default=None):
"""Get attribute by name."""

value = default
if el._is_xml:
try:
value = el.attrs[name]
value = cls.normalize_value(el.attrs[name])
except KeyError:
pass
else:
for k, v in el.attrs.items():
if util.lower(k) == name:
value = v
value = cls.normalize_value(v)
break
return value

@staticmethod
def iter_attributes(el):
@classmethod
def iter_attributes(cls, el):
"""Iterate attributes."""

for k, v in el.attrs.items():
yield k, v
yield k, cls.normalize_value(v)

@classmethod
def get_classes(cls, el):
Expand Down
47 changes: 47 additions & 0 deletions tests/test_level2/test_attribute.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""Test attribute selector."""
from .. import util
from soupsieve import SelectorSyntaxError
from bs4 import BeautifulSoup as BS


class TestAttribute(util.TestCase):
Expand Down Expand Up @@ -370,3 +371,49 @@ def test_attribute_contains_cannot_have_escaped_spaces(self):
[],
flags=util.HTML
)

def test_none_inputs(self):
"""Test weird inputs."""

soup = BS('<span>text</span>', 'html.parser')
soup.span['foo'] = None
self.assertEqual(len(soup.select('span[foo]')), 1)

def test_numeric_inputs(self):
"""Test weird inputs."""

soup = BS('<span>text</span>', 'html.parser')
soup.span['foo'] = 3
self.assertEqual(len(soup.select('span[foo="3"]')), 1)
soup.span['foo'] = 3.3
self.assertEqual(len(soup.select('span[foo="3.3"]')), 1)

def test_sequence_inputs(self):
"""Test weird inputs."""

soup = BS('<span>text</span>', 'html.parser')
soup.span['foo'] = [3, "4"]
self.assertEqual(len(soup.select('span[foo="3 4"]')), 1)

def test_bytes_inputs(self):
"""Test weird inputs."""

soup = BS('<span>text</span>', 'html.parser')
soup.span['foo'] = b'test'
self.assertEqual(len(soup.select('span[foo="test"]')), 1)

def test_weird_inputs(self):
"""Test weird inputs."""

soup = BS('<span>text</span>', 'html.parser')
soup.span['foo'] = {'3': '4'}
self.assertEqual(len(soup.select('span[foo="{\'3\': \'4\'}"]')), 1)

def test_nested_sequences(self):
"""Nested sequences will crash and burn due to the way BS handles them."""

# The exact exception is not important as it can fail in various locations for different reasons
with self.assertRaises(Exception):
soup = BS('<span>text</span>', 'html.parser')
soup.span['foo'] = [['1']]
soup.select("span['foo']")
8 changes: 8 additions & 0 deletions tests/test_level3/test_not.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Test not selectors."""
from .. import util
from bs4 import BeautifulSoup as BS


class TestNot(util.TestCase):
Expand Down Expand Up @@ -47,3 +48,10 @@ def test_not_case(self):
["0", "2", "3", "4", "5", "6", "pre"],
flags=util.HTML
)

def test_none_inputs(self):
"""Test weird inputs."""

soup = BS('<span foo-"something">text</span>', 'html.parser')
soup.span['foo'] = None
self.assertEqual(len(soup.select('span:not([foo])')), 0)
2 changes: 1 addition & 1 deletion tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ commands =
[flake8]
exclude=build/*,.tox/*
max-line-length=120
ignore=D202,D203,D401,E741,W504
ignore=D202,D203,D401,E741,W504,N817

[pytest]
filterwarnings =
Expand Down

0 comments on commit 0ba6ce8

Please sign in to comment.