Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove deep validation and instead use error hierarchy to improve error messages #2842

Merged
merged 11 commits into from
Feb 5, 2023
105 changes: 63 additions & 42 deletions altair/utils/schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import inspect
import json
import textwrap
from typing import Any
from typing import Any, Sequence, List

import jsonschema
import jsonschema.exceptions
Expand Down Expand Up @@ -66,9 +66,50 @@ def validate_jsonschema(spec, schema, rootschema=None):
if hasattr(validator_cls, "FORMAT_CHECKER"):
validator_kwargs["format_checker"] = validator_cls.FORMAT_CHECKER
validator = validator_cls(schema, **validator_kwargs)
error = jsonschema.exceptions.best_match(validator.iter_errors(spec))
if error is not None:
raise error
errors = list(validator.iter_errors(spec))
if errors:
errors = _get_most_relevant_errors(errors)
main_error = errors[0]
# They can be used to craft more helpful error messages when this error
# is being converted to a SchemaValidationError
main_error._additional_errors = errors[1:]
raise main_error


def _get_most_relevant_errors(
errors: Sequence[jsonschema.exceptions.ValidationError],
) -> List[jsonschema.exceptions.ValidationError]:
if len(errors) == 0:
return []
binste marked this conversation as resolved.
Show resolved Hide resolved

# Go to lowest level in schema where an error happened as these give
# the most relevant error messages
lowest_level = errors[0]
while lowest_level.context:
lowest_level = lowest_level.context[0]

most_relevant_errors = []
if lowest_level.validator == "additionalProperties":
# For these errors, the message is already informative enough and the other
# errors on the lowest level are not helpful for a user but instead contain
# the same information in a more verbose way
most_relevant_errors = [lowest_level]
else:
parent = lowest_level.parent
if parent is None:
# In this case we are still at the top level and can return all errors
most_relevant_errors = errors
else:
# Return all errors of the lowest level out of which
# we can construct more informative error messages
most_relevant_errors = lowest_level.parent.context

# This should never happen but might still be good to test for it as else
# the original error would just slip through without being raised
if len(most_relevant_errors) == 0:
raise Exception("Could not determine the most relevant errors") from errors[0]

return most_relevant_errors


def _subclasses(cls):
Expand All @@ -82,18 +123,14 @@ def _subclasses(cls):
yield cls


def _todict(obj, validate, context):
def _todict(obj, context):
"""Convert an object to a dict representation."""
if isinstance(obj, SchemaBase):
return obj.to_dict(validate=validate, context=context)
return obj.to_dict(validate=False, context=context)
elif isinstance(obj, (list, tuple, np.ndarray)):
return [_todict(v, validate, context) for v in obj]
return [_todict(v, context) for v in obj]
elif isinstance(obj, dict):
return {
k: _todict(v, validate, context)
for k, v in obj.items()
if v is not Undefined
}
return {k: _todict(v, context) for k, v in obj.items() if v is not Undefined}
elif hasattr(obj, "to_dict"):
return obj.to_dict()
elif isinstance(obj, np.number):
Expand All @@ -117,24 +154,9 @@ class SchemaValidationError(jsonschema.ValidationError):
"""A wrapper for jsonschema.ValidationError with friendlier traceback"""

def __init__(self, obj, err):
super(SchemaValidationError, self).__init__(**self._get_contents(err))
super(SchemaValidationError, self).__init__(**err._contents())
self.obj = obj

@staticmethod
def _get_contents(err):
"""Get a dictionary with the contents of a ValidationError"""
try:
# works in jsonschema 2.3 or later
contents = err._contents()
except AttributeError:
try:
# works in Python >=3.4
spec = inspect.getfullargspec(err.__init__)
except AttributeError:
# works in Python <3.4
spec = inspect.getargspec(err.__init__)
contents = {key: getattr(err, key) for key in spec.args[1:]}
return contents
self._additional_errors = getattr(err, "_additional_errors", [])

def __str__(self):
cls = self.obj.__class__
Expand All @@ -145,13 +167,18 @@ def __str__(self):
for val in schema_path[:-1]
if val not in ("properties", "additionalProperties", "patternProperties")
)
message = self.message
if self._additional_errors:
binste marked this conversation as resolved.
Show resolved Hide resolved
message += "\n " + "\n ".join(
[e.message for e in self._additional_errors]
)
return """Invalid specification

{}, validating {!r}

{}
""".format(
schema_path, self.validator, self.message
schema_path, self.validator, message
)


Expand Down Expand Up @@ -327,11 +354,9 @@ def to_dict(self, validate=True, ignore=None, context=None):

Parameters
----------
validate : boolean or string
validate : boolean
If True (default), then validate the output dictionary
binste marked this conversation as resolved.
Show resolved Hide resolved
against the schema. If "deep" then recursively validate
all objects in the spec. This takes much more time, but
it results in friendlier tracebacks for large objects.
against the schema.
ignore : list
A list of keys to ignore. This will *not* passed to child to_dict
function calls.
Expand All @@ -353,10 +378,9 @@ def to_dict(self, validate=True, ignore=None, context=None):
context = {}
if ignore is None:
ignore = []
sub_validate = "deep" if validate == "deep" else False

if self._args and not self._kwds:
result = _todict(self._args[0], validate=sub_validate, context=context)
result = _todict(self._args[0], context=context)
elif not self._args:
kwds = self._kwds.copy()
# parsed_shorthand is added by FieldChannelMixin.
Expand Down Expand Up @@ -384,7 +408,6 @@ def to_dict(self, validate=True, ignore=None, context=None):
}
result = _todict(
kwds,
validate=sub_validate,
context=context,
)
else:
Expand All @@ -406,11 +429,9 @@ def to_json(

Parameters
----------
validate : boolean or string
validate : boolean
If True (default), then validate the output dictionary
against the schema. If "deep" then recursively validate
all objects in the spec. This takes much more time, but
it results in friendlier tracebacks for large objects.
against the schema.
ignore : list
A list of keys to ignore. This will *not* passed to child to_dict
function calls.
Expand Down Expand Up @@ -516,7 +537,7 @@ def validate_property(cls, name, value, schema=None):
Validate a property against property schema in the context of the
rootschema
"""
value = _todict(value, validate=False, context={})
value = _todict(value, context={})
props = cls.resolve_references(schema or cls._schema).get("properties", {})
return validate_jsonschema(
value, props.get(name, {}), rootschema=cls._rootschema or cls._schema
Expand Down
12 changes: 1 addition & 11 deletions altair/vegalite/v5/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -556,17 +556,7 @@ def to_dict(self, *args, **kwargs):
context["top_level"] = False
kwargs["context"] = context

try:
dct = super(TopLevelMixin, copy).to_dict(*args, **kwargs)
except jsonschema.ValidationError:
dct = None

# If we hit an error, then re-convert with validate='deep' to get
# a more useful traceback. We don't do this by default because it's
# much slower in the case that there are no errors.
if dct is None:
kwargs["validate"] = "deep"
dct = super(TopLevelMixin, copy).to_dict(*args, **kwargs)
dct = super(TopLevelMixin, copy).to_dict(*args, **kwargs)

# TODO: following entries are added after validation. Should they be validated?
if is_top_level:
Expand Down
1 change: 1 addition & 0 deletions doc/releases/changes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ Enhancements
- Saving charts with HTML inline is now supported without having altair_saver installed (#2807).
- The documentation page has been revamped, both in terms of appearance and content.
- More informative autocompletion by removing deprecated methods (#2814) and adding support for completion in method chains for editors that rely on type hints (e.g. VS Code) (#2846)
- Improved error messages (#2842)

Grammar Changes
~~~~~~~~~~~~~~~
Expand Down
130 changes: 129 additions & 1 deletion tests/utils/tests/test_schemapi.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,14 @@
import io
import json
import jsonschema
import re
import pickle
import pytest
import warnings

import numpy as np
import pandas as pd
import pytest
from vega_datasets import data

import altair as alt
from altair import load_schema
Expand Down Expand Up @@ -380,6 +383,131 @@ def test_schema_validation_error():
assert the_err.message in message


def chart_example_layer():
points = (
alt.Chart(data.cars.url)
.mark_point()
.encode(
x="Horsepower:Q",
y="Miles_per_Gallon:Q",
)
)
return (points & points).properties(width=400)


def chart_example_hconcat():
source = data.cars()
points = (
alt.Chart(source)
.mark_point()
.encode(
x="Horsepower",
y="Miles_per_Gallon",
)
)

text = (
alt.Chart(source)
.mark_text(align="right")
.encode(alt.Text("Horsepower:N", title=dict(text="Horsepower", align="right")))
)

return points | text


def chart_example_invalid_channel_and_condition():
selection = alt.selection_point()
return (
alt.Chart(data.barley())
.mark_circle()
.add_params(selection)
.encode(
color=alt.condition(selection, alt.value("red"), alt.value("green")),
invalidChannel=None,
)
)


def chart_example_invalid_y_option():
return (
alt.Chart(data.barley())
.mark_bar()
.encode(
x=alt.X("variety", unknown=2),
y=alt.Y("sum(yield)", stack="asdf"),
)
)


def chart_example_invalid_y_option_value():
return (
alt.Chart(data.barley())
.mark_bar()
.encode(
x=alt.X("variety"),
y=alt.Y("sum(yield)", stack="asdf"),
)
)


def chart_example_invalid_y_option_value_with_condition():
return (
alt.Chart(data.barley())
.mark_bar()
.encode(
x="variety",
y=alt.Y("sum(yield)", stack="asdf"),
opacity=alt.condition("datum.yield > 0", alt.value(1), alt.value(0.2)),
)
)


@pytest.mark.parametrize(
"chart_func, expected_error_message",
[
(
chart_example_invalid_y_option,
r"Additional properties are not allowed \('unknown' was unexpected\)",
),
(
chart_example_invalid_y_option_value,
r"'asdf' is not one of \['zero', 'center', 'normalize'\].*"
+ r"'asdf' is not of type 'null'.*'asdf' is not of type 'boolean'",
),
(
chart_example_layer,
r"Additional properties are not allowed \('width' was unexpected\)",
),
(
chart_example_invalid_y_option_value_with_condition,
r"'asdf' is not one of \['zero', 'center', 'normalize'\].*"
+ r"'asdf' is not of type 'null'.*'asdf' is not of type 'boolean'",
),
(
chart_example_hconcat,
r"\{'text': 'Horsepower', 'align': 'right'\} is not of type 'string'.*"
+ r"\{'text': 'Horsepower', 'align': 'right'\} is not of type 'array'",
),
(
chart_example_invalid_channel_and_condition,
r"Additional properties are not allowed \('invalidChannel' was unexpected\)",
),
],
)
def test_chart_validation_errors(chart_func, expected_error_message):
# DOTALL flag makes that a dot (.) also matches new lines
pattern = re.compile(expected_error_message, re.DOTALL)
# For some wrong chart specifications such as an unknown encoding channel,
# Altair already raises a warning before the chart specifications are validated.
# We can ignore these warnings as we are interested in the errors being raised
# during validation which is triggered by to_dict
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning)
chart = chart_func()
with pytest.raises(SchemaValidationError, match=pattern):
chart.to_dict()


def test_serialize_numpy_types():
m = MySchema(
a={"date": np.datetime64("2019-01-01")},
Expand Down
Loading