Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extend the validate() signature #804

Merged
merged 13 commits into from
Dec 12, 2023
Merged
2 changes: 1 addition & 1 deletion AUTHORS.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,5 +25,5 @@ The following persons contributed to the development of the |pyam| package:

| The core maintenance of the |pyam| package is done by
the *Scenario Services & Scientific Software* research theme
at the IIASA Energy, Climate, and Enviroment program.
at the IIASA Energy, Climate, and Environment program.
| Visit https://software.ece.iiasa.ac.at for more information.
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Next Release

- [#804](https://github.com/IAMconsortium/pyam/pull/804) Support filters as direct keyword arguments for `validate()` method
- [#801](https://github.com/IAMconsortium/pyam/pull/801) Support initializing with `meta` dataframe in long format
- [#796](https://github.com/IAMconsortium/pyam/pull/796) Raise explicit error message if no connection to IIASA manager service
- [#794](https://github.com/IAMconsortium/pyam/pull/794) Fix wrong color codes for AR6 Illustrative Pathways
Expand Down
4 changes: 2 additions & 2 deletions docs/data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ Background: The IAMC timeseries scenario data format

Over the past decade, the Integrated Assessment Modeling Consortium (IAMC)
developed a standardised tabular timeseries format to exchange scenario data
related to energy systems modelling, land-use change, demand sectors,
and economic indicators in the context of the Sustainable Development Goals.
related to energy systems modelling, land-use change, demand sectors, and economic
indicators in the context of climate change and the the Sustainable Development Goals.
Previous high-level use cases include reports by the *Intergovernmental Panel
on Climate Change* (`IPCC`_) and model comparison exercises
within the *Energy Modeling Forum* (`EMF`_) hosted by Stanford University.
Expand Down
6 changes: 3 additions & 3 deletions docs/tutorials/pyam_first_steps.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -557,7 +557,7 @@
"metadata": {},
"outputs": [],
"source": [
"df_world.validate(criteria={'Primary Energy': {'lo': 540 * 0.9, 'year': 2010}})"
"df_world.validate(variable='Primary Energy', year=2010, lower_bound=540 * 0.9)"
]
},
{
Expand Down Expand Up @@ -595,7 +595,7 @@
"metadata": {},
"outputs": [],
"source": [
"df_world.validate(criteria={'Emissions|CO2': {'lo': 38000, 'year': 2020}}, exclude_on_fail=True)"
"df_world.validate(variable=\"Emissions|CO2\", year=2020, lower_bound=38000, exclude_on_fail=True)"
]
},
{
Expand All @@ -604,7 +604,7 @@
"metadata": {},
"outputs": [],
"source": [
"df_world.validate(criteria={'Emissions|CO2': {'up': 45000}}, exclude_on_fail=True)"
"df_world.validate(variable=\"Emissions|CO2\", upper_bound=45000, exclude_on_fail=True)"
]
},
{
Expand Down
60 changes: 34 additions & 26 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -1047,30 +1047,49 @@ def require_variable(self, *args, **kwargs):
# TODO: deprecated, remove for release >= 2.1
raise DeprecationWarning("Use `df.require_data()` instead.")

def validate(self, criteria={}, exclude_on_fail=False):
"""Validate scenarios using criteria on timeseries values
def validate(
self,
criteria: dict = None,
*,
upper_bound: float = None,
lower_bound: float = None,
exclude_on_fail: bool = False,
**kwargs,
) -> pd.DataFrame:
"""Validate scenarios using bounds on (filtered) timeseries 'data' values.

Returns all scenarios which do not match the criteria and prints a log
message, or returns None if all scenarios match the criteria.
Returns all data rows that do not match the criteria, or returns None if all
scenarios match the criteria.

When called with `exclude_on_fail=True`, scenarios not
satisfying the criteria will be marked as `exclude=True`.
When called with `exclude_on_fail=True`, scenarios not satisfying the criteria
will be marked as `exclude=True`.

Parameters
----------
criteria : dict
dictionary with variable keys and validation mappings
('up' and 'lo' for respective bounds, 'year' for years)
upper_bound, lower_bound : float, optional
Upper and lower bounds for validation criteria of timeseries :attr:`data`.
criteria : dict, optional, deprecated
This option is deprecated; dictionary with variable keys and validation
mappings ('up' and 'lo' for respective bounds, 'year' for years).
exclude_on_fail : bool, optional
If True, set :attr:`exclude` = *True* for all scenarios that do not satisfy
the criteria.
**kwargs
Passed to :meth:`slice` to downselect datapoints for validation.

Returns
-------
:class:`pandas.DataFrame` or None
All data points that do not satisfy the criteria.
"""
return _validate(self, criteria, exclude_on_fail=exclude_on_fail)
return _validate(
self,
criteria=criteria,
upper_bound=upper_bound,
lower_bound=lower_bound,
exclude_on_fail=exclude_on_fail,
**kwargs,
)

def rename(
self, mapping=None, inplace=False, append=False, check_duplicates=True, **kwargs
Expand Down Expand Up @@ -1800,9 +1819,10 @@ def slice(self, *, keep=True, **kwargs):
-----
The following arguments are available for filtering:

- 'meta' columns: filter by string value of that column
- 'model', 'scenario', 'region', 'variable', 'unit':
string or list of strings, where `*` can be used as a wildcard
- 'meta' columns: mapping of column name to allowed values
- 'exclude': values of :attr:`exclude`
- 'index': list of model, scenario 2-tuples or :class:`pandas.MultiIndex`
- 'level': the "depth" of entries in the variable column (number of '|')
(excluding the strings given in the 'variable' argument)
Expand Down Expand Up @@ -2532,21 +2552,9 @@ def _empty_iamframe(index):


def validate(df, criteria={}, exclude_on_fail=False, **kwargs):
"""Validate scenarios using criteria on timeseries values

Returns all scenarios which do not match the criteria and prints a log
message or returns None if all scenarios match the criteria.

When called with `exclude_on_fail=True`, scenarios in `df` not satisfying
the criteria will be marked as :attr:`exclude` = *True*.

Parameters
----------
df : IamDataFrame
args : passed to :meth:`IamDataFrame.validate`
kwargs : used for downselecting IamDataFrame
passed to :meth:`IamDataFrame.filter`
"""
"""This method is deprecated, use `df.validate()` instead."""
# TODO: method is deprecated, remove for release >= 3.0
deprecation_warning("Use `IamDataFrame.validate()` instead.")
fdf = df.filter(**kwargs)
if len(fdf.data) > 0:
vdf = fdf.validate(criteria=criteria, exclude_on_fail=exclude_on_fail)
Expand Down
39 changes: 37 additions & 2 deletions pyam/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,48 @@
import logging
import pandas as pd

from pyam.logging import deprecation_warning
from pyam.utils import META_IDX, make_index, s

logger = logging.getLogger(__name__)


def _validate(df, criteria, exclude_on_fail):
_df = _apply_criteria(df._data, criteria, in_range=False)
def _validate(df, criteria, upper_bound, lower_bound, exclude_on_fail, **kwargs):
# TODO: argument `criteria` is deprecated, remove for release >= 3.0
if criteria is not None:
deprecation_warning(
"Use `upper_bound`, `lower_bound`, and filter-arguments instead.",
"Argument `criteria`",
)
if upper_bound or lower_bound is not None and not kwargs.empty:
raise NotImplementedError(

Check warning on line 19 in pyam/validation.py

View check run for this annotation

Codecov / codecov/patch

pyam/validation.py#L19

Added line #L19 was not covered by tests
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems that Codecov and I came to the same conclusion here :D

"Using `criteria` and other arguments simultaneously is not supported."
)
# translate legcy `criteria` argument to explicit kwargs
if len(criteria) == 1:
key, value = list(criteria.items())[0]
kwargs = dict(variable=key)
upper_bound, lower_bound = value.get("up", None), value.get("lo", None)
kwargs["year"] = value.get("year", None)
criteria = None

if criteria is None:
_df = df._data[df.slice(**kwargs)]
if _df.empty:
logger.warning("No data matches filters, skipping validation.")

failed_validation = []
if upper_bound is not None:
failed_validation.append(_df[_df > upper_bound])
if lower_bound is not None:
failed_validation.append(_df[_df < lower_bound])
if not failed_validation:
return
_df = pd.concat(failed_validation).sort_index()

# legcy implementation for multiple validation within one dictionary
else:
_df = _apply_criteria(df._data, criteria, in_range=False)

if not _df.empty:
msg = "{} of {} data points do not satisfy the criteria"
Expand Down
131 changes: 110 additions & 21 deletions tests/test_feature_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,77 +68,166 @@ def test_require_data(test_df_year, kwargs, exclude_on_fail):
assert list(df.exclude) == [False, False]


def test_validate_pass(test_df):
obs = test_df.validate({"Primary Energy": {"up": 10}}, exclude_on_fail=True)
# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy"),
dict(criteria={"Primary Energy": {}}),
dict(variable="foo", upper_bound=10),
dict(criteria={"foo": {"up": 10}}),
),
)
def test_validate_none(test_df, args):
# validation for non-existing variables or without upper or lower bound passes
obs = test_df.validate(**args, exclude_on_fail=True)
assert obs is None
assert list(test_df.exclude) == [False, False] # none excluded


# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy", upper_bound=10),
dict(criteria={"Primary Energy": {"up": 10}}),
),
)
def test_validate_pass(test_df, args):
obs = test_df.validate(**args, exclude_on_fail=True)
assert obs is None
assert list(test_df.exclude) == [False, False] # none excluded


def test_validate_nonexisting(test_df):
# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy|Coal", upper_bound=2),
dict(criteria={"Primary Energy|Coal": {"up": 2}}),
),
)
def test_validate_nonexisting(test_df, args):
# checking that a scenario with no relevant value does not fail validation
obs = test_df.validate({"Primary Energy|Coal": {"up": 2}}, exclude_on_fail=True)
obs = test_df.validate(**args, exclude_on_fail=True)
# checking that the return-type is correct
pdt.assert_frame_equal(obs, test_df.data[3:4].reset_index(drop=True))
# scenario with failed validation excluded, scenario with no value passes
assert list(test_df.exclude) == [True, False]


def test_validate_up(test_df):
# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy", upper_bound=6.5),
dict(criteria={"Primary Energy": {"up": 6.5}}),
),
)
def test_validate_up(test_df, args):
# checking that the return-type is correct
obs = test_df.validate({"Primary Energy": {"up": 6.5}})
obs = test_df.validate(**args)
pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
assert list(test_df.exclude) == [False, False]

# checking exclude on fail
obs = test_df.validate({"Primary Energy": {"up": 6.5}}, exclude_on_fail=True)
obs = test_df.validate(**args, exclude_on_fail=True)
pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
assert list(test_df.exclude) == [False, True]


def test_validate_lo(test_df):
# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy", upper_bound=8, lower_bound=2),
dict(criteria={"Primary Energy": {"up": 8, "lo": 2}}),
),
)
def test_validate_lo(test_df, args):
# checking that the return-type is correct
obs = test_df.validate({"Primary Energy": {"up": 8, "lo": 2}})
obs = test_df.validate(**args)
pdt.assert_frame_equal(obs, test_df.data[0:1].reset_index(drop=True))
assert list(test_df.exclude) == [False, False]

# checking exclude on fail
obs = test_df.validate({"Primary Energy": {"up": 8, "lo": 2}}, exclude_on_fail=True)
obs = test_df.validate(**args, exclude_on_fail=True)
pdt.assert_frame_equal(obs, test_df.data[0:1].reset_index(drop=True))
assert list(test_df.exclude) == [True, False]


def test_validate_both(test_df):
# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy", upper_bound=6.5, lower_bound=2),
dict(criteria={"Primary Energy": {"up": 6.5, "lo": 2}}),
),
)
def test_validate_both(test_df, args):
# checking that the return-type is correct
obs = test_df.validate({"Primary Energy": {"up": 6.5, "lo": 2}})
obs = test_df.validate(**args)
pdt.assert_frame_equal(obs, test_df.data[0:6:5].reset_index(drop=True))
assert list(test_df.exclude) == [False, False]

# checking exclude on fail
obs = test_df.validate(
{"Primary Energy": {"up": 6.5, "lo": 2}}, exclude_on_fail=True
)
obs = test_df.validate(**args, exclude_on_fail=True)
pdt.assert_frame_equal(obs, test_df.data[0:6:5].reset_index(drop=True))
assert list(test_df.exclude) == [True, True]


def test_validate_year(test_df):
# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy", year=2005, upper_bound=6),
dict(criteria={"Primary Energy": {"up": 6, "year": 2005}}),
),
)
def test_validate_year_2010(test_df, args):
# checking that the year filter works as expected
obs = test_df.validate({"Primary Energy": {"up": 6, "year": 2005}})
obs = test_df.validate(**args)
assert obs is None


# include args for deprecated legacy signature
@pytest.mark.parametrize(
"args",
(
dict(variable="Primary Energy", year=2010, upper_bound=6),
dict(criteria={"Primary Energy": {"up": 6, "year": 2010}}),
),
)
def test_validate_year_201ß(test_df, args):
# checking that the return-type is correct
obs = test_df.validate({"Primary Energy": {"up": 6, "year": 2010}})
obs = test_df.validate(**args)
pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
assert list(test_df.exclude) == [False, False]

# checking exclude on fail
obs = test_df.validate(
{"Primary Energy": {"up": 6, "year": 2010}}, exclude_on_fail=True
)
obs = test_df.validate(**args, exclude_on_fail=True)
pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
assert list(test_df.exclude) == [False, True]


def test_validate_multiple_criteria(test_df):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You could assert here in addition that the deprecation warning is issued.

# test that validating with multiple criteria works as expected (deprecated feature)
criteria = {
"Primary Energy": {"lo": 7, "year": 2010},
"Primary Energy|Coal": {"lo": 3},
}
exp = test_df.data[1:3].reset_index(drop=True)

obs = test_df.validate(criteria=criteria)
pdt.assert_frame_equal(obs, exp)
assert list(test_df.exclude) == [False, False]

# checking exclude on fail
obs = test_df.validate(criteria=criteria, exclude_on_fail=True)
pdt.assert_frame_equal(obs, exp)
assert list(test_df.exclude) == [True, False]


def test_validate_top_level(test_df):
obs = validate(
test_df,
Expand Down
Loading