diff --git a/AUTHORS.rst b/AUTHORS.rst index 9af304068..8c4ede8b4 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -25,5 +25,5 @@ The following persons contributed to the development of the |pyam| package: | The core maintenance of the |pyam| package is done by the *Scenario Services & Scientific Software* research theme - at the IIASA Energy, Climate, and Enviroment program. + at the IIASA Energy, Climate, and Environment program. | Visit https://software.ece.iiasa.ac.at for more information. diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index 7108ca26c..0ba395f07 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,6 @@ # Next Release +- [#804](https://github.com/IAMconsortium/pyam/pull/804) Support filters as direct keyword arguments for `validate()` method - [#801](https://github.com/IAMconsortium/pyam/pull/801) Support initializing with `meta` dataframe in long format - [#796](https://github.com/IAMconsortium/pyam/pull/796) Raise explicit error message if no connection to IIASA manager service - [#794](https://github.com/IAMconsortium/pyam/pull/794) Fix wrong color codes for AR6 Illustrative Pathways diff --git a/docs/data.rst b/docs/data.rst index 3274009bd..6295395fb 100644 --- a/docs/data.rst +++ b/docs/data.rst @@ -14,8 +14,8 @@ Background: The IAMC timeseries scenario data format Over the past decade, the Integrated Assessment Modeling Consortium (IAMC) developed a standardised tabular timeseries format to exchange scenario data -related to energy systems modelling, land-use change, demand sectors, -and economic indicators in the context of the Sustainable Development Goals. +related to energy systems modelling, land-use change, demand sectors, and economic +indicators in the context of climate change and the the Sustainable Development Goals. Previous high-level use cases include reports by the *Intergovernmental Panel on Climate Change* (`IPCC`_) and model comparison exercises within the *Energy Modeling Forum* (`EMF`_) hosted by Stanford University. diff --git a/docs/tutorials/pyam_first_steps.ipynb b/docs/tutorials/pyam_first_steps.ipynb index 9e1c7842e..c1dd45cb8 100644 --- a/docs/tutorials/pyam_first_steps.ipynb +++ b/docs/tutorials/pyam_first_steps.ipynb @@ -557,7 +557,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_world.validate(criteria={'Primary Energy': {'lo': 540 * 0.9, 'year': 2010}})" + "df_world.validate(variable='Primary Energy', year=2010, lower_bound=540 * 0.9)" ] }, { @@ -595,7 +595,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_world.validate(criteria={'Emissions|CO2': {'lo': 38000, 'year': 2020}}, exclude_on_fail=True)" + "df_world.validate(variable=\"Emissions|CO2\", year=2020, lower_bound=38000, exclude_on_fail=True)" ] }, { @@ -604,7 +604,7 @@ "metadata": {}, "outputs": [], "source": [ - "df_world.validate(criteria={'Emissions|CO2': {'up': 45000}}, exclude_on_fail=True)" + "df_world.validate(variable=\"Emissions|CO2\", upper_bound=45000, exclude_on_fail=True)" ] }, { diff --git a/pyam/core.py b/pyam/core.py index c9defb030..8ba0f0bec 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -1047,30 +1047,49 @@ def require_variable(self, *args, **kwargs): # TODO: deprecated, remove for release >= 2.1 raise DeprecationWarning("Use `df.require_data()` instead.") - def validate(self, criteria={}, exclude_on_fail=False): - """Validate scenarios using criteria on timeseries values + def validate( + self, + criteria: dict = None, + *, + upper_bound: float = None, + lower_bound: float = None, + exclude_on_fail: bool = False, + **kwargs, + ) -> pd.DataFrame: + """Validate scenarios using bounds on (filtered) timeseries 'data' values. - Returns all scenarios which do not match the criteria and prints a log - message, or returns None if all scenarios match the criteria. + Returns all data rows that do not match the criteria, or returns None if all + scenarios match the criteria. - When called with `exclude_on_fail=True`, scenarios not - satisfying the criteria will be marked as `exclude=True`. + When called with `exclude_on_fail=True`, scenarios not satisfying the criteria + will be marked as `exclude=True`. Parameters ---------- - criteria : dict - dictionary with variable keys and validation mappings - ('up' and 'lo' for respective bounds, 'year' for years) + upper_bound, lower_bound : float, optional + Upper and lower bounds for validation criteria of timeseries :attr:`data`. + criteria : dict, optional, deprecated + This option is deprecated; dictionary with variable keys and validation + mappings ('up' and 'lo' for respective bounds, 'year' for years). exclude_on_fail : bool, optional If True, set :attr:`exclude` = *True* for all scenarios that do not satisfy the criteria. + **kwargs + Passed to :meth:`slice` to downselect datapoints for validation. Returns ------- :class:`pandas.DataFrame` or None All data points that do not satisfy the criteria. """ - return _validate(self, criteria, exclude_on_fail=exclude_on_fail) + return _validate( + self, + criteria=criteria, + upper_bound=upper_bound, + lower_bound=lower_bound, + exclude_on_fail=exclude_on_fail, + **kwargs, + ) def rename( self, mapping=None, inplace=False, append=False, check_duplicates=True, **kwargs @@ -1800,9 +1819,10 @@ def slice(self, *, keep=True, **kwargs): ----- The following arguments are available for filtering: - - 'meta' columns: filter by string value of that column - 'model', 'scenario', 'region', 'variable', 'unit': string or list of strings, where `*` can be used as a wildcard + - 'meta' columns: mapping of column name to allowed values + - 'exclude': values of :attr:`exclude` - 'index': list of model, scenario 2-tuples or :class:`pandas.MultiIndex` - 'level': the "depth" of entries in the variable column (number of '|') (excluding the strings given in the 'variable' argument) @@ -2532,21 +2552,9 @@ def _empty_iamframe(index): def validate(df, criteria={}, exclude_on_fail=False, **kwargs): - """Validate scenarios using criteria on timeseries values - - Returns all scenarios which do not match the criteria and prints a log - message or returns None if all scenarios match the criteria. - - When called with `exclude_on_fail=True`, scenarios in `df` not satisfying - the criteria will be marked as :attr:`exclude` = *True*. - - Parameters - ---------- - df : IamDataFrame - args : passed to :meth:`IamDataFrame.validate` - kwargs : used for downselecting IamDataFrame - passed to :meth:`IamDataFrame.filter` - """ + """This method is deprecated, use `df.validate()` instead.""" + # TODO: method is deprecated, remove for release >= 3.0 + deprecation_warning("Use `IamDataFrame.validate()` instead.") fdf = df.filter(**kwargs) if len(fdf.data) > 0: vdf = fdf.validate(criteria=criteria, exclude_on_fail=exclude_on_fail) diff --git a/pyam/validation.py b/pyam/validation.py index a196469b0..513e7ddbd 100644 --- a/pyam/validation.py +++ b/pyam/validation.py @@ -2,13 +2,48 @@ import logging import pandas as pd +from pyam.logging import deprecation_warning from pyam.utils import META_IDX, make_index, s logger = logging.getLogger(__name__) -def _validate(df, criteria, exclude_on_fail): - _df = _apply_criteria(df._data, criteria, in_range=False) +def _validate(df, criteria, upper_bound, lower_bound, exclude_on_fail, **kwargs): + # TODO: argument `criteria` is deprecated, remove for release >= 3.0 + if criteria is not None: + deprecation_warning( + "Use `upper_bound`, `lower_bound`, and filter-arguments instead.", + "Argument `criteria`", + ) + if upper_bound or lower_bound is not None and not kwargs.empty: + raise NotImplementedError( + "Using `criteria` and other arguments simultaneously is not supported." + ) + # translate legcy `criteria` argument to explicit kwargs + if len(criteria) == 1: + key, value = list(criteria.items())[0] + kwargs = dict(variable=key) + upper_bound, lower_bound = value.get("up", None), value.get("lo", None) + kwargs["year"] = value.get("year", None) + criteria = None + + if criteria is None: + _df = df._data[df.slice(**kwargs)] + if _df.empty: + logger.warning("No data matches filters, skipping validation.") + + failed_validation = [] + if upper_bound is not None: + failed_validation.append(_df[_df > upper_bound]) + if lower_bound is not None: + failed_validation.append(_df[_df < lower_bound]) + if not failed_validation: + return + _df = pd.concat(failed_validation).sort_index() + + # legcy implementation for multiple validation within one dictionary + else: + _df = _apply_criteria(df._data, criteria, in_range=False) if not _df.empty: msg = "{} of {} data points do not satisfy the criteria" diff --git a/tests/test_feature_validation.py b/tests/test_feature_validation.py index 6aed06c2c..72b39486c 100644 --- a/tests/test_feature_validation.py +++ b/tests/test_feature_validation.py @@ -68,77 +68,166 @@ def test_require_data(test_df_year, kwargs, exclude_on_fail): assert list(df.exclude) == [False, False] -def test_validate_pass(test_df): - obs = test_df.validate({"Primary Energy": {"up": 10}}, exclude_on_fail=True) +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy"), + dict(criteria={"Primary Energy": {}}), + dict(variable="foo", upper_bound=10), + dict(criteria={"foo": {"up": 10}}), + ), +) +def test_validate_none(test_df, args): + # validation for non-existing variables or without upper or lower bound passes + obs = test_df.validate(**args, exclude_on_fail=True) + assert obs is None + assert list(test_df.exclude) == [False, False] # none excluded + + +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy", upper_bound=10), + dict(criteria={"Primary Energy": {"up": 10}}), + ), +) +def test_validate_pass(test_df, args): + obs = test_df.validate(**args, exclude_on_fail=True) assert obs is None assert list(test_df.exclude) == [False, False] # none excluded -def test_validate_nonexisting(test_df): +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy|Coal", upper_bound=2), + dict(criteria={"Primary Energy|Coal": {"up": 2}}), + ), +) +def test_validate_nonexisting(test_df, args): # checking that a scenario with no relevant value does not fail validation - obs = test_df.validate({"Primary Energy|Coal": {"up": 2}}, exclude_on_fail=True) + obs = test_df.validate(**args, exclude_on_fail=True) # checking that the return-type is correct pdt.assert_frame_equal(obs, test_df.data[3:4].reset_index(drop=True)) # scenario with failed validation excluded, scenario with no value passes assert list(test_df.exclude) == [True, False] -def test_validate_up(test_df): +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy", upper_bound=6.5), + dict(criteria={"Primary Energy": {"up": 6.5}}), + ), +) +def test_validate_up(test_df, args): # checking that the return-type is correct - obs = test_df.validate({"Primary Energy": {"up": 6.5}}) + obs = test_df.validate(**args) pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True)) assert list(test_df.exclude) == [False, False] # checking exclude on fail - obs = test_df.validate({"Primary Energy": {"up": 6.5}}, exclude_on_fail=True) + obs = test_df.validate(**args, exclude_on_fail=True) pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True)) assert list(test_df.exclude) == [False, True] -def test_validate_lo(test_df): +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy", upper_bound=8, lower_bound=2), + dict(criteria={"Primary Energy": {"up": 8, "lo": 2}}), + ), +) +def test_validate_lo(test_df, args): # checking that the return-type is correct - obs = test_df.validate({"Primary Energy": {"up": 8, "lo": 2}}) + obs = test_df.validate(**args) pdt.assert_frame_equal(obs, test_df.data[0:1].reset_index(drop=True)) assert list(test_df.exclude) == [False, False] # checking exclude on fail - obs = test_df.validate({"Primary Energy": {"up": 8, "lo": 2}}, exclude_on_fail=True) + obs = test_df.validate(**args, exclude_on_fail=True) pdt.assert_frame_equal(obs, test_df.data[0:1].reset_index(drop=True)) assert list(test_df.exclude) == [True, False] -def test_validate_both(test_df): +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy", upper_bound=6.5, lower_bound=2), + dict(criteria={"Primary Energy": {"up": 6.5, "lo": 2}}), + ), +) +def test_validate_both(test_df, args): # checking that the return-type is correct - obs = test_df.validate({"Primary Energy": {"up": 6.5, "lo": 2}}) + obs = test_df.validate(**args) pdt.assert_frame_equal(obs, test_df.data[0:6:5].reset_index(drop=True)) assert list(test_df.exclude) == [False, False] # checking exclude on fail - obs = test_df.validate( - {"Primary Energy": {"up": 6.5, "lo": 2}}, exclude_on_fail=True - ) + obs = test_df.validate(**args, exclude_on_fail=True) pdt.assert_frame_equal(obs, test_df.data[0:6:5].reset_index(drop=True)) assert list(test_df.exclude) == [True, True] -def test_validate_year(test_df): +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy", year=2005, upper_bound=6), + dict(criteria={"Primary Energy": {"up": 6, "year": 2005}}), + ), +) +def test_validate_year_2010(test_df, args): # checking that the year filter works as expected - obs = test_df.validate({"Primary Energy": {"up": 6, "year": 2005}}) + obs = test_df.validate(**args) assert obs is None + +# include args for deprecated legacy signature +@pytest.mark.parametrize( + "args", + ( + dict(variable="Primary Energy", year=2010, upper_bound=6), + dict(criteria={"Primary Energy": {"up": 6, "year": 2010}}), + ), +) +def test_validate_year_201ß(test_df, args): # checking that the return-type is correct - obs = test_df.validate({"Primary Energy": {"up": 6, "year": 2010}}) + obs = test_df.validate(**args) pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True)) assert list(test_df.exclude) == [False, False] # checking exclude on fail - obs = test_df.validate( - {"Primary Energy": {"up": 6, "year": 2010}}, exclude_on_fail=True - ) + obs = test_df.validate(**args, exclude_on_fail=True) pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True)) assert list(test_df.exclude) == [False, True] +def test_validate_multiple_criteria(test_df): + # test that validating with multiple criteria works as expected (deprecated feature) + criteria = { + "Primary Energy": {"lo": 7, "year": 2010}, + "Primary Energy|Coal": {"lo": 3}, + } + exp = test_df.data[1:3].reset_index(drop=True) + + obs = test_df.validate(criteria=criteria) + pdt.assert_frame_equal(obs, exp) + assert list(test_df.exclude) == [False, False] + + # checking exclude on fail + obs = test_df.validate(criteria=criteria, exclude_on_fail=True) + pdt.assert_frame_equal(obs, exp) + assert list(test_df.exclude) == [True, False] + + def test_validate_top_level(test_df): obs = validate( test_df,