IAMconsortium · danielhuppmann · Dec 12, 2023 · Dec 9, 2023 · Dec 9, 2023 · Dec 9, 2023
diff --git a/AUTHORS.rst b/AUTHORS.rst
@@ -25,5 +25,5 @@ The following persons contributed to the development of the |pyam| package:
 
 | The core maintenance of the |pyam| package is done by
   the *Scenario Services & Scientific Software* research theme
-  at the IIASA Energy, Climate, and Enviroment program.
+  at the IIASA Energy, Climate, and Environment program.
 | Visit https://software.ece.iiasa.ac.at for more information.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,5 +1,6 @@
 # Next Release
 
+- [#804](https://github.com/IAMconsortium/pyam/pull/804) Support filters as direct keyword arguments for `validate()` method
 - [#801](https://github.com/IAMconsortium/pyam/pull/801) Support initializing with `meta` dataframe in long format
 - [#796](https://github.com/IAMconsortium/pyam/pull/796) Raise explicit error message if no connection to IIASA manager service
 - [#794](https://github.com/IAMconsortium/pyam/pull/794) Fix wrong color codes for AR6 Illustrative Pathways

diff --git a/docs/data.rst b/docs/data.rst
@@ -14,8 +14,8 @@ Background: The IAMC timeseries scenario data format
 
 Over the past decade, the Integrated Assessment Modeling Consortium (IAMC)
 developed a standardised tabular timeseries format to exchange scenario data
-related to energy systems modelling, land-use change, demand sectors,
-and economic indicators in the context of the Sustainable Development Goals.
+related to energy systems modelling, land-use change, demand sectors, and economic
+indicators in the context of climate change and the the Sustainable Development Goals.
 Previous high-level use cases include reports by the *Intergovernmental Panel
 on Climate Change* (`IPCC`_) and model comparison exercises
 within the *Energy Modeling Forum* (`EMF`_) hosted by Stanford University.

diff --git a/docs/tutorials/pyam_first_steps.ipynb b/docs/tutorials/pyam_first_steps.ipynb
@@ -557,7 +557,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_world.validate(criteria={'Primary Energy': {'lo': 540 * 0.9, 'year': 2010}})"
+    "df_world.validate(variable='Primary Energy', year=2010, lower_bound=540 * 0.9)"
    ]
   },
   {
@@ -595,7 +595,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_world.validate(criteria={'Emissions|CO2': {'lo': 38000, 'year': 2020}}, exclude_on_fail=True)"
+    "df_world.validate(variable=\"Emissions|CO2\", year=2020, lower_bound=38000, exclude_on_fail=True)"
    ]
   },
   {
@@ -604,7 +604,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "df_world.validate(criteria={'Emissions|CO2': {'up': 45000}}, exclude_on_fail=True)"
+    "df_world.validate(variable=\"Emissions|CO2\", upper_bound=45000, exclude_on_fail=True)"
    ]
   },
   {

diff --git a/pyam/core.py b/pyam/core.py
@@ -1047,30 +1047,49 @@ def require_variable(self, *args, **kwargs):
         # TODO: deprecated, remove for release >= 2.1
         raise DeprecationWarning("Use `df.require_data()` instead.")
 
-    def validate(self, criteria={}, exclude_on_fail=False):
-        """Validate scenarios using criteria on timeseries values
+    def validate(
+        self,
+        criteria: dict = None,
+        *,
+        upper_bound: float = None,
+        lower_bound: float = None,
+        exclude_on_fail: bool = False,
+        **kwargs,
+    ) -> pd.DataFrame:
+        """Validate scenarios using bounds on (filtered) timeseries 'data' values.
 
-        Returns all scenarios which do not match the criteria and prints a log
-        message, or returns None if all scenarios match the criteria.
+        Returns all data rows that do not match the criteria, or returns None if all
+        scenarios match the criteria.
 
-        When called with `exclude_on_fail=True`, scenarios not
-        satisfying the criteria will be marked as `exclude=True`.
+        When called with `exclude_on_fail=True`, scenarios not satisfying the criteria
+        will be marked as `exclude=True`.
 
         Parameters
         ----------
-        criteria : dict
-           dictionary with variable keys and validation mappings
-            ('up' and 'lo' for respective bounds, 'year' for years)
+        upper_bound, lower_bound : float, optional
+            Upper and lower bounds for validation criteria of timeseries :attr:`data`.
+        criteria : dict, optional, deprecated
+           This option is deprecated; dictionary with variable keys and validation
+           mappings ('up' and 'lo' for respective bounds, 'year' for years).
         exclude_on_fail : bool, optional
             If True, set :attr:`exclude` = *True* for all scenarios that do not satisfy
             the criteria.
+        **kwargs
+            Passed to :meth:`slice` to downselect datapoints for validation.
 
         Returns
         -------
         :class:`pandas.DataFrame` or None
             All data points that do not satisfy the criteria.
         """
-        return _validate(self, criteria, exclude_on_fail=exclude_on_fail)
+        return _validate(
+            self,
+            criteria=criteria,
+            upper_bound=upper_bound,
+            lower_bound=lower_bound,
+            exclude_on_fail=exclude_on_fail,
+            **kwargs,
+        )
 
     def rename(
         self, mapping=None, inplace=False, append=False, check_duplicates=True, **kwargs
@@ -1800,9 +1819,10 @@ def slice(self, *, keep=True, **kwargs):
         -----
         The following arguments are available for filtering:
 
-         - 'meta' columns: filter by string value of that column
          - 'model', 'scenario', 'region', 'variable', 'unit':
            string or list of strings, where `*` can be used as a wildcard
+         - 'meta' columns: mapping of column name to allowed values
+         - 'exclude': values of :attr:`exclude`
          - 'index': list of model, scenario 2-tuples or :class:`pandas.MultiIndex`
          - 'level': the "depth" of entries in the variable column (number of '|')
            (excluding the strings given in the 'variable' argument)
@@ -2532,21 +2552,9 @@ def _empty_iamframe(index):
 
 
 def validate(df, criteria={}, exclude_on_fail=False, **kwargs):
-    """Validate scenarios using criteria on timeseries values
-
-    Returns all scenarios which do not match the criteria and prints a log
-    message or returns None if all scenarios match the criteria.
-
-    When called with `exclude_on_fail=True`, scenarios in `df` not satisfying
-    the criteria will be marked as :attr:`exclude` = *True*.
-
-    Parameters
-    ----------
-    df : IamDataFrame
-    args : passed to :meth:`IamDataFrame.validate`
-    kwargs : used for downselecting IamDataFrame
-        passed to :meth:`IamDataFrame.filter`
-    """
+    """This method is deprecated, use `df.validate()` instead."""
+    # TODO: method is deprecated, remove for release >= 3.0
+    deprecation_warning("Use `IamDataFrame.validate()` instead.")
     fdf = df.filter(**kwargs)
     if len(fdf.data) > 0:
         vdf = fdf.validate(criteria=criteria, exclude_on_fail=exclude_on_fail)

diff --git a/pyam/validation.py b/pyam/validation.py
@@ -2,13 +2,48 @@
 import logging
 import pandas as pd
 
+from pyam.logging import deprecation_warning
 from pyam.utils import META_IDX, make_index, s
 
 logger = logging.getLogger(__name__)
 
 
-def _validate(df, criteria, exclude_on_fail):
-    _df = _apply_criteria(df._data, criteria, in_range=False)
+def _validate(df, criteria, upper_bound, lower_bound, exclude_on_fail, **kwargs):
+    # TODO: argument `criteria` is deprecated, remove for release >= 3.0
+    if criteria is not None:
+        deprecation_warning(
+            "Use `upper_bound`, `lower_bound`, and filter-arguments instead.",
+            "Argument `criteria`",
+        )
+        if upper_bound or lower_bound is not None and not kwargs.empty:
+            raise NotImplementedError(
+                "Using `criteria` and other arguments simultaneously is not supported."
+            )
+        # translate legcy `criteria` argument to explicit kwargs
+        if len(criteria) == 1:
+            key, value = list(criteria.items())[0]
+            kwargs = dict(variable=key)
+            upper_bound, lower_bound = value.get("up", None), value.get("lo", None)
+            kwargs["year"] = value.get("year", None)
+            criteria = None
+
+    if criteria is None:
+        _df = df._data[df.slice(**kwargs)]
+        if _df.empty:
+            logger.warning("No data matches filters, skipping validation.")
+
+        failed_validation = []
+        if upper_bound is not None:
+            failed_validation.append(_df[_df > upper_bound])
+        if lower_bound is not None:
+            failed_validation.append(_df[_df < lower_bound])
+        if not failed_validation:
+            return
+        _df = pd.concat(failed_validation).sort_index()
+
+    # legcy implementation for multiple validation within one dictionary
+    else:
+        _df = _apply_criteria(df._data, criteria, in_range=False)
 
     if not _df.empty:
         msg = "{} of {} data points do not satisfy the criteria"

diff --git a/tests/test_feature_validation.py b/tests/test_feature_validation.py
@@ -68,77 +68,166 @@ def test_require_data(test_df_year, kwargs, exclude_on_fail):
         assert list(df.exclude) == [False, False]
 
 
-def test_validate_pass(test_df):
-    obs = test_df.validate({"Primary Energy": {"up": 10}}, exclude_on_fail=True)
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy"),
+        dict(criteria={"Primary Energy": {}}),
+        dict(variable="foo", upper_bound=10),
+        dict(criteria={"foo": {"up": 10}}),
+    ),
+)
+def test_validate_none(test_df, args):
+    # validation for non-existing variables or without upper or lower bound passes
+    obs = test_df.validate(**args, exclude_on_fail=True)
+    assert obs is None
+    assert list(test_df.exclude) == [False, False]  # none excluded
+
+
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy", upper_bound=10),
+        dict(criteria={"Primary Energy": {"up": 10}}),
+    ),
+)
+def test_validate_pass(test_df, args):
+    obs = test_df.validate(**args, exclude_on_fail=True)
     assert obs is None
     assert list(test_df.exclude) == [False, False]  # none excluded
 
 
-def test_validate_nonexisting(test_df):
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy|Coal", upper_bound=2),
+        dict(criteria={"Primary Energy|Coal": {"up": 2}}),
+    ),
+)
+def test_validate_nonexisting(test_df, args):
     # checking that a scenario with no relevant value does not fail validation
-    obs = test_df.validate({"Primary Energy|Coal": {"up": 2}}, exclude_on_fail=True)
+    obs = test_df.validate(**args, exclude_on_fail=True)
     # checking that the return-type is correct
     pdt.assert_frame_equal(obs, test_df.data[3:4].reset_index(drop=True))
     # scenario with failed validation excluded, scenario with no value passes
     assert list(test_df.exclude) == [True, False]
 
 
-def test_validate_up(test_df):
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy", upper_bound=6.5),
+        dict(criteria={"Primary Energy": {"up": 6.5}}),
+    ),
+)
+def test_validate_up(test_df, args):
     # checking that the return-type is correct
-    obs = test_df.validate({"Primary Energy": {"up": 6.5}})
+    obs = test_df.validate(**args)
     pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
     assert list(test_df.exclude) == [False, False]
 
     # checking exclude on fail
-    obs = test_df.validate({"Primary Energy": {"up": 6.5}}, exclude_on_fail=True)
+    obs = test_df.validate(**args, exclude_on_fail=True)
     pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
     assert list(test_df.exclude) == [False, True]
 
 
-def test_validate_lo(test_df):
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy", upper_bound=8, lower_bound=2),
+        dict(criteria={"Primary Energy": {"up": 8, "lo": 2}}),
+    ),
+)
+def test_validate_lo(test_df, args):
     # checking that the return-type is correct
-    obs = test_df.validate({"Primary Energy": {"up": 8, "lo": 2}})
+    obs = test_df.validate(**args)
     pdt.assert_frame_equal(obs, test_df.data[0:1].reset_index(drop=True))
     assert list(test_df.exclude) == [False, False]
 
     # checking exclude on fail
-    obs = test_df.validate({"Primary Energy": {"up": 8, "lo": 2}}, exclude_on_fail=True)
+    obs = test_df.validate(**args, exclude_on_fail=True)
     pdt.assert_frame_equal(obs, test_df.data[0:1].reset_index(drop=True))
     assert list(test_df.exclude) == [True, False]
 
 
-def test_validate_both(test_df):
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy", upper_bound=6.5, lower_bound=2),
+        dict(criteria={"Primary Energy": {"up": 6.5, "lo": 2}}),
+    ),
+)
+def test_validate_both(test_df, args):
     # checking that the return-type is correct
-    obs = test_df.validate({"Primary Energy": {"up": 6.5, "lo": 2}})
+    obs = test_df.validate(**args)
     pdt.assert_frame_equal(obs, test_df.data[0:6:5].reset_index(drop=True))
     assert list(test_df.exclude) == [False, False]
 
     # checking exclude on fail
-    obs = test_df.validate(
-        {"Primary Energy": {"up": 6.5, "lo": 2}}, exclude_on_fail=True
-    )
+    obs = test_df.validate(**args, exclude_on_fail=True)
     pdt.assert_frame_equal(obs, test_df.data[0:6:5].reset_index(drop=True))
     assert list(test_df.exclude) == [True, True]
 
 
-def test_validate_year(test_df):
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy", year=2005, upper_bound=6),
+        dict(criteria={"Primary Energy": {"up": 6, "year": 2005}}),
+    ),
+)
+def test_validate_year_2010(test_df, args):
     # checking that the year filter works as expected
-    obs = test_df.validate({"Primary Energy": {"up": 6, "year": 2005}})
+    obs = test_df.validate(**args)
     assert obs is None
 
+
+# include args for deprecated legacy signature
+@pytest.mark.parametrize(
+    "args",
+    (
+        dict(variable="Primary Energy", year=2010, upper_bound=6),
+        dict(criteria={"Primary Energy": {"up": 6, "year": 2010}}),
+    ),
+)
+def test_validate_year_201ß(test_df, args):
     # checking that the return-type is correct
-    obs = test_df.validate({"Primary Energy": {"up": 6, "year": 2010}})
+    obs = test_df.validate(**args)
     pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
     assert list(test_df.exclude) == [False, False]
 
     # checking exclude on fail
-    obs = test_df.validate(
-        {"Primary Energy": {"up": 6, "year": 2010}}, exclude_on_fail=True
-    )
+    obs = test_df.validate(**args, exclude_on_fail=True)
     pdt.assert_frame_equal(obs, test_df.data[5:6].reset_index(drop=True))
     assert list(test_df.exclude) == [False, True]
 
 
+def test_validate_multiple_criteria(test_df):
+    # test that validating with multiple criteria works as expected (deprecated feature)
+    criteria = {
+        "Primary Energy": {"lo": 7, "year": 2010},
+        "Primary Energy|Coal": {"lo": 3},
+    }
+    exp = test_df.data[1:3].reset_index(drop=True)
+
+    obs = test_df.validate(criteria=criteria)
+    pdt.assert_frame_equal(obs, exp)
+    assert list(test_df.exclude) == [False, False]
+
+    # checking exclude on fail
+    obs = test_df.validate(criteria=criteria, exclude_on_fail=True)
+    pdt.assert_frame_equal(obs, exp)
+    assert list(test_df.exclude) == [True, False]
+
+
 def test_validate_top_level(test_df):
     obs = validate(
         test_df,