IAMconsortium · danielhuppmann · Dec 12, 2022 · Dec 7, 2022 · Dec 7, 2022 · Dec 7, 2022
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -7,7 +7,8 @@ Bump minimum version of **pandas** to v1.2.0 to support automatic engine selecti
 
 ## Individual updates
 
-- [#713](https://github.com/IAMconsortium/pyam/pull/713) Informative error when using lists for filter by level. `Level` now a forbidden column. 
+- [#715](https://github.com/IAMconsortium/pyam/pull/715) Add a `require_data()` method
+- [#713](https://github.com/IAMconsortium/pyam/pull/713) Informative error when using lists for filter by level, `level` now a forbidden column.
 - [#709](https://github.com/IAMconsortium/pyam/pull/709) Hotfix ops to support `fillna=0`
 - [#708](https://github.com/IAMconsortium/pyam/pull/708) Remove 'xls' as by-default-supported file format
 

diff --git a/pyam/core.py b/pyam/core.py
@@ -435,6 +435,11 @@ def dimensions(self):
         """Return the list of `data` columns (index names & data coordinates)"""
         return list(self._data.index.names)
 
+    @property
+    def coordinates(self):
+        """Return the list of `data` coordinates (columns not including index names)"""
+        return [i for i in self._data.index.names if i not in self.index.names]
+
     @property
     def time_domain(self):
         """Indicator of the time domain: 'year', 'datetime', or 'mixed'"""
@@ -928,6 +933,78 @@ def _new_meta_column(self, name):
         if name not in self.meta:
             self.meta[name] = np.nan
 
+    def require_data(
+        self, region=None, variable=None, unit=None, year=None, exclude_on_fail=False
+    ):
+        """Check whether scenarios have values for all combinations of given elements
+
+        Parameters
+        ----------
+        region : str or list of str, optional
+            Required region(s).
+        variable : str or list of str, optional
+            Required variable(s).
+        unit : str or list of str, optional
+            Required unit(s).
+        year : int or list of int, optional
+            Required year(s).
+        exclude_on_fail : bool, optional
+            Set *meta* indicator for scenarios failing validation as `exclude: True`.
+
+        Returns
+        -------
+        pd.DataFrame
+            A dataframe of the *index* of scenarios not satisfying the cr
+        """
+
+        # TODO: option to require values in certain ranges, see `_apply_criteria()`
+
+        # create mapping of required dimensions
+        required = {}
+        n = 1  # expected number of rows per scenario
+        for dim, value in [
+            ("region", region),
+            ("variable", variable),
+            ("unit", unit),
+            ("year", year),
+        ]:
+            if value is not None:
+                required[dim] = value
+                n *= len(to_list(value))
+
+        # fast exit if no arguments values are given
+        if not required:
+            return
+
+        # downselect to relevant rows
+        keep = self._apply_filters(**required)
+        rows = self._data.index[keep]
+
+        # identify scenarios that have none of the required values
+        index_none = self.index.difference(
+            rows.droplevel(level=self.coordinates).drop_duplicates()
+        ).to_frame(index=False)
+
+        # identify scenarios that have some but not all required values
+        columns = [i for i in self.coordinates if i not in required]
+        rows = rows.droplevel(level=columns).drop_duplicates()
+        data = (
+            pd.DataFrame(index=rows)
+            .reset_index(level=list(required))
+            .groupby(self.index.names)
+        )
+
+        index_incomplete = pd.DataFrame(
+            [idx for idx, df in data if len(df) != n], columns=self.index.names
+        )
+
+        # merge all scenarios where not all required data is present
+        index_missing_required = pd.concat([index_none, index_incomplete])
+        if not index_missing_required.empty:
+            if exclude_on_fail:
+                self._exclude_on_fail(index_missing_required)
+            return index_missing_required
+
     def require_variable(self, variable, unit=None, year=None, exclude_on_fail=False):
         """Check whether all scenarios have a required variable
 

diff --git a/tests/test_feature_validation.py b/tests/test_feature_validation.py
@@ -1,8 +1,46 @@
 import pandas as pd
 import pandas.testing as pdt
+import pytest
+
 from pyam import IamDataFrame, validate, categorize, require_variable, META_IDX
 
 
+@pytest.mark.parametrize(
+    "kwargs",
+    (
+        dict(),
+        dict(variable="Primary Energy"),
+        dict(variable=["Primary Energy"], year=[2005, 2010]),
+    ),
+)
+def test_require_data_pass(test_df_year, kwargs):
+    # check that IamDataFrame with all required data returns None
+    assert test_df_year.require_data(**kwargs) is None
+
+
+@pytest.mark.parametrize(
+    "kwargs",
+    (
+        dict(variable="Primary Energy|Coal"),
+        dict(variable=["Primary Energy"], year=[2005, 2010]),
+    ),
+)
+@pytest.mark.parametrize("exclude_on_fail", (False, True))
+def test_require_data(test_df_year, kwargs, exclude_on_fail):
+    # check different ways of failing when not all required data is present
+
+    test_df_year._data = test_df_year._data[0:5]  # remove value for scen_b & 2010
+
+    obs = test_df_year.require_data(**kwargs, exclude_on_fail=exclude_on_fail)
+    exp = pd.DataFrame([["model_a", "scen_b"]], columns=["model", "scenario"])
+    pdt.assert_frame_equal(obs, exp)
+
+    if exclude_on_fail:
+        list(test_df_year.meta["exclude"]) == [False, True]
+    else:
+        list(test_df_year.meta["exclude"]) == [False, False]
+
+
 def test_require_variable_pass(test_df):
     # checking that the return-type is correct
     obs = test_df.require_variable(variable="Primary Energy", exclude_on_fail=True)