Skip to content

Commit

Permalink
Show all missing rows for require_data() (#772)
Browse files Browse the repository at this point in the history
  • Loading branch information
danielhuppmann authored Aug 22, 2023
1 parent 9c947a8 commit e70a0b7
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 31 deletions.
1 change: 1 addition & 0 deletions RELEASE_NOTES.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ instead of `pyam.to_list()`.

## Individual updates

- [#772](https://github.com/IAMconsortium/pyam/pull/772) Show all missing rows for `require_data()`
- [#771](https://github.com/IAMconsortium/pyam/pull/771) Refactor to start a separate validation module
- [#764](https://github.com/IAMconsortium/pyam/pull/764) Clean-up exposing internal methods and attributes
- [#763](https://github.com/IAMconsortium/pyam/pull/763) Implement a fix against carrying over unused levels when initializing from an indexed pandas object
Expand Down
56 changes: 27 additions & 29 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
IAMC_IDX,
SORT_IDX,
ILLEGAL_COLS,
remove_from_list,
)
from pyam.filter import (
datetime_match,
Expand All @@ -65,6 +66,7 @@
get_keep_col,
verify_index_integrity,
replace_index_values,
append_index_col,
)
from pyam.time import swap_time_for_year, swap_year_for_time
from pyam.logging import raise_data_error, deprecation_warning
Expand Down Expand Up @@ -972,7 +974,7 @@ def _new_meta_column(self, name):
def require_data(
self, region=None, variable=None, unit=None, year=None, exclude_on_fail=False
):
"""Check whether scenarios have values for all combinations of given elements
"""Check whether scenarios have values for all (combinations of) given elements.
Parameters
----------
Expand All @@ -991,56 +993,52 @@ def require_data(
Returns
-------
:class:`pandas.DataFrame` or None
A dataframe of the *index* of scenarios not satisfying the criteria.
A dataframe of missing (combinations of) elements for all scenarios.
"""

# TODO: option to require values in certain ranges, see `_apply_criteria()`

# create mapping of required dimensions
required = {}
n = 1 # expected number of rows per scenario
for dim, value in [
("region", region),
("variable", variable),
("unit", unit),
("year", year),
]:
if value is not None:
required[dim] = value
n *= len(to_list(value))
required[dim] = to_list(value)

# fast exit if no arguments values are given
# fast exit if no arguments are given
if not required:
logger.warning("No validation criteria provided.")
return

# downselect to relevant rows
keep = self._apply_filters(**required)
rows = self._data.index[keep]

# identify scenarios that have none of the required values
index_none = self.index.difference(
rows.droplevel(level=self.coordinates).drop_duplicates()
).to_frame(index=False)

# identify scenarios that have some but not all required values
columns = [i for i in self.coordinates if i not in required]
rows = rows.droplevel(level=columns).drop_duplicates()
data = (
pd.DataFrame(index=rows)
.reset_index(level=list(required))
.groupby(self.index.names)
# create index of required elements
index_required = pd.MultiIndex.from_product(
required.values(), names=list(required)
)

index_incomplete = pd.DataFrame(
[idx for idx, df in data if len(df) != n], columns=self.index.names
# create scenario index of suitable length, merge required elements as columns
n = len(self.index)
index = self.index.repeat(len(index_required))
for i, name in enumerate(required.keys()):
index = append_index_col(
index, list(index_required.get_level_values(i)) * n, name=name
)

# identify scenarios that do not have all required elements
rows = (
self._data.index[self._apply_filters(**required)]
.droplevel(level=remove_from_list(self.coordinates, required))
.drop_duplicates()
)
missing_required = index.difference(rows)

# merge all scenarios where not all required data is present
index_missing_required = pd.concat([index_none, index_incomplete])
if not index_missing_required.empty:
if not missing_required.empty:
if exclude_on_fail:
_exclude_on_fail(self, index_missing_required)
return index_missing_required
_exclude_on_fail(self, missing_required.droplevel(list(required)))
return missing_required.to_frame(index=False)

def require_variable(self, variable, unit=None, year=None, exclude_on_fail=False):
"""Check whether all scenarios have a required variable
Expand Down
12 changes: 10 additions & 2 deletions tests/test_feature_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,13 +51,21 @@ def test_require_data(test_df_year, kwargs, exclude_on_fail):
df = test_df_year.append(IamDataFrame(DATA_GAS))

obs = df.require_data(**kwargs, exclude_on_fail=exclude_on_fail)

exp = pd.DataFrame([["model_a", "scen_b"]], columns=["model", "scenario"])
# add parametrization-dependent columns to expected output
if kwargs["variable"] == "Primary Energy|Coal":
exp["variable"] = ["Primary Energy|Coal"]
else:
exp["variable"] = ["Primary Energy"]
exp["year"] = [2010]

pdt.assert_frame_equal(obs, exp)

if exclude_on_fail:
list(df.exclude) == [False, True]
assert list(df.exclude) == [False, True]
else:
list(df.exclude) == [False, False]
assert list(df.exclude) == [False, False]


def test_require_variable_pass(test_df):
Expand Down

0 comments on commit e70a0b7

Please sign in to comment.