From 045448e1fa83a4362858ed9b11dbeaaf0c9ebf2e Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Thu, 3 Mar 2022 17:08:49 +0100 Subject: [PATCH 1/5] Add initial IamSlice suggestion --- pyam/core.py | 74 +++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index c212170f1..23f7f8613 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -91,6 +91,60 @@ logger = logging.getLogger(__name__) +class IamSlice(pd.Series): + @property + def _constructor(self): + return IamSlice + + _internal_names = pd.Series._internal_names + ["_iamcache"] + _internal_names_set = set(_internal_names) + + def __init__(self, data=None, index=None, **kwargs): + super().__init__(data, index, **kwargs) + self._iamcache = dict() + + def __dir__(self): + return self.dimensions + super().__dir__() + + def __getattr__(self, attr): + ret = object.__getattribute__(self, "_iamcache").get(attr) + if ret is not None: + return ret + + if attr in self.dimensions: + ret = self._iamcache[attr] = ( + self.index[self].unique(level=attr).tolist() + ) + return ret + + return super().__getattr__(attr) + + @property + def dimensions(self): + return self.index.names + + def __repr__(self): + return self.info() + "\n\n" + super().__repr__() + + def info(self, n=80): + """Print a summary of the represented index dimensions + + Parameters + ---------- + n : int + The maximum line length + """ + # concatenate list of index dimensions and levels + info = f'{type(self)}\nIndex dimensions:\n' + c1 = max([len(i) for i in self.dimensions]) + 1 + c2 = n - c1 - 5 + info += '\n'.join( + [f' * {i:{c1}}: {print_list(getattr(self, i), c2)}' + for i in self.dimensions]) + + return info + + class IamDataFrame(object): """Scenario timeseries data and meta indicators @@ -255,7 +309,9 @@ def _finalize(self, data, append, **args): def __getitem__(self, key): _key_check = [key] if isstr(key) else key - if key == "value": + if isinstance(key, IamSlice): + return IamDataFrame(self._data.loc[key]) + elif key == "value": return pd.Series(self._data.values, name="value") elif set(_key_check).issubset(self.meta.columns): return self.meta.__getitem__(key) @@ -1712,6 +1768,15 @@ def _exclude_on_fail(self, df): ) ) + def slice(self, keep=True, **kwargs): + if not isinstance(keep, bool): + raise ValueError(f"Cannot filter by `keep={keep}`, must be a boolean!") + + _keep = self._apply_filters(**kwargs) + _keep = _keep if keep else ~_keep + + return IamSlice(_keep.values, self._data.index) + def filter(self, keep=True, inplace=False, **kwargs): """Return a (copy of a) filtered (downselected) IamDataFrame @@ -1736,12 +1801,9 @@ def filter(self, keep=True, inplace=False, **kwargs): ('month', 'hour', 'time') - 'regexp=True' disables pseudo-regexp syntax in `pattern_match()` """ - if not isinstance(keep, bool): - raise ValueError(f"Cannot filter by `keep={keep}`, must be a boolean!") - # downselect `data` rows and clean up index - _keep = self._apply_filters(**kwargs) - _keep = _keep if keep else ~_keep + _keep = self.slice(keep=keep, **kwargs) + ret = self.copy() if not inplace else self ret._data = ret._data[_keep] ret._data.index = ret._data.index.remove_unused_levels() From 8fe4376b653469647ebdc37d240ac4d35c34fac7 Mon Sep 17 00:00:00 2001 From: Jonas Hoersch Date: Thu, 3 Mar 2022 17:37:49 +0100 Subject: [PATCH 2/5] core: Fix type instability of _apply_filter Depending on what is filtered it returns a boolean mask as a pd.Series or as a numpy array :/ --- pyam/core.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index 23f7f8613..7e772483a 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -112,13 +112,11 @@ def __getattr__(self, attr): return ret if attr in self.dimensions: - ret = self._iamcache[attr] = ( - self.index[self].unique(level=attr).tolist() - ) + ret = self._iamcache[attr] = self.index[self].unique(level=attr).tolist() return ret return super().__getattr__(attr) - + @property def dimensions(self): return self.index.names @@ -135,12 +133,15 @@ def info(self, n=80): The maximum line length """ # concatenate list of index dimensions and levels - info = f'{type(self)}\nIndex dimensions:\n' + info = f"{type(self)}\nIndex dimensions:\n" c1 = max([len(i) for i in self.dimensions]) + 1 c2 = n - c1 - 5 - info += '\n'.join( - [f' * {i:{c1}}: {print_list(getattr(self, i), c2)}' - for i in self.dimensions]) + info += "\n".join( + [ + f" * {i:{c1}}: {print_list(getattr(self, i), c2)}" + for i in self.dimensions + ] + ) return info @@ -1774,8 +1775,12 @@ def slice(self, keep=True, **kwargs): _keep = self._apply_filters(**kwargs) _keep = _keep if keep else ~_keep - - return IamSlice(_keep.values, self._data.index) + + return ( + IamSlice(_keep) + if isinstance(_keep, pd.Series) + else IamSlice(_keep, self._data.index) + ) def filter(self, keep=True, inplace=False, **kwargs): """Return a (copy of a) filtered (downselected) IamDataFrame From b67483e2f71450915548937830ffc177ab8fefe3 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 15 Mar 2022 13:17:06 +0100 Subject: [PATCH 3/5] Add tests and set correct return type for `time` (#642) * Add tests and set correct return type for `time` * Implement suggestion by @coroa * Add `name` to pd.Index returned by `time` * Deactivate unfccc-test --- pyam/core.py | 10 ++++++---- tests/conftest.py | 2 +- tests/test_filter.py | 21 ++++++++++++--------- tests/test_time.py | 12 ++++++------ tests/test_unfccc.py | 2 +- 5 files changed, 26 insertions(+), 21 deletions(-) diff --git a/pyam/core.py b/pyam/core.py index 7e772483a..b9183086e 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -109,11 +109,11 @@ def __dir__(self): def __getattr__(self, attr): ret = object.__getattribute__(self, "_iamcache").get(attr) if ret is not None: - return ret + return ret.tolist() if attr != "time" else ret if attr in self.dimensions: - ret = self._iamcache[attr] = self.index[self].unique(level=attr).tolist() - return ret + ret = self._iamcache[attr] = self.index[self].unique(level=attr) + return ret.tolist() if attr != "time" else ret return super().__getattr__(attr) @@ -477,7 +477,9 @@ def time(self): - :class:`pandas.Index` if the time domain is 'mixed' """ if self._time is None: - self._time = pd.Index(get_index_levels(self._data, self.time_col)) + self._time = pd.Index( + self._data.index.unique(level=self.time_col).values, name="time" + ) return self._time diff --git a/tests/conftest.py b/tests/conftest.py index 5428c5d36..f596779ee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -38,7 +38,7 @@ DTS_MAPPING = {2005: TEST_DTS[0], 2010: TEST_DTS[1]} -EXP_DATETIME_INDEX = pd.DatetimeIndex(["2005-06-17T00:00:00"]) +EXP_DATETIME_INDEX = pd.DatetimeIndex(["2005-06-17T00:00:00"], name="time") TEST_DF = pd.DataFrame( diff --git a/tests/test_filter.py b/tests/test_filter.py index 26c667e75..54283039e 100644 --- a/tests/test_filter.py +++ b/tests/test_filter.py @@ -11,19 +11,22 @@ from .conftest import EXP_DATETIME_INDEX -def test_filter_error_illegal_column(test_df): +@pytest.mark.parametrize("method", ("filter", "slice")) +def test_filter_error_illegal_column(test_df, method): # filtering by column `foo` is not valid - pytest.raises(ValueError, test_df.filter, foo="test") + pytest.raises(ValueError, getattr(test_df, method), foo="test") -def test_filter_error_keep(test_df): +@pytest.mark.parametrize("method", ("filter", "slice")) +def test_filter_error_keep(test_df, method): # string or non-starred dict was mis-interpreted as `keep` kwarg, see #253 - pytest.raises(ValueError, test_df.filter, model="foo", keep=1) - pytest.raises(ValueError, test_df.filter, dict(model="foo")) + pytest.raises(ValueError, getattr(test_df, method), model="foo", keep=1) + pytest.raises(ValueError, getattr(test_df, method), dict(model="foo")) -def test_filter_year(test_df): - obs = test_df.filter(year=2005) +@pytest.mark.parametrize("method", ("filter", "slice")) +def test_filter_year(test_df, method): + obs = getattr(test_df, method)(year=2005) if test_df.time_col == "year": assert obs.year == [2005] else: @@ -45,14 +48,14 @@ def test_filter_mixed_time_domain(test_df_mixed, arg_year, arg_time): # filtering to datetime-only works as expected obs = test_df_mixed.filter(**arg_time) assert obs.time_domain == "datetime" - pdt.assert_index_equal(obs.time, pd.DatetimeIndex(["2010-07-21"])) + pdt.assert_index_equal(obs.time, pd.DatetimeIndex(["2010-07-21"], name="time")) # filtering to year-only works as expected including changing of time domain obs = test_df_mixed.filter(**arg_year) assert obs.time_col == "year" assert obs.time_domain == "year" assert obs.year == [2005] - pdt.assert_index_equal(obs.time, pd.Int64Index([2005])) + pdt.assert_index_equal(obs.time, pd.Int64Index([2005], name="time")) def test_filter_time_domain_raises(test_df_year): diff --git a/tests/test_time.py b/tests/test_time.py index 1d2f4c896..4aad10ec8 100644 --- a/tests/test_time.py +++ b/tests/test_time.py @@ -38,11 +38,11 @@ def get_subannual_df(date1, date2): @pytest.mark.parametrize( "time, domain, index", [ - (TEST_YEARS, "year", pd.Int64Index([2005, 2010])), - (TEST_DTS, "datetime", pd.DatetimeIndex(TEST_DTS)), - (TEST_TIME_STR, "datetime", pd.DatetimeIndex(TEST_DTS)), - (TEST_TIME_STR_HR, "datetime", pd.DatetimeIndex(TEST_TIME_STR_HR)), - (TEST_TIME_MIXED, "mixed", pd.Index(TEST_TIME_MIXED)), + (TEST_YEARS, "year", pd.Int64Index([2005, 2010], name="time")), + (TEST_DTS, "datetime", pd.DatetimeIndex(TEST_DTS, name="time")), + (TEST_TIME_STR, "datetime", pd.DatetimeIndex(TEST_DTS, name="time")), + (TEST_TIME_STR_HR, "datetime", pd.DatetimeIndex(TEST_TIME_STR_HR, name="time")), + (TEST_TIME_MIXED, "mixed", pd.Index(TEST_TIME_MIXED, name="time")), ], ) def test_time_domain(test_pd_df, time, domain, index): @@ -74,7 +74,7 @@ def test_swap_time_to_year(test_df, inplace): obs = test_df assert_iamframe_equal(obs, exp) - pdt.assert_index_equal(obs.time, pd.Index([2005, 2010])) + pdt.assert_index_equal(obs.time, pd.Index([2005, 2010], name="time")) @pytest.mark.parametrize( diff --git a/tests/test_unfccc.py b/tests/test_unfccc.py index 6f7b82ab2..047b21aab 100644 --- a/tests/test_unfccc.py +++ b/tests/test_unfccc.py @@ -11,7 +11,7 @@ INDEX_ARGS = dict(model="UNFCCC", scenario="Data Inventory") -def test_unfccc_tier1(): +def _test_unfccc_tier1(): # test that UNFCCC API returns expected data and units exp = IamDataFrame( UNFCCC_DF, From 478112fde641966a7ed7cd851a72d82c20135208 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 15 Mar 2022 13:18:04 +0100 Subject: [PATCH 4/5] Add a proper length attribute (#643) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Add a proper length attribute * Use pandas native method Co-authored-by: Jonas Hörsch --- pyam/core.py | 3 +++ tests/test_slice.py | 4 ++++ 2 files changed, 7 insertions(+) create mode 100644 tests/test_slice.py diff --git a/pyam/core.py b/pyam/core.py index b9183086e..e296cdc6a 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -117,6 +117,9 @@ def __getattr__(self, attr): return super().__getattr__(attr) + def __len__(self): + return self.sum() + @property def dimensions(self): return self.index.names diff --git a/tests/test_slice.py b/tests/test_slice.py new file mode 100644 index 000000000..7adbf018d --- /dev/null +++ b/tests/test_slice.py @@ -0,0 +1,4 @@ +def test_slice_len(test_df_year): + """Check the length of a slice""" + + assert len(test_df_year.slice(scenario="scen_a")) == 4 From 0b7993b943b2a87db51d5827462d2daff7341d7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonas=20H=C3=B6rsch?= Date: Tue, 15 Mar 2022 20:43:32 +0100 Subject: [PATCH 5/5] Reactivate test fixed #647. Co-authored-by: Daniel Huppmann --- tests/test_unfccc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_unfccc.py b/tests/test_unfccc.py index f488ba958..649213b04 100644 --- a/tests/test_unfccc.py +++ b/tests/test_unfccc.py @@ -11,7 +11,7 @@ INDEX_ARGS = dict(model="UNFCCC", scenario="Data Inventory") -def _test_unfccc_tier1(): +def test_unfccc_tier1(): # test that UNFCCC API returns expected data and units exp = IamDataFrame( UNFCCC_DF,