Add an Iamslice (#657)

Co-authored-by: Jonas Hoersch <jonas.hoersch@climateanalytics.org>
IAMconsortium · May 12, 2022 · c5e8f9c · c5e8f9c
1 parent 764a85e
commit c5e8f9c
Show file tree

Hide file tree

Showing 10 changed files with 240 additions and 49 deletions.
diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md
@@ -1,3 +1,7 @@
+# Next release
+
+- [#657](https://github.com/IAMconsortium/pyam/pull/657) Add an `IamSlice` class
+
 # Release v1.4.0
 
 ## Highlights

diff --git a/doc/source/api.rst b/doc/source/api.rst
@@ -13,6 +13,7 @@ and methods.
    api/general
    api/iamdataframe
    api/database
+   api/slice
    api/filtering
    api/compute
    api/plotting

diff --git a/doc/source/api/slice.rst b/doc/source/api/slice.rst
@@ -0,0 +1,7 @@
+.. currentmodule:: pyam
+
+The **IamSlice** class
+======================
+
+.. autoclass:: IamSlice
+   :members: dimensions, time, info
diff --git a/pyam/__init__.py b/pyam/__init__.py
@@ -10,16 +10,17 @@
     from importlib_metadata import version, PackageNotFoundError
 
 from pyam.core import *
+from pyam.slice import IamSlice  # noqa: F401
 from pyam.utils import *
 from pyam.statistics import *
 from pyam.timeseries import *
 from pyam.read_ixmp import *
 from pyam.logging import *
 from pyam.run_control import *
-from pyam.iiasa import read_iiasa
-from pyam.datareader import read_worldbank
-from pyam.unfccc import read_unfccc
-from pyam.testing import assert_iamframe_equal
+from pyam.iiasa import read_iiasa  # noqa: F401
+from pyam.datareader import read_worldbank  # noqa: F401
+from pyam.unfccc import read_unfccc  # noqa: F401
+from pyam.testing import assert_iamframe_equal  # noqa: F401
 
 from pyam.logging import defer_logging_config
 

diff --git a/pyam/core.py b/pyam/core.py
@@ -23,6 +23,7 @@
 
 from tempfile import TemporaryDirectory
 
+from pyam.slice import IamSlice
 from pyam.filter import filter_by_time_domain, filter_by_year, filter_by_dt_arg
 
 try:
@@ -255,7 +256,9 @@ def _finalize(self, data, append, **args):
 
     def __getitem__(self, key):
         _key_check = [key] if isstr(key) else key
-        if key == "value":
+        if isinstance(key, IamSlice):
+            return IamDataFrame(self._data.loc[key])
+        elif key == "value":
             return pd.Series(self._data.values, name="value")
         elif set(_key_check).issubset(self.meta.columns):
             return self.meta.__getitem__(key)
@@ -291,7 +294,7 @@ def info(self, n=80, meta_rows=5, memory_usage=False):
         c2 = n - c1 - 5
         info += "\n".join(
             [
-                f" * {i:{c1}}: {print_list(get_index_levels(self._data, i), c2)}"
+                f" * {i:{c1}}: {print_list(getattr(self, i), c2)}"
                 for i in self.index.names
             ]
         )
@@ -300,7 +303,7 @@ def info(self, n=80, meta_rows=5, memory_usage=False):
         info += "\nTimeseries data coordinates:\n"
         info += "\n".join(
             [
-                f"   {i:{c1}}: {print_list(get_index_levels(self._data, i), c2)}"
+                f"   {i:{c1}}: {print_list(getattr(self, i), c2)}"
                 for i in self.dimensions
                 if i not in self.index.names
             ]
@@ -414,13 +417,16 @@ def list_or_str(x):
     def time(self):
         """The time index, i.e., axis labels related to the time domain.
 
-        The returned type is
-        - :class:`pandas.Int64Index` if the time_domain is 'year'
-        - :class:`pandas.DatetimeIndex` if the time domain is 'datetime'
-        - :class:`pandas.Index` if the time domain is 'mixed'
+        Returns
+        -------
+        - A :class:`pandas.Int64Index` if the :attr:`time_domain` is 'year'
+        - A :class:`pandas.DatetimeIndex` if the :attr:`time_domain` is 'datetime'
+        - A :class:`pandas.Index` if the :attr:`time_domain` is 'mixed'
         """
         if self._time is None:
-            self._time = pd.Index(get_index_levels(self._data, self.time_col))
+            self._time = pd.Index(
+                self._data.index.unique(level=self.time_col).values, name="time"
+            )
 
         return self._time
 
@@ -1712,38 +1718,67 @@ def _exclude_on_fail(self, df):
             )
         )
 
+    def slice(self, keep=True, **kwargs):
+        """Return a (filtered) slice object of the IamDataFrame timeseries data index
+
+        Parameters
+        ----------
+        keep : bool, optional
+            Keep all scenarios satisfying the filters (if *True*) or the inverse.
+        **kwargs
+            Arguments for filtering. See the "Notes".
+
+        Returns
+        -------
+        :class:`IamSlice`
+
+        Notes
+        -----
+        The following arguments are available for filtering:
+
+         - 'meta' columns: filter by string value of that column
+         - 'model', 'scenario', 'region', 'variable', 'unit':
+           string or list of strings, where `*` can be used as a wildcard
+         - 'level': the "depth" of entries in the variable column (number of '|')
+           (excluding the strings given in the 'variable' argument)
+         - 'year': takes an integer (int/np.int64), a list of integers or
+           a range. Note that the last year of a range is not included,
+           so `range(2010, 2015)` is interpreted as `[2010, ..., 2014]`
+         - 'time_domain': can be "year" or "datetime"
+         - arguments for filtering by `datetime.datetime` or np.datetime64
+           ('month', 'hour', 'time')
+         - 'regexp=True' disables pseudo-regexp syntax in `pattern_match()`
+
+        """
+
+        if not isinstance(keep, bool):
+            raise ValueError(f"Value of `keep` must be a boolean, found: {keep}")
+
+        _keep = self._apply_filters(**kwargs)
+        _keep = _keep if keep else ~_keep
+
+        return (
+            IamSlice(_keep)
+            if isinstance(_keep, pd.Series)
+            else IamSlice(_keep, self._data.index)
+        )
+
     def filter(self, keep=True, inplace=False, **kwargs):
         """Return a (copy of a) filtered (downselected) IamDataFrame
 
         Parameters
         ----------
         keep : bool, optional
-            keep all scenarios satisfying the filters (if True) or the inverse
+            Keep all scenarios satisfying the filters (if *True*) or the inverse.
         inplace : bool, optional
-            if True, do operation inplace and return None
-        filters by kwargs:
-            The following columns are available for filtering:
-             - 'meta' columns: filter by string value of that column
-             - 'model', 'scenario', 'region', 'variable', 'unit':
-               string or list of strings, where `*` can be used as a wildcard
-             - 'level': the maximum "depth" of IAM variables (number of '|')
-               (excluding the strings given in the 'variable' argument)
-             - 'year': takes an integer (int/np.int64), a list of integers or
-               a range. Note that the last year of a range is not included,
-               so `range(2010, 2015)` is interpreted as `[2010, ..., 2014]`
-             - 'time_domain': can be "year" or "datetime"
-             - arguments for filtering by `datetime.datetime` or np.datetime64
-               ('month', 'hour', 'time')
-             - 'regexp=True' disables pseudo-regexp syntax in `pattern_match()`
+            If *True*, do operation inplace and return *None*.
+        **kwargs
+            Passed to :meth:`slice`.
         """
-        if not isinstance(keep, bool):
-            raise ValueError(f"Cannot filter by `keep={keep}`, must be a boolean!")
 
         # downselect `data` rows and clean up index
-        _keep = self._apply_filters(**kwargs)
-        _keep = _keep if keep else ~_keep
         ret = self.copy() if not inplace else self
-        ret._data = ret._data[_keep]
+        ret._data = ret._data[self.slice(keep=keep, **kwargs)]
         ret._data.index = ret._data.index.remove_unused_levels()
 
         # swap time for year if downselected to years-only

diff --git a/pyam/slice.py b/pyam/slice.py
@@ -0,0 +1,88 @@
+import pandas as pd
+from pyam.utils import print_list
+
+
+class IamSlice(pd.Series):
+    """A slice object of the IamDataFrame timeseries data index"""
+
+    @property
+    def _constructor(self):
+        return IamSlice
+
+    _internal_names = pd.Series._internal_names + ["_iamcache"]
+    _internal_names_set = set(_internal_names)
+
+    def __init__(self, data=None, index=None, **kwargs):
+        super().__init__(data, index, **kwargs)
+        self._iamcache = dict()
+
+    def __dir__(self):
+        return self.dimensions + super().__dir__()
+
+    def __getattr__(self, attr):
+        try:
+            return super().__getattr__(attr)
+        except AttributeError:
+            cache = object.__getattribute__(self, "_iamcache")
+            ret = cache.get(attr)
+            if ret is not None:
+                return ret.tolist()
+
+            if attr in self.dimensions:
+                ret = cache[attr] = self.index[self].unique(level=attr)
+                return ret.tolist()
+
+            raise
+
+    def __len__(self):
+        return self.sum()
+
+    @property
+    def dimensions(self):
+        """Return the list of index names & data coordinates"""
+        return self.index.names
+
+    @property
+    def time(self):
+        """The time index, i.e., axis labels related to the time domain.
+
+        Returns
+        -------
+        - A :class:`pandas.Int64Index` if the time-domain is 'year'
+        - A :class:`pandas.DatetimeIndex` if the time-domain is 'datetime'
+        - A :class:`pandas.Index` if the time-domain is 'mixed'
+        """
+        ret = self._iamcache.get("time")
+        if ret is None:
+            ret = self._iamcache["time"] = (
+                self.index[self].unique(level=self.time_col).rename("time")
+            )
+        return ret
+
+    @property
+    def time_col(self):
+        return "year" if "year" in self.dimensions else "time"
+
+    def __repr__(self):
+        return self.info()
+
+    def info(self, n=80):
+        """Print a summary of the represented index dimensions and data coordinates
+
+        Parameters
+        ----------
+        n : int
+            The maximum line length
+        """
+        # concatenate list of index dimensions and levels
+        info = f"{type(self)}\nIndex dimensions and data coordinates:\n"
+        c1 = max([len(i) for i in self.dimensions]) + 1
+        c2 = n - c1 - 5
+        info += "\n".join(
+            [
+                f"   {i:{c1}}: {print_list(getattr(self, i), c2)}"
+                for i in self.dimensions
+            ]
+        )
+
+        return info
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -38,7 +38,7 @@
 
 DTS_MAPPING = {2005: TEST_DTS[0], 2010: TEST_DTS[1]}
 
-EXP_DATETIME_INDEX = pd.DatetimeIndex(["2005-06-17T00:00:00"])
+EXP_DATETIME_INDEX = pd.DatetimeIndex(["2005-06-17T00:00:00"], name="time")
 
 
 TEST_DF = pd.DataFrame(

diff --git a/tests/test_filter.py b/tests/test_filter.py
@@ -11,19 +11,22 @@
 from .conftest import EXP_DATETIME_INDEX
 
 
-def test_filter_error_illegal_column(test_df):
+@pytest.mark.parametrize("method", ("filter", "slice"))
+def test_filter_error_illegal_column(test_df, method):
     # filtering by column `foo` is not valid
-    pytest.raises(ValueError, test_df.filter, foo="test")
+    pytest.raises(ValueError, getattr(test_df, method), foo="test")
 
 
-def test_filter_error_keep(test_df):
+@pytest.mark.parametrize("method", ("filter", "slice"))
+def test_filter_error_keep(test_df, method):
     # string or non-starred dict was mis-interpreted as `keep` kwarg, see #253
-    pytest.raises(ValueError, test_df.filter, model="foo", keep=1)
-    pytest.raises(ValueError, test_df.filter, dict(model="foo"))
+    pytest.raises(ValueError, getattr(test_df, method), model="foo", keep=1)
+    pytest.raises(ValueError, getattr(test_df, method), dict(model="foo"))
 
 
-def test_filter_year(test_df):
-    obs = test_df.filter(year=2005)
+@pytest.mark.parametrize("method", ("filter", "slice"))
+def test_filter_year(test_df, method):
+    obs = getattr(test_df, method)(year=2005)
     if test_df.time_col == "year":
         assert obs.year == [2005]
     else:
@@ -45,14 +48,14 @@ def test_filter_mixed_time_domain(test_df_mixed, arg_year, arg_time):
     # filtering to datetime-only works as expected
     obs = test_df_mixed.filter(**arg_time)
     assert obs.time_domain == "datetime"
-    pdt.assert_index_equal(obs.time, pd.DatetimeIndex(["2010-07-21"]))
+    pdt.assert_index_equal(obs.time, pd.DatetimeIndex(["2010-07-21"], name="time"))
 
     # filtering to year-only works as expected including changing of time domain
     obs = test_df_mixed.filter(**arg_year)
     assert obs.time_col == "year"
     assert obs.time_domain == "year"
     assert obs.year == [2005]
-    pdt.assert_index_equal(obs.time, pd.Int64Index([2005]))
+    pdt.assert_index_equal(obs.time, pd.Int64Index([2005], name="time"))
 
 
 def test_filter_time_domain_raises(test_df_year):

diff --git a/tests/test_slice.py b/tests/test_slice.py
@@ -0,0 +1,52 @@
+import pandas as pd
+import pytest
+
+
+def test_slice_len(test_df_year):
+    """Check the length of a slice"""
+
+    assert len(test_df_year.slice(scenario="scen_a")) == 4
+
+
+def test_slice_index_attributes(test_df):
+    # assert that the index and data column attributes are set correctly in an IamSlice
+
+    s = test_df.slice()
+
+    assert s.model == ["model_a"]
+    assert s.scenario == ["scen_a", "scen_b"]
+    assert s.region == ["World"]
+    assert s.variable == ["Primary Energy", "Primary Energy|Coal"]
+    assert s.unit == ["EJ/yr"]
+    if test_df.time_col == "year":
+        assert s.year == [2005, 2010]
+    else:
+        match = "'IamSlice' object has no attribute 'year'"
+        with pytest.raises(AttributeError, match=match):
+            s.year
+    assert s.time.equals(pd.Index(test_df.data[test_df.time_col].unique()))
+
+
+def test_filtered_slice_index_attributes(test_df_year):
+    # assert that the attributes are set correctly in a filtered IamSlice
+
+    s = test_df_year.slice(scenario="scen_b")
+    assert s.scenario == ["scen_b"]
+
+
+def test_print(test_df_year):
+    """Assert that `print(IamSlice)` (and `info()`) returns as expected"""
+    exp = "\n".join(
+        [
+            "<class 'pyam.slice.IamSlice'>",
+            "Index dimensions and data coordinates:",
+            "   model    : model_a (1)",
+            "   scenario : scen_a, scen_b (2)",
+            "   region   : World (1)",
+            "   variable : Primary Energy, Primary Energy|Coal (2)",
+            "   unit     : EJ/yr (1)",
+            "   year     : 2005, 2010 (2)",
+        ]
+    )
+    obs = test_df_year.slice().info()
+    assert obs == exp