add weighted quantiles

IAMconsortium · Aug 12, 2022 · 3563b43 · 3563b43
1 parent d104630
commit 3563b43
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 0 deletions.
diff --git a/pyam/core.py b/pyam/core.py
@@ -4,6 +4,7 @@
 import logging
 import os
 import sys
+import wquantiles
 
 import numpy as np
 import pandas as pd
@@ -2310,6 +2311,73 @@ def diff(self, mapping, periods=1, append=False):
         # append to `self` or return as `IamDataFrame`
         return self._finalize(_value, append=append)
 
+    def quantiles(self, quantiles, weights=None, level=['model', 'scenario'], append=False):
+        """Compute the optionally weighted quantiles of data grouped by `level`.
+
+        For example, the following will provide the interquartile range and median value
+        of CO2 emissions across all models and scenarios in a given dataset:
+
+        .. code-block:: python
+
+            df.filter(variable='Emissions|CO2').quantiles([0.25, 0.5, 0.75])
+
+        Parameters
+        ----------
+        quantiles : collection
+            Group of quantile values to compute
+        weights : pd.Series, optional
+            Series indexed by `level`
+        level : collection, optional
+            The index columns to compute quantiles over
+        append : bool, optional
+            Whether to append computed timeseries data to this instance.
+        """
+        if len(self.variable) > 1:
+            raise ValueError(
+                'quantiles() currently supports only 1 variable, and this'
+                f'dataframe has {len(self.variable)}'
+            )
+        df = self.timeseries()
+        model = 'unweighted' if weights is None else 'weighted' # can make this a kwarg
+
+        # get weights aligned with model/scenario in data
+        if weights is None:
+            df['weight'] = 1.0
+        else:
+            df = df.join(weights, how='inner')
+        w = df['weight']
+        df.drop('weight', axis='columns', inplace=True)
+
+        # prep data for processing
+        df = (
+            df
+            .reset_index(level=level)
+            .drop(columns=level)
+        )
+
+        dfs = []
+        # indexed over region, variable, and unit
+        idxs = df.index.drop_duplicates()
+        for idx, q in itertools.product(idxs, quantiles):
+            data = pd.Series(
+                wquantiles.quantile(df.loc[idx].values.T, w.values, q),
+                index=pd.Series(df.columns, name='year'),
+                name='value',
+            )
+            kwargs = {idxs.names[i]: idx[i] for i in range(len(idx))}
+            dfs.append(
+                IamDataFrame(
+                    data,
+                    model=model,
+                    scenario=f'quantile_{q}', # can make this a kwarg
+                    **kwargs
+                )
+            )
+
+        # append to `self` or return as `IamDataFrame`
+        return self._finalize(concat(dfs), append=append)
+
+
     def _to_file_format(self, iamc_index):
         """Return a dataframe suitable for writing to a file"""
         df = self.timeseries(iamc_index=iamc_index).reset_index()

diff --git a/setup.cfg b/setup.cfg
@@ -41,6 +41,7 @@ install_requires =
     xlrd >= 2.0
     setuptools >= 41
     setuptools_scm
+    wquantiles
     # required explicitly for Python 3.7
     importlib_metadata
 setup_requires =

diff --git a/tests/test_feature_quantiles.py b/tests/test_feature_quantiles.py
@@ -0,0 +1,32 @@
+from pyam import IamDataFrame
+import pytest
+from pyam.testing import assert_iamframe_equal
+import pandas as pd
+
+
+def test_qunatile_one_variable(test_pd_df):
+    """Tests interquartile range of standard test df
+
+    Because it is only two datapoints, the only 'new' computation 
+    is the median
+    """
+    df = IamDataFrame(test_pd_df)
+    quantiles = (0.25, 0.5, 0.75)
+    obs = df.filter(variable='Primary Energy').quantiles(quantiles)
+    exp = IamDataFrame(
+        pd.DataFrame({
+            'scenario': [f'quantile_{q}' for q in quantiles],
+            '2005': [1, (1. + 2) / 2, 2],
+            '2010': [6, (6 + 7) / 2, 7],
+            }),
+        model="unweighted",
+        region="World",
+        variable="Primary Energy",
+        unit="EJ/yr",
+    )
+    assert_iamframe_equal(exp, obs)
+
+def test_quantile_multiple_variables(test_pd_df):
+    df = IamDataFrame(test_pd_df)
+    with pytest.raises(ValueError):
+        df.quantiles((0.25, 0.5))