diff --git a/pyam/core.py b/pyam/core.py index b917fee21..51cb3040a 100755 --- a/pyam/core.py +++ b/pyam/core.py @@ -4,6 +4,7 @@ import logging import os import sys +import wquantiles import numpy as np import pandas as pd @@ -2310,6 +2311,73 @@ def diff(self, mapping, periods=1, append=False): # append to `self` or return as `IamDataFrame` return self._finalize(_value, append=append) + def quantiles(self, quantiles, weights=None, level=['model', 'scenario'], append=False): + """Compute the optionally weighted quantiles of data grouped by `level`. + + For example, the following will provide the interquartile range and median value + of CO2 emissions across all models and scenarios in a given dataset: + + .. code-block:: python + + df.filter(variable='Emissions|CO2').quantiles([0.25, 0.5, 0.75]) + + Parameters + ---------- + quantiles : collection + Group of quantile values to compute + weights : pd.Series, optional + Series indexed by `level` + level : collection, optional + The index columns to compute quantiles over + append : bool, optional + Whether to append computed timeseries data to this instance. + """ + if len(self.variable) > 1: + raise ValueError( + 'quantiles() currently supports only 1 variable, and this' + f'dataframe has {len(self.variable)}' + ) + df = self.timeseries() + model = 'unweighted' if weights is None else 'weighted' # can make this a kwarg + + # get weights aligned with model/scenario in data + if weights is None: + df['weight'] = 1.0 + else: + df = df.join(weights, how='inner') + w = df['weight'] + df.drop('weight', axis='columns', inplace=True) + + # prep data for processing + df = ( + df + .reset_index(level=level) + .drop(columns=level) + ) + + dfs = [] + # indexed over region, variable, and unit + idxs = df.index.drop_duplicates() + for idx, q in itertools.product(idxs, quantiles): + data = pd.Series( + wquantiles.quantile(df.loc[idx].values.T, w.values, q), + index=pd.Series(df.columns, name='year'), + name='value', + ) + kwargs = {idxs.names[i]: idx[i] for i in range(len(idx))} + dfs.append( + IamDataFrame( + data, + model=model, + scenario=f'quantile_{q}', # can make this a kwarg + **kwargs + ) + ) + + # append to `self` or return as `IamDataFrame` + return self._finalize(concat(dfs), append=append) + + def _to_file_format(self, iamc_index): """Return a dataframe suitable for writing to a file""" df = self.timeseries(iamc_index=iamc_index).reset_index() diff --git a/setup.cfg b/setup.cfg index 00f8099f9..257668bb0 100644 --- a/setup.cfg +++ b/setup.cfg @@ -41,6 +41,7 @@ install_requires = xlrd >= 2.0 setuptools >= 41 setuptools_scm + wquantiles # required explicitly for Python 3.7 importlib_metadata setup_requires = diff --git a/tests/test_feature_quantiles.py b/tests/test_feature_quantiles.py new file mode 100644 index 000000000..8c441d348 --- /dev/null +++ b/tests/test_feature_quantiles.py @@ -0,0 +1,32 @@ +from pyam import IamDataFrame +import pytest +from pyam.testing import assert_iamframe_equal +import pandas as pd + + +def test_qunatile_one_variable(test_pd_df): + """Tests interquartile range of standard test df + + Because it is only two datapoints, the only 'new' computation + is the median + """ + df = IamDataFrame(test_pd_df) + quantiles = (0.25, 0.5, 0.75) + obs = df.filter(variable='Primary Energy').quantiles(quantiles) + exp = IamDataFrame( + pd.DataFrame({ + 'scenario': [f'quantile_{q}' for q in quantiles], + '2005': [1, (1. + 2) / 2, 2], + '2010': [6, (6 + 7) / 2, 7], + }), + model="unweighted", + region="World", + variable="Primary Energy", + unit="EJ/yr", + ) + assert_iamframe_equal(exp, obs) + +def test_quantile_multiple_variables(test_pd_df): + df = IamDataFrame(test_pd_df) + with pytest.raises(ValueError): + df.quantiles((0.25, 0.5)) \ No newline at end of file