Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add weighted quantiles #686

Merged
merged 13 commits into from
Dec 15, 2022
65 changes: 65 additions & 0 deletions pyam/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,71 @@ class IamComputeAccessor:
def __init__(self, df):
self._df = df

def quantiles(
self, quantiles, weights=None, level=["model", "scenario"], append=False
):
"""Compute the optionally weighted quantiles of data grouped by `level`.

For example, the following will provide the interquartile range and median value
of CO2 emissions across all models and scenarios in a given dataset:

.. code-block:: python

df.filter(variable='Emissions|CO2').quantiles([0.25, 0.5, 0.75])

Parameters
----------
quantiles : collection
Group of quantile values to compute
weights : pd.Series, optional
Series indexed by `level`
level : collection, optional
The index columns to compute quantiles over
append : bool, optional
Whether to append computed timeseries data to this instance.
"""
gidden marked this conversation as resolved.
Show resolved Hide resolved
self_df = self._df
if len(self_df.variable) > 1:
raise ValueError(
"quantiles() currently supports only 1 variable, and this"
f"dataframe has {len(self_df.variable)}"
)
df = self_df.timeseries()
model = "unweighted" if weights is None else "weighted" # can make this a kwarg
gidden marked this conversation as resolved.
Show resolved Hide resolved

# get weights aligned with model/scenario in data
if weights is None:
df["weight"] = 1.0
else:
df = df.join(weights, how="inner")
w = df["weight"]
df.drop("weight", axis="columns", inplace=True)

# prep data for processing
df = df.reset_index(level=level).drop(columns=level)

dfs = []
# indexed over region, variable, and unit
idxs = df.index.drop_duplicates()
for idx, q in itertools.product(idxs, quantiles):
data = pd.Series(
wquantiles.quantile(df.loc[idx].values.T, w.values, q),
index=pd.Series(df.columns, name="year"),
name="value",
)
kwargs = {idxs.names[i]: idx[i] for i in range(len(idx))}
dfs.append(
IamDataFrame(
data,
model=model,
scenario=f"quantile_{q}", # can make this a kwarg
gidden marked this conversation as resolved.
Show resolved Hide resolved
**kwargs,
)
)

# append to `self` or return as `IamDataFrame`
return self_df._finalize(concat(dfs), append=append)

def growth_rate(self, mapping, append=False):
"""Compute the annualized growth rate of a timeseries along the time dimension

Expand Down
1 change: 1 addition & 0 deletions pyam/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
import os
import sys
import wquantiles
gidden marked this conversation as resolved.
Show resolved Hide resolved

import numpy as np
import pandas as pd
Expand Down
1 change: 1 addition & 0 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ install_requires =
six
setuptools >= 41
setuptools_scm
wquantiles
# required explicitly for Python 3.7
importlib_metadata
xlsxwriter
Expand Down
35 changes: 35 additions & 0 deletions tests/test_feature_quantiles.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from pyam import IamDataFrame
import pytest
from pyam.testing import assert_iamframe_equal
import pandas as pd


def test_qunatile_one_variable(test_pd_df):
gidden marked this conversation as resolved.
Show resolved Hide resolved
"""Tests interquartile range of standard test df

Because it is only two datapoints, the only 'new' computation
is the median
"""
df = IamDataFrame(test_pd_df)
quantiles = (0.25, 0.5, 0.75)
obs = df.filter(variable="Primary Energy").quantiles(quantiles)
exp = IamDataFrame(
pd.DataFrame(
{
"scenario": [f"quantile_{q}" for q in quantiles],
"2005": [1, (1.0 + 2) / 2, 2],
"2010": [6, (6 + 7) / 2, 7],
}
),
model="unweighted",
region="World",
variable="Primary Energy",
unit="EJ/yr",
)
assert_iamframe_equal(exp, obs)


def test_quantile_multiple_variables(test_pd_df):
gidden marked this conversation as resolved.
Show resolved Hide resolved
df = IamDataFrame(test_pd_df)
with pytest.raises(ValueError):
df.compute.quantiles((0.25, 0.5))