From 5b2f3b66b08a3dfbf9a6965c49ad046a0e6a585d Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Thu, 19 Oct 2023 10:53:21 +0200 Subject: [PATCH] Facilitate region-aggregation with inconsistent weights index (#792) --- RELEASE_NOTES.md | 2 + pyam/aggregation.py | 29 ++++++++++++-- tests/test_feature_aggregate.py | 65 ++++++++++++++++++++++++++----- tests/test_feature_growth_rate.py | 2 +- tests/test_io.py | 2 +- 5 files changed, 85 insertions(+), 15 deletions(-) diff --git a/RELEASE_NOTES.md b/RELEASE_NOTES.md index e7d85bcce..d373650f1 100644 --- a/RELEASE_NOTES.md +++ b/RELEASE_NOTES.md @@ -1,5 +1,7 @@ # Release v2.0.0 +- [#792](https://github.com/IAMconsortium/pyam/pull/792) Support region-aggregation with weights-index >> data-index + ## Highlights - Use **ixmp4** as dependency for better integration with the IIASA Scenario Explorer database infrastructure diff --git a/pyam/aggregation.py b/pyam/aggregation.py index ee781b660..a02c02818 100644 --- a/pyam/aggregation.py +++ b/pyam/aggregation.py @@ -4,7 +4,7 @@ from itertools import compress from pyam.index import replace_index_values -from pyam.logging import adjust_log_level +from pyam.logging import adjust_log_level, format_log_message from pyam.str import find_depth, is_str, reduce_hierarchy from pyam.utils import KNOWN_FUNCS, is_list_like, to_list from pyam._compare import _compare @@ -116,7 +116,10 @@ def _aggregate_region( raise ValueError("Using weights and components in one operation not supported.") # default subregions to all regions other than `region` - subregions = subregions or df._all_other_regions(region, variable) + if weight is None: + subregions = subregions or df._all_other_regions(region, variable) + else: + subregions = subregions or df._all_other_regions(region, [variable, weight]) if not len(subregions): logger.info( @@ -214,10 +217,28 @@ def _agg_weight(data, weight, method, drop_negative_weights): raise ValueError("Only method 'np.sum' allowed for weighted average.") weight = weight.droplevel(["variable", "unit"]) + data_index = data.droplevel(["variable", "unit"]).index + + # check that weights exist for all data rows + missing_weights = data_index.difference(weight.index) + if not missing_weights.empty: + raise ValueError( + format_log_message( + "Missing weights for the following data rows", missing_weights + ) + ) - if not data.droplevel(["variable", "unit"]).index.equals(weight.index): - raise ValueError("Inconsistent index between variable and weight!") + # warn if no data exists for available weights + missing_data = weight.index.difference(data_index) + if not missing_data.empty: + logger.warning( + format_log_message( + "Ignoring weights for the following missing data rows", missing_data + ) + ) + weight[missing_data] = np.nan + # remove (and warn) negative values from weights due to strange behavior if drop_negative_weights is True: if any(weight < 0): logger.warning( diff --git a/tests/test_feature_aggregate.py b/tests/test_feature_aggregate.py index 10c27e61b..54f75ba9a 100644 --- a/tests/test_feature_aggregate.py +++ b/tests/test_feature_aggregate.py @@ -1,5 +1,5 @@ import pytest -import logging +import re import numpy as np import pandas as pd @@ -238,7 +238,7 @@ def test_check_aggregate_region_log(simple_df, caplog): @pytest.mark.parametrize( "variable", ( - ("Primary Energy"), + "Primary Energy", (["Primary Energy", "Primary Energy|Coal", "Primary Energy|Wind"]), ), ) @@ -252,7 +252,7 @@ def test_aggregate_region_append(simple_df, variable): @pytest.mark.parametrize( "variable", ( - ("Primary Energy"), + "Primary Energy", (["Primary Energy", "Primary Energy|Coal", "Primary Energy|Wind"]), ), ) @@ -315,7 +315,13 @@ def test_aggregate_region_with_weights(simple_df, caplog): exp = simple_df.filter(variable=v, region="World") assert_iamframe_equal(simple_df.aggregate_region(v, weight=w), exp) - # test that dropping negative weights works as expected + +def test_aggregate_region_with_negative_weights(simple_df, caplog): + # carbon price shouldn't be summed but be weighted by emissions + v = "Price|Carbon" + w = "Emissions|CO2" + + # dropping negative weights works as expected neg_weights_df = simple_df.copy() neg_weights_df._data[18] = -6 exp = simple_df.filter(variable=v, region="World", year=2010) @@ -329,7 +335,7 @@ def test_aggregate_region_with_weights(simple_df, caplog): idx = caplog.messages.index(msg) assert caplog.records[idx].levelname == "WARNING" - # test that not dropping negative weights works as expected + # *not* dropping negative weights works as expected exp = simple_df.filter(variable=v, region="World") exp._data[0] = -8 assert_iamframe_equal( @@ -337,16 +343,57 @@ def test_aggregate_region_with_weights(simple_df, caplog): ) -def test_aggregate_region_with_weights_raises(simple_df): +@pytest.mark.parametrize( + "filter_arg,log_message", + ( + (dict(year=2010), ""), + (dict(), "model_a scen_a reg_b 2005\n1 "), + ), +) +def test_aggregate_region_with_weights_inconsistent_index( + simple_df, caplog, filter_arg, log_message +): # carbon price shouldn't be summed but be weighted by emissions v = "Price|Carbon" w = "Emissions|CO2" - # inconsistent index of variable and weight raises an error - _df = simple_df.filter(variable=w, region="reg_b", keep=False) - with pytest.raises(ValueError, match="Inconsistent index between variable and wei"): + log_message = "\n0 " + log_message + "model_a scen_a reg_b 2010" + if simple_df.time_domain == "datetime": + time_col = " time" + log_message = log_message.replace(" 2005", "2005-06-17").replace( + " 2010", "2010-07-21" + ) + else: + time_col = "year" + + # missing weight row raises an error + _df = simple_df.filter(variable=w, region="reg_b", keep=False, **filter_arg) + match = r"Missing weights for the following data.*\n.*" + re.escape(log_message) + with pytest.raises(ValueError, match=match): _df.aggregate_region(v, weight=w) + # missing data row prints a warning (data-index is a subset of weight-index) + exp = simple_df.filter(variable=v, region="World") + if not filter_arg: + exp._data[0] = 1.0 + exp._data[1] = 30.0 + _df = simple_df.filter(variable=v, region="reg_b", keep=False, **filter_arg) + assert_iamframe_equal(_df.aggregate_region(v, weight=w), exp) + + msg = ( + "Ignoring weights for the following missing data rows:\n" + f" model scenario region {time_col}" + log_message + ) + + idx = caplog.messages.index(msg) + assert caplog.records[idx].levelname == "WARNING" + + +def test_aggregate_region_with_weights_raises(simple_df): + # carbon price shouldn't be summed but be weighted by emissions + v = "Price|Carbon" + w = "Emissions|CO2" + # using weight and method other than 'sum' raises an error pytest.raises(ValueError, simple_df.aggregate_region, v, method="max", weight="bar") diff --git a/tests/test_feature_growth_rate.py b/tests/test_feature_growth_rate.py index fa9ea3b61..708e98bf1 100644 --- a/tests/test_feature_growth_rate.py +++ b/tests/test_feature_growth_rate.py @@ -66,5 +66,5 @@ def test_growth_rate_timeseries(x2010, rates): def test_growth_rate_timeseries_fails(value): """Check that a timeseries reaching/crossing 0 raises""" - with pytest.raises(ValueError, match="Cannot compute growth rate when*."): + with pytest.raises(ValueError, match="Cannot compute growth rate when"): growth_rate(pd.Series([1.0, value])) diff --git a/tests/test_io.py b/tests/test_io.py index c358e9531..fddf9079c 100644 --- a/tests/test_io.py +++ b/tests/test_io.py @@ -43,7 +43,7 @@ def test_not_a_file(): def test_io_list(): # initializing with a list raises an error - match = r"Initializing from list is not supported,*." + match = "Initializing from list is not supported," with pytest.raises(ValueError, match=match): IamDataFrame([1, 2])