Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bootstrap weights #485

Merged
merged 15 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion src/estimagic/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def bootstrap(
existing_result=None,
outcome_kwargs=None,
n_draws=1_000,
weight_by=None,
cluster_by=None,
seed=None,
n_cores=1,
Expand All @@ -41,6 +42,7 @@ def bootstrap(
n_draws (int): Number of bootstrap samples to draw.
If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
is used.
weight_by (str): Column name of variable with weights or None.
cluster_by (str): Column name of variable to cluster by or None.
seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
numpy.random.default_rng is used seeded with seed. If seed is already a
Expand All @@ -59,7 +61,7 @@ def bootstrap(

"""
if callable(outcome):
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)

if outcome_kwargs is not None:
outcome = functools.partial(outcome, **outcome_kwargs)
Expand All @@ -82,6 +84,7 @@ def bootstrap(
new_outcomes = get_bootstrap_outcomes(
data=data,
outcome=outcome,
weight_by=weight_by,
cluster_by=cluster_by,
rng=rng,
n_draws=n_draws - n_existing,
Expand Down
12 changes: 11 additions & 1 deletion src/estimagic/bootstrap_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,18 @@


def check_inputs(
data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
data=None,
weight_by=None,
cluster_by=None,
ci_method="percentile",
ci_level=0.95,
skipdata=False,
):
"""Check validity of inputs.

Args:
data (pd.DataFrame): Dataset.
weight_by (str): Column name of variable with weights.
cluster_by (str): Column name of variable to cluster by.
ci_method (str): Method of choice for computing confidence intervals.
The default is "percentile".
Expand All @@ -21,6 +27,10 @@ def check_inputs(
if not skipdata:
if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
elif (weight_by is not None) and (weight_by not in data.columns.tolist()):
raise ValueError(
"Input 'weight_by' must be None or a column name of 'data'."
)
elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
raise ValueError(
"Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
5 changes: 4 additions & 1 deletion src/estimagic/bootstrap_outcomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
def get_bootstrap_outcomes(
data,
outcome,
weight_by=None,
cluster_by=None,
rng=None,
n_draws=1000,
Expand All @@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
data (pandas.DataFrame): original dataset.
outcome (callable): function of the dataset calculating statistic of interest.
Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
rng (numpy.random.Generator): A random number generator.
n_draws (int): number of bootstrap draws.
Expand All @@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
estimates (list): List of pytrees of estimated bootstrap outcomes.

"""
check_inputs(data=data, cluster_by=cluster_by)
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
batch_evaluator = process_batch_evaluator(batch_evaluator)

indices = get_bootstrap_indices(
data=data,
rng=rng,
weight_by=weight_by,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
54 changes: 50 additions & 4 deletions src/estimagic/bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,13 @@
import pandas as pd


def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_indices(
data,
rng,
weight_by=None,
cluster_by=None,
n_draws=1000,
):
"""Draw positional indices for the construction of bootstrap samples.

Storing the positional indices instead of the full bootstrap samples saves a lot
Expand All @@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -19,12 +26,16 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):

"""
n_obs = len(data)
probs = _calculate_bootstrap_indices_weights(data, weight_by, cluster_by)

if cluster_by is None:
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
bootstrap_indices = list(
rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs)
)
else:
clusters = data[cluster_by].unique()
drawn_clusters = rng.choice(
clusters, size=(n_draws, len(clusters)), replace=True
clusters, size=(n_draws, len(clusters)), replace=True, p=probs
)

bootstrap_indices = _convert_cluster_ids_to_indices(
Expand All @@ -34,6 +45,33 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
return bootstrap_indices


def _calculate_bootstrap_indices_weights(data, weight_by, cluster_by):
"""Calculate weights for drawing bootstrap indices.

If weights_by is not None and cluster_by is None, the weights are normalized to sum
to one. If weights_by and cluster_by are both not None, the weights are normalized
to sum to one within each cluster.

Args:
data (pandas.DataFrame): original dataset.
weight_by (str): column name of the variable with weights.
cluster_by (str): column name of the variable to cluster by.

Returns:
list: None or pd.Series of weights.

"""
if weight_by is None:
probs = None
else:
if cluster_by is None:
probs = data[weight_by] / data[weight_by].sum()
else:
cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum()
probs = cluster_weights / cluster_weights.sum()
return probs


def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
"""Convert the drawn clusters to positional indices of individual observations.

Expand All @@ -48,7 +86,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
return bootstrap_indices


def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
def get_bootstrap_samples(
data,
rng,
weight_by=None,
cluster_by=None,
n_draws=1000,
):
"""Draw bootstrap samples.

If you have memory issues you should use get_bootstrap_indices instead and construct
Expand All @@ -57,6 +101,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
Args:
data (pandas.DataFrame): original dataset.
rng (numpy.random.Generator): A random number generator.
weight_by (str): weights for the observations.
cluster_by (str): column name of the variable to cluster by.
n_draws (int): number of draws, only relevant if seeds is None.

Expand All @@ -67,6 +112,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
indices = get_bootstrap_indices(
data=data,
rng=rng,
weight_by=weight_by,
cluster_by=cluster_by,
n_draws=n_draws,
)
Expand Down
5 changes: 3 additions & 2 deletions src/estimagic/msm_weighting.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ def get_moments_cov(
moment_kwargs (dict): Additional keyword arguments for calculate_moments.
bootstrap_kwargs (dict): Additional keyword arguments that govern the
bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
"batch_evaluator", "cluster_by" and "error_handling". For details see the
bootstrap function.
"batch_evaluator", "weight_by", "cluster_by" and "error_handling".
For details see the bootstrap function.

Returns:
pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
Expand All @@ -39,6 +39,7 @@ def get_moments_cov(
"n_draws",
"seed",
"batch_evaluator",
"weight_by",
"cluster_by",
"error_handling",
"existing_result",
Expand Down
25 changes: 25 additions & 0 deletions tests/estimagic/test_bootstrap_ci.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
from pybaum import tree_just_flatten

from estimagic.bootstrap_ci import calculate_ci, check_inputs
from estimagic.bootstrap_samples import get_bootstrap_indices
from optimagic.parameters.tree_registry import get_registry
from optimagic.utilities import get_rng


def aaae(obj1, obj2, decimal=6):
Expand Down Expand Up @@ -88,6 +90,29 @@ def test_check_inputs_data():
assert str(error.value) == expected_msg


def test_check_inputs_weight_by(setup):
expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'."
with pytest.raises(ValueError, match=expected_error_msg):
check_inputs(data=setup["df"], weight_by="this is not a column name of df")


def test_get_bootstrap_indices_heterogeneous_weights():
data = pd.DataFrame(
{"id": [0, 1], "w_homogenous": [0.5, 0.5], "w_heterogenous": [0.1, 0.9]}
)

res_homogenous = get_bootstrap_indices(
data, weight_by="w_homogenous", n_draws=1_000, rng=get_rng(seed=0)
)
res_heterogenous = get_bootstrap_indices(
data, weight_by="w_heterogenous", n_draws=1_000, rng=get_rng(seed=0)
)

# Given the weights, the first sample mean should be close to 0.5,
# while the second one should be close to 0.9
assert np.mean(res_homogenous) < 0.75 < np.mean(res_heterogenous)


def test_check_inputs_cluster_by(setup):
cluster_by = "this is not a column name of df"
expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."
Expand Down
84 changes: 84 additions & 0 deletions tests/estimagic/test_bootstrap_samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@
import pytest
from numpy.testing import assert_array_equal as aae
from pandas.testing import assert_frame_equal as afe
from pandas.testing import assert_series_equal as ase

from estimagic.bootstrap_samples import (
_calculate_bootstrap_indices_weights,
_convert_cluster_ids_to_indices,
_get_bootstrap_samples_from_indices,
get_bootstrap_indices,
Expand All @@ -18,6 +20,7 @@ def data():
df = pd.DataFrame()
df["id"] = np.arange(900)
df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
df["weights"] = np.ones(900)
return df


Expand All @@ -33,6 +36,37 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
rng = get_rng(seed=12345)
res = get_bootstrap_indices(
data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng
)
assert set(res[0]) != set(res[1])


def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data):
rng1 = get_rng(seed=12345)
rng2 = get_rng(seed=12345)
res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1)
res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2)
assert not np.array_equal(res1, res2)


def test_get_boostrap_indices_randomization_works_with_extreme_case(data):
rng = get_rng(seed=12345)
weights = np.zeros(900)
weights[0] = 1.0
data["weights"] = weights
res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng)
assert len(np.unique(res)) == 1


def test_clustering_leaves_households_intact(data):
rng = get_rng(seed=12345)
indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]
Expand Down Expand Up @@ -63,3 +97,53 @@ def test_get_bootstrap_samples_from_indices():
def test_get_bootstrap_samples_runs(data):
rng = get_rng(seed=12345)
get_bootstrap_samples(data, n_draws=2, rng=rng)


@pytest.fixture
def sample_data():
return pd.DataFrame({"weight": [1, 2, 3, 4], "cluster": ["A", "A", "B", "B"]})


def test_no_weights_no_clusters(sample_data):
result = _calculate_bootstrap_indices_weights(sample_data, None, None)
assert result is None


def test_weights_no_clusters(sample_data):
result = _calculate_bootstrap_indices_weights(sample_data, "weight", None)
expected = pd.Series([0.1, 0.2, 0.3, 0.4], index=sample_data.index, name="weight")
pd.testing.assert_series_equal(result, expected)


def test_weights_and_clusters(sample_data):
result = _calculate_bootstrap_indices_weights(sample_data, "weight", "cluster")
expected = pd.Series(
[0.3, 0.7], index=pd.Index(["A", "B"], name="cluster"), name="weight"
)
ase(result, expected)


def test_invalid_weight_column():
data = pd.DataFrame({"x": [1, 2, 3]})
with pytest.raises(KeyError):
_calculate_bootstrap_indices_weights(data, "weight", None)


def test_invalid_cluster_column(sample_data):
with pytest.raises(KeyError):
_calculate_bootstrap_indices_weights(sample_data, "weight", "invalid_cluster")


def test_empty_dataframe():
empty_df = pd.DataFrame()
result = _calculate_bootstrap_indices_weights(empty_df, None, None)
assert result is None


def test_some_zero_weights_with_clusters():
data = pd.DataFrame({"weight": [0, 1, 0, 2], "cluster": ["A", "A", "B", "B"]})
result = _calculate_bootstrap_indices_weights(data, "weight", "cluster")
expected = pd.Series(
[1 / 3, 2 / 3], index=pd.Index(["A", "B"], name="cluster"), name="weight"
)
ase(result, expected)