Skip to content

Commit 15ac4c2

Browse files
authored
Bootstrap weights (#485)
1 parent 84be118 commit 15ac4c2

File tree

7 files changed

+181
-9
lines changed

7 files changed

+181
-9
lines changed

src/estimagic/bootstrap.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ def bootstrap(
2424
existing_result=None,
2525
outcome_kwargs=None,
2626
n_draws=1_000,
27+
weight_by=None,
2728
cluster_by=None,
2829
seed=None,
2930
n_cores=1,
@@ -41,6 +42,7 @@ def bootstrap(
4142
n_draws (int): Number of bootstrap samples to draw.
4243
If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
4344
is used.
45+
weight_by (str): Column name of variable with weights or None.
4446
cluster_by (str): Column name of variable to cluster by or None.
4547
seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
4648
numpy.random.default_rng is used seeded with seed. If seed is already a
@@ -59,7 +61,7 @@ def bootstrap(
5961
6062
"""
6163
if callable(outcome):
62-
check_inputs(data=data, cluster_by=cluster_by)
64+
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
6365

6466
if outcome_kwargs is not None:
6567
outcome = functools.partial(outcome, **outcome_kwargs)
@@ -82,6 +84,7 @@ def bootstrap(
8284
new_outcomes = get_bootstrap_outcomes(
8385
data=data,
8486
outcome=outcome,
87+
weight_by=weight_by,
8588
cluster_by=cluster_by,
8689
rng=rng,
8790
n_draws=n_draws - n_existing,

src/estimagic/bootstrap_helpers.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,18 @@
22

33

44
def check_inputs(
5-
data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
5+
data=None,
6+
weight_by=None,
7+
cluster_by=None,
8+
ci_method="percentile",
9+
ci_level=0.95,
10+
skipdata=False,
611
):
712
"""Check validity of inputs.
813
914
Args:
1015
data (pd.DataFrame): Dataset.
16+
weight_by (str): Column name of variable with weights.
1117
cluster_by (str): Column name of variable to cluster by.
1218
ci_method (str): Method of choice for computing confidence intervals.
1319
The default is "percentile".
@@ -21,6 +27,10 @@ def check_inputs(
2127
if not skipdata:
2228
if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
2329
raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
30+
elif (weight_by is not None) and (weight_by not in data.columns.tolist()):
31+
raise ValueError(
32+
"Input 'weight_by' must be None or a column name of 'data'."
33+
)
2434
elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
2535
raise ValueError(
2636
"Input 'cluster_by' must be None or a column name of 'data'."

src/estimagic/bootstrap_outcomes.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
def get_bootstrap_outcomes(
77
data,
88
outcome,
9+
weight_by=None,
910
cluster_by=None,
1011
rng=None,
1112
n_draws=1000,
@@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
1920
data (pandas.DataFrame): original dataset.
2021
outcome (callable): function of the dataset calculating statistic of interest.
2122
Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
23+
weight_by (str): column name of the variable with weights.
2224
cluster_by (str): column name of the variable to cluster by.
2325
rng (numpy.random.Generator): A random number generator.
2426
n_draws (int): number of bootstrap draws.
@@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
3436
estimates (list): List of pytrees of estimated bootstrap outcomes.
3537
3638
"""
37-
check_inputs(data=data, cluster_by=cluster_by)
39+
check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
3840
batch_evaluator = process_batch_evaluator(batch_evaluator)
3941

4042
indices = get_bootstrap_indices(
4143
data=data,
4244
rng=rng,
45+
weight_by=weight_by,
4346
cluster_by=cluster_by,
4447
n_draws=n_draws,
4548
)

src/estimagic/bootstrap_samples.py

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,13 @@
22
import pandas as pd
33

44

5-
def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
5+
def get_bootstrap_indices(
6+
data,
7+
rng,
8+
weight_by=None,
9+
cluster_by=None,
10+
n_draws=1000,
11+
):
612
"""Draw positional indices for the construction of bootstrap samples.
713
814
Storing the positional indices instead of the full bootstrap samples saves a lot
@@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
1117
Args:
1218
data (pandas.DataFrame): original dataset.
1319
rng (numpy.random.Generator): A random number generator.
20+
weight_by (str): column name of the variable with weights.
1421
cluster_by (str): column name of the variable to cluster by.
1522
n_draws (int): number of draws, only relevant if seeds is None.
1623
@@ -19,12 +26,16 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
1926
2027
"""
2128
n_obs = len(data)
29+
probs = _calculate_bootstrap_indices_weights(data, weight_by, cluster_by)
30+
2231
if cluster_by is None:
23-
bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
32+
bootstrap_indices = list(
33+
rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs)
34+
)
2435
else:
2536
clusters = data[cluster_by].unique()
2637
drawn_clusters = rng.choice(
27-
clusters, size=(n_draws, len(clusters)), replace=True
38+
clusters, size=(n_draws, len(clusters)), replace=True, p=probs
2839
)
2940

3041
bootstrap_indices = _convert_cluster_ids_to_indices(
@@ -34,6 +45,33 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
3445
return bootstrap_indices
3546

3647

48+
def _calculate_bootstrap_indices_weights(data, weight_by, cluster_by):
49+
"""Calculate weights for drawing bootstrap indices.
50+
51+
If weights_by is not None and cluster_by is None, the weights are normalized to sum
52+
to one. If weights_by and cluster_by are both not None, the weights are normalized
53+
to sum to one within each cluster.
54+
55+
Args:
56+
data (pandas.DataFrame): original dataset.
57+
weight_by (str): column name of the variable with weights.
58+
cluster_by (str): column name of the variable to cluster by.
59+
60+
Returns:
61+
list: None or pd.Series of weights.
62+
63+
"""
64+
if weight_by is None:
65+
probs = None
66+
else:
67+
if cluster_by is None:
68+
probs = data[weight_by] / data[weight_by].sum()
69+
else:
70+
cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum()
71+
probs = cluster_weights / cluster_weights.sum()
72+
return probs
73+
74+
3775
def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
3876
"""Convert the drawn clusters to positional indices of individual observations.
3977
@@ -48,7 +86,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
4886
return bootstrap_indices
4987

5088

51-
def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
89+
def get_bootstrap_samples(
90+
data,
91+
rng,
92+
weight_by=None,
93+
cluster_by=None,
94+
n_draws=1000,
95+
):
5296
"""Draw bootstrap samples.
5397
5498
If you have memory issues you should use get_bootstrap_indices instead and construct
@@ -57,6 +101,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
57101
Args:
58102
data (pandas.DataFrame): original dataset.
59103
rng (numpy.random.Generator): A random number generator.
104+
weight_by (str): weights for the observations.
60105
cluster_by (str): column name of the variable to cluster by.
61106
n_draws (int): number of draws, only relevant if seeds is None.
62107
@@ -67,6 +112,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
67112
indices = get_bootstrap_indices(
68113
data=data,
69114
rng=rng,
115+
weight_by=weight_by,
70116
cluster_by=cluster_by,
71117
n_draws=n_draws,
72118
)

src/estimagic/msm_weighting.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,8 @@ def get_moments_cov(
2424
moment_kwargs (dict): Additional keyword arguments for calculate_moments.
2525
bootstrap_kwargs (dict): Additional keyword arguments that govern the
2626
bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
27-
"batch_evaluator", "cluster_by" and "error_handling". For details see the
28-
bootstrap function.
27+
"batch_evaluator", "weight_by", "cluster_by" and "error_handling".
28+
For details see the bootstrap function.
2929
3030
Returns:
3131
pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
@@ -39,6 +39,7 @@ def get_moments_cov(
3939
"n_draws",
4040
"seed",
4141
"batch_evaluator",
42+
"weight_by",
4243
"cluster_by",
4344
"error_handling",
4445
"existing_result",

tests/estimagic/test_bootstrap_ci.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@
66
from pybaum import tree_just_flatten
77

88
from estimagic.bootstrap_ci import calculate_ci, check_inputs
9+
from estimagic.bootstrap_samples import get_bootstrap_indices
910
from optimagic.parameters.tree_registry import get_registry
11+
from optimagic.utilities import get_rng
1012

1113

1214
def aaae(obj1, obj2, decimal=6):
@@ -88,6 +90,29 @@ def test_check_inputs_data():
8890
assert str(error.value) == expected_msg
8991

9092

93+
def test_check_inputs_weight_by(setup):
94+
expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'."
95+
with pytest.raises(ValueError, match=expected_error_msg):
96+
check_inputs(data=setup["df"], weight_by="this is not a column name of df")
97+
98+
99+
def test_get_bootstrap_indices_heterogeneous_weights():
100+
data = pd.DataFrame(
101+
{"id": [0, 1], "w_homogenous": [0.5, 0.5], "w_heterogenous": [0.1, 0.9]}
102+
)
103+
104+
res_homogenous = get_bootstrap_indices(
105+
data, weight_by="w_homogenous", n_draws=1_000, rng=get_rng(seed=0)
106+
)
107+
res_heterogenous = get_bootstrap_indices(
108+
data, weight_by="w_heterogenous", n_draws=1_000, rng=get_rng(seed=0)
109+
)
110+
111+
# Given the weights, the first sample mean should be close to 0.5,
112+
# while the second one should be close to 0.9
113+
assert np.mean(res_homogenous) < 0.75 < np.mean(res_heterogenous)
114+
115+
91116
def test_check_inputs_cluster_by(setup):
92117
cluster_by = "this is not a column name of df"
93118
expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."

tests/estimagic/test_bootstrap_samples.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@
33
import pytest
44
from numpy.testing import assert_array_equal as aae
55
from pandas.testing import assert_frame_equal as afe
6+
from pandas.testing import assert_series_equal as ase
67

78
from estimagic.bootstrap_samples import (
9+
_calculate_bootstrap_indices_weights,
810
_convert_cluster_ids_to_indices,
911
_get_bootstrap_samples_from_indices,
1012
get_bootstrap_indices,
@@ -18,6 +20,7 @@ def data():
1820
df = pd.DataFrame()
1921
df["id"] = np.arange(900)
2022
df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
23+
df["weights"] = np.ones(900)
2124
return df
2225

2326

@@ -33,6 +36,37 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
3336
assert set(res[0]) != set(res[1])
3437

3538

39+
def test_get_bootstrap_indices_randomization_works_with_weights(data):
40+
rng = get_rng(seed=12345)
41+
res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng)
42+
assert set(res[0]) != set(res[1])
43+
44+
45+
def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
46+
rng = get_rng(seed=12345)
47+
res = get_bootstrap_indices(
48+
data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng
49+
)
50+
assert set(res[0]) != set(res[1])
51+
52+
53+
def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data):
54+
rng1 = get_rng(seed=12345)
55+
rng2 = get_rng(seed=12345)
56+
res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1)
57+
res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2)
58+
assert not np.array_equal(res1, res2)
59+
60+
61+
def test_get_boostrap_indices_randomization_works_with_extreme_case(data):
62+
rng = get_rng(seed=12345)
63+
weights = np.zeros(900)
64+
weights[0] = 1.0
65+
data["weights"] = weights
66+
res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng)
67+
assert len(np.unique(res)) == 1
68+
69+
3670
def test_clustering_leaves_households_intact(data):
3771
rng = get_rng(seed=12345)
3872
indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]
@@ -63,3 +97,53 @@ def test_get_bootstrap_samples_from_indices():
6397
def test_get_bootstrap_samples_runs(data):
6498
rng = get_rng(seed=12345)
6599
get_bootstrap_samples(data, n_draws=2, rng=rng)
100+
101+
102+
@pytest.fixture
103+
def sample_data():
104+
return pd.DataFrame({"weight": [1, 2, 3, 4], "cluster": ["A", "A", "B", "B"]})
105+
106+
107+
def test_no_weights_no_clusters(sample_data):
108+
result = _calculate_bootstrap_indices_weights(sample_data, None, None)
109+
assert result is None
110+
111+
112+
def test_weights_no_clusters(sample_data):
113+
result = _calculate_bootstrap_indices_weights(sample_data, "weight", None)
114+
expected = pd.Series([0.1, 0.2, 0.3, 0.4], index=sample_data.index, name="weight")
115+
pd.testing.assert_series_equal(result, expected)
116+
117+
118+
def test_weights_and_clusters(sample_data):
119+
result = _calculate_bootstrap_indices_weights(sample_data, "weight", "cluster")
120+
expected = pd.Series(
121+
[0.3, 0.7], index=pd.Index(["A", "B"], name="cluster"), name="weight"
122+
)
123+
ase(result, expected)
124+
125+
126+
def test_invalid_weight_column():
127+
data = pd.DataFrame({"x": [1, 2, 3]})
128+
with pytest.raises(KeyError):
129+
_calculate_bootstrap_indices_weights(data, "weight", None)
130+
131+
132+
def test_invalid_cluster_column(sample_data):
133+
with pytest.raises(KeyError):
134+
_calculate_bootstrap_indices_weights(sample_data, "weight", "invalid_cluster")
135+
136+
137+
def test_empty_dataframe():
138+
empty_df = pd.DataFrame()
139+
result = _calculate_bootstrap_indices_weights(empty_df, None, None)
140+
assert result is None
141+
142+
143+
def test_some_zero_weights_with_clusters():
144+
data = pd.DataFrame({"weight": [0, 1, 0, 2], "cluster": ["A", "A", "B", "B"]})
145+
result = _calculate_bootstrap_indices_weights(data, "weight", "cluster")
146+
expected = pd.Series(
147+
[1 / 3, 2 / 3], index=pd.Index(["A", "B"], name="cluster"), name="weight"
148+
)
149+
ase(result, expected)

0 commit comments

Comments
 (0)