Bootstrap weights (#485)

alanlujan91 · web-flow · commit 15ac4c286f1f · 2024-11-20T09:53:08.000+01:00
diff --git a/src/estimagic/bootstrap.py b/src/estimagic/bootstrap.py
@@ -24,6 +24,7 @@ def bootstrap(
     existing_result=None,
     outcome_kwargs=None,
     n_draws=1_000,
+    weight_by=None,
     cluster_by=None,
     seed=None,
     n_cores=1,
@@ -41,6 +42,7 @@ def bootstrap(
         n_draws (int): Number of bootstrap samples to draw.
             If len(existing_outcomes) >= n_draws, a random subset of existing_outcomes
             is used.
+        weight_by (str): Column name of variable with weights or None.
         cluster_by (str): Column name of variable to cluster by or None.
         seed (Union[None, int, numpy.random.Generator]): If seed is None or int the
             numpy.random.default_rng is used seeded with seed. If seed is already a
@@ -59,7 +61,7 @@ def bootstrap(
 
     """
     if callable(outcome):
-        check_inputs(data=data, cluster_by=cluster_by)
+        check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
 
         if outcome_kwargs is not None:
             outcome = functools.partial(outcome, **outcome_kwargs)
@@ -82,6 +84,7 @@ def bootstrap(
         new_outcomes = get_bootstrap_outcomes(
             data=data,
             outcome=outcome,
+            weight_by=weight_by,
             cluster_by=cluster_by,
             rng=rng,
             n_draws=n_draws - n_existing,
diff --git a/src/estimagic/bootstrap_helpers.py b/src/estimagic/bootstrap_helpers.py
@@ -2,12 +2,18 @@
 
 
 def check_inputs(
-    data=None, cluster_by=None, ci_method="percentile", ci_level=0.95, skipdata=False
+    data=None,
+    weight_by=None,
+    cluster_by=None,
+    ci_method="percentile",
+    ci_level=0.95,
+    skipdata=False,
 ):
     """Check validity of inputs.
 
     Args:
         data (pd.DataFrame): Dataset.
+        weight_by (str): Column name of variable with weights.
         cluster_by (str): Column name of variable to cluster by.
         ci_method (str): Method of choice for computing confidence intervals.
             The default is "percentile".
@@ -21,6 +27,10 @@ def check_inputs(
     if not skipdata:
         if not isinstance(data, pd.DataFrame) and not isinstance(data, pd.Series):
             raise TypeError("Data must be a pandas.DataFrame or pandas.Series.")
+        elif (weight_by is not None) and (weight_by not in data.columns.tolist()):
+            raise ValueError(
+                "Input 'weight_by' must be None or a column name of 'data'."
+            )
         elif (cluster_by is not None) and (cluster_by not in data.columns.tolist()):
             raise ValueError(
                 "Input 'cluster_by' must be None or a column name of 'data'."
diff --git a/src/estimagic/bootstrap_outcomes.py b/src/estimagic/bootstrap_outcomes.py
@@ -6,6 +6,7 @@
 def get_bootstrap_outcomes(
     data,
     outcome,
+    weight_by=None,
     cluster_by=None,
     rng=None,
     n_draws=1000,
@@ -19,6 +20,7 @@ def get_bootstrap_outcomes(
         data (pandas.DataFrame): original dataset.
         outcome (callable): function of the dataset calculating statistic of interest.
             Returns a general pytree (e.g. pandas Series, dict, numpy array, etc.).
+        weight_by (str): column name of the variable with weights.
         cluster_by (str): column name of the variable to cluster by.
         rng (numpy.random.Generator): A random number generator.
         n_draws (int): number of bootstrap draws.
@@ -34,12 +36,13 @@ def get_bootstrap_outcomes(
         estimates (list):  List of pytrees of estimated bootstrap outcomes.
 
     """
-    check_inputs(data=data, cluster_by=cluster_by)
+    check_inputs(data=data, weight_by=weight_by, cluster_by=cluster_by)
     batch_evaluator = process_batch_evaluator(batch_evaluator)
 
     indices = get_bootstrap_indices(
         data=data,
         rng=rng,
+        weight_by=weight_by,
         cluster_by=cluster_by,
         n_draws=n_draws,
     )
diff --git a/src/estimagic/bootstrap_samples.py b/src/estimagic/bootstrap_samples.py
@@ -2,7 +2,13 @@
 import pandas as pd
 
 
-def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
+def get_bootstrap_indices(
+    data,
+    rng,
+    weight_by=None,
+    cluster_by=None,
+    n_draws=1000,
+):
     """Draw positional indices for the construction of bootstrap samples.
 
     Storing the positional indices instead of the full bootstrap samples saves a lot
@@ -11,6 +17,7 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
     Args:
         data (pandas.DataFrame): original dataset.
         rng (numpy.random.Generator): A random number generator.
+        weight_by (str): column name of the variable with weights.
         cluster_by (str): column name of the variable to cluster by.
         n_draws (int): number of draws, only relevant if seeds is None.
 
@@ -19,12 +26,16 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
 
     """
     n_obs = len(data)
+    probs = _calculate_bootstrap_indices_weights(data, weight_by, cluster_by)
+
     if cluster_by is None:
-        bootstrap_indices = list(rng.integers(0, n_obs, size=(n_draws, n_obs)))
+        bootstrap_indices = list(
+            rng.choice(n_obs, size=(n_draws, n_obs), replace=True, p=probs)
+        )
     else:
         clusters = data[cluster_by].unique()
         drawn_clusters = rng.choice(
-            clusters, size=(n_draws, len(clusters)), replace=True
+            clusters, size=(n_draws, len(clusters)), replace=True, p=probs
         )
 
         bootstrap_indices = _convert_cluster_ids_to_indices(
@@ -34,6 +45,33 @@ def get_bootstrap_indices(data, rng, cluster_by=None, n_draws=1000):
     return bootstrap_indices
 
 
+def _calculate_bootstrap_indices_weights(data, weight_by, cluster_by):
+    """Calculate weights for drawing bootstrap indices.
+
+    If weights_by is not None and cluster_by is None, the weights are normalized to sum
+    to one. If weights_by and cluster_by are both not None, the weights are normalized
+    to sum to one within each cluster.
+
+    Args:
+        data (pandas.DataFrame): original dataset.
+        weight_by (str): column name of the variable with weights.
+        cluster_by (str): column name of the variable to cluster by.
+
+    Returns:
+        list: None or pd.Series of weights.
+
+    """
+    if weight_by is None:
+        probs = None
+    else:
+        if cluster_by is None:
+            probs = data[weight_by] / data[weight_by].sum()
+        else:
+            cluster_weights = data.groupby(cluster_by, sort=False)[weight_by].sum()
+            probs = cluster_weights / cluster_weights.sum()
+    return probs
+
+
 def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
     """Convert the drawn clusters to positional indices of individual observations.
 
@@ -48,7 +86,13 @@ def _convert_cluster_ids_to_indices(cluster_col, drawn_clusters):
     return bootstrap_indices
 
 
-def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
+def get_bootstrap_samples(
+    data,
+    rng,
+    weight_by=None,
+    cluster_by=None,
+    n_draws=1000,
+):
     """Draw bootstrap samples.
 
     If you have memory issues you should use get_bootstrap_indices instead and construct
@@ -57,6 +101,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
     Args:
         data (pandas.DataFrame): original dataset.
         rng (numpy.random.Generator): A random number generator.
+        weight_by (str): weights for the observations.
         cluster_by (str): column name of the variable to cluster by.
         n_draws (int): number of draws, only relevant if seeds is None.
 
@@ -67,6 +112,7 @@ def get_bootstrap_samples(data, rng, cluster_by=None, n_draws=1000):
     indices = get_bootstrap_indices(
         data=data,
         rng=rng,
+        weight_by=weight_by,
         cluster_by=cluster_by,
         n_draws=n_draws,
     )
diff --git a/src/estimagic/msm_weighting.py b/src/estimagic/msm_weighting.py
@@ -24,8 +24,8 @@ def get_moments_cov(
         moment_kwargs (dict): Additional keyword arguments for calculate_moments.
         bootstrap_kwargs (dict): Additional keyword arguments that govern the
             bootstrapping. Allowed arguments are "n_draws", "seed", "n_cores",
-            "batch_evaluator", "cluster_by" and "error_handling". For details see the
-            bootstrap function.
+            "batch_evaluator", "weight_by", "cluster_by" and "error_handling".
+            For details see the bootstrap function.
 
     Returns:
         pandas.DataFrame or numpy.ndarray: The covariance matrix of the moment
@@ -39,6 +39,7 @@ def get_moments_cov(
         "n_draws",
         "seed",
         "batch_evaluator",
+        "weight_by",
         "cluster_by",
         "error_handling",
         "existing_result",
diff --git a/tests/estimagic/test_bootstrap_ci.py b/tests/estimagic/test_bootstrap_ci.py
@@ -6,7 +6,9 @@
 from pybaum import tree_just_flatten
 
 from estimagic.bootstrap_ci import calculate_ci, check_inputs
+from estimagic.bootstrap_samples import get_bootstrap_indices
 from optimagic.parameters.tree_registry import get_registry
+from optimagic.utilities import get_rng
 
 
 def aaae(obj1, obj2, decimal=6):
@@ -88,6 +90,29 @@ def test_check_inputs_data():
     assert str(error.value) == expected_msg
 
 
+def test_check_inputs_weight_by(setup):
+    expected_error_msg = "Input 'weight_by' must be None or a column name of 'data'."
+    with pytest.raises(ValueError, match=expected_error_msg):
+        check_inputs(data=setup["df"], weight_by="this is not a column name of df")
+
+
+def test_get_bootstrap_indices_heterogeneous_weights():
+    data = pd.DataFrame(
+        {"id": [0, 1], "w_homogenous": [0.5, 0.5], "w_heterogenous": [0.1, 0.9]}
+    )
+
+    res_homogenous = get_bootstrap_indices(
+        data, weight_by="w_homogenous", n_draws=1_000, rng=get_rng(seed=0)
+    )
+    res_heterogenous = get_bootstrap_indices(
+        data, weight_by="w_heterogenous", n_draws=1_000, rng=get_rng(seed=0)
+    )
+
+    # Given the weights, the first sample mean should be close to 0.5,
+    # while the second one should be close to 0.9
+    assert np.mean(res_homogenous) < 0.75 < np.mean(res_heterogenous)
+
+
 def test_check_inputs_cluster_by(setup):
     cluster_by = "this is not a column name of df"
     expected_msg = "Input 'cluster_by' must be None or a column name of 'data'."
diff --git a/tests/estimagic/test_bootstrap_samples.py b/tests/estimagic/test_bootstrap_samples.py
@@ -3,8 +3,10 @@
 import pytest
 from numpy.testing import assert_array_equal as aae
 from pandas.testing import assert_frame_equal as afe
+from pandas.testing import assert_series_equal as ase
 
 from estimagic.bootstrap_samples import (
+    _calculate_bootstrap_indices_weights,
     _convert_cluster_ids_to_indices,
     _get_bootstrap_samples_from_indices,
     get_bootstrap_indices,
@@ -18,6 +20,7 @@ def data():
     df = pd.DataFrame()
     df["id"] = np.arange(900)
     df["hh"] = [3, 1, 2, 0, 0, 2, 5, 4, 5] * 100
+    df["weights"] = np.ones(900)
     return df
 
 
@@ -33,6 +36,37 @@ def test_get_bootstrap_indices_radomization_works_with_clustering(data):
     assert set(res[0]) != set(res[1])
 
 
+def test_get_bootstrap_indices_randomization_works_with_weights(data):
+    rng = get_rng(seed=12345)
+    res = get_bootstrap_indices(data, weight_by="weights", n_draws=2, rng=rng)
+    assert set(res[0]) != set(res[1])
+
+
+def test_get_bootstrap_indices_randomization_works_with_weights_and_clustering(data):
+    rng = get_rng(seed=12345)
+    res = get_bootstrap_indices(
+        data, weight_by="weights", cluster_by="hh", n_draws=2, rng=rng
+    )
+    assert set(res[0]) != set(res[1])
+
+
+def test_get_bootstrap_indices_randomization_works_with_and_without_weights(data):
+    rng1 = get_rng(seed=12345)
+    rng2 = get_rng(seed=12345)
+    res1 = get_bootstrap_indices(data, n_draws=1, rng=rng1)
+    res2 = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng2)
+    assert not np.array_equal(res1, res2)
+
+
+def test_get_boostrap_indices_randomization_works_with_extreme_case(data):
+    rng = get_rng(seed=12345)
+    weights = np.zeros(900)
+    weights[0] = 1.0
+    data["weights"] = weights
+    res = get_bootstrap_indices(data, weight_by="weights", n_draws=1, rng=rng)
+    assert len(np.unique(res)) == 1
+
+
 def test_clustering_leaves_households_intact(data):
     rng = get_rng(seed=12345)
     indices = get_bootstrap_indices(data, cluster_by="hh", n_draws=1, rng=rng)[0]
@@ -63,3 +97,53 @@ def test_get_bootstrap_samples_from_indices():
 def test_get_bootstrap_samples_runs(data):
     rng = get_rng(seed=12345)
     get_bootstrap_samples(data, n_draws=2, rng=rng)
+
+
+@pytest.fixture
+def sample_data():
+    return pd.DataFrame({"weight": [1, 2, 3, 4], "cluster": ["A", "A", "B", "B"]})
+
+
+def test_no_weights_no_clusters(sample_data):
+    result = _calculate_bootstrap_indices_weights(sample_data, None, None)
+    assert result is None
+
+
+def test_weights_no_clusters(sample_data):
+    result = _calculate_bootstrap_indices_weights(sample_data, "weight", None)
+    expected = pd.Series([0.1, 0.2, 0.3, 0.4], index=sample_data.index, name="weight")
+    pd.testing.assert_series_equal(result, expected)
+
+
+def test_weights_and_clusters(sample_data):
+    result = _calculate_bootstrap_indices_weights(sample_data, "weight", "cluster")
+    expected = pd.Series(
+        [0.3, 0.7], index=pd.Index(["A", "B"], name="cluster"), name="weight"
+    )
+    ase(result, expected)
+
+
+def test_invalid_weight_column():
+    data = pd.DataFrame({"x": [1, 2, 3]})
+    with pytest.raises(KeyError):
+        _calculate_bootstrap_indices_weights(data, "weight", None)
+
+
+def test_invalid_cluster_column(sample_data):
+    with pytest.raises(KeyError):
+        _calculate_bootstrap_indices_weights(sample_data, "weight", "invalid_cluster")
+
+
+def test_empty_dataframe():
+    empty_df = pd.DataFrame()
+    result = _calculate_bootstrap_indices_weights(empty_df, None, None)
+    assert result is None
+
+
+def test_some_zero_weights_with_clusters():
+    data = pd.DataFrame({"weight": [0, 1, 0, 2], "cluster": ["A", "A", "B", "B"]})
+    result = _calculate_bootstrap_indices_weights(data, "weight", "cluster")
+    expected = pd.Series(
+        [1 / 3, 2 / 3], index=pd.Index(["A", "B"], name="cluster"), name="weight"
+    )
+    ase(result, expected)