Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

5,788 changes: 3,378 additions & 2,410 deletions pixi.lock

Large diffs are not rendered by default.

69 changes: 69 additions & 0 deletions pyfixest/estimation/models/feiv_.py
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,75 @@ def get_fit(self) -> None:
D = np.linalg.inv(self._tXZ @ self._tZZinv @ self._tZX)
self._bread = H.T @ D @ H

def _bootstrap_one_rep(self, nz: np.ndarray, w_combined: np.ndarray) -> np.ndarray:
"""
Run one 2SLS bootstrap iteration.

Demeans Y, X (exog + endog), and Z (instruments) with the bootstrap
weights, applies WLS scaling, then solves the 2SLS formula.
"""
w_f = w_combined[nz]
Y_f = self._Y_untransformed.iloc[nz].reset_index(drop=True)
X_f = (
self._X_untransformed_df[list(self._coefnames)]
.iloc[nz]
.reset_index(drop=True)
)
Z_f = (
self._Z_untransformed_df[list(self._coefnames_z)]
.iloc[nz]
.reset_index(drop=True)
)
fe_f = (
self._fe.iloc[nz].reset_index(drop=True) if self._fe is not None else None
)

if self._has_fixef:
Yd, Xd = demean_model(
Y=Y_f, # type: ignore[arg-type]
X=X_f,
fe=fe_f,
weights=w_f,
lookup_demeaned_data={},
na_index=self._na_index,
fixef_tol=self._fixef_tol,
fixef_maxiter=self._fixef_maxiter,
demean_func=self._demean_func,
)
# demean Z — pass Y_f as a placeholder Y (only Z columns matter)
_, Zd = demean_model(
Y=Y_f, # type: ignore[arg-type]
X=Z_f,
fe=fe_f,
weights=w_f,
lookup_demeaned_data={},
na_index=self._na_index,
fixef_tol=self._fixef_tol,
fixef_maxiter=self._fixef_maxiter,
demean_func=self._demean_func,
)
_Y_b = Yd.to_numpy().flatten()
_X_b = Xd.to_numpy()
_Z_b = Zd.to_numpy()
else:
_Y_b = Y_f.to_numpy().flatten()
_X_b = X_f.to_numpy()
_Z_b = Z_f.to_numpy()

# WLS transform: scale by sqrt(w)
sqrt_w = np.sqrt(w_f)
Xw = _X_b * sqrt_w[:, None]
Yw = _Y_b * sqrt_w
Zw = _Z_b * sqrt_w[:, None]

# 2SLS formula: β = (X'Z (Z'Z)⁻¹ Z'X)⁻¹ (X'Z (Z'Z)⁻¹ Z'Y)
tZX = Zw.T @ Xw
tXZ = Xw.T @ Zw
tZy = Zw.T @ Yw
tZZinv = np.linalg.inv(Zw.T @ Zw)
H = tXZ @ tZZinv
return solve_ols(H @ tZX, H @ tZy, self._solver)

def first_stage(self) -> None:
"""Implement First stage regression."""
# Store names of instruments from Z matrix
Expand Down
231 changes: 231 additions & 0 deletions pyfixest/estimation/models/feols_.py
Original file line number Diff line number Diff line change
Expand Up @@ -412,9 +412,17 @@ def prepare_model_matrix(self):
self._Y = model_matrix.dependent
self._Y_untransformed = model_matrix.dependent.copy()
self._X = model_matrix.independent
self._X_untransformed_df = (
model_matrix.independent.copy()
) # pre-demeaning DataFrame, for bootstrap
self._fe = model_matrix.fixed_effects
self._endogvar = model_matrix.endogenous
self._Z = model_matrix.instruments
self._Z_untransformed_df = (
model_matrix.instruments.copy()
if model_matrix.instruments is not None
else None
) # pre-demeaning instrument DataFrame; used by Feiv bootstrap
self._weights_df = model_matrix.weights
self._na_index = model_matrix.na_index
# TODO: set dynamically based on naming set in pyfixest.estimation.formula.factor_interaction._encode_i
Expand Down Expand Up @@ -1104,6 +1112,8 @@ def _clear_attributes(self):
"_Y_hat_link",
"_Y_hat_response",
"_Y_untransformed",
"_X_untransformed_df",
"_Z_untransformed_df",
]

for attr in attributes:
Expand Down Expand Up @@ -1459,6 +1469,227 @@ def wildboottest(
else:
return res_df

def weightingboottest(
self,
reps: int,
method: Literal["bayesian", "multinomial"] = "bayesian",
concentration: float = 1.0,
cluster: str | None = None,
ci_level: float = 0.95,
seed: int | None = None,
return_draws: bool = False,
) -> pd.DataFrame | tuple[pd.DataFrame, np.ndarray]:
"""
Run a weighting bootstrap (Bayesian or Multinomial) for inference on
all regression coefficients.

Unlike the wild bootstrap, which perturbs residuals, the weighting
bootstrap reweights observations. Each iteration draws a new weight
vector, re-demeans (if fixed effects are present) with those weights,
fits a weighted OLS, and collects the coefficient estimates. Standard
errors and confidence intervals are derived directly from the bootstrap
distribution.

When a cluster variable is provided (or set via vcov at estimation
time), weights are drawn once per unit and broadcast to all observations
of that unit, correctly handling unbalanced panels.

Parameters
----------
reps : int
Number of bootstrap iterations.
method : str, optional
Either ``"bayesian"`` (Dirichlet/Gamma weights) or
``"multinomial"`` (integer counts). Defaults to ``"bayesian"``.
concentration : float, optional
Concentration parameter for the Dirichlet distribution.
Only used when ``method="bayesian"``. ``concentration=1`` is the
standard Bayesian bootstrap of Rubin (1981); larger values shrink
weights toward uniform (less variable draws). Defaults to 1.0.
cluster : str or None, optional
Column name in the data used to define sampling units for
panel-aware weighting. If None, falls back to the model's
``clustervar`` if one was set at estimation time, otherwise
draws weights independently per observation.
ci_level : float, optional
Confidence level for percentile intervals. Defaults to 0.95.
seed : int or None, optional
Random seed for reproducibility. Defaults to None.
return_draws : bool, optional
If True, also returns the full ``(reps x k)`` matrix of bootstrap
coefficient draws. Defaults to False.

Returns
-------
pd.DataFrame
Indexed by coefficient name with columns:
``Estimate``, ``Bootstrap SE``, ``CI lower``, ``CI upper``,
``p-value``, ``method``.
tuple[pd.DataFrame, np.ndarray]
If ``return_draws=True``, returns a tuple of the DataFrame and
the ``(reps x k)`` bootstrap draws matrix.

Examples
--------
```{python}
import pyfixest as pf

data = pf.get_data()
fit = pf.feols("Y ~ X1 + X2 | f1", data)

fit.weightingboottest(reps=500, method="bayesian", concentration=1.0, seed=42)
fit.weightingboottest(reps=500, method="multinomial", seed=42)
```
"""
if method not in ("bayesian", "multinomial"):
raise ValueError(
f"method must be 'bayesian' or 'multinomial', got '{method}'."
)
if method == "bayesian" and concentration <= 0:
raise ValueError(f"concentration must be positive, got {concentration}.")

rng = np.random.default_rng(seed)

# --- resolve cluster variable (same logic as wildboottest) ---
cluster_var = None
if cluster is not None:
cluster_var = cluster
elif self._clustervar:
cluster_var = (
self._clustervar[0]
if isinstance(self._clustervar, list)
else self._clustervar
)

if cluster_var is not None and cluster_var not in self._data.columns:
raise ValueError(f"Cluster variable '{cluster_var}' not found in the data.")

# --- unit-to-observation mapping for panel-aware weighting ---
# np.unique returns sorted unique values and inverse maps each
# observation back to its unit index — handles unbalanced panels
# because mapping is by value, not by position.
if cluster_var is not None:
cluster_arr = self._data[cluster_var].to_numpy()
_, inverse = np.unique(cluster_arr, return_inverse=True)
n_units = int(inverse.max()) + 1
else:
N_obs = len(self._Y)
n_units = N_obs
inverse = np.arange(N_obs)

# --- original estimate on already-fitted arrays ---
# self._X and self._Y are post-demean, post-wls-transform numpy arrays.
# self._coefnames tracks the coefficient names after collinearity drop.
beta_hat = self._beta_hat
coefnames = self._coefnames

# --- original analytic weights (ones if no WLS) ---
w_original = self._weights.flatten() # shape (N_obs,)

# --- bootstrap loop ---
k = len(beta_hat)
beta_boots = np.empty((reps, k))

for b in range(reps):
# draw unit-level weights
if method == "multinomial":
w_unit = rng.multinomial(n_units, np.ones(n_units) / n_units).astype(
float
)
else: # bayesian
w_unit = rng.dirichlet(np.full(n_units, concentration)) * n_units

# broadcast to observations — handles unbalanced panels correctly
w_boot = w_unit[inverse]

# combine with original analytic weights if WLS
w_combined = w_original * w_boot

# With multinomial draws, whole FE groups can get zero total weight,
# causing division-by-zero in the demeaning kernel. Filter them out;
# zero-weight rows contribute nothing to the WLS objective anyway.
nz = w_combined > 0
beta_boots[b] = self._bootstrap_one_rep(nz, w_combined)

# --- inference from bootstrap distribution ---
boot_se = beta_boots.std(axis=0, ddof=1)
q_lo = (1 - ci_level) / 2
q_hi = 1 - q_lo
ci_lower = np.quantile(beta_boots, q_lo, axis=0)
ci_upper = np.quantile(beta_boots, q_hi, axis=0)
# symmetric p-value: fraction of draws at least as extreme as observed
p_values = np.mean(np.abs(beta_boots - beta_hat) >= np.abs(beta_hat), axis=0)

res_df = pd.DataFrame(
{
"Estimate": beta_hat,
"Bootstrap SE": boot_se,
f"CI {int(q_lo * 100)}%": ci_lower,
f"CI {int(q_hi * 100)}%": ci_upper,
"p-value": p_values,
"method": method,
},
index=coefnames,
)

if return_draws:
return res_df, beta_boots
return res_df

def _bootstrap_one_rep(self, nz: np.ndarray, w_combined: np.ndarray) -> np.ndarray:
"""
Run one OLS/WLS bootstrap iteration.

Subclasses override this method to implement model-specific fitting
(e.g. 2SLS for Feiv, IRLS for Fepois/Feglm). The shared weight-drawing
and inference logic stays in ``weightingboottest()``.

Parameters
----------
nz : np.ndarray
Boolean mask of observations with positive combined weight.
w_combined : np.ndarray
Combined weights (bootstrap * original analytic weights), length N.

Returns
-------
np.ndarray
Bootstrap coefficient vector of shape ``(k,)``.
"""
w_f = w_combined[nz]
Y_f = self._Y_untransformed.iloc[nz].reset_index(drop=True)
X_f = (
self._X_untransformed_df[list(self._coefnames)]
.iloc[nz]
.reset_index(drop=True)
)
fe_f = (
self._fe.iloc[nz].reset_index(drop=True) if self._fe is not None else None
)

if self._has_fixef:
Yd, Xd = demean_model(
Y=Y_f, # type: ignore[arg-type]
X=X_f,
fe=fe_f,
weights=w_f,
lookup_demeaned_data={},
na_index=self._na_index,
fixef_tol=self._fixef_tol,
fixef_maxiter=self._fixef_maxiter,
demean_func=self._demean_func,
)
_Y_b = Yd.to_numpy().flatten()
_X_b = Xd.to_numpy()
else:
_Y_b = Y_f.to_numpy().flatten()
_X_b = X_f.to_numpy()

sqrt_w = np.sqrt(w_f)
Xw = _X_b * sqrt_w[:, None]
Yw = _Y_b * sqrt_w
return solve_ols(Xw.T @ Xw, Xw.T @ Yw, self._solver)

def ccv(
self,
treatment,
Expand Down
2 changes: 1 addition & 1 deletion pyfixest/utils/dgps.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ def get_ivf_data(N=2000, seed=1234):
TRUE_EFFECT = -0.15

OVB formula for naive OLS (earnings ~ num_children):
bias ≈ γ_ambition * Cov(num_children, ambition) / Var(num_children)
bias ≈ y_ambition * Cov(num_children, ambition) / Var(num_children)
≈ 0.6 * (-0.4) / 0.57 ≈ -0.42
β_OLS ≈ -0.15 + (-0.42) ≈ -0.57 (overstates the penalty)
β_IV ≈ -0.15 (recovers the true effect)
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ polars = ">=1.0.0"
pyarrow = ">=14.0"
doubleml = ">=0.10.1"
pytest-sugar = ">=1.1.1"
jupyterlab = ">=4.5.6,<5"

[tool.pixi.pypi-dependencies]
pyfixest = { path = ".", editable = true, extras = ["jax", "plots"] }
Expand Down
Loading
Loading