fix: add normalizer bias support. fix trainer callback. add tests

baberabb · baberabb · commit e5f68699b4e5 · 2025-11-18T05:30:14.000+05:00
diff --git a/bergson/gradients.py b/bergson/gradients.py
@@ -68,14 +68,24 @@ def state_dict(self) -> dict[str, str | Tensor]:
 class AdafactorNormalizer(Normalizer):
     """
     Row and column sums of second moments of gradients for a matrix-valued parameter.
+
+    Args:
+        row: Row statistics [O]
+        col: Column statistics [I]
+        bias_avg_sq: Optional second moments for bias [O]
     """
 
     row: Tensor  # shape [O]
     col: Tensor  # shape [I]
+    bias_avg_sq: Tensor | None = None  # shape [O]
 
     def __post_init__(self):
         assert self.row.ndim == 1, f"Expected 1D tensor for row, got {self.row.ndim}D"
         assert self.col.ndim == 1, f"Expected 1D tensor for col, got {self.col.ndim}D"
+        if self.bias_avg_sq is not None:
+            assert self.bias_avg_sq.ndim == 1, (
+                f"Expected 1D tensor for bias_avg_sq, got {self.bias_avg_sq.ndim}D"
+            )
 
     @torch.compile
     def normalize_(
@@ -120,22 +130,29 @@ def to_adam(self) -> "AdamNormalizer":
         """
         Convert this Adafactor normalizer to an Adam normalizer by materializing the
         rank-one second moment matrix.
+
+        Preserves bias_avg_sq if present.
         """
         # Compute the second moment matrix as a square matrix of shape [O, I]
         # NOTE: We don't add the epsilon here, since the AdamNormalizer is going to
         # add it outside the square root. This could cause infs though if there are
         # any exactly zero rows or columns, so we should be careful.
         avg_sq = torch.outer(self.row, self.col) / self.row.mean()
-        return AdamNormalizer(avg_sq=avg_sq)
+        return AdamNormalizer(avg_sq=avg_sq, bias_avg_sq=self.bias_avg_sq)
 
 
 @dataclass
 class AdamNormalizer(Normalizer):
     """
     Contains the second moments of the gradients.
+
+    Args:
+        avg_sq: Second moments for weights [O, I]
+        bias_avg_sq: Optional second moments for bias [O]
     """
 
     avg_sq: Tensor
+    bias_avg_sq: Tensor | None = None
 
     @torch.compile
     def normalize_(
@@ -153,6 +170,8 @@ def to_adafactor(self) -> AdafactorNormalizer:
         Convert this Adam normalizer to an Adafactor normalizer, minimizing the
         I-divergence (generalized Kullback-Leibler divergence) between the original
         and the factored second moments.
+
+        Preserves bias_avg_sq if present.
         """
         # We assume avg_sq is a square matrix of shape [O, I]
         assert self.avg_sq.ndim == 2, (
@@ -163,6 +182,7 @@ def to_adafactor(self) -> AdafactorNormalizer:
         return AdafactorNormalizer(
             row=self.avg_sq.mean(dim=1),  # shape [O]
             col=self.avg_sq.mean(dim=0),  # shape [I]
+            bias_avg_sq=self.bias_avg_sq,  # Preserve bias second moments
         )
 
 
@@ -551,8 +571,22 @@ def _process_grad(self, module: nn.Module, _, grad_out):
         i = getattr(module, LayerAdapter.in_attr(module))
         o = getattr(module, LayerAdapter.out_attr(module))
 
-        # Pre-scale G by the Adafactor row statistics
+        # Handle bias gradients if needed (must be computed from raw G)
         norm = self.processor.normalizers.get(name)
+        bias_grad = None
+        if include_bias:
+            # Compute bias from raw G (before any normalization)
+            bias_grad = G.sum(dim=1)  # [N, S, O] -> [N, O]
+
+            # Normalize bias with appropriate second moments
+            if (
+                isinstance(norm, (AdamNormalizer, AdafactorNormalizer))
+                and hasattr(norm, "bias_avg_sq")
+                and norm.bias_avg_sq is not None
+            ):
+                bias_grad = bias_grad / norm.bias_avg_sq.sqrt().add_(1e-8)
+
+        # Pre-scale G by the Adafactor row statistics (for weight gradients)
         if isinstance(norm, AdafactorNormalizer):
             # Compare to the normalize_ method in AdafactorNormalizer
             r = norm.row.add(1e-30)
@@ -568,11 +602,10 @@ def _process_grad(self, module: nn.Module, _, grad_out):
                 # Normalize the gradients using the second moment matrix
                 P /= norm.avg_sq.sqrt().add_(1e-8)
 
-            if include_bias:
-                # TODO: should we normalize the bias gradients?
-                # Append the raw bias gradient to the input
+            if include_bias and bias_grad is not None:
+                # Append pre-computed and normalized bias gradient
                 P = torch.cat(
-                    [P, G.sum(dim=1).unsqueeze(2)],  # [N, S, O] -> [N, O]  # [N, O, 1]
+                    [P, bias_grad.unsqueeze(2)],  # [N, O, 1]
                     dim=2,
                 )
                 i += 1
diff --git a/bergson/huggingface.py b/bergson/huggingface.py
@@ -239,7 +239,6 @@ def on_step_end(
         **kwargs,
     ):
         self.on_substep_end(args, state, control)
-        print("Step end")
 
         # Record training order if enabled
         if self.order is not None:
@@ -279,32 +278,82 @@ def on_step_end(
 
         # Read normalizers off of the optimizer state. We need to figure out
         # what type of optimizer this is first.
+        # Collect references to both weight and bias second moments per layer
+        layer_second_moments: dict[str, dict[str, Tensor]] = {}
+
         for group in optimizer.param_groups:
-            lr_sqrt = group["lr"] ** 0.5
+            group_lr = group["lr"]
 
             for param in group["params"]:
-                name = param_to_name[param].removesuffix(".weight")
-                if name not in self.collector.target_info:
+                param_name = param_to_name[param]
+
+                # Extract layer name (remove .weight or .bias suffix)
+                if param_name.endswith(".weight"):
+                    param_type = "weight"
+                    layer_name = param_name.removesuffix(".weight")
+                elif param_name.endswith(".bias"):
+                    param_type = "bias"
+                    layer_name = param_name.removesuffix(".bias")
+                else:
+                    continue
+
+                if layer_name not in self.collector.target_info:
                     continue
 
                 p_state = optimizer.state[param]
 
+                # Initialize layer dict if needed, storing this group's learning rate
+                if layer_name not in layer_second_moments:
+                    layer_second_moments[layer_name] = {"lr": group_lr}
+
                 # Adam-like optimizer
                 if (eas := p_state.get("exp_avg_sq")) is not None:
-                    norm = AdamNormalizer(eas).to_adafactor()
-
+                    layer_second_moments[layer_name][param_type] = eas
                 # Adafactor-like optimizer
                 elif (vr := p_state.get("exp_avg_sq_row")) is not None:
                     vc = p_state.get("exp_avg_sq_col")
-                    norm = AdafactorNormalizer(vr, vc)
-                else:
-                    continue
-
-                # Scale the gradient by the current learning rate. It's factorized
-                # so we multiply each factor by the square root of the LR.
-                norm.row *= lr_sqrt
-                norm.col *= lr_sqrt
-                normalizers[name] = norm
+                    if param_type == "weight":
+                        # Factorized second moments for weights
+                        layer_second_moments[layer_name]["row"] = vr
+                        layer_second_moments[layer_name]["col"] = vc
+                    elif param_type == "bias":
+                        # Adafactor stores bias as regular exp_avg_sq
+                        bias_eas = p_state.get("exp_avg_sq")
+                        if bias_eas is not None:
+                            layer_second_moments[layer_name]["bias"] = bias_eas
+
+        # Build normalizers from collected second moments
+        for layer_name, moments in layer_second_moments.items():
+            lr_sqrt = moments["lr"] ** 0.5
+
+            # Adam-like: has weight exp_avg_sq
+            if "weight" in moments:
+                weight_eas = moments["weight"]
+                bias_eas = moments.get("bias")  # May be None
+
+                # Create Adam normalizer with optional bias, then convert to Adafactor
+                # TODO: always convert to adafactor?
+                norm = AdamNormalizer(weight_eas, bias_eas).to_adafactor()
+
+                # Scale by LR (factorized) - use non-in-place ops to avoid modifying optimizer state
+                norm.row = norm.row * lr_sqrt
+                norm.col = norm.col * lr_sqrt
+                if norm.bias_avg_sq is not None:
+                    norm.bias_avg_sq = norm.bias_avg_sq * (lr_sqrt**2)
+
+            # Adafactor-like: has row/col
+            elif "row" in moments and "col" in moments:
+                bias_eas = moments.get("bias")  # May be present
+                norm = AdafactorNormalizer(moments["row"], moments["col"], bias_eas)
+                # Scale by LR (factorized) - use non-in-place ops to avoid modifying optimizer state
+                norm.row = norm.row * lr_sqrt
+                norm.col = norm.col * lr_sqrt
+                if norm.bias_avg_sq is not None:
+                    norm.bias_avg_sq = norm.bias_avg_sq * (lr_sqrt**2)
+            else:
+                continue
+
+            normalizers[layer_name] = norm
 
         proc.normalizers = normalizers
 
diff --git a/tests/test_trainer_callback.py b/tests/test_trainer_callback.py
@@ -1,12 +1,24 @@
 import os
+from pathlib import Path
+
+from torch import nn
+
+from bergson import GradientProcessor
+from bergson.gradients import AdafactorNormalizer
 
 os.environ["CUDA_VISIBLE_DEVICES"] = "0"
 os.environ["WANDB_MODE"] = "disabled"
 
 import pytest
 import torch
 from datasets import Dataset
-from transformers import AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments
+from transformers import (
+    Adafactor,
+    AutoConfig,
+    AutoModelForCausalLM,
+    Trainer,
+    TrainingArguments,
+)
 from trl import SFTConfig, SFTTrainer
 
 from bergson.data import load_gradients
@@ -245,3 +257,144 @@ def test_sft_trainer(self, tmp_path, model, dataset):
         saved_order = Dataset.load_from_disk(str(order_file))
         assert len(saved_order) > 0
         assert all(key in saved_order[0] for key in ["_idx", "global_step", "epoch"])
+
+    @pytest.mark.parametrize("optimizer_name", ["adam", "adafactor"])
+    @pytest.mark.parametrize("include_bias", [True, False])
+    def test_optimizer_state_extraction(self, optimizer_name: str, include_bias: bool):
+        """Test that normalizers are correctly extracted from optimizer state.
+
+        This tests the huggingface.py callback by:
+        1. Training a model with an optimizer
+        2. Calling the callback's on_step_end method
+        3. Verifying against raw optimizer state
+        """
+        torch.manual_seed(42)
+        N = 4
+        S = 6
+        I = 5
+        O = 3
+
+        class SimpleModel(nn.Module):
+            def __init__(self):
+                super().__init__()
+                self.fc1 = nn.Linear(I, O * 2, bias=include_bias)
+                self.relu = nn.ReLU()
+                self.fc2 = nn.Linear(O * 2, O, bias=include_bias)
+
+            def forward(self, x):
+                return self.fc2(self.relu(self.fc1(x)))
+
+        torch.manual_seed(42)
+        model = SimpleModel()
+
+        # Create optimizer
+        if optimizer_name == "adam":
+            optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+        else:
+            optimizer = Adafactor(
+                model.parameters(), scale_parameter=False, relative_step=False, lr=0.001
+            )
+
+        # Train a few steps to build up second moments
+        for _ in range(5):
+            optimizer.zero_grad()
+            out = model(torch.randn(N, S, I))
+            loss = (out**2).sum()
+            loss.backward()
+            optimizer.step()
+
+        # Extract normalizers using the ACTUAL callback
+        from unittest.mock import Mock, patch
+
+        from bergson.huggingface import GradientCollectorCallback
+
+        # Create callback with minimal setup
+        callback = GradientCollectorCallback(
+            path=Path("/tmp/test"),
+            use_optimizer_state=True,
+            include_bias=include_bias,
+        )
+
+        # Mock the collector and processor
+        mock_collector = Mock()
+        mock_collector.processor = GradientProcessor(
+            normalizers={}, include_bias=include_bias
+        )
+        mock_collector.target_info = {"fc1": None, "fc2": None}  # Track these layers
+        callback.collector = mock_collector
+
+        # Mock on_substep_end to avoid needing train_grad_buffer
+        with patch.object(callback, "on_substep_end"):
+            # Call the ACTUAL callback method
+            callback.on_step_end(
+                args=Mock(),
+                state=Mock(epoch=0, global_step=1),
+                control=Mock(),
+                model=model,
+                optimizer=optimizer,
+            )
+
+        # Get the normalizers the callback extracted
+        normalizers = callback.collector.processor.normalizers
+
+        # Verify against raw optimizer state (independent ground truth)
+        for layer_name in ["fc1", "fc2"]:
+            layer = model.get_submodule(layer_name)
+            norm = normalizers[layer_name]
+
+            # Check normalizer type
+            assert isinstance(norm, AdafactorNormalizer)
+
+            # Get raw state from optimizer
+            weight_state = optimizer.state[layer.weight]
+            lr = optimizer.param_groups[0]["lr"]
+            lr_sqrt = lr**0.5
+
+            if optimizer_name == "adam":
+                # Ground truth: Adam stores full exp_avg_sq
+                raw_exp_avg_sq = weight_state["exp_avg_sq"]
+
+                # NOTE: We convert Adam's full second moments to Adafactor's factorized
+                # form (row + col vectors) for memory efficiency. This is a lossy
+                # rank-1 approximation that can have large reconstruction errors.
+                # We can't verify correctness here, only sanity check the factorization.
+
+                # Sanity checks on the factorized representation
+                assert norm.row.shape == (raw_exp_avg_sq.shape[0],)
+                assert norm.col.shape == (raw_exp_avg_sq.shape[1],)
+                assert (
+                    not torch.isnan(norm.row).any() and not torch.isinf(norm.row).any()
+                )
+                assert (
+                    not torch.isnan(norm.col).any() and not torch.isinf(norm.col).any()
+                )
+                assert (norm.row > 0).all() and (
+                    norm.col > 0
+                ).all()  # Second moments are positive
+
+            elif optimizer_name == "adafactor":
+                # Ground truth: Adafactor stores row/col directly
+                raw_row = weight_state["exp_avg_sq_row"]
+                raw_col = weight_state["exp_avg_sq_col"]
+
+                # Our normalizer should match (scaled by LR)
+                expected_row = raw_row * lr_sqrt
+                expected_col = raw_col * lr_sqrt
+
+                torch.testing.assert_close(norm.row, expected_row)
+                torch.testing.assert_close(norm.col, expected_col)
+
+            # Verify bias handling
+            if include_bias and layer.bias is not None:
+                bias_state = optimizer.state[layer.bias]
+                raw_bias_exp_avg_sq = bias_state["exp_avg_sq"]
+                expected_bias = raw_bias_exp_avg_sq * lr
+
+                assert norm.bias_avg_sq is not None, (
+                    f"Expected bias_avg_sq for {layer_name}"
+                )
+                torch.testing.assert_close(norm.bias_avg_sq, expected_bias)
+            else:
+                assert norm.bias_avg_sq is None, (
+                    f"Unexpected bias_avg_sq for {layer_name}"
+                )