fix: sum bias gradients over sequence dim only, not batch + tests

baberabb · baberabb · commit 4722e6e2c64e · 2025-11-17T19:00:19.000+05:00
diff --git a/bergson/gradients.py b/bergson/gradients.py
@@ -334,8 +334,8 @@ class GradientCollector(ContextDecorator):
     of the parameters, which are expected to be precomputed and passed in.
 
     We assume that the input to `model` is of shape `[N, S, I]`, where `N` is the
-    batch size, `S` is the sequence length, and `I` is the input dimension. We take the
-    mean over the sequence length to obtain a single gradient per sequence.
+    batch size, `S` is the sequence length, and `I` is the input dimension. We
+    sum over the sequence dimension to obtain a single gradient per sequence.
     """
 
     model: nn.Module
@@ -565,24 +565,19 @@ def _process_grad(self, module: nn.Module, _, grad_out):
         if isinstance(norm, AdamNormalizer) or include_bias:
 
             P = G.mT @ I  # [N, O, S] @ [N, S, I] → [N, O, I]
+            if isinstance(norm, AdamNormalizer):
+                # Normalize the gradients using the second moment matrix
+                P /= norm.avg_sq.sqrt().add_(1e-8)
+
             if include_bias:
-                # Append the bias gradient to the input
+                # TODO: should we normalize the bias gradients?
+                # Append the raw bias gradient to the input
                 P = torch.cat(
-                    [
-                        P,
-                        G.sum(dim=(0, 1))
-                        .unsqueeze(0)
-                        .unsqueeze(2)
-                        .expand(P.shape[0], -1, 1),
-                    ],
+                    [P, G.sum(dim=1).unsqueeze(2)],  # [N, S, O] -> [N, O]  # [N, O, 1]
                     dim=2,
                 )
                 i += 1
 
-            if isinstance(norm, AdamNormalizer):
-                # Normalize the gradients using the second moment matrix
-                P /= norm.avg_sq.sqrt().add_(1e-8)
-
             if self.processor.reshape_to_square:
                 P = reshape_to_nearest_square(P)
                 o, i = P.shape[-2:]
diff --git a/tests/test_gradients.py b/tests/test_gradients.py
@@ -1,7 +1,10 @@
 import tempfile
+from collections import defaultdict
 from pathlib import Path
 
+import pytest
 import torch
+import torch.nn as nn
 from transformers import AutoConfig, AutoModelForCausalLM
 
 from bergson.gradients import (
@@ -13,7 +16,7 @@
 )
 
 
-def test_phi3():
+def test_gradient_collector_proj_norm():
     temp_dir = Path(tempfile.mkdtemp())
 
     config = AutoConfig.from_pretrained("trl-internal-testing/tiny-Phi3ForCausalLM")
@@ -105,3 +108,168 @@ def closure(name: str, g: torch.Tensor):
                         )
 
                 previous_collected_grads = collected_grads.copy()
+
+
+@pytest.mark.parametrize("include_bias", [True, False])
+def test_gradient_collector_batched(include_bias: bool):
+    torch.manual_seed(42)
+    N = 4
+    S = 6
+    I = 5
+    O = 3
+
+    class SimpleModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc1 = nn.Linear(I, O * 2, bias=include_bias)
+            self.relu = nn.ReLU()
+            self.fc2 = nn.Linear(O * 2, O, bias=include_bias)
+
+        def forward(self, x):
+            return self.fc2(self.relu(self.fc1(x)))
+
+    torch.manual_seed(42)
+    model = SimpleModel()
+
+    optimizer = torch.optim.Adam(model.parameters())
+
+    # Run a few training steps to build up second moments
+    for _ in range(5):
+        optimizer.zero_grad()
+        out = model(torch.randn(N, S, I))
+        loss = (out**2).sum()
+        loss.backward()
+        optimizer.step()
+
+    normalizers = {}
+    for name, param in model.named_parameters():
+        if "weight" in name:
+            layer_name = name.replace(".weight", "")
+            # Adam stores second moments as 'exp_avg_sq'
+            exp_avg_sq = optimizer.state[param]["exp_avg_sq"]
+            normalizers[layer_name] = AdamNormalizer(exp_avg_sq)
+
+    # collect gradients
+    collected_grads = {}
+
+    def closure(name: str, g: torch.Tensor):
+        """Store the gradients in a dictionary for later comparison."""
+        collected_grads[name] = g
+
+    processor = GradientProcessor(
+        normalizers=normalizers, projection_dim=None, include_bias=include_bias
+    )
+    collector = GradientCollector(model, closure, processor)
+
+    x = torch.randn(N, S, I)
+    with collector:
+        model.zero_grad()
+        out = model(x)
+        loss = (out**2).sum()
+        loss.backward()
+
+    def compute_ground_truth():
+        """Compute gradients using individual backward passes, with normalization."""
+        model.zero_grad()
+        output = model(x)  # [N, S, O]
+
+        # Per-sample losses
+        per_sample_losses = (output**2).sum(dim=(1, 2))  # [N]
+
+        ground_truth_grads = defaultdict(list)
+        for n in range(N):
+            model.zero_grad()
+            per_sample_losses[n].backward(retain_graph=True)
+
+            # manually normalize
+            for layer_name in ["fc1", "fc2"]:
+                layer = model.get_submodule(layer_name)
+                grad = layer.weight.grad.clone()
+
+                grad = normalizers[layer_name].normalize_(grad)
+
+                if include_bias:
+                    bias_grad = layer.bias.grad.clone()
+                    bias_grad = bias_grad.unsqueeze(1)
+                    grad = torch.cat([grad, bias_grad], dim=1)
+
+                ground_truth_grads[layer_name].append(grad)
+
+        for layer_name in ["fc1", "fc2"]:
+            ground_truth_grads[layer_name] = torch.stack(ground_truth_grads[layer_name])
+
+        return ground_truth_grads
+
+    ground_truth = compute_ground_truth()
+    for layer_name in ["fc1", "fc2"]:
+        torch.testing.assert_close(
+            collected_grads[layer_name], ground_truth[layer_name]
+        )
+
+
+def test_bias_gradients():
+    """Test that per-sample bias gradients are correctly computed."""
+    torch.manual_seed(42)
+    N = 4
+    S = 6
+    I = 5
+    O = 3
+
+    class SimpleModel(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.fc = torch.nn.Linear(I, O, bias=True)
+
+        def forward(self, x):
+            return self.fc(x)
+
+    model = SimpleModel()
+    x = torch.randn(N, S, I)
+
+    # bias gradient is a sum over sequence dimension for each n
+    def compute_ground_truth(model) -> torch.Tensor:
+        """Compute gradients using individual backward passes."""
+        model.zero_grad()
+        output = model(x)  # [N, S, O]
+
+        per_sample_losses = (output**2).sum(dim=(1, 2))  # [N]
+
+        bias_grads = []
+        for n in range(N):
+            model.zero_grad()
+            per_sample_losses[n].backward(retain_graph=True)
+            bias_grads.append(model.fc.bias.grad.clone())
+
+        return torch.stack(bias_grads, dim=0)  # [N, O]
+
+    ground_truth = compute_ground_truth(model)
+
+    # GradientCollector with include_bias=True
+    collected_grads = {}
+
+    def closure(name: str, g: torch.Tensor):
+        collected_grads[name] = g
+
+    processor = GradientProcessor(include_bias=True, projection_dim=None)
+    collector = GradientCollector(model, closure, processor, target_modules={"fc"})
+
+    with collector:
+        model.zero_grad()
+        output = model(x)
+        loss = (output**2).sum()
+        loss.backward()
+
+    # the last column is bias
+    bias_grads = collected_grads["fc"][..., -1]
+
+    assert bias_grads.shape == (
+        N,
+        3,
+    ), f"Expected shape ({N}, {O}), got {bias_grads.shape}"
+    assert ground_truth.shape == (
+        N,
+        3,
+    ), f"Expected shape ({N}, {O}), got {ground_truth.shape}"
+
+    # Compare to ground truth
+    torch.testing.assert_close(bias_grads, ground_truth)