add scale_by_lr method to AdafactorNormalizer

baberabb · baberabb · commit 31e50084eb6a · 2025-11-18T05:30:14.000+05:00
diff --git a/bergson/gradients.py b/bergson/gradients.py
@@ -140,6 +140,21 @@ def to_adam(self) -> "AdamNormalizer":
         avg_sq = torch.outer(self.row, self.col) / self.row.mean()
         return AdamNormalizer(avg_sq=avg_sq, bias_avg_sq=self.bias_avg_sq)
 
+    def scale_by_lr(self, lr: float | Tensor) -> "AdafactorNormalizer":
+        """Scale normalizer by learning rate.
+
+        Factorized dimensions (row, col) are scaled by sqrt(lr).
+        Bias is scaled by lr.
+        """
+        lr_sqrt = lr**0.5
+        return AdafactorNormalizer(
+            row=self.row * lr_sqrt,  # shape [O]
+            col=self.col * lr_sqrt,  # shape [I]
+            bias_avg_sq=self.bias_avg_sq * lr
+            if self.bias_avg_sq is not None
+            else None,  # shape [O]
+        )
+
 
 @dataclass
 class AdamNormalizer(Normalizer):
diff --git a/bergson/huggingface.py b/bergson/huggingface.py
@@ -324,7 +324,7 @@ def on_step_end(
 
         # Build normalizers from collected second moments
         for layer_name, moments in layer_second_moments.items():
-            lr_sqrt = moments["lr"] ** 0.5
+            lr = moments["lr"]
 
             # Adam-like: has weight exp_avg_sq
             if "weight" in moments:
@@ -333,23 +333,16 @@ def on_step_end(
 
                 # Create Adam normalizer with optional bias, then convert to Adafactor
                 # TODO: always convert to adafactor?
-                norm = AdamNormalizer(weight_eas, bias_eas).to_adafactor()
-
-                # Scale by LR (factorized) - use non-in-place ops to avoid modifying optimizer state
-                norm.row = norm.row * lr_sqrt
-                norm.col = norm.col * lr_sqrt
-                if norm.bias_avg_sq is not None:
-                    norm.bias_avg_sq = norm.bias_avg_sq * (lr_sqrt**2)
+                norm = (
+                    AdamNormalizer(weight_eas, bias_eas).to_adafactor().scale_by_lr(lr)
+                )
 
             # Adafactor-like: has row/col
             elif "row" in moments and "col" in moments:
                 bias_eas = moments.get("bias")  # May be present
-                norm = AdafactorNormalizer(moments["row"], moments["col"], bias_eas)
-                # Scale by LR (factorized) - use non-in-place ops to avoid modifying optimizer state
-                norm.row = norm.row * lr_sqrt
-                norm.col = norm.col * lr_sqrt
-                if norm.bias_avg_sq is not None:
-                    norm.bias_avg_sq = norm.bias_avg_sq * (lr_sqrt**2)
+                norm = AdafactorNormalizer(
+                    moments["row"], moments["col"], bias_eas
+                ).scale_by_lr(lr)
             else:
                 continue