correctly plumb quantization_rule to kernel

khatwanimohit · khatwanimohit · commit c88e5a907845 · 2025-08-19T21:55:25.000Z
diff --git a/MaxText/configs/base.yml b/MaxText/configs/base.yml
@@ -120,7 +120,12 @@ save_quantized_params_path: ""
 model_call_mode: ""
 use_qwix_quantization: False # Whether to use qwix for quantization. If set to True, the model will be quantized using qwix.
 # Quantization calibration method used for weights and activations. Supported methods can be found in https://github.com/google/qwix/blob/dc2a0770351c740e5ab3cce7c0efe9f7beacce9e/qwix/qconfig.py#L70-L80
-quantization_calibration_method: "absmax"
+fwd_weight_calibration_method: "absmax"
+fwd_act_calibration_method: "absmax"
+dlhs_lhs_calibration_method: "absmax"
+dlhs_rhs_calibration_method: "absmax"
+drhs_lhs_calibration_method: "absmax"
+drhs_rhs_calibration_method: "absmax"
 # Shard the range finding operation for quantization. By default this is set to number of slices.
 quantization_local_shard_count: -1
 
diff --git a/MaxText/kernels/megablox/gmm.py b/MaxText/kernels/megablox/gmm.py
@@ -33,6 +33,7 @@
 
 from qwix.pallas import QArray
 import qwix.pallas as qpl
+import qwix
 
 from MaxText.kernels.megablox import common
 
@@ -305,6 +306,7 @@ def _zero_uninitialized_memory(
         "interpret",
         "lhs_quantize_dtype",
         "rhs_quantize_dtype",
+        "quantization_rule",
         "use_qwix_quantization",
     ],
 )
@@ -320,6 +322,7 @@ def gmm(
     interpret: bool = False,
     lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
+    quantization_rule: qwix.QtRule | None = None,
     use_qwix_quantization: bool = False,
 ) -> jnp.ndarray:
   """Compute lhs[sizes[i-1]:sizes[i], :] @ rhs for each group 'i'.
@@ -780,7 +783,7 @@ def _do():
         qvalue = lax.select(
             rhs_mask[...],
             rhs.qvalue[...],
-            jnp.zeros_like(rhs.qvalue, lhs.qvalue.dtype),
+            jnp.zeros_like(rhs.qvalue, rhs.qvalue.dtype),
         )
         loaded_rhs = dataclasses.replace(loaded_rhs, qvalue=qvalue)
       else:
diff --git a/MaxText/kernels/megablox/ops.py b/MaxText/kernels/megablox/ops.py
@@ -30,15 +30,9 @@
 
 gmm = jax.custom_vjp(
     backend.gmm,
-    nondiff_argnums=(3, 4, 7, 8, 9, 10, 11),
+    nondiff_argnums=(3, 4, 7, 8, 9, 10, 11, 12),
 )
 
-def _get_current_rule(op_name: str):
-  rule = qpl.get_current_rule(op_name)
-  if rule is not None and not isinstance(rule, qwix.QtRule):
-    rule = qwix.QtRule(**dataclasses.asdict(rule))
-  return rule
-
 def _gmm_fwd(
     lhs: jnp.ndarray,
     rhs: jnp.ndarray | aqt_tensor.QTensor,
@@ -51,6 +45,7 @@ def _gmm_fwd(
     interpret: bool = False,
     lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
     rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None = None,
+    quantization_rule: qwix.QtRule | None = None,
     use_qwix_quantization: bool = False,
 ) -> tuple[
     jnp.ndarray,
@@ -65,10 +60,9 @@ def _gmm_fwd(
   """Forward function for GMM VJP."""
   if use_qwix_quantization:
     lhs_quantize_dtype, rhs_quantize_dtype = None, None
-    rule = _get_current_rule("dot_general")
-    if rule is not None:
-      lhs_quantize_dtype = rule.act_qtype
-      rhs_quantize_dtype = rule.weight_qtype
+    if quantization_rule is not None:
+      lhs_quantize_dtype = quantization_rule.act_qtype
+      rhs_quantize_dtype = quantization_rule.weight_qtype
   out = backend.gmm(
       lhs,
       rhs,
@@ -93,6 +87,7 @@ def _gmm_bwd(
     interpret: bool,
     lhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None,
     rhs_quantize_dtype: Literal[jnp.int4, jnp.int8] | None,
+    quantization_rule: qwix.QtRule | None,
     use_qwix_quantization: bool,
     residual: tuple[
         jnp.ndarray,
@@ -106,14 +101,13 @@ def _gmm_bwd(
   """Backward function for throughput GMM VJP."""
   if use_qwix_quantization:
     lhs_quantize_dtype, rhs_quantize_dtype = None, None
-    rule = _get_current_rule("dot_general")
-    if rule is not None:
-      if rule.additional_qt_config is not None:
-        lhs_quantize_dtype = rule.additional_qt_config["dlhs_lhs_qtype"]
-        rhs_quantize_dtype = rule.additional_qt_config["dlhs_rhs_qtype"]
+    if quantization_rule is not None:
+      if quantization_rule.additional_qt_config is not None:
+        lhs_quantize_dtype = quantization_rule.additional_qt_config["dlhs_lhs_qtype"]
+        rhs_quantize_dtype = quantization_rule.additional_qt_config["dlhs_rhs_qtype"]
       else:
-        lhs_quantize_dtype = rule.act_qtype
-        rhs_quantize_dtype = rule.bwd_qtype
+        lhs_quantize_dtype = quantization_rule.act_qtype
+        rhs_quantize_dtype = quantization_rule.bwd_qtype
   del preferred_element_type
   lhs, rhs, group_sizes, group_offset, num_actual_groups = residual
   grad_lhs = backend.gmm(
@@ -131,14 +125,13 @@ def _gmm_bwd(
   )
   if use_qwix_quantization:
     lhs_quantize_dtype, rhs_quantize_dtype = None, None
-    rule = _get_current_rule("dot_general")
-    if rule is not None:
-      if rule.additional_qt_config is not None:
-        lhs_quantize_dtype = rule.additional_qt_config["drhs_lhs_qtype"]
-        rhs_quantize_dtype = rule.additional_qt_config["drhs_rhs_qtype"]
+    if quantization_rule is not None:
+      if quantization_rule.additional_qt_config is not None:
+        lhs_quantize_dtype = quantization_rule.additional_qt_config["drhs_lhs_qtype"]
+        rhs_quantize_dtype = quantization_rule.additional_qt_config["drhs_rhs_qtype"]
       else:
-        lhs_quantize_dtype = rule.bwd_qtype
-        rhs_quantize_dtype = rule.act_qtype
+        lhs_quantize_dtype = quantization_rule.bwd_qtype
+        rhs_quantize_dtype = quantization_rule.act_qtype
   grad_rhs = backend.tgmm(
       lhs.swapaxes(0, 1),
       grad,
diff --git a/MaxText/layers/moe.py b/MaxText/layers/moe.py
@@ -736,6 +736,7 @@ def gmm(inputs, kernel, group_sizes, expert_assignments):
         quant_dg = self.quant.quant_dg
         lhs_quantize_dtype = quant_dg.fwd.dg_quantizer.lhs.numerics.get_dtype()
         rhs_quantize_dtype = quant_dg.fwd.dg_quantizer.rhs.numerics.get_dtype()
+      quantization_rule=None
       if self.config.use_qwix_quantization:
         quantization_rule = qpl.get_current_rule("dot_general")
         if quantization_rule is not None:
@@ -756,6 +757,7 @@ def gmm(inputs, kernel, group_sizes, expert_assignments):
             tiling=tiling,
             lhs_quantize_dtype=lhs_quantize_dtype,
             rhs_quantize_dtype=rhs_quantize_dtype,
+            quantization_rule=quantization_rule,
             use_qwix_quantization=self.config.use_qwix_quantization,
         )
       else:
diff --git a/MaxText/layers/quantizations.py b/MaxText/layers/quantizations.py
@@ -35,6 +35,7 @@
 from flax.linen import fp8_ops
 from flax.linen import initializers as flax_initializers
 import flax.linen as nn
+from flax.core import FrozenDict
 
 from MaxText.common_types import DType, Config
 from MaxText.inference.kvcache import KVQuant
@@ -667,16 +668,19 @@ def get_quantization_rule(config: Config):
           bwd_qtype=jnp.float8_e5m2,
           bwd_use_original_residuals=True,
           disable_channelwise_axes=True, # per_tensor calibration
-          weight_calibration_method=config.quantization_calibration_method,
-          act_calibration_method=config.quantization_calibration_method,
-          bwd_calibration_method=config.quantization_calibration_method,
+          weight_calibration_method=config.fwd_weight_calibration_method,
+          act_calibration_method=config.fwd_act_calibration_method,
           op_names=("dot_general",),
-          additional_qt_config={
+          additional_qt_config=FrozenDict({
             "dlhs_lhs_qtype": jnp.float8_e5m2,
             "dlhs_rhs_qtype": jnp.float8_e4m3fn,
             "drhs_lhs_qtype": jnp.float8_e4m3fn,
             "drhs_rhs_qtype": jnp.float8_e5m2,
-          },
+            "dlhs_lhs_calibration_method": config.dlhs_lhs_calibration_method,
+            "dlhs_rhs_calibration_method": config.dlhs_rhs_calibration_method,
+            "drhs_lhs_calibration_method": config.drhs_lhs_calibration_method,
+            "drhs_rhs_calibration_method": config.drhs_rhs_calibration_method,
+          }),
       )
     case "fp8_gpu":
       return qwix.QtRule(