[Bugfix] fix ragged attention kernel auto-tuning table key (#9497)

yaochengji · web-flow · commit 31c4c2f23918 · 2025-07-23T19:02:07.000Z
diff --git a/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py b/torch_xla/experimental/pallas_kernels/ragged_paged_attention_v2.py
@@ -25,6 +25,7 @@
 from jax.experimental import pallas as pl
 from jax.experimental.pallas import tpu as pltpu
 import jax.numpy as jnp
+import logging
 
 DEFAULT_MASK_VALUE = -0.7 * float(jnp.finfo(jnp.dtype("float32")).max)
 # The page size is too small. We only have 32 SREGs in TC. If the pages
@@ -1421,12 +1422,12 @@ def simplify_key(key):
   return (
       jnp.dtype(q_dtype).name,
       jnp.dtype(kv_dtype).name,
-      next_power_of_2(num_q_heads_per_blk),
-      next_power_of_2(num_kv_heads_per_blk),
+      num_q_heads_per_blk,
+      num_kv_heads_per_blk,
       (head_dim + 127) // 128 * 128,
       next_power_of_2(page_size),
       next_power_of_2(max_num_batched_tokens),
-      next_power_of_2(page_size * pages_per_seq),
+      page_size * pages_per_seq,
   )
 
 
@@ -1472,7 +1473,7 @@ def get_tuned_block_sizes(
       max_num_batched_tokens,
       pages_per_seq,
   )
-  key = simplify_key(key)
+  simplified_key = simplify_key(key)
   device_name = get_device_name()
 
   # Default block sizes.
@@ -1500,8 +1501,12 @@ def compute_actual_vmem_bytes(num_kv_pages_per_blk):
     # OOM in vmem
     bkv, bq = (32, 32)
   elif device_name in TUNED_BLOCK_SIZES:
-    if key in TUNED_BLOCK_SIZES[device_name]:
-      bkv, bq = TUNED_BLOCK_SIZES[device_name][key]
+    if simplified_key in TUNED_BLOCK_SIZES[device_name]:
+      bkv, bq = TUNED_BLOCK_SIZES[device_name][simplified_key]
+    else:
+      logging.warning(
+          f"simplified_key({simplified_key}) is not in ragged attention kernel's tuning table!, the key before simpilification is {key}"
+      )
   return (min(pages_per_seq, bkv), min(max_num_batched_tokens, bq))
 
 
diff --git a/torchax/torchax/interop.py b/torchax/torchax/interop.py
@@ -239,8 +239,7 @@ def j2t_autograd(fn, call_jax=call_jax):
 
   @wraps(fn)
   def inner(*args, **kwargs):
-    from jax.tree_util import tree_flatten, tree_unflatten
-    from jax.util import safe_zip
+    from jax.tree_util import tree_flatten
 
     class JaxFun(torch.autograd.Function):
 
@@ -275,8 +274,8 @@ def backward(ctx, *grad_out):
         # The subsequent gradients correspond to flat_inputs.
         # We need to put a None for inputs that did not require gradients.
         final_grads = [None]
-        for needs_grad, grad in safe_zip(ctx.needs_input_grad[1:],
-                                         input_grads_structured):
+        for needs_grad, grad in zip(
+            ctx.needs_input_grad[1:], input_grads_structured, strict=True):
           final_grads.append(grad if needs_grad else None)
 
         return tuple(final_grads)