bugfix: softmax NaN results caused by large -inf masks (#1178)

xslingcn · yzh119 · web-flow · commit 1b9ba254154f · 2025-06-26T01:00:17.000-07:00
## 📌 Description  For inputs with many `-inf` masks (e.g. topk masked logits), our current softmax kernel produces all `NaN` results when a thread sees a slice of input made up entirely of `-inf`s. This pr adds checks to fix it. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: yzh119 <zihaoy@nvidia.com>
diff --git a/include/flashinfer/sampling.cuh b/include/flashinfer/sampling.cuh
@@ -358,33 +358,31 @@ __global__ void OnlineSoftmaxFusedKernel(DType* logits, DType* output, DType* te
     __syncthreads();
     block_max = temp_storage.shared_state.max_val;
 
-    float thread_sum = 0.0f;
+    // if block_max is -inf, then this block contains all -inf values, so we can skip updating
+    if (!isinf(block_max)) {
+      float thread_sum = 0.0f;
 #pragma unroll
-    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-      thread_sum += __expf(logits_vec[j] - block_max);  // e^(-inf) is safe to add
-    }
+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
+        thread_sum += __expf(logits_vec[j] - block_max);
+      }
 
-    float block_sum =
-        cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce).Sum(thread_sum);
-    if (tx == 0) {
-      temp_storage.shared_state.denominator = block_sum;
-    }
-    __syncthreads();
-    block_sum = temp_storage.shared_state.denominator;
+      float block_sum =
+          cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce).Sum(thread_sum);
+      __syncthreads();
 
-    if (tx == 0) {
-      float new_max = max(running_max, block_max);
-      running_denominator = running_denominator * __expf(running_max - new_max) +
-                            block_sum * __expf(block_max - new_max);
-      running_max = new_max;
+      if (tx == 0) {
+        float new_max = max(running_max, block_max);
+        running_denominator = running_denominator * __expf(running_max - new_max) +
+                              block_sum * __expf(block_max - new_max);
+        running_max = new_max;
 
-      temp_storage.shared_state.max_val = running_max;
-      temp_storage.shared_state.denominator = running_denominator;
+        temp_storage.shared_state.max_val = running_max;
+        temp_storage.shared_state.denominator = running_denominator;
+      }
+      __syncthreads();
+      running_max = temp_storage.shared_state.max_val;
+      running_denominator = temp_storage.shared_state.denominator;
     }
-    __syncthreads();
-
-    running_max = temp_storage.shared_state.max_val;
-    running_denominator = temp_storage.shared_state.denominator;
   }
 
   const float final_max = running_max;
@@ -476,34 +474,31 @@ __global__ void OnlineSoftmaxMapKernel(DType* logits, PartialSoftmaxResult* part
     __syncthreads();
     block_max = temp_storage.shared_state.max_val;
 
-    float thread_sum = 0.0f;
+    // if block_max is -inf, then this block contains all -inf values, so we can skip updating
+    if (!isinf(block_max)) {
+      float thread_sum = 0.0f;
 #pragma unroll
-    for (uint32_t j = 0; j < VEC_SIZE; ++j) {
-      thread_sum += __expf(logits_vec[j] - block_max);
-    }
-
-    float block_sum =
-        cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce).Sum(thread_sum);
+      for (uint32_t j = 0; j < VEC_SIZE; ++j) {
+        thread_sum += __expf(logits_vec[j] - block_max);
+      }
 
-    if (tx == 0) {
-      temp_storage.shared_state.denominator = block_sum;
-    }
-    __syncthreads();
-    block_sum = temp_storage.shared_state.denominator;
+      float block_sum =
+          cub::BlockReduce<float, BLOCK_THREADS>(temp_storage.block_prim.reduce).Sum(thread_sum);
+      __syncthreads();
 
-    if (tx == 0) {
-      float new_max = max(running_max, block_max);
-      running_denominator = running_denominator * __expf(running_max - new_max) +
-                            block_sum * __expf(block_max - new_max);
-      running_max = new_max;
+      if (tx == 0) {
+        float new_max = max(running_max, block_max);
+        running_denominator = running_denominator * __expf(running_max - new_max) +
+                              block_sum * __expf(block_max - new_max);
+        running_max = new_max;
 
-      temp_storage.shared_state.max_val = running_max;
-      temp_storage.shared_state.denominator = running_denominator;
+        temp_storage.shared_state.max_val = running_max;
+        temp_storage.shared_state.denominator = running_denominator;
+      }
+      __syncthreads();
+      running_max = temp_storage.shared_state.max_val;
+      running_denominator = temp_storage.shared_state.denominator;
     }
-    __syncthreads();
-
-    running_max = temp_storage.shared_state.max_val;
-    running_denominator = temp_storage.shared_state.denominator;
   }
 
   if (tx == 0) {
diff --git a/tests/test_sampling.py b/tests/test_sampling.py
@@ -50,9 +50,17 @@ def gumbel_noise(shape, device):
 )
 @pytest.mark.parametrize("temperature", [1.0, 0.5, 0.1])
 @pytest.mark.parametrize("temperature_arr", [True, False])
-def test_softmax(batch_size, vocab_size, distribution, temperature, temperature_arr):
+@pytest.mark.parametrize("neg_inf_input", [True, False])
+def test_softmax(
+    batch_size, vocab_size, distribution, temperature, temperature_arr, neg_inf_input
+):
     torch.manual_seed(42)
     logits = distribution((batch_size, vocab_size), "cuda:0")
+    if neg_inf_input:
+        # assign random logits to -inf
+        num_inf = torch.randint(0, logits.numel() - 1, (), device=logits.device).item()
+        inf_idx = torch.randperm(logits.numel(), device=logits.device)[:num_inf]
+        logits.view(-1).index_fill_(0, inf_idx, float("-inf"))
 
     if temperature_arr:
         temperature_arr = torch.full((batch_size,), temperature, device="cuda:0")
@@ -64,7 +72,7 @@ def test_softmax(batch_size, vocab_size, distribution, temperature, temperature_
 
     probs_ref = torch.softmax(logits_scaled, dim=-1)
 
-    assert torch.allclose(probs, probs_ref, atol=1e-3)
+    assert torch.allclose(probs, probs_ref, atol=1e-5)
 
 
 @pytest.mark.parametrize("vocab_size", [111, 32000, 128256])