Accommodate softmax to handle arbitrary axis dim length

Ziminli · Ziminli · commit 4ce95acea2bd · 2025-05-08T15:03:50.000+08:00
diff --git a/src/ntops/softmax.py b/src/ntops/softmax.py
@@ -5,16 +5,22 @@
 import torch
 from ninetoothed import Tensor
 
+BLOCK_SIZE = ninetoothed.block_size()
+
 
 def arrangement(input, output, dim):
     assert input.ndim == output.ndim
 
     def create_axis_tile_shape(dim, dim_block):
-        return tuple(1 for _ in range(dim)) + (dim_block,) + tuple(1 for _ in range(input.ndim - dim - 1))
-    
-    inner_block_shape = create_axis_tile_shape(dim, input.shape[dim])
+        return (
+            tuple(1 for _ in range(dim))
+            + (dim_block,)
+            + tuple(1 for _ in range(input.ndim - dim - 1))
+        )
+
+    inner_block_shape = create_axis_tile_shape(dim, BLOCK_SIZE)
     outer_block_shape = create_axis_tile_shape(dim, -1)
-    
+
     def arrange(input):
         input_arranged = input.tile(inner_block_shape).tile(outer_block_shape)
 
@@ -25,25 +31,36 @@ def arrange(input):
             tuple(d for d in range(input.ndim) if d != dim)
         )
         return input_arranged
-    
-    input_arranged = arrange(input)
-    output_arranged = arrange(output)
 
-    return input_arranged, output_arranged
+    return arrange(input), arrange(output)
+
+
+def _exp(x, dtype):
+    exp_dtype = dtype if dtype != ntl.float16 else ntl.float32
+    return ntl.cast(ntl.exp(ntl.cast(x, exp_dtype)), dtype)
 
 
 def application(input, output):
+    dtype = output.dtype.dtype
+    prev_max = ntl.cast(float("-inf"), dtype)
+    denominator = ntl.cast(0, dtype)
+
+    for i in range(input.shape[0]):
+        input_i = ntl.cast(input[i], dtype)
+        curr_max = ntl.cast(ntl.maximum(prev_max, ntl.max(input_i)), dtype)
+        input_max_diff_exp = _exp(input_i - curr_max, dtype)
+        prev_curr_max_diff_exp = _exp(prev_max - curr_max, dtype)
+        denominator = denominator * prev_curr_max_diff_exp + ntl.sum(input_max_diff_exp)
+        prev_max = curr_max
+
     for i in range(input.shape[0]):
-        input_i = input[i]
-        row_minus_max = input_i - ntl.max(input_i)
-        numerator = ntl.exp(ntl.cast(row_minus_max, ntl.float32))
-        denominator = ntl.sum(numerator)
-        output[i] = numerator / denominator  # noqa: F841
+        numerator = _exp(input[i] - prev_max, dtype)
+        output[i] = numerator / denominator
 
 
-def softmax(input, dim, output=None):
-    if output is None:
-        output = torch.empty_like(input)
+def softmax(input, dim, dtype=None):
+    tensor_dtype = dtype if dtype is not None else input.dtype
+    output = torch.empty_like(input, dtype=tensor_dtype)
 
     kernel = _make(input.ndim, dim)
 
diff --git a/tests/test_softmax.py b/tests/test_softmax.py
@@ -15,8 +15,9 @@ def test_cuda(shape, dtype, atol, rtol):
 
     input = torch.randn(shape, dtype=dtype, device=device)
     dim = random.randint(0, input.ndim - 1)
+    dtype = random.choice([torch.float16, torch.float32, torch.float64])
 
-    ninetoothed_output = ntops.softmax(input, dim)
-    reference_output = torch.nn.functional.softmax(input, dim=dim)
+    ninetoothed_output = ntops.softmax(input, dim, dtype)
+    reference_output = torch.nn.functional.softmax(input, dim=dim, dtype=dtype)
 
     assert torch.allclose(ninetoothed_output, reference_output, atol=atol, rtol=rtol)