[Perf] Using __nv_fp8_e4m3 instead of c10::e4m3 for per_token_group_quant (#21867)

yewentao256 · web-flow · commit 1b0a15553420 · 2025-07-29T21:50:46.000-06:00
Signed-off-by: yewentao256 &lt;zhyanwentao@126.com&gt;
diff --git a/csrc/quantization/fp8/per_token_group_quant.cu b/csrc/quantization/fp8/per_token_group_quant.cu
@@ -1,12 +1,10 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <c10/util/Float8_e4m3fn.h>
 
 #include "../per_token_group_quant_8bit.h"
 
 #include <cmath>
 
-#include <cuda_fp16.h>
-#include <cuda_bf16.h>
+#include <cuda_fp8.h>
 
 #include <torch/all.h>
 
@@ -199,7 +197,7 @@ void per_token_group_quant_8bit(const torch::Tensor& input,
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "per_token_group_quant_8bit", ([&] {
         if (dst_type == at::ScalarType::Float8_e4m3fn) {
-          LAUNCH_KERNEL(scalar_t, c10::Float8_e4m3fn);
+          LAUNCH_KERNEL(scalar_t, __nv_fp8_e4m3);
         } else if (dst_type == at::ScalarType::Char) {
           LAUNCH_KERNEL(scalar_t, int8_t);
         }