[Logs] Change flashinfer sampler logs to once (#21759)

mgoin · web-flow · commit 34a20c49b3f8 · 2025-07-28T06:59:51.000-07:00
Signed-off-by: mgoin &lt;mgoin64@gmail.com&gt;
diff --git a/vllm/v1/sample/ops/topk_topp_sampler.py b/vllm/v1/sample/ops/topk_topp_sampler.py
@@ -33,7 +33,7 @@ def __init__(self):
             if is_flashinfer_available:
                 flashinfer_version = flashinfer.__version__
                 if flashinfer_version < "0.2.3":
-                    logger.warning(
+                    logger.warning_once(
                         "FlashInfer version >= 0.2.3 required. "
                         "Falling back to default sampling implementation.")
                     self.forward = self.forward_native
@@ -46,17 +46,18 @@ def __init__(self):
                     # None means False, while in V1, None means True. This is
                     # why we use the condition
                     # `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
-                    logger.info("Using FlashInfer for top-p & top-k sampling.")
+                    logger.info_once(
+                        "Using FlashInfer for top-p & top-k sampling.")
                     self.forward = self.forward_cuda
                 else:
-                    logger.warning(
+                    logger.warning_once(
                         "FlashInfer is available, but it is not enabled. "
                         "Falling back to the PyTorch-native implementation of "
                         "top-p & top-k sampling. For the best performance, "
                         "please set VLLM_USE_FLASHINFER_SAMPLER=1.")
                     self.forward = self.forward_native
             else:
-                logger.warning(
+                logger.warning_once(
                     "FlashInfer is not available. Falling back to the PyTorch-"
                     "native implementation of top-p & top-k sampling. For the "
                     "best performance, please install FlashInfer.")
@@ -97,9 +98,9 @@ def forward_cuda(
             probs = logits.softmax(dim=-1, dtype=torch.float32)
             return random_sample(probs, generators)
         if generators:
-            logger.warning("FlashInfer 0.2.3+ does not support "
-                           "per-request generators. Falling back to "
-                           "PyTorch-native implementation.")
+            logger.warning_once("FlashInfer 0.2.3+ does not support "
+                                "per-request generators. Falling back to "
+                                "PyTorch-native implementation.")
             return self.forward_native(logits, generators, k, p)
         # flashinfer sampling functions expect contiguous logits.
         # In flex_attn/triton_attn fp32 inference, logits can be non-contiguous