Skip to content

Commit 34a20c4

Browse files
authored
[Logs] Change flashinfer sampler logs to once (#21759)
Signed-off-by: mgoin <[email protected]>
1 parent 31084b3 commit 34a20c4

File tree

1 file changed

+8
-7
lines changed

1 file changed

+8
-7
lines changed

vllm/v1/sample/ops/topk_topp_sampler.py

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def __init__(self):
3333
if is_flashinfer_available:
3434
flashinfer_version = flashinfer.__version__
3535
if flashinfer_version < "0.2.3":
36-
logger.warning(
36+
logger.warning_once(
3737
"FlashInfer version >= 0.2.3 required. "
3838
"Falling back to default sampling implementation.")
3939
self.forward = self.forward_native
@@ -46,17 +46,18 @@ def __init__(self):
4646
# None means False, while in V1, None means True. This is
4747
# why we use the condition
4848
# `envs.VLLM_USE_FLASHINFER_SAMPLER is not False` here.
49-
logger.info("Using FlashInfer for top-p & top-k sampling.")
49+
logger.info_once(
50+
"Using FlashInfer for top-p & top-k sampling.")
5051
self.forward = self.forward_cuda
5152
else:
52-
logger.warning(
53+
logger.warning_once(
5354
"FlashInfer is available, but it is not enabled. "
5455
"Falling back to the PyTorch-native implementation of "
5556
"top-p & top-k sampling. For the best performance, "
5657
"please set VLLM_USE_FLASHINFER_SAMPLER=1.")
5758
self.forward = self.forward_native
5859
else:
59-
logger.warning(
60+
logger.warning_once(
6061
"FlashInfer is not available. Falling back to the PyTorch-"
6162
"native implementation of top-p & top-k sampling. For the "
6263
"best performance, please install FlashInfer.")
@@ -97,9 +98,9 @@ def forward_cuda(
9798
probs = logits.softmax(dim=-1, dtype=torch.float32)
9899
return random_sample(probs, generators)
99100
if generators:
100-
logger.warning("FlashInfer 0.2.3+ does not support "
101-
"per-request generators. Falling back to "
102-
"PyTorch-native implementation.")
101+
logger.warning_once("FlashInfer 0.2.3+ does not support "
102+
"per-request generators. Falling back to "
103+
"PyTorch-native implementation.")
103104
return self.forward_native(logits, generators, k, p)
104105
# flashinfer sampling functions expect contiguous logits.
105106
# In flex_attn/triton_attn fp32 inference, logits can be non-contiguous

0 commit comments

Comments
 (0)