Using functools.cache approach

lianyiibo · lianyiibo · commit 37f3c78d8f5d · 2025-07-16T18:29:51.000+08:00
Signed-off-by: lianyibo &lt;lianyibo1@kunlunit.com&gt;
diff --git a/vllm_ascend/core/scheduler.py b/vllm_ascend/core/scheduler.py
@@ -53,7 +53,6 @@ def __init__(
                          include_finished_set, log_stats)
         self.scheduled_req_ids: set[str] = set()
         self.running: list[Request] = []
-        self.lock_version = vllm_version_is("0.9.2")
 
     def schedule(self) -> SchedulerOutput:
         if self.scheduler_config.chunked_prefill_enabled:
@@ -284,13 +283,13 @@ def skip_cur_request():
                     # allow the lower-priority requests to be scheduled.
                     req_index += 1
                     continue
-                if self.lock_version:
+                if vllm_version_is("0.9.2"):
                     num_draft_tokens = max(
                         num_new_tokens + request.num_computed_tokens -
                         request.num_tokens, 0)
 
                 while True:
-                    if self.lock_version:
+                    if vllm_version_is("0.9.2"):
                         new_blocks = self.kv_cache_manager.allocate_slots(
                             request,
                             num_new_tokens,
diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py
@@ -22,6 +22,7 @@
 import math
 import os
 import shutil
+import functools
 from contextlib import contextmanager, nullcontext
 from enum import Enum
 from threading import Lock
@@ -280,6 +281,7 @@ def adapt_patch(is_global_patch: bool = False):
         from vllm_ascend.patch import worker  # noqa: F401
 
 
+@functools.cache
 def vllm_version_is(target_vllm_version: str):
     if envs.VLLM_VERSION is not None:
         vllm_version = envs.VLLM_VERSION