Skip to content

Commit cd4e928

Browse files
committed
[HybridKV][Bugfix] Fix Hybrid kvcache sharing bug in same attention type
Signed-off-by: MengqingCao <[email protected]>
1 parent 1f25d60 commit cd4e928

File tree

2 files changed

+26
-20
lines changed

2 files changed

+26
-20
lines changed

tests/e2e/multicard/test_qwen3_next.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,12 @@
2727
def test_models_distributed_Qwen3_NEXT_TP4():
2828
example_prompts = [
2929
"Hello, my name is",
30-
]
30+
] * 4
3131
max_tokens = 5
3232
with VllmRunner("Qwen/Qwen3-Next-80B-A3B-Instruct",
3333
tensor_parallel_size=4,
3434
max_model_len=4096,
35-
gpu_memory_utilization=0.7,
35+
gpu_memory_utilization=0.8,
3636
distributed_executor_backend="mp",
3737
enforce_eager=True) as vllm_model:
3838
vllm_model.generate_greedy(example_prompts, max_tokens)

vllm_ascend/worker/model_runner_v1.py

Lines changed: 24 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3175,25 +3175,26 @@ def initialize_kv_cache_tensors(
31753175
# TODO: REFACTOR ME to sharing hybrid cache
31763176
for idx in range(len(kv_cache_tensor.shared_by)):
31773177
layer_name = kv_cache_tensor.shared_by[idx]
3178-
if "linear_attn" in layer_name:
3178+
if "linear_attn" in layer_name and layer_name not in kv_cache_raw_tensors.keys(
3179+
):
31793180
# for mamba linear attention
3181+
if self.vllm_config.kv_transfer_config is None:
3182+
tensor = torch.zeros(kv_cache_tensor.size,
3183+
dtype=torch.int8,
3184+
device=self.device)
3185+
else:
3186+
cache_size_aligned = kv_cache_tensor.size + alignment
3187+
tensor = torch.zeros(cache_size_aligned,
3188+
dtype=torch.int8,
3189+
device=self.device)
3190+
tensor = self._align_memory(
3191+
tensor, alignment)[:kv_cache_tensor.size]
31803192
for layer_name_inner in kv_cache_tensor.shared_by:
3181-
if ("attn" in layer_name_inner and "linear_attn" not in layer_name_inner) or \
3182-
layer_name_inner in kv_cache_raw_tensors.keys():
3183-
continue
3184-
if self.vllm_config.kv_transfer_config is None:
3185-
tensor = torch.zeros(kv_cache_tensor.size,
3186-
dtype=torch.int8,
3187-
device=self.device)
3188-
else:
3189-
cache_size_aligned = kv_cache_tensor.size + alignment
3190-
tensor = torch.zeros(cache_size_aligned,
3191-
dtype=torch.int8,
3192-
device=self.device)
3193-
tensor = self._align_memory(
3194-
tensor, alignment)[:kv_cache_tensor.size]
3195-
kv_cache_raw_tensors[layer_name_inner] = tensor
3196-
elif "attn" in layer_name:
3193+
# shared the kvcache between the self_attn specs in the same group
3194+
if "linear_attn" in layer_name_inner:
3195+
kv_cache_raw_tensors[layer_name_inner] = tensor
3196+
elif "attn" in layer_name and layer_name not in kv_cache_raw_tensors.keys(
3197+
):
31973198
# for other attentions, e.g., self_attn, sliding window attn
31983199
if self.vllm_config.kv_transfer_config is None:
31993200
k_tensor = torch.zeros(kv_cache_tensor.size // 2,
@@ -3215,7 +3216,12 @@ def initialize_kv_cache_tensors(
32153216
alignment)[:cache_size]
32163217
v_tensor = self._align_memory(v_tensor,
32173218
alignment)[:cache_size]
3218-
kv_cache_raw_tensors[layer_name] = (k_tensor, v_tensor)
3219+
for layer_name_inner in kv_cache_tensor.shared_by:
3220+
# shared the kvcache between the self_attn specs in the same group
3221+
if ("attn" in layer_name_inner
3222+
and "linear_attn" not in layer_name_inner):
3223+
kv_cache_raw_tensors[layer_name_inner] = (k_tensor,
3224+
v_tensor)
32193225

32203226
layer_names = set()
32213227
for group in kv_cache_config.kv_cache_groups:

0 commit comments

Comments
 (0)