vllm-project
diff --git a/‎tests/v1/sample/test_tree_rejection_sampler.py
Lines changed: 117 additions & 0 deletions b/‎tests/v1/sample/test_tree_rejection_sampler.py
Lines changed: 117 additions & 0 deletions
diff --git a/‎vllm/attention/layer.py
Lines changed: 4 additions & 1 deletion b/‎vllm/attention/layer.py
Lines changed: 4 additions & 1 deletion
diff --git a/‎vllm/config/__init__.py
Lines changed: 14 additions & 13 deletions b/‎vllm/config/__init__.py
Lines changed: 14 additions & 13 deletions
diff --git a/‎vllm/model_executor/models/llama.py
Lines changed: 9 additions & 0 deletions b/‎vllm/model_executor/models/llama.py
Lines changed: 9 additions & 0 deletions
diff --git a/‎vllm/tree_drafter_params.py
Lines changed: 103 additions & 0 deletions b/‎vllm/tree_drafter_params.py
Lines changed: 103 additions & 0 deletions
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from torch import Generator
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import (
+    apply_top_k_top_p,
+    is_flashinfer_available,
+)
+
+DEVICE = current_platform.device_type
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
+if is_flashinfer_available:
+    from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
+
+
+@pytest.fixture(autouse=True)
+def reset_default_device():
+    """
+    Explicitly set the default device, which can affect subsequent tests.
+    Adding this fixture helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
+
+
+def test_topk_impl_equivalence():
+
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(33)
+
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Random top-k values between 1 and 9.
+    k = torch.randint(1, 10, (BATCH_SIZE,), generator=generator)
+
+    # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+    k.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=bool), VOCAB_SIZE
+    )
+
+    # Top-k only implementation
+    result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+    # Top-p + top-k
+    no_op_top_p = torch.tensor([1.0])
+    result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+    assert torch.allclose(result1, result2)
+
+
+def test_tree_rejection_sampler():
+    """
+    This test verifies that the FlashInfer top-k and top-p sampling
+    implementation produces the same results as the Python implementation.
+
+    NOTE: FlashInfer did not directly expose an interface for fused top-k and
+    top-p prob renorm (it did provide fused sampling but we cannot compare
+    sampling results due to randomness), so we will compare the probability
+    renormed consequently by top-k and then top-p of FlashInfer implementation.
+    """
+
+    if not FLASHINFER_ENABLED:
+        pytest.skip("FlashInfer not installed or not available on this platform.")
+
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(42)
+
+    # Generate random logits
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Generate various top-k and top-p values
+    k_values = torch.randint(1, 1000, (BATCH_SIZE,), generator=generator)
+    p_values = (
+        torch.rand((BATCH_SIZE,), generator=generator) * 0.5 + 0.5
+    )  # range in [0.5, 1.0]
+
+    # Sometimes disable top-k (k=vocab_size)
+    k_values.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool),
+        VOCAB_SIZE,
+    )
+
+    # Sometimes disable top-p (p=1.0)
+    p_values.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 1.0
+    )
+
+    python_logits = apply_top_k_top_p(
+        logits=logits.clone(),
+        k=k_values,
+        p=p_values,
+    )
+    python_probs = torch.softmax(python_logits, dim=-1)
+
+    # FlashInfer only exposed renorm interfaces for probs so convert first
+    flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
+    flashinfer_probs = top_k_renorm_probs(
+        probs=flashinfer_probs,
+        top_k=k_values,
+    )
+    flashinfer_probs = top_p_renorm_probs(
+        probs=flashinfer_probs,
+        top_p=p_values,
+    )
+
+    # Compare the results
+    assert torch.allclose(
+        python_probs, flashinfer_probs, atol=2e-2
+    ), "FlashInfer and Python sampling implementations do not match!"
@@ -320,6 +320,10 @@ def process_weights_after_loading(self, act_dtype: torch.dtype):
     def get_attn_backend(self) -> type[AttentionBackend]:
         return self.attn_backend
 
+    def get_kv_cache(self) -> torch.Tensor:
+        forward_context: ForwardContext = get_forward_context()
+        return self.kv_cache[forward_context.virtual_engine]
+
 
 class MultiHeadAttention(nn.Module):
     """Multi-headed attention without any cache, used for ViT."""
@@ -409,7 +413,6 @@ def forward(
 
         return out.reshape(bsz, q_len, -1)
 
-
 def wait_for_kv_layer_from_connector(layer_name: str):
     if not has_kv_transfer_group() or not is_v1_kv_transfer_group():
         return
 
@@ -48,6 +48,7 @@
     try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
+from vllm.tree_drafter_params import TreeDrafterParams
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
                         LazyLoader, common_broadcastable_dtype, random_uuid)
 
@@ -1980,6 +1981,9 @@ class SpeculativeConfig:
         ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
+    # params generated in the post-init stage for tree drafting.
+    tree_drafter_params: SkipValidation[TreeDrafterParams] = None
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -2201,12 +2205,9 @@ def __post_init__(self):
                         (i + 1) * (0, )
                         for i in range(self.num_speculative_tokens)
                     ])
-                else:
-                    # Sort the token tree breadth-first.
-                    tree_choices = ast.literal_eval(
-                        self.speculative_token_tree)
-                    self.speculative_token_tree = str(
-                        sorted(tree_choices, key=lambda t: (len(t), t)))
+                # Construct tree drafter params from the serialized token tree.
+                self.tree_drafter_params = TreeDrafterParams.from_spec_token_tree(
+                    self.speculative_token_tree)
 
                 self.draft_tensor_parallel_size = \
                     SpeculativeConfig._verify_and_get_draft_tp(
@@ -2518,7 +2519,7 @@ class MultiModalConfig:
 
     skip_mm_profiling: bool = False
     """
-    When enabled, skips multimodal memory profiling and only profiles with 
+    When enabled, skips multimodal memory profiling and only profiles with
     language backbone model during engine initialization.
 
     This reduces engine startup time but shifts the responsibility to users for
@@ -2581,24 +2582,24 @@ class PoolerConfig:
     ## for embeddings models
     normalize: Optional[bool] = None
     """
-    Whether to normalize the embeddings outputs. 
+    Whether to normalize the embeddings outputs.
     """
     dimensions: Optional[int] = None
     """
-    Reduce the dimensions of embeddings if model 
+    Reduce the dimensions of embeddings if model
     support matryoshka representation.
     """
 
     ## for classification models
     activation: Optional[bool] = None
     """
-    Whether to apply activation function to the classification outputs. 
+    Whether to apply activation function to the classification outputs.
     """
 
     ## for reward models
     softmax: Optional[bool] = None
     """
-    Whether to apply softmax to the reward outputs. 
+    Whether to apply softmax to the reward outputs.
     """
     step_tag_id: Optional[int] = None
     """
@@ -2624,9 +2625,9 @@ class PoolerConfig:
 
     max_embed_len: Optional[int] = None
     """
-    Maximum input length allowed for embedding generation. When set, allows 
+    Maximum input length allowed for embedding generation. When set, allows
     inputs longer than max_embed_len to be accepted for embedding models.
-    This parameter enables accepting long inputs without requiring 
+    This parameter enables accepting long inputs without requiring
     VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
     max_embed_len, it will be handled according to the original max_model_len
     validation logic. Defaults to None (i.e. set to max_model_len).
 
@@ -304,6 +304,9 @@ def forward(
         hidden_states = self.mlp(hidden_states)
         return hidden_states, residual
 
+    def get_kv_cache(self) -> torch.Tensor:
+        return self.self_attn.attn.get_kv_cache()
+
 
 @support_torch_compile
 class LlamaModel(nn.Module):
@@ -556,6 +559,12 @@ def get_eagle3_aux_hidden_state_layers(self) -> tuple[int]:
         num_layers = len(self.model.layers)
         return (2, num_layers // 2, num_layers - 3)
 
+    def get_layer_kv_caches(self) -> list[torch.Tensor]:
+        kv_caches = []
+        for layer in self.model.layers:
+            kv_caches.append(layer.get_kv_cache())
+        return kv_caches
+
     def _init_model(self,
                     vllm_config: VllmConfig,
                     prefix: str = "",
 
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with TreeAttention."""
+
+import ast
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class TreeDrafterParams:
+    tree_choices: list[tuple[int, ...]]
+    attn_mask: list[list[bool]]
+    first_branching_level: Optional[int]
+    cu_drafts_per_level: list[int]
+    child_drafts_per_level: list[int]
+
+    @staticmethod
+    def from_spec_token_tree(spec_token_tree: str) -> "TreeDrafterParams":
+        # Parse the speculative token tree.
+        tree_choices: list[tuple[int, ...]] = ast.literal_eval(spec_token_tree)
+        # Sort the tree breadth-first.
+        tree_choices.sort(key=lambda t: (len(t), t))
+
+        tree_depth = len(tree_choices[-1])
+        # Precompute per-level properties of the tree.
+        num_drafts_per_level = [0] * tree_depth
+        for node in tree_choices:
+            num_drafts_per_level[len(node) - 1] += 1
+        cu_drafts_per_level = [num_drafts_per_level[0]]
+        child_drafts_per_level = [num_drafts_per_level[0]]
+        for level in range(1, tree_depth):
+            cu_drafts_per_level.append(
+                cu_drafts_per_level[-1] + num_drafts_per_level[level]
+            )
+            child_drafts_per_level.append(
+                num_drafts_per_level[level] // num_drafts_per_level[level - 1]
+            )
+        # Find the first level where the tree branches off into one or more
+        # children.
+        first_branching_level = None
+        for level in range(tree_depth):
+            if child_drafts_per_level[level] > 1:
+                first_branching_level = level
+                break
+
+        # Construct the tree attention bias.
+        depth_counts = _get_depth_counts(tree_choices)
+        attn_mask = _prepare_tree_attn_bias(
+            tree_choices,
+            depth_counts,
+        )
+
+        return TreeDrafterParams(
+            tree_choices=tree_choices,
+            attn_mask=attn_mask,
+            first_branching_level=first_branching_level,
+            cu_drafts_per_level=cu_drafts_per_level,
+            child_drafts_per_level=child_drafts_per_level,
+        )
+
+
+def _get_depth_counts(sorted_tree_choices: list[tuple[int, ...]]) -> list[int]:
+    # Count the number of choices at each depth of the tree.
+    depth_counts = []
+    prev_depth = 0
+    for path in sorted_tree_choices:
+        depth = len(path)
+        if depth != prev_depth:
+            depth_counts.append(0)
+        depth_counts[depth - 1] += 1
+        prev_depth = depth
+    return depth_counts
+
+
+def _prepare_tree_attn_bias(
+    sorted_tree_choices: list[tuple[int, ...]],
+    depth_counts: list[int],
+) -> list[list[bool]]:
+    # +1 comes from the additional root node.
+    tree_len = len(sorted_tree_choices) + 1
+    tree_attn_mask = [[False for _ in range(tree_len)] for _ in range(tree_len)]
+
+    mask_val = True
+    for i in range(tree_len):
+        # Set diagonal to all True. Each token should attend to itself.
+        tree_attn_mask[i][i] = mask_val
+        # Set root column to all True. All tokens attend to it.
+        tree_attn_mask[i][0] = mask_val
+
+    # Set all ancestors to True.
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_tree_choice = sorted_tree_choices[start + j]
+            if len(cur_tree_choice) == 1:
+                continue
+
+            for c in range(len(cur_tree_choice) - 1):
+                ancestor_idx = sorted_tree_choices.index(cur_tree_choice[: c + 1]) + 1
+                tree_attn_mask[j + start + 1][ancestor_idx] = mask_val
+        start += depth_counts[i]
+    return tree_attn_mask