vllm-project
diff --git a/‎tests/v1/sample/test_tree_rejection_sampler.py
Lines changed: 117 additions & 0 deletions b/‎tests/v1/sample/test_tree_rejection_sampler.py
Lines changed: 117 additions & 0 deletions
diff --git a/‎vllm/config/__init__.py
Lines changed: 7 additions & 6 deletions b/‎vllm/config/__init__.py
Lines changed: 7 additions & 6 deletions
diff --git a/‎vllm/tree_drafter_params.py
Lines changed: 103 additions & 0 deletions b/‎vllm/tree_drafter_params.py
Lines changed: 103 additions & 0 deletions
@@ -0,0 +1,117 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+from torch import Generator
+
+from vllm.platforms import current_platform
+from vllm.v1.sample.ops.topk_topp_sampler import (
+    apply_top_k_top_p,
+    is_flashinfer_available,
+)
+
+DEVICE = current_platform.device_type
+
+BATCH_SIZE = 1024
+VOCAB_SIZE = 128 * 1024
+
+FLASHINFER_ENABLED = current_platform.is_cuda() and is_flashinfer_available
+if is_flashinfer_available:
+    from flashinfer.sampling import top_k_renorm_probs, top_p_renorm_probs
+
+
+@pytest.fixture(autouse=True)
+def reset_default_device():
+    """
+    Explicitly set the default device, which can affect subsequent tests.
+    Adding this fixture helps avoid this problem.
+    """
+    original_device = torch.get_default_device()
+    yield
+    torch.set_default_device(original_device)
+
+
+def test_topk_impl_equivalence():
+
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(33)
+
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Random top-k values between 1 and 9.
+    k = torch.randint(1, 10, (BATCH_SIZE,), generator=generator)
+
+    # Set k=vocab_size for ~50% of requests in the batch (top-k disabled).
+    k.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=bool), VOCAB_SIZE
+    )
+
+    # Top-k only implementation
+    result1 = apply_top_k_top_p(logits=logits.clone(), k=k, p=None)
+
+    # Top-p + top-k
+    no_op_top_p = torch.tensor([1.0])
+    result2 = apply_top_k_top_p(logits=logits.clone(), k=k, p=no_op_top_p)
+
+    assert torch.allclose(result1, result2)
+
+
+def test_tree_rejection_sampler():
+    """
+    This test verifies that the FlashInfer top-k and top-p sampling
+    implementation produces the same results as the Python implementation.
+
+    NOTE: FlashInfer did not directly expose an interface for fused top-k and
+    top-p prob renorm (it did provide fused sampling but we cannot compare
+    sampling results due to randomness), so we will compare the probability
+    renormed consequently by top-k and then top-p of FlashInfer implementation.
+    """
+
+    if not FLASHINFER_ENABLED:
+        pytest.skip("FlashInfer not installed or not available on this platform.")
+
+    torch.set_default_device(DEVICE)
+    generator = Generator(device=DEVICE).manual_seed(42)
+
+    # Generate random logits
+    logits = torch.rand((BATCH_SIZE, VOCAB_SIZE), generator=generator)
+
+    # Generate various top-k and top-p values
+    k_values = torch.randint(1, 1000, (BATCH_SIZE,), generator=generator)
+    p_values = (
+        torch.rand((BATCH_SIZE,), generator=generator) * 0.5 + 0.5
+    )  # range in [0.5, 1.0]
+
+    # Sometimes disable top-k (k=vocab_size)
+    k_values.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool),
+        VOCAB_SIZE,
+    )
+
+    # Sometimes disable top-p (p=1.0)
+    p_values.masked_fill_(
+        torch.randint(0, 2, (BATCH_SIZE,), generator=generator, dtype=torch.bool), 1.0
+    )
+
+    python_logits = apply_top_k_top_p(
+        logits=logits.clone(),
+        k=k_values,
+        p=p_values,
+    )
+    python_probs = torch.softmax(python_logits, dim=-1)
+
+    # FlashInfer only exposed renorm interfaces for probs so convert first
+    flashinfer_probs = torch.softmax(logits.clone(), dim=-1)
+    flashinfer_probs = top_k_renorm_probs(
+        probs=flashinfer_probs,
+        top_k=k_values,
+    )
+    flashinfer_probs = top_p_renorm_probs(
+        probs=flashinfer_probs,
+        top_p=p_values,
+    )
+
+    # Compare the results
+    assert torch.allclose(
+        python_probs, flashinfer_probs, atol=2e-2
+    ), "FlashInfer and Python sampling implementations do not match!"
@@ -47,6 +47,7 @@
     try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
+from vllm.tree_drafter_params import TreeDrafterParams
 # yapf conflicts with isort for this block
 # yapf: disable
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS,
@@ -2277,6 +2278,9 @@ class SpeculativeConfig:
         ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
+    # params generated in the post-init stage for tree drafting.
+    tree_drafter_params: SkipValidation[TreeDrafterParams] = None
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -2498,12 +2502,9 @@ def __post_init__(self):
                         (i + 1) * (0, )
                         for i in range(self.num_speculative_tokens)
                     ])
-                else:
-                    # Sort the token tree breadth-first.
-                    tree_choices = ast.literal_eval(
-                        self.speculative_token_tree)
-                    self.speculative_token_tree = str(
-                        sorted(tree_choices, key=lambda t: (len(t), t)))
+                # Construct tree drafter params from the serialized token tree.
+                self.tree_drafter_params = TreeDrafterParams.from_spec_token_tree(
+                    self.speculative_token_tree)
 
                 self.draft_tensor_parallel_size = \
                     SpeculativeConfig._verify_and_get_draft_tp(
 
@@ -0,0 +1,103 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with TreeAttention."""
+
+import ast
+from dataclasses import dataclass
+from typing import Optional
+
+
+@dataclass
+class TreeDrafterParams:
+    tree_choices: list[tuple[int, ...]]
+    attn_mask: list[list[bool]]
+    first_branching_level: Optional[int]
+    cu_drafts_per_level: list[int]
+    child_drafts_per_level: list[int]
+
+    @staticmethod
+    def from_spec_token_tree(spec_token_tree: str) -> "TreeDrafterParams":
+        # Parse the speculative token tree.
+        tree_choices: list[tuple[int, ...]] = ast.literal_eval(spec_token_tree)
+        # Sort the tree breadth-first.
+        tree_choices.sort(key=lambda t: (len(t), t))
+
+        tree_depth = len(tree_choices[-1])
+        # Precompute per-level properties of the tree.
+        num_drafts_per_level = [0] * tree_depth
+        for node in tree_choices:
+            num_drafts_per_level[len(node) - 1] += 1
+        cu_drafts_per_level = [num_drafts_per_level[0]]
+        child_drafts_per_level = [num_drafts_per_level[0]]
+        for level in range(1, tree_depth):
+            cu_drafts_per_level.append(
+                cu_drafts_per_level[-1] + num_drafts_per_level[level]
+            )
+            child_drafts_per_level.append(
+                num_drafts_per_level[level] // num_drafts_per_level[level - 1]
+            )
+        # Find the first level where the tree branches off into one or more
+        # children.
+        first_branching_level = None
+        for level in range(tree_depth):
+            if child_drafts_per_level[level] > 1:
+                first_branching_level = level
+                break
+
+        # Construct the tree attention bias.
+        depth_counts = _get_depth_counts(tree_choices)
+        attn_mask = _prepare_tree_attn_bias(
+            tree_choices,
+            depth_counts,
+        )
+
+        return TreeDrafterParams(
+            tree_choices=tree_choices,
+            attn_mask=attn_mask,
+            first_branching_level=first_branching_level,
+            cu_drafts_per_level=cu_drafts_per_level,
+            child_drafts_per_level=child_drafts_per_level,
+        )
+
+
+def _get_depth_counts(sorted_tree_choices: list[tuple[int, ...]]) -> list[int]:
+    # Count the number of choices at each depth of the tree.
+    depth_counts = []
+    prev_depth = 0
+    for path in sorted_tree_choices:
+        depth = len(path)
+        if depth != prev_depth:
+            depth_counts.append(0)
+        depth_counts[depth - 1] += 1
+        prev_depth = depth
+    return depth_counts
+
+
+def _prepare_tree_attn_bias(
+    sorted_tree_choices: list[tuple[int, ...]],
+    depth_counts: list[int],
+) -> list[list[bool]]:
+    # +1 comes from the additional root node.
+    tree_len = len(sorted_tree_choices) + 1
+    tree_attn_mask = [[False for _ in range(tree_len)] for _ in range(tree_len)]
+
+    mask_val = True
+    for i in range(tree_len):
+        # Set diagonal to all True. Each token should attend to itself.
+        tree_attn_mask[i][i] = mask_val
+        # Set root column to all True. All tokens attend to it.
+        tree_attn_mask[i][0] = mask_val
+
+    # Set all ancestors to True.
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_tree_choice = sorted_tree_choices[start + j]
+            if len(cur_tree_choice) == 1:
+                continue
+
+            for c in range(len(cur_tree_choice) - 1):
+                ancestor_idx = sorted_tree_choices.index(cur_tree_choice[: c + 1]) + 1
+                tree_attn_mask[j + start + 1][ancestor_idx] = mask_val
+        start += depth_counts[i]
+    return tree_attn_mask