vllm-project
diff --git a/‎tests/v1/e2e/test_spec_decode.py
Lines changed: 9 additions & 7 deletions b/‎tests/v1/e2e/test_spec_decode.py
Lines changed: 9 additions & 7 deletions
diff --git a/‎tests/v1/sample/test_tree_rejection_sampler.py
Lines changed: 146 additions & 0 deletions b/‎tests/v1/sample/test_tree_rejection_sampler.py
Lines changed: 146 additions & 0 deletions
diff --git a/‎vllm/config/__init__.py
Lines changed: 14 additions & 13 deletions b/‎vllm/config/__init__.py
Lines changed: 14 additions & 13 deletions
diff --git a/‎vllm/tree_drafter_params.py
Lines changed: 92 additions & 0 deletions b/‎vllm/tree_drafter_params.py
Lines changed: 92 additions & 0 deletions
@@ -162,12 +162,6 @@ def test_eagle_correctness(
     mm_enabled: bool,
     attn_backend: str,
 ):
-    if attn_backend == "TREE_ATTN":
-        # TODO: Fix this flaky test
-        pytest.skip(
-            "TREE_ATTN is flaky in the test disable for now until it can be "
-            "reolved (see https://github.com/vllm-project/vllm/issues/22922)")
-
     # Generate test prompts inside the function instead of using fixture
     test_prompts = get_test_prompts(mm_enabled)
     '''
@@ -222,7 +216,15 @@ def test_eagle_correctness(
 
         # Heuristic: expect at least 66% of the prompts to match exactly
         # Upon failure, inspect the outputs to check for inaccuracy.
-        assert matches > int(0.66 * len(ref_outputs))
+        accuracy_threshold = 0.66
+
+        if attn_backend == "TREE_ATTN":
+            # Tree attention uses Triton kernels, which perform can perform
+            # non-deterministic floating arithmetic. Threshold needs to be
+            # reduced to 50% to prevent flaky tests.
+            accuracy_threshold = 0.50
+
+        assert matches > int(accuracy_threshold * len(ref_outputs))
         del spec_llm
         torch.cuda.empty_cache()
         cleanup_dist_env_and_memory()
@@ -0,0 +1,146 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from typing import Any, Optional
+
+import pytest
+import torch
+import torch.nn.functional as F
+
+from vllm.platforms import current_platform
+from vllm.tree_drafter_params import TreeDrafterParams
+from vllm.v1.sample.logits_processor import LogitsProcessorManager
+from vllm.v1.sample.metadata import SamplingMetadata
+from vllm.v1.sample.sampler import Sampler
+from vllm.v1.sample.tree_rejection_sampler import (PLACEHOLDER_TOKEN_ID,
+                                              TreeRejectionSampler)
+from vllm.v1.spec_decode.metadata import SpecDecodeMetadata
+
+DEVICE = current_platform.device_type
+
+
+def create_logits_tensor(output_token_ids: list[list[int]],
+                         vocab_size: int = 100) -> torch.Tensor:
+    """Helper function to create logits tensor that
+       will produce desired token ids on argmax"""
+    token_ids = [tokens for tokens in output_token_ids]
+    num_total_tokens = sum(len(tokens) for tokens in token_ids)
+    logits = torch.full((num_total_tokens, vocab_size), -100.0, device=DEVICE)
+    start_loc = 0
+    for tokens in token_ids:
+        for j, token_id in enumerate(tokens):
+            logits[start_loc + j, token_id] = 100.0
+        start_loc += len(tokens)
+    return logits
+
+
+def create_sampling_metadata(
+    all_greedy: bool,
+    temperature: Optional[torch.Tensor] = None,
+    top_k: Optional[torch.Tensor] = None,
+    top_p: Optional[torch.Tensor] = None,
+    generators: Optional[dict[int, Any]] = None,
+) -> SamplingMetadata:
+    """Create a v1 sampling metadata object with all_greedy set
+        to the given value. Either all greedy or all random sampling
+        is used.
+    """
+    generators = generators or {}
+    if all_greedy:
+        temperature = None
+    else:
+        assert temperature is not None
+
+    return SamplingMetadata(
+        temperature=temperature,
+        all_greedy=all_greedy,
+        all_random=not all_greedy,
+        top_p=top_p,
+        top_k=top_k,
+        generators=generators,
+        max_num_logprobs=0,
+        no_penalties=True,
+        prompt_token_ids=None,
+        frequency_penalties=torch.tensor([]),
+        presence_penalties=torch.tensor([]),
+        repetition_penalties=torch.tensor([]),
+        output_token_ids=[],
+        allowed_token_ids_mask=None,
+        bad_words_token_ids={},
+        logitsprocs=LogitsProcessorManager(),
+    )
+
+
+########################### Tests for Greedy Sampling ###################
+
+def test_perfect_match():
+    """Test when output tokens perfectly match speculated tokens"""
+    tree_drafter_params = TreeDrafterParams.from_spec_token_tree(
+        "[(0, ), (0, 0), (0, 0, 0)]")
+    tree_rejection_sampler = TreeRejectionSampler(
+        tree_drafter_params=tree_drafter_params,
+        max_batch_size=1,
+        main_sampler=Sampler(),
+        device=None,
+    )
+
+    spec_tokens = [[1, 2, 3]]
+    output_tokens = [[1, 2, 3, 4]]  # 4 is the bonus token
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = tree_rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=None,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(output_tokens,
+                            dtype=torch.int,
+                            device=logits.device)
+    assert torch.equal(output, expected)
+
+@pytest.mark.parametrize(
+    "spec_token_tree",
+    [
+        [(0, )],  # A single token
+        [(0, ), (0, 0), (0, 0, 0)],  # Chain
+        [(0, ), (1, ), (2, )],  # Parallel
+        [(0, ), (1, ), (2, ), (0, 0), (0, 1), (1, 0), (1, 1), (2, 0),
+         (2, 1)],  # Tree
+    ])
+def test_greedy_sampling(spec_token_tree: list[tuple[int]]):
+    """Test when output tokens perfectly match speculated tokens"""
+    tree_drafter_params = TreeDrafterParams.from_spec_token_tree(
+        str(spec_token_tree))
+    tree_rejection_sampler = TreeRejectionSampler(
+        tree_drafter_params=tree_drafter_params,
+        max_batch_size=1,
+        main_sampler=Sampler(),
+        device=None,
+    )
+
+    spec_tokens = [[i + 1 for i in range(len(spec_token_tree) + 1)]]
+    output_tokens = [[1, 2, 3, 4]]  # 4 is the bonus token
+    longest_path = find_longest_path()
+
+    metadata = create_sampling_metadata(all_greedy=True)
+    logits = create_logits_tensor(output_tokens)
+    spec_decode_metadata = SpecDecodeMetadata.make_dummy(spec_tokens,
+                                                         device=logits.device)
+
+    output = tree_rejection_sampler(
+        spec_decode_metadata,
+        draft_probs=None,
+        target_logits=logits,
+        bonus_token_ids=None,
+        sampling_metadata=metadata,
+    )
+    expected = torch.tensor(longest_path,
+                            dtype=torch.int,
+                            device=logits.device)
+
+    assert torch.equal(output, expected)
@@ -48,6 +48,7 @@
     try_get_tokenizer_config, uses_mrope)
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
+from vllm.tree_drafter_params import TreeDrafterParams
 from vllm.utils import (DEFAULT_MAX_NUM_BATCHED_TOKENS, LayerBlockType,
                         LazyLoader, common_broadcastable_dtype, random_uuid)
 
@@ -1980,6 +1981,9 @@ class SpeculativeConfig:
         ParallelConfig] = None  # type: ignore
     """The parallel configuration for the draft model initialized internal."""
 
+    # params generated in the post-init stage for tree drafting.
+    tree_drafter_params: SkipValidation[TreeDrafterParams] = None
+
     def compute_hash(self) -> str:
         """
         WARNING: Whenever a new field is added to this config,
@@ -2201,12 +2205,9 @@ def __post_init__(self):
                         (i + 1) * (0, )
                         for i in range(self.num_speculative_tokens)
                     ])
-                else:
-                    # Sort the token tree breadth-first.
-                    tree_choices = ast.literal_eval(
-                        self.speculative_token_tree)
-                    self.speculative_token_tree = str(
-                        sorted(tree_choices, key=lambda t: (len(t), t)))
+                # Construct tree drafter params from the serialized token tree.
+                self.tree_drafter_params = TreeDrafterParams.from_spec_token_tree(
+                    self.speculative_token_tree)
 
                 self.draft_tensor_parallel_size = \
                     SpeculativeConfig._verify_and_get_draft_tp(
@@ -2518,7 +2519,7 @@ class MultiModalConfig:
 
     skip_mm_profiling: bool = False
     """
-    When enabled, skips multimodal memory profiling and only profiles with 
+    When enabled, skips multimodal memory profiling and only profiles with
     language backbone model during engine initialization.
 
     This reduces engine startup time but shifts the responsibility to users for
@@ -2581,24 +2582,24 @@ class PoolerConfig:
     ## for embeddings models
     normalize: Optional[bool] = None
     """
-    Whether to normalize the embeddings outputs. 
+    Whether to normalize the embeddings outputs.
     """
     dimensions: Optional[int] = None
     """
-    Reduce the dimensions of embeddings if model 
+    Reduce the dimensions of embeddings if model
     support matryoshka representation.
     """
 
     ## for classification models
     activation: Optional[bool] = None
     """
-    Whether to apply activation function to the classification outputs. 
+    Whether to apply activation function to the classification outputs.
     """
 
     ## for reward models
     softmax: Optional[bool] = None
     """
-    Whether to apply softmax to the reward outputs. 
+    Whether to apply softmax to the reward outputs.
     """
     step_tag_id: Optional[int] = None
     """
@@ -2624,9 +2625,9 @@ class PoolerConfig:
 
     max_embed_len: Optional[int] = None
     """
-    Maximum input length allowed for embedding generation. When set, allows 
+    Maximum input length allowed for embedding generation. When set, allows
     inputs longer than max_embed_len to be accepted for embedding models.
-    This parameter enables accepting long inputs without requiring 
+    This parameter enables accepting long inputs without requiring
     VLLM_ALLOW_LONG_MAX_MODEL_LEN environment variable. When an input exceeds
     max_embed_len, it will be handled according to the original max_model_len
     validation logic. Defaults to None (i.e. set to max_model_len).
 
@@ -0,0 +1,92 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""Attention layer with TreeAttention."""
+
+import ast
+from dataclasses import dataclass
+
+@dataclass
+class TreeDrafterParams:
+    tree_choices: list[tuple[int, ...]]
+    attn_mask: list[list[bool]]
+    cu_drafts_per_level: list[int]
+    child_drafts_per_level: list[int]
+
+    @staticmethod
+    def from_spec_token_tree(spec_token_tree: str) -> "TreeDrafterParams":
+        # Parse the speculative token tree.
+        tree_choices: list[tuple[int, ...]] = ast.literal_eval(spec_token_tree)
+        # Sort the tree breadth-first.
+        tree_choices.sort(key=lambda t: (len(t), t))
+
+        tree_depth = len(tree_choices[-1])
+        # Precompute per-level properties of the tree.
+        num_drafts_per_level = [0] * tree_depth
+        for node in tree_choices:
+            num_drafts_per_level[len(node) - 1] += 1
+        cu_drafts_per_level = [num_drafts_per_level[0]]
+        child_drafts_per_level = [num_drafts_per_level[0]]
+        for level in range(1, tree_depth):
+            cu_drafts_per_level.append(
+                cu_drafts_per_level[-1] + num_drafts_per_level[level]
+            )
+            child_drafts_per_level.append(
+                num_drafts_per_level[level] // num_drafts_per_level[level - 1]
+            )
+
+        # Construct the tree attention bias.
+        depth_counts = _get_depth_counts(tree_choices)
+        attn_mask = _prepare_tree_attn_bias(
+            tree_choices,
+            depth_counts,
+        )
+
+        return TreeDrafterParams(
+            tree_choices=tree_choices,
+            attn_mask=attn_mask,
+            cu_drafts_per_level=cu_drafts_per_level,
+            child_drafts_per_level=child_drafts_per_level,
+        )
+
+
+def _get_depth_counts(sorted_tree_choices: list[tuple[int, ...]]) -> list[int]:
+    # Count the number of choices at each depth of the tree.
+    depth_counts = []
+    prev_depth = 0
+    for path in sorted_tree_choices:
+        depth = len(path)
+        if depth != prev_depth:
+            depth_counts.append(0)
+        depth_counts[depth - 1] += 1
+        prev_depth = depth
+    return depth_counts
+
+
+def _prepare_tree_attn_bias(
+    sorted_tree_choices: list[tuple[int, ...]],
+    depth_counts: list[int],
+) -> list[list[bool]]:
+    # +1 comes from the additional root node.
+    tree_len = len(sorted_tree_choices) + 1
+    tree_attn_mask = [[False for _ in range(tree_len)] for _ in range(tree_len)]
+
+    mask_val = True
+    for i in range(tree_len):
+        # Set diagonal to all True. Each token should attend to itself.
+        tree_attn_mask[i][i] = mask_val
+        # Set root column to all True. All tokens attend to it.
+        tree_attn_mask[i][0] = mask_val
+
+    # Set all ancestors to True.
+    start = 0
+    for i in range(len(depth_counts)):
+        for j in range(depth_counts[i]):
+            cur_tree_choice = sorted_tree_choices[start + j]
+            if len(cur_tree_choice) == 1:
+                continue
+
+            for c in range(len(cur_tree_choice) - 1):
+                ancestor_idx = sorted_tree_choices.index(cur_tree_choice[: c + 1]) + 1
+                tree_attn_mask[j + start + 1][ancestor_idx] = mask_val
+        start += depth_counts[i]
+    return tree_attn_mask