Skip to content

Commit 5cd6a5e

Browse files
[V1] implement tree sampler for draft token acceptance
Signed-off-by: Giancarlo Delfin <[email protected]>
1 parent 3253ae7 commit 5cd6a5e

File tree

10 files changed

+808
-143
lines changed

10 files changed

+808
-143
lines changed

tests/v1/e2e/test_spec_decode.py

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -162,12 +162,6 @@ def test_eagle_correctness(
162162
mm_enabled: bool,
163163
attn_backend: str,
164164
):
165-
if attn_backend == "TREE_ATTN":
166-
# TODO: Fix this flaky test
167-
pytest.skip(
168-
"TREE_ATTN is flaky in the test disable for now until it can be "
169-
"reolved (see https://github.com/vllm-project/vllm/issues/22922)")
170-
171165
# Generate test prompts inside the function instead of using fixture
172166
test_prompts = get_test_prompts(mm_enabled)
173167
'''
@@ -222,7 +216,15 @@ def test_eagle_correctness(
222216

223217
# Heuristic: expect at least 66% of the prompts to match exactly
224218
# Upon failure, inspect the outputs to check for inaccuracy.
225-
assert matches > int(0.66 * len(ref_outputs))
219+
accuracy_threshold = 0.66
220+
221+
if attn_backend == "TREE_ATTN":
222+
# The tree attention backend uses Triton kernels, which exhibit
223+
# floating-point nondeterminism. Reducing the threshold to 50%
224+
# to prevent flaky tests.
225+
accuracy_threshold = 0.50
226+
227+
assert matches > int(accuracy_threshold * len(ref_outputs))
226228
del spec_llm
227229
torch.cuda.empty_cache()
228230
cleanup_dist_env_and_memory()

0 commit comments

Comments
 (0)