Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions vllm/v1/core/sched/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -532,6 +532,7 @@ def schedule(self) -> SchedulerOutput:
self._update_connector_prefix_cache_stats(
request, num_external_computed_tokens
)
request.num_external_computed_tokens += num_external_computed_tokens

# Request was already popped from self.waiting
# unless it was re-added above due to new_blocks being None.
Expand Down Expand Up @@ -1042,6 +1043,7 @@ def update_from_output(
kv_transfer_params=kv_transfer_params,
trace_headers=request.trace_headers,
num_cached_tokens=request.num_cached_tokens,
num_external_computed_tokens=request.num_external_computed_tokens,
)
)
else:
Expand Down
2 changes: 2 additions & 0 deletions vllm/v1/engine/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ class EngineCoreOutput(
trace_headers: Mapping[str, str] | None = None
# The number of tokens with prefix cache hits.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this comment looks incorrect ... assuming "prefix cache" refers to the local cache?

                    # Total computed tokens (local + external).                                                                                                                 
                    num_computed_tokens = (
                        num_new_local_computed_tokens + num_external_computed_tokens
                    )
                ...
                # Count the number of prefix cached tokens.                                                                                                                     
                if request.num_cached_tokens < 0:
                    request.num_cached_tokens = num_computed_tokens

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not familiar with it cc @chaunceyjiang

num_cached_tokens: int = 0
# The number of tokens that have been computed remotely.
num_external_computed_tokens: int = 0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd be tempted to refactor these two into a PrefillStats object ... and only include that in the ECO when the prefill completes ... especially if we ever wanted to also send like num_locally_cached_tokens too

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have a strong opinion on this tbh, we can probably wait to have a few more things to bundle before executing the suggestion


@property
def finished(self) -> bool:
Expand Down
2 changes: 1 addition & 1 deletion vllm/v1/metrics/loggers.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _reset(self, now):

def _track_iteration_stats(self, iteration_stats: IterationStats):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Presumably you want to update the Prometheus metric too?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@markmc which one? I intentionally left self.counter_prompt_tokens unchanged to avoid replacing the actual prompt count.
Should I just make a new one for local tokens?

# Save tracked stats for token counters.
self.num_prompt_tokens += iteration_stats.num_prompt_tokens
self.num_prompt_tokens += iteration_stats.num_local_prompt_tokens
self.num_generation_tokens += iteration_stats.num_generation_tokens

def _get_throughput(self, tracked_stats: int, now: float) -> float:
Expand Down
5 changes: 5 additions & 0 deletions vllm/v1/metrics/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ def __init__(self):
self.num_generation_tokens = 0
self.num_prompt_tokens = 0
self.num_preempted_reqs = 0
# Num of prompt tokens that have been computed locally.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the naming here a big confusing? By "computed locally" here we mean both computed and locally cached?

If you just tracked num_external_computed_tokens and then subtracted it in _track_iteration_stats() would that be more clear?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By "computed locally" here we mean both computed and locally cached?

Yes the behavior is unchanged, cached ones would still result in higher throughput even in regular aggregated setup.

If you just tracked num_external_computed_tokens and then subtracted it in _track_iteration_stats() would that be more clear?

I think looking at the diff

self.num_prompt_tokens += iteration_stats.num_prompt_tokens
-->
self.num_prompt_tokens += iteration_stats.num_local_prompt_tokens

this is pretty clear that I just want to rule out the remote tokens ie I assume the semantic was the intended one from the beginning, it's just "local" used to be redundant

self.num_local_prompt_tokens = 0
self.finished_requests: list[FinishedRequestStats] = []
self.max_num_generation_tokens_iter: list[int] = []
self.n_params_iter: list[int] = []
Expand Down Expand Up @@ -251,6 +253,9 @@ def update_from_output(
self.num_generation_tokens += num_new_generation_tokens
if is_prefilling:
self.num_prompt_tokens += prompt_len
self.num_local_prompt_tokens += (
prompt_len - output.num_external_computed_tokens
)

first_token_latency = self._time_since(req_stats.arrival_time)
self.time_to_first_tokens_iter.append(first_token_latency)
Expand Down
5 changes: 4 additions & 1 deletion vllm/v1/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,9 +118,12 @@ def __init__(
# indicates that the output is corrupted
self.num_nans_in_logits = 0

# The number of requests being preempted by the scheduler
# The number of requests being preempted by the scheduler.
self.num_preemptions = 0

# The number of tokens that have been computed remotely.
self.num_external_computed_tokens = 0

self.block_hashes: list[BlockHash] = []
self.get_hash_new_full_blocks: Callable[[], list[BlockHash]] | None = None
if block_hasher is not None:
Expand Down
Loading