remove num_tokens from EngineOutput (#4088)

lvhan028 · web-flow · commit b1765261d19a · 2025-10-31T15:23:03.000+08:00
diff --git a/benchmark/profile_throughput.py b/benchmark/profile_throughput.py
@@ -178,7 +178,7 @@ async def _inference(self, req_queue: Queue, session_id: int, temperature: float
                                                       stream_output=stream_output)
             try:
                 async for outputs in generator:
-                    n_token += outputs.num_token
+                    n_token += len(outputs.token_ids)
                     token_ids += outputs.token_ids
                     if not skip_detokenize:
                         _, state = self.tokenizer.detokenize_incrementally(token_ids, state)
diff --git a/lmdeploy/messages.py b/lmdeploy/messages.py
@@ -526,12 +526,11 @@ class RequestMetrics:
 
 @dataclass
 class EngineOutput:
-    """Engine output for turbomind/pytorch engine.
+    """Engine output from turbomind/pytorch engine.
 
     Args:
         status (ResponseType): the response type.
         token_ids (List[int]): the newly generated token ids in each iteration.
-        num_token (int): the newly generated token number, equal to `len(token_ids)`
         logprobs (List[Dict[int, float]]): the top logprobs for each output
             position.
         cache_block_ids (List[int]): send cache blocks back for migration in
@@ -540,7 +539,6 @@ class EngineOutput:
     """
     status: ResponseType
     token_ids: List[int]
-    num_token: int
     logprobs: List[Dict[int, float]] = None
     logits: torch.Tensor = None
     last_hidden_state: torch.Tensor = None
diff --git a/lmdeploy/metrics/stats.py b/lmdeploy/metrics/stats.py
@@ -198,7 +198,7 @@ def update_from_output(self, outputs: EngineOutput, req_state: RequestState):
             outputs (EngineOutput): The output from the engine containing information about the current iteration.
             req_state (RequestState): The state of the request, including timestamps and token counts.
         """
-        new_generation_tokens = outputs.num_token
+        new_generation_tokens = len(outputs.token_ids)
         if new_generation_tokens == 0:
             return
         self.new_generation_tokens = new_generation_tokens
@@ -213,7 +213,7 @@ def update_from_output(self, outputs: EngineOutput, req_state: RequestState):
         # update the latest token generation time
         req_state.lastest_token_time = outputs.req_metrics.token_timestamp
         # update the number of generated tokens
-        req_state.generation_tokens += outputs.num_token
+        req_state.generation_tokens += new_generation_tokens
 
         if outputs.status != ResponseType.SUCCESS:
             req_state.finish_reason = outputs.status
diff --git a/lmdeploy/pytorch/engine/engine_instance.py b/lmdeploy/pytorch/engine/engine_instance.py
@@ -126,7 +126,7 @@ async def async_stream_infer(self,
             int: The number of the output tokens.
         """
         if len(input_ids) > self.max_input_len:
-            yield EngineOutput(ResponseType.INPUT_LENGTH_ERROR, [], 0)
+            yield EngineOutput(ResponseType.INPUT_LENGTH_ERROR, [])
             return
         gen_config = gen_config or GenerationConfig()
         sampling_param = SamplingParam.from_gen_config(gen_config=gen_config)
@@ -158,7 +158,6 @@ async def async_stream_infer(self,
                 logger.debug(f'session[{session_id}] success: num_out_ids={num_ids}.')
                 yield EngineOutput(resp.type,
                                    token_ids[output_offset:],
-                                   num_ids,
                                    cache_block_ids=cache_block_ids,
                                    req_metrics=req_metrics,
                                    logprobs=logprobs)
@@ -171,15 +170,14 @@ async def async_stream_infer(self,
                 logger.debug(f'session[{session_id}] finish: num_out_ids={num_ids}.')
                 yield EngineOutput(resp.type,
                                    token_ids[output_offset:],
-                                   num_ids,
                                    logits=logits,
                                    cache_block_ids=cache_block_ids,
                                    req_metrics=req_metrics,
                                    logprobs=logprobs)
                 break
             else:
                 logger.debug(f'session[{session_id}] failed.')
-                yield EngineOutput(resp.type, [], 0)
+                yield EngineOutput(resp.type, [])
                 break
 
     async def async_infer(self,
diff --git a/lmdeploy/pytorch/engine/mp_engine/base_worker.py b/lmdeploy/pytorch/engine/mp_engine/base_worker.py
@@ -138,7 +138,7 @@ def __init__(self):
 
     def get(self, stream_id):
         if stream_id not in self._output:
-            self._output[stream_id] = EngineOutput(status=None, token_ids=[], num_token=0, logprobs=[])
+            self._output[stream_id] = EngineOutput(status=None, token_ids=[], logprobs=[])
         return self._output[stream_id]
 
     def add(self, stream_id, result):
@@ -154,5 +154,4 @@ def pop(self, stream_id, result):
         output = self._output.pop(stream_id)
         result.token_ids = output.token_ids or []
         result.logprobs = output.logprobs or None
-        result.num_token = len(output.token_ids)
         return result
diff --git a/lmdeploy/serve/async_engine.py b/lmdeploy/serve/async_engine.py
@@ -854,7 +854,7 @@ def is_error(status):
                     if is_error(outputs.status):
                         break
 
-                    output_len = outputs.num_token
+                    output_len = len(outputs.token_ids)
                     if hit_stop_token:
                         continue
 
diff --git a/lmdeploy/turbomind/turbomind.py b/lmdeploy/turbomind/turbomind.py
@@ -760,7 +760,6 @@ async def async_stream_infer(self,
         state = None
 
         output_ids = []
-        output_len = 0
         prev_len = step + input_len
         try:
             while True:
@@ -782,8 +781,7 @@ async def async_stream_infer(self,
                     continue
 
                 output_ids = output_ids_buf[prev_len:seq_len].tolist()
-                output_len = seq_len - prev_len
-                output = EngineOutput(ret_status, output_ids, output_len)
+                output = EngineOutput(ret_status, output_ids)
 
                 for f in extra_fs:
                     f(output, seq_len)
@@ -811,7 +809,7 @@ async def async_stream_infer(self,
             logger.info(f'[async_stream_infer] session {session_id} done')
 
     def _get_error_output(self, status):
-        return EngineOutput(status=self.errcode_map[status], token_ids=[], num_token=0)
+        return EngineOutput(status=self.errcode_map[status], token_ids=[])
 
     def _get_generation_config(self, cfg: GenerationConfig):
         c = _tm.GenerationConfig()