vllm-project
diff --git a/‎examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py‎
Lines changed: 1 addition & 46 deletions b/‎examples/disaggregated_prefill_v1/load_balance_proxy_layerwise_server_example.py‎
Lines changed: 1 addition & 46 deletions
diff --git a/‎tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py‎
Lines changed: 0 additions & 3 deletions b/‎tests/e2e/singlecard/spec_decode_v1/test_v1_mtp_correctness.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎tests/ut/kv_connector/test_mooncake_connector.py‎
Lines changed: 1 addition & 1 deletion b/‎tests/ut/kv_connector/test_mooncake_connector.py‎
Lines changed: 1 addition & 1 deletion
@@ -361,16 +361,6 @@ async def send_request_to_service(client: httpx.AsyncClient,
     aborted_requests = proxy_state.aquire_aborted_prefiller_requests(
         prefiller_id)
     req_data = req_data.copy()
-    # req_data['kv_transfer_params'] = {
-    #     "do_remote_decode": True,
-    #     "do_remote_prefill": False,
-    #     "remote_engine_id": None,
-    #     "remote_block_ids": None,
-    #     "remote_host": None,
-    #     "remote_port": None,
-    #     "aborted_request": list(aborted_requests),
-    #     "metaserver": f"http://{global_args.host}:{global_args.port}/v1/metaserver"
-    # }
     req_data["stream"] = False
     req_data["max_tokens"] = 1
     if "stream_options" in req_data:
@@ -474,33 +464,10 @@ async def _handle_completions(api: str, request: Request):
         req_data = await request.json()
         req_body = await request.body()
         request_length = len(req_body)
-        # prefiller_score = proxy_state.calculate_prefill_scores(request_length)
-        # logger.debug(
-        #     f"Request length: {request_length}, Prefiller score: {prefiller_score}"
-        # )
         request_id = await proxy_state.next_req_id()
         request_id_api = get_api_request_id(api, request_id)
         proxy_state.req_data_dict[request_id_api] = (req_data, request_length,
                                                      api)
-        # # Select prefiller
-        # prefiller_idx = proxy_state.select_prefiller(prefiller_score)
-        # prefiller = proxy_state.prefillers[prefiller_idx]
-        # result_future = asyncio.Future()  # type: ignore
-        # proxy_state.req_id_future[request_id_api] = result_future
-        # # Send request to prefiller
-        # asyncio.get_running_loop().create_task(send_request_to_service(
-        #     prefiller.client,
-        #     prefiller_idx,
-        #     api,
-        #     req_data,
-        #     request_id,
-        #     max_retries=global_args.max_retries,
-        #     base_delay=global_args.retry_delay))
-        # proxy_state.release_prefiller(prefiller_idx, prefiller_score)
-
-        # response = await result_future
-        # del proxy_state.req_id_future[request_id_api]
-        # req_data["kv_transfer_params"] = response
         req_data['kv_transfer_params'] = {
             "do_remote_decode":
             False,
@@ -530,18 +497,11 @@ async def generate_stream():
                         request_id=request_id,
                         max_retries=global_args.max_retries,
                         base_delay=global_args.retry_delay):
-                    # if not released_kv and chunk:
-                    #     proxy_state.release_prefiller_kv(
-                    #         prefiller_idx, prefiller_score)
-                    #     released_kv = True
                     yield chunk
             except Exception as e:
                 logger.error(
                     f"Error during streaming from decoder {decoder.url}: {str(e)} the aborted request {request_id} will be routing to the target prefiller when new request is ready to dispatch to it"
                 )
-                # proxy_state.abort_prefiller_request(prefiller_idx, request_id)
-                # proxy_state.release_prefiller_kv(prefiller_idx,
-                #                                  prefiller_score)
 
             # After streaming done, release tokens
             proxy_state.release_decoder(decoder_idx, decoder_score)
@@ -587,9 +547,6 @@ async def metaserver(request: Request):
         request_id = kv_transfer_params["request_id"]
         assert request_id in proxy_state.req_data_dict
         req_data, request_length, api = proxy_state.req_data_dict[request_id]
-        # output_prompt = proxy_state.tokenizer.decode(kv_transfer_params["token_ids"])
-        # req_data["prompt"] = output_prompt
-        # del kv_transfer_params['token_ids']
         request_id = get_origin_request_id(api, request_id)
         req_data["kv_transfer_params"] = kv_transfer_params
         prefiller_score = proxy_state.calculate_prefill_scores(request_length)
@@ -612,8 +569,6 @@ async def metaserver(request: Request):
             base_delay=global_args.retry_delay)
         proxy_state.release_prefiller(prefiller_idx, prefiller_score)
 
-        # del req_data["prompt"]
-
     except Exception as e:
         logger.error(f"Post metaserver failed with: {str(e)}")
 
@@ -622,4 +577,4 @@ async def metaserver(request: Request):
     global global_args
     global_args = parse_args()
     import uvicorn
-    uvicorn.run(app, host=global_args.host, port=global_args.port)
+    uvicorn.run(app, host=global_args.host, port=global_args.port)
@@ -1,8 +1,5 @@
 from __future__ import annotations
 
-import os
-from unittest.mock import patch
-
 import pytest
 from vllm import SamplingParams
 from vllm.config import CompilationConfig, CUDAGraphMode
 
@@ -1136,4 +1136,4 @@ def test_device_id_selection_with_physical_devices(self):
 
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()
Original file line number	Diff line number	Diff line change
`@@ -1136,4 +1136,4 @@ def test_device_id_selection_with_physical_devices(self):`
`1136`	`1136`
`1137`	`1137`
`1138`	`1138`	`if __name__ == '__main__':`
`1139`		`- unittest.main()`
	`1139`	`+ unittest.main()`