NVIDIA · reasonsolo · Sep 3, 2025 · Aug 29, 2025 · Sep 3, 2025
diff --git a/tests/integration/defs/accuracy/test_disaggregated_serving.py b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@@ -4,6 +4,7 @@
 # Please take a look at the existing test_llm_api_pytorch.py file for reference.
 import concurrent
 import contextlib
+import itertools
 import json
 import os
 import tempfile
@@ -47,6 +48,9 @@ def result(self):
 
 DuckLLM = namedtuple('DuckLLM', ['args', 'tokenizer', 'generate_async'])
 
+DEFAULT_TEST_TIMEOUT = 1800
+DEFAULT_SERVER_WAITING_TIMEOUT = 1200
+
 
 class MyThreadPoolExecutor(ThreadPoolExecutor):
 
@@ -67,13 +71,15 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 @contextlib.contextmanager
-def launch_disaggregated_llm(disaggregated_server_config: Dict[str, Any],
-                             ctx_server_config: Dict[str, Any],
-                             gen_server_config: Dict[str, Any],
-                             model_name: str,
-                             tensor_parallel_size: int = 1,
-                             ctx_model: str = None,
-                             gen_model: str = None):
+def launch_disaggregated_llm(
+        disaggregated_server_config: Dict[str, Any],
+        ctx_server_config: Dict[str, Any],
+        gen_server_config: Dict[str, Any],
+        model_name: str,
+        tensor_parallel_size: int = 1,
+        ctx_model: str = None,
+        gen_model: str = None,
+        server_waiting_timeout: int = DEFAULT_SERVER_WAITING_TIMEOUT):
     temp_dir = tempfile.TemporaryDirectory()
     disaggregated_serving_config_path = os.path.join(
         temp_dir.name, "disaggregated_serving_config.yaml")
@@ -197,16 +203,22 @@ def multi_popen(server_configs):
             )
             raise
 
+    server_cmd = [
+        trtllm_serve_path, "disaggregated", "-c",
+        disaggregated_serving_config_path, "--server_start_timeout",
+        str(server_waiting_timeout)
+    ]
     with (MyThreadPoolExecutor(max_workers=16) as
-          thread_pool, temp_dir, multi_popen(ctx_servers + gen_servers),
-          popen([
-              trtllm_serve_path, "disaggregated", "-c",
-              disaggregated_serving_config_path, "--server_start_timeout",
-              "3600"
-          ])):
+          thread_pool, temp_dir, multi_popen(ctx_servers + gen_servers) as
+          worker_processes, popen(server_cmd) as server_process):
         start_time = time.time()
-        while time.time() - start_time < 3600:
-            time.sleep(1)
+        while time.time() - start_time < server_waiting_timeout:
+            time.sleep(5)
+            for process in itertools.chain(worker_processes, [server_process]):
+                if process.poll() is not None:
+                    raise Exception(
+                        f"process {process.pid} exited with code {process.returncode}"
+                    )
             try:
                 print("Checking health endpoint")
                 response = requests.get("http://localhost:8000/health")
@@ -339,7 +351,7 @@ def run_parallel_test(model_name: str,
             task.evaluate(llm)
 
 
-@pytest.mark.timeout(3600)
+@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
 class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct"
     MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct"
@@ -639,7 +651,7 @@ def test_auto_dtype(self, overlap_scheduler):
             task.evaluate(llm)
 
 
-@pytest.mark.timeout(3600)
+@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
 class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
     MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
     MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
@@ -724,7 +736,7 @@ def test_auto_dtype(self, overlap_scheduler, mtp_nextn):
             task.evaluate(llm)
 
 
-@pytest.mark.timeout(3600)
+@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
 class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
     MODEL_NAME = "google/gemma-3-1b-it"
     MODEL_PATH = f"{llm_models_root()}/gemma/gemma-3-1b-it/"
@@ -780,7 +792,7 @@ def test_auto_dtype(self, overlap_scheduler):
             task.evaluate(llm)
 
 
-@pytest.mark.timeout(3600)
+@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
 class TestQwen3_8B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen3/Qwen3-8B"
     MODEL_PATH = f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8"
@@ -897,7 +909,7 @@ def test_chunked_prefill(self):
 
 
 @skip_pre_blackwell
-@pytest.mark.timeout(3600)
+@pytest.mark.timeout(DEFAULT_TEST_TIMEOUT)
 class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
     FP4_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf"
     FP8_MODEL = f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf"