NVIDIA · venkywonka · Nov 1, 2025 · Nov 1, 2025 · Nov 4, 2025 · Nov 4, 2025
@@ -283,7 +283,8 @@ def extract_from_precompiled(precompiled_location: str, package_data: List[str],
             'trtllm-refit=tensorrt_llm.commands.refit:main',
             'trtllm-bench=tensorrt_llm.commands.bench:main',
             'trtllm-serve=tensorrt_llm.commands.serve:main',
-            'trtllm-eval=tensorrt_llm.commands.eval:main'
+            'trtllm-eval=tensorrt_llm.commands.eval:main',
+            'trtllm-configure=tensorrt_llm.commands.configure:main'
         ],
     },
     scripts=['tensorrt_llm/llmapi/trtllm-llmapi-launch'],

@@ -29,6 +29,8 @@
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
                                            initialize_tokenizer,
                                            update_metadata_for_multimodal)
+from tensorrt_llm.bench.utils.scenario import (
+    prepare_llm_api_config_for_recipe, process_recipe_scenario)
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
 
@@ -196,6 +198,19 @@ def latency_command(
     # Model, experiment, and engine params
     options = get_general_cli_options(params, bench_env)
 
+    # Process recipe scenario if present
+    cli_defaults = {
+        'concurrency': 1,  # Latency default is 1 (not -1 like throughput)
+        'target_input_len': None,
+        'target_output_len': None,
+        'num_requests': 0,
+        'tp': 1,
+        'pp': 1,
+        'ep': None,
+    }
+    params, options, scenario = process_recipe_scenario(params, options,
+                                                        bench_env, cli_defaults)
+
     # Speculative Decode Options
     medusa_choices = params.get("medusa_choices")
     # Initialize the HF tokenizer for the specified model.
@@ -274,7 +289,10 @@ def latency_command(
     exec_settings["performance_options"]["cuda_graphs"] = True
     exec_settings["performance_options"]["multi_block_mode"] = True
 
-    exec_settings["extra_llm_api_options"] = params.get("extra_llm_api_options")
+    # Process recipe format if detected - extract llm_api_config only
+    extra_llm_api_options_path = params.get("extra_llm_api_options")
+    exec_settings["extra_llm_api_options"] = prepare_llm_api_config_for_recipe(
+        extra_llm_api_options_path, scenario)
 
     # Decoding Options
     if medusa_choices is not None:

@@ -28,6 +28,8 @@
 from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
                                            initialize_tokenizer,
                                            update_metadata_for_multimodal)
+from tensorrt_llm.bench.utils.scenario import (
+    prepare_llm_api_config_for_recipe, process_recipe_scenario)
 from tensorrt_llm.llmapi import CapacitySchedulerPolicy
 from tensorrt_llm.logger import logger
 from tensorrt_llm.sampling_params import SamplingParams
@@ -302,6 +304,20 @@ def throughput_command(
     options: GeneralExecSettings = get_general_cli_options(params, bench_env)
     tokenizer = initialize_tokenizer(options.checkpoint_path)
 
+    # Process recipe scenario if present
+    cli_defaults = {
+        'concurrency': -1,
+        'target_input_len': None,
+        'target_output_len': None,
+        'num_requests': 0,
+        'tp': 1,
+        'pp': 1,
+        'ep': None,
+        'streaming': False,
+    }
+    params, options, scenario = process_recipe_scenario(params, options,
+                                                        bench_env, cli_defaults)
+
     # Extract throughput-specific options not handled by GeneralExecSettings
     max_batch_size = params.get("max_batch_size")
     max_num_tokens = params.get("max_num_tokens")
@@ -397,7 +413,11 @@ def throughput_command(
     exec_settings["settings_config"]["dynamic_max_batch_size"] = True
 
     # LlmArgs
-    exec_settings["extra_llm_api_options"] = params.pop("extra_llm_api_options")
+    # Process recipe format if detected - extract llm_api_config only
+    extra_llm_api_options_path = params.pop("extra_llm_api_options")
+    exec_settings["extra_llm_api_options"] = prepare_llm_api_config_for_recipe(
+        extra_llm_api_options_path, scenario)
+
     exec_settings["iteration_log"] = options.iteration_log
 
     # Construct the runtime configuration dataclass.

@@ -84,7 +84,31 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
     kv_cache_config = {}
     if extra_llm_api_options:
         with open(extra_llm_api_options, 'r') as f:
-            llm_args_dict = yaml.safe_load(f)
+            loaded_data = yaml.safe_load(f)
+
+            # Detect recipe format (has 'scenario' and 'llm_api_config' keys)
+            if isinstance(
+                    loaded_data, dict
+            ) and 'scenario' in loaded_data and 'llm_api_config' in loaded_data:
+                # Recipe format - extract llm_api_config section for LLM args
+                llm_args_dict = loaded_data['llm_api_config']
+
+                # TODO: Add llm_api_config validation once PR #8331 merges
+                # (standardizes LlmArgs with Pydantic - validation will happen automatically)
+
+                # Set environment variables from 'env' section (if not already set)
+                import os
+                env_vars = loaded_data.get('env', {})
+                for key, value in env_vars.items():
+                    if key not in os.environ:
+                        os.environ[key] = str(value)
+                        logger.info(
+                            f"Set environment variable from recipe: {key}={value}"
+                        )
+            else:
+                # Simple format - use loaded data directly
+                llm_args_dict = loaded_data
+
             kv_cache_config = llm_args_dict.get("kv_cache_config", {
                 "dtype": "auto",
             })