Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ def extract_from_precompiled(precompiled_location: str, package_data: List[str],
'trtllm-refit=tensorrt_llm.commands.refit:main',
'trtllm-bench=tensorrt_llm.commands.bench:main',
'trtllm-serve=tensorrt_llm.commands.serve:main',
'trtllm-eval=tensorrt_llm.commands.eval:main'
'trtllm-eval=tensorrt_llm.commands.eval:main',
'trtllm-configure=tensorrt_llm.commands.configure:main'
],
},
scripts=['tensorrt_llm/llmapi/trtllm-llmapi-launch'],
Expand Down
20 changes: 19 additions & 1 deletion tensorrt_llm/bench/benchmark/low_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
initialize_tokenizer,
update_metadata_for_multimodal)
from tensorrt_llm.bench.utils.scenario import (
prepare_llm_api_config_for_recipe, process_recipe_scenario)
from tensorrt_llm.logger import logger
from tensorrt_llm.sampling_params import SamplingParams

Expand Down Expand Up @@ -196,6 +198,19 @@ def latency_command(
# Model, experiment, and engine params
options = get_general_cli_options(params, bench_env)

# Process recipe scenario if present
cli_defaults = {
'concurrency': 1, # Latency default is 1 (not -1 like throughput)
'target_input_len': None,
'target_output_len': None,
'num_requests': 0,
'tp': 1,
'pp': 1,
'ep': None,
}
params, options, scenario = process_recipe_scenario(params, options,
bench_env, cli_defaults)

# Speculative Decode Options
medusa_choices = params.get("medusa_choices")
# Initialize the HF tokenizer for the specified model.
Expand Down Expand Up @@ -274,7 +289,10 @@ def latency_command(
exec_settings["performance_options"]["cuda_graphs"] = True
exec_settings["performance_options"]["multi_block_mode"] = True

exec_settings["extra_llm_api_options"] = params.get("extra_llm_api_options")
# Process recipe format if detected - extract llm_api_config only
extra_llm_api_options_path = params.get("extra_llm_api_options")
exec_settings["extra_llm_api_options"] = prepare_llm_api_config_for_recipe(
extra_llm_api_options_path, scenario)

# Decoding Options
if medusa_choices is not None:
Expand Down
22 changes: 21 additions & 1 deletion tensorrt_llm/bench/benchmark/throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@
from tensorrt_llm.bench.utils.data import (create_dataset_from_stream,
initialize_tokenizer,
update_metadata_for_multimodal)
from tensorrt_llm.bench.utils.scenario import (
prepare_llm_api_config_for_recipe, process_recipe_scenario)
from tensorrt_llm.llmapi import CapacitySchedulerPolicy
from tensorrt_llm.logger import logger
from tensorrt_llm.sampling_params import SamplingParams
Expand Down Expand Up @@ -302,6 +304,20 @@ def throughput_command(
options: GeneralExecSettings = get_general_cli_options(params, bench_env)
tokenizer = initialize_tokenizer(options.checkpoint_path)

# Process recipe scenario if present
cli_defaults = {
'concurrency': -1,
'target_input_len': None,
'target_output_len': None,
'num_requests': 0,
'tp': 1,
'pp': 1,
'ep': None,
'streaming': False,
}
params, options, scenario = process_recipe_scenario(params, options,
bench_env, cli_defaults)

# Extract throughput-specific options not handled by GeneralExecSettings
max_batch_size = params.get("max_batch_size")
max_num_tokens = params.get("max_num_tokens")
Expand Down Expand Up @@ -397,7 +413,11 @@ def throughput_command(
exec_settings["settings_config"]["dynamic_max_batch_size"] = True

# LlmArgs
exec_settings["extra_llm_api_options"] = params.pop("extra_llm_api_options")
# Process recipe format if detected - extract llm_api_config only
extra_llm_api_options_path = params.pop("extra_llm_api_options")
exec_settings["extra_llm_api_options"] = prepare_llm_api_config_for_recipe(
extra_llm_api_options_path, scenario)

exec_settings["iteration_log"] = options.iteration_log

# Construct the runtime configuration dataclass.
Expand Down
26 changes: 25 additions & 1 deletion tensorrt_llm/bench/benchmark/utils/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,31 @@ def get_settings(params: dict, dataset_metadata: DatasetMetadata, model: str,
kv_cache_config = {}
if extra_llm_api_options:
with open(extra_llm_api_options, 'r') as f:
llm_args_dict = yaml.safe_load(f)
loaded_data = yaml.safe_load(f)

# Detect recipe format (has 'scenario' and 'llm_api_config' keys)
if isinstance(
loaded_data, dict
) and 'scenario' in loaded_data and 'llm_api_config' in loaded_data:
# Recipe format - extract llm_api_config section for LLM args
llm_args_dict = loaded_data['llm_api_config']

# TODO: Add llm_api_config validation once PR #8331 merges
# (standardizes LlmArgs with Pydantic - validation will happen automatically)

# Set environment variables from 'env' section (if not already set)
import os
env_vars = loaded_data.get('env', {})
for key, value in env_vars.items():
if key not in os.environ:
os.environ[key] = str(value)
logger.info(
f"Set environment variable from recipe: {key}={value}"
)
else:
# Simple format - use loaded data directly
llm_args_dict = loaded_data

kv_cache_config = llm_args_dict.get("kv_cache_config", {
"dtype": "auto",
})
Expand Down
Loading
Loading