Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 76 additions & 4 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4782,12 +4782,84 @@ gemma4-bf16-h100-vllm:
osl: 1024
benchmark-client: [aiperf]
search-space:
- { tp: 2, conc-start: 4, conc-end: 32 }
- isl: 8192
osl: 1024
- { tp: 2, conc-start: 1, conc-end: 1, max-num-batched-tokens: 4096 }

# Agentic-replay pair (vLLM vs SGLang) on google/gemma-4-31B-it, 2x H100 / TP=2 /
# fp8, both cache-ON, driven by AIPerf over the committed 64k agentic-coding trace
# (first 1000 records). Distinct model-prefix `gemma4-agentic` so the launcher
# resolves dedicated trace-replay scripts (gemma4-agentic_<fw>...) rather than the
# fixed-seq-len gemma4_*.sh. context-length 73728 covers the <=66,655 session-
# cumulative max of the #1000 subset (AIPerf threads each mooncake session, so
# context ACCUMULATES across turns; sizing from per-record input_length truncates).
# The "xiu" (slight) inequivalence vs SGLang is the established gemma4 recipe: vLLM
# does on-the-fly fp8 over google/gemma-4-31B-it; SGLang uses the pre-quantized
# RedHatAI/gemma-4-31B-it-FP8-dynamic (its on-the-fly fp8 crashes on the vision tower).
gemma4-agentic-fp8-h100-2x-vllm:
image: vllm/vllm-openai:v0.21.0
model: google/gemma-4-31B-it
model-prefix: gemma4-agentic
runner: h100-2x
precision: fp8
framework: vllm
multinode: false
scenarios:
agentic-replay:
- input-file: benchmarks/single_node/agentic/datasets/agentic-coding-64k.jsonl#1000
custom-dataset-type: mooncake_trace
max-model-len: 73728
benchmark-client: [aiperf]
search-space:
- { tp: 2, conc-list: [16] }

gemma4-agentic-fp8-h100-2x-sglang:
image: lmsysorg/sglang:v0.5.12-cu130
model: RedHatAI/gemma-4-31B-it-FP8-dynamic
model-prefix: gemma4-agentic
runner: h100-2x
precision: fp8
framework: sglang
multinode: false
scenarios:
agentic-replay:
- input-file: benchmarks/single_node/agentic/datasets/agentic-coding-64k.jsonl#1000
custom-dataset-type: mooncake_trace
max-model-len: 73728
benchmark-client: [aiperf]
search-space:
- { tp: 2, conc-list: [16] }

qwen3-4b-2507-bf16-h100-vllm:
image: vllm/vllm-openai:v0.21.0
model: Qwen/Qwen3-4B-Instruct-2507
model-prefix: qwen3-4b-2507
runner: h100-2x
precision: bf16
framework: vllm
multinode: false
# Qwen3-4B-Instruct-2507 is a dense Qwen3ForCausalLM transformer (256K native
# context, no rope-scaling), so vLLM V1 keeps automatic prefix caching ON by
# default. It replaces the prior Qwen/Qwen3.5-4B, a hybrid-Mamba model for which
# vLLM auto-DISABLES prefix caching (observed 0% hit rate on the agentic trace).
scenarios:
agentic-replay:
- input-file: benchmarks/single_node/agentic/datasets/qwen3.5-4b-smoke.jsonl
custom-dataset-type: mooncake_trace
max-model-len: 8192
benchmark-client: [aiperf]
search-space:
- { tp: 1, conc-list: [2] }
# 64k agentic-coding trace (committed in-repo), first 2000 records replayed
# at concurrency 32. AIPerf threads each mooncake session (session_id), so
# context ACCUMULATES across turns: the realized per-request total reaches
# ~65,847 tokens (the "64k" target), NOT the per-record input_length max of
# 37,818. max-model-len 73728 covers that with headroom; 40960 rejected ~55%
# of requests with HTTP 400 (input+output > context window).
- input-file: benchmarks/single_node/agentic/datasets/agentic-coding-64k.jsonl#2000
custom-dataset-type: mooncake_trace
max-model-len: 73728
benchmark-client: [aiperf]
search-space:
- { tp: 2, conc-start: 4, conc-end: 32 }
- { tp: 1, conc-list: [32] }

minimaxm2.5-fp8-h100-vllm:
image: vllm/vllm-openai:v0.21.0
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/benchmark-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,16 @@ on:
required: false
type: string
default: '1800'
input-file:
description: "Repo-relative trace JSONL for agentic-replay (mooncake_trace)"
required: false
type: string
default: ''
custom-dataset-type:
description: "AIPerf --custom-dataset-type for agentic-replay (e.g. mooncake_trace)"
required: false
type: string
default: ''
env:
RANDOM_RANGE_RATIO: 0.8
HF_TOKEN: ${{ secrets.HF_TOKEN }}
Expand Down Expand Up @@ -131,6 +141,8 @@ env:
OFFLOADING: ${{ inputs.offloading }}
TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
DURATION: ${{ inputs.duration }}
INPUT_FILE: ${{ inputs.input-file }}
CUSTOM_DATASET_TYPE: ${{ inputs.custom-dataset-type }}
RESULT_DIR: /workspace/results
PYTHONDONTWRITEBYTECODE: '1'
PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache
Expand Down
46 changes: 42 additions & 4 deletions .github/workflows/e2e-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ jobs:
multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
agentic-replay-config: ${{ steps.get-jobs.outputs.agentic-replay-config }}
steps:
- name: Checkout code (ref)
if: ${{ inputs.ref && inputs.ref != '' }}
Expand All @@ -71,12 +72,14 @@ jobs:
${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
AGENTIC_REPLAY=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-replay' and 'prefill' not in x]))")
SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'agentic-replay') and not x.get('eval-only', False)]))")
MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'agentic-replay') and x.get('run-eval', False)]))")
MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
echo "agentic-replay-config=$AGENTIC_REPLAY" >> $GITHUB_OUTPUT
echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
Expand Down Expand Up @@ -269,6 +272,41 @@ jobs:
run-eval: false
ref: ${{ inputs.ref }}

test-sweep-agentic-replay:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.agentic-replay-config != '[]' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: agentic-replay /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.get-jobs.outputs.agentic-replay-config) }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
benchmark-client: ${{ matrix.config.benchmark-client }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
input-file: ${{ matrix.config.input-file }}
custom-dataset-type: ${{ matrix.config.custom-dataset-type }}
duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
spec-decoding: 'none'
disagg: 'false'
run-eval: false
scenario-type: agentic-replay
ref: ${{ inputs.ref }}

test-sweep-evals:
needs: get-jobs
if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
Expand Down Expand Up @@ -304,8 +342,8 @@ jobs:
ref: ${{ inputs.ref }}

collect-results:
needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic, test-sweep-agentic-replay]
if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped' || needs.test-sweep-agentic-replay.result != 'skipped') }}
uses: ./.github/workflows/collect-results.yml
secrets: inherit
with:
Expand Down
37 changes: 37 additions & 0 deletions .github/workflows/run-sweep.yml
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,40 @@ jobs:
run-eval: false
scenario-type: agentic-coding

sweep-single-node-agentic-replay:
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic-replay']) != 'null' }}
uses: ./.github/workflows/benchmark-tmpl.yml
name: agentic-replay /
strategy:
fail-fast: false
matrix:
config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['agentic-replay'] }}
secrets: inherit
with:
exp-name: ${{ matrix.config.exp-name }}
runner: ${{ matrix.config.runner }}
image: ${{ matrix.config.image }}
model: ${{ matrix.config.model }}
model-prefix: ${{ matrix.config.model-prefix }}
framework: ${{ matrix.config.framework }}
benchmark-client: ${{ matrix.config.benchmark-client }}
precision: ${{ matrix.config.precision }}
tp: ${{ matrix.config.tp }}
ep: ${{ matrix.config.ep }}
dp-attn: ${{ matrix.config.dp-attn }}
conc: ${{ matrix.config.conc }}
isl: ${{ matrix.config.isl }}
osl: ${{ matrix.config.osl }}
max-model-len: ${{ matrix.config.max-model-len }}
input-file: ${{ matrix.config.input-file }}
custom-dataset-type: ${{ matrix.config.custom-dataset-type }}
duration: ${{ matrix.config.duration }}
spec-decoding: 'none'
disagg: ${{ 'false' }}
run-eval: false
scenario-type: agentic-replay

sweep-multi-node-agentic:
needs: setup
if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }}
Expand Down Expand Up @@ -381,6 +415,7 @@ jobs:
sweep-single-node-1k1k,
sweep-single-node-8k1k,
sweep-agentic,
sweep-single-node-agentic-replay,
sweep-multi-node-1k1k,
sweep-multi-node-8k1k,
sweep-multi-node-agentic,
Expand All @@ -393,6 +428,7 @@ jobs:
(
needs.sweep-single-node-1k1k.result != 'skipped' ||
needs.sweep-single-node-8k1k.result != 'skipped' ||
needs.sweep-single-node-agentic-replay.result != 'skipped' ||
needs.sweep-multi-node-1k1k.result != 'skipped' ||
needs.sweep-multi-node-8k1k.result != 'skipped'
)
Expand Down Expand Up @@ -534,6 +570,7 @@ jobs:
always() &&
github.event_name == 'pull_request' &&
needs.collect-results.result == 'success'
continue-on-error: true
runs-on: ubuntu-latest

env:
Expand Down
49 changes: 42 additions & 7 deletions benchmarks/benchmark_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -614,6 +614,12 @@ run_client_benchmark() {
local use_chat_template=false
local dsv4=false
local trust_remote_code=false
# Agentic-replay (trace) path: when --input-file is set, the benchmark
# replays a recorded mooncake_trace JSONL through AIPerf instead of a
# synthetic isl/osl workload. Only the aiperf client supports this.
local input_file=""
local custom_dataset_type=""
local request_count=""

while [[ $# -gt 0 ]]; do
case $1 in
Expand All @@ -630,6 +636,9 @@ run_client_benchmark() {
--bench-serving-dir) bench_serving_dir="$2"; shift 2 ;;
--server-pid) server_pid="$2"; shift 2 ;;
--random-seed) random_seed="$2"; shift 2 ;;
--input-file) input_file="$2"; shift 2 ;;
--custom-dataset-type) custom_dataset_type="$2"; shift 2 ;;
--request-count) request_count="$2"; shift 2 ;;
--use-chat-template) use_chat_template=true; shift ;;
--dsv4) dsv4=true; use_chat_template=true; shift ;;
--trust-remote-code) trust_remote_code=true; shift ;;
Expand All @@ -640,9 +649,13 @@ run_client_benchmark() {
if [[ -z "$model" ]]; then echo "Error: --model is required"; return 1; fi
if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi
if [[ -z "$backend" ]]; then echo "Error: --backend is required"; return 1; fi
if [[ -z "$isl" ]]; then echo "Error: --isl is required"; return 1; fi
if [[ -z "$osl" ]]; then echo "Error: --osl is required"; return 1; fi
if [[ -z "$random_range_ratio" ]]; then echo "Error: --random-range-ratio is required"; return 1; fi
# isl/osl/random-range-ratio describe a synthetic workload; they are not
# required when replaying a recorded trace via --input-file.
if [[ -z "$input_file" ]]; then
if [[ -z "$isl" ]]; then echo "Error: --isl is required"; return 1; fi
if [[ -z "$osl" ]]; then echo "Error: --osl is required"; return 1; fi
if [[ -z "$random_range_ratio" ]]; then echo "Error: --random-range-ratio is required"; return 1; fi
fi
if [[ -z "$concurrency" ]]; then echo "Error: --concurrency is required"; return 1; fi
if [[ -z "$result_filename" ]]; then echo "Error: --result-filename is required"; return 1; fi
if [[ -z "$result_dir" ]]; then echo "Error: --result-dir is required"; return 1; fi
Expand All @@ -658,20 +671,42 @@ run_client_benchmark() {
--url "http://0.0.0.0:$port"
--endpoint-type "$endpoint_type"
--concurrency "$concurrency"
--request-count "$((concurrency * 10))"
--warmup-request-count "$((concurrency * 2))"
--isl "$isl"
--osl "$osl"
--result-filename "$result_filename"
--result-dir "$result_dir"
--bench-serving-dir "$bench_serving_dir"
)
if [[ -n "$input_file" ]]; then
# Trace replay: replay the recorded dataset once. request-count
# equals the dataset record count; isl/osl and warmup do not
# apply (the trace defines per-request lengths).
if [[ -z "$request_count" ]]; then
echo "Error: --request-count is required when --input-file is set"; return 1
fi
aiperf_args+=(
--request-count "$request_count"
--input-file "$input_file"
)
if [[ -n "$custom_dataset_type" ]]; then
aiperf_args+=(--custom-dataset-type "$custom_dataset_type")
fi
else
aiperf_args+=(
--request-count "$((concurrency * 10))"
--warmup-request-count "$((concurrency * 2))"
--isl "$isl"
--osl "$osl"
)
fi
if [[ -n "$random_seed" ]]; then
aiperf_args+=(--random-seed "$random_seed")
fi
run_aiperf_benchmark "${aiperf_args[@]}"
;;
inferencex_native)
if [[ -n "$input_file" ]]; then
echo "Error: --input-file (trace replay) is only supported with BENCHMARK_CLIENT=aiperf"
return 1
fi
local native_args=(
--model "$model"
--port "$port"
Expand Down
Loading