vngcloud · thangquang09 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026 · Jun 3, 2026
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
@@ -4782,12 +4782,84 @@ gemma4-bf16-h100-vllm:
       osl: 1024
       benchmark-client: [aiperf]
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 32 }
-    - isl: 8192
-      osl: 1024
+      - { tp: 2, conc-start: 1, conc-end: 1, max-num-batched-tokens: 4096 }
+
+# Agentic-replay pair (vLLM vs SGLang) on google/gemma-4-31B-it, 2x H100 / TP=2 /
+# fp8, both cache-ON, driven by AIPerf over the committed 64k agentic-coding trace
+# (first 1000 records). Distinct model-prefix `gemma4-agentic` so the launcher
+# resolves dedicated trace-replay scripts (gemma4-agentic_<fw>...) rather than the
+# fixed-seq-len gemma4_*.sh. context-length 73728 covers the <=66,655 session-
+# cumulative max of the #1000 subset (AIPerf threads each mooncake session, so
+# context ACCUMULATES across turns; sizing from per-record input_length truncates).
+# The "xiu" (slight) inequivalence vs SGLang is the established gemma4 recipe: vLLM
+# does on-the-fly fp8 over google/gemma-4-31B-it; SGLang uses the pre-quantized
+# RedHatAI/gemma-4-31B-it-FP8-dynamic (its on-the-fly fp8 crashes on the vision tower).
+gemma4-agentic-fp8-h100-2x-vllm:
+  image: vllm/vllm-openai:v0.21.0
+  model: google/gemma-4-31B-it
+  model-prefix: gemma4-agentic
+  runner: h100-2x
+  precision: fp8
+  framework: vllm
+  multinode: false
+  scenarios:
+    agentic-replay:
+    - input-file: benchmarks/single_node/agentic/datasets/agentic-coding-64k.jsonl#1000
+      custom-dataset-type: mooncake_trace
+      max-model-len: 73728
+      benchmark-client: [aiperf]
+      search-space:
+      - { tp: 2, conc-list: [16] }
+
+gemma4-agentic-fp8-h100-2x-sglang:
+  image: lmsysorg/sglang:v0.5.12-cu130
+  model: RedHatAI/gemma-4-31B-it-FP8-dynamic
+  model-prefix: gemma4-agentic
+  runner: h100-2x
+  precision: fp8
+  framework: sglang
+  multinode: false
+  scenarios:
+    agentic-replay:
+    - input-file: benchmarks/single_node/agentic/datasets/agentic-coding-64k.jsonl#1000
+      custom-dataset-type: mooncake_trace
+      max-model-len: 73728
+      benchmark-client: [aiperf]
+      search-space:
+      - { tp: 2, conc-list: [16] }
+
+qwen3-4b-2507-bf16-h100-vllm:
+  image: vllm/vllm-openai:v0.21.0
+  model: Qwen/Qwen3-4B-Instruct-2507
+  model-prefix: qwen3-4b-2507
+  runner: h100-2x
+  precision: bf16
+  framework: vllm
+  multinode: false
+  # Qwen3-4B-Instruct-2507 is a dense Qwen3ForCausalLM transformer (256K native
+  # context, no rope-scaling), so vLLM V1 keeps automatic prefix caching ON by
+  # default. It replaces the prior Qwen/Qwen3.5-4B, a hybrid-Mamba model for which
+  # vLLM auto-DISABLES prefix caching (observed 0% hit rate on the agentic trace).
+  scenarios:
+    agentic-replay:
+    - input-file: benchmarks/single_node/agentic/datasets/qwen3.5-4b-smoke.jsonl
+      custom-dataset-type: mooncake_trace
+      max-model-len: 8192
+      benchmark-client: [aiperf]
+      search-space:
+      - { tp: 1, conc-list: [2] }
+    # 64k agentic-coding trace (committed in-repo), first 2000 records replayed
+    # at concurrency 32. AIPerf threads each mooncake session (session_id), so
+    # context ACCUMULATES across turns: the realized per-request total reaches
+    # ~65,847 tokens (the "64k" target), NOT the per-record input_length max of
+    # 37,818. max-model-len 73728 covers that with headroom; 40960 rejected ~55%
+    # of requests with HTTP 400 (input+output > context window).
+    - input-file: benchmarks/single_node/agentic/datasets/agentic-coding-64k.jsonl#2000
+      custom-dataset-type: mooncake_trace
+      max-model-len: 73728
       benchmark-client: [aiperf]
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 32 }
+      - { tp: 1, conc-list: [32] }
 
 minimaxm2.5-fp8-h100-vllm:
   image: vllm/vllm-openai:v0.21.0

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -101,6 +101,16 @@ on:
         required: false
         type: string
         default: '1800'
+      input-file:
+        description: "Repo-relative trace JSONL for agentic-replay (mooncake_trace)"
+        required: false
+        type: string
+        default: ''
+      custom-dataset-type:
+        description: "AIPerf --custom-dataset-type for agentic-replay (e.g. mooncake_trace)"
+        required: false
+        type: string
+        default: ''
 env:
   RANDOM_RANGE_RATIO: 0.8
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -131,6 +141,8 @@ env:
   OFFLOADING: ${{ inputs.offloading }}
   TOTAL_CPU_DRAM_GB: ${{ inputs.total-cpu-dram-gb }}
   DURATION: ${{ inputs.duration }}
+  INPUT_FILE: ${{ inputs.input-file }}
+  CUSTOM_DATASET_TYPE: ${{ inputs.custom-dataset-type }}
   RESULT_DIR: /workspace/results
   PYTHONDONTWRITEBYTECODE: '1'
   PYTHONPYCACHEPREFIX: /tmp/inferencex-pycache

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
@@ -51,6 +51,7 @@ jobs:
             multi-node-eval-config: ${{ steps.get-jobs.outputs.multi-node-eval-config }}
             agentic-config: ${{ steps.get-jobs.outputs.agentic-config }}
             multi-node-agentic-config: ${{ steps.get-jobs.outputs.multi-node-agentic-config }}
+            agentic-replay-config: ${{ steps.get-jobs.outputs.agentic-replay-config }}
         steps:
             - name: Checkout code (ref)
               if: ${{ inputs.ref && inputs.ref != '' }}
@@ -71,12 +72,14 @@ jobs:
                     ${{ inputs.generate-cli-command || github.event.inputs.generate-cli-command }})
                   AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' not in x]))")
                   MULTI_AGENTIC=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-coding' and 'prefill' in x]))")
-                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
+                  AGENTIC_REPLAY=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if x.get('scenario-type') == 'agentic-replay' and 'prefill' not in x]))")
+                  SINGLE=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'agentic-replay') and not x.get('eval-only', False)]))")
                   MULTI=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('scenario-type') != 'agentic-coding' and not x.get('eval-only', False)]))")
-                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') != 'agentic-coding' and x.get('run-eval', False)]))")
+                  EVALS=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' not in x and x.get('scenario-type') not in ('agentic-coding', 'agentic-replay') and x.get('run-eval', False)]))")
                   MULTI_EVAL=$(echo "$CONFIG_JSON" | python3 -c "import sys,json; d=json.load(sys.stdin); print(json.dumps([x for x in d if 'prefill' in x and x.get('run-eval', False)]))")
                   echo "agentic-config=$AGENTIC" >> $GITHUB_OUTPUT
                   echo "multi-node-agentic-config=$MULTI_AGENTIC" >> $GITHUB_OUTPUT
+                  echo "agentic-replay-config=$AGENTIC_REPLAY" >> $GITHUB_OUTPUT
                   echo "single-node-config=$SINGLE" >> $GITHUB_OUTPUT
                   echo "multi-node-config=$MULTI" >> $GITHUB_OUTPUT
                   echo "eval-config=$EVALS" >> $GITHUB_OUTPUT
@@ -269,6 +272,41 @@ jobs:
             run-eval: false
             ref: ${{ inputs.ref }}
 
+    test-sweep-agentic-replay:
+        needs: get-jobs
+        if: ${{ needs.get-jobs.outputs.agentic-replay-config != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: agentic-replay /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.agentic-replay-config) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            benchmark-client: ${{ matrix.config.benchmark-client }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            input-file: ${{ matrix.config.input-file }}
+            custom-dataset-type: ${{ matrix.config.custom-dataset-type }}
+            duration: ${{ inputs.duration-override != '' && inputs.duration-override || matrix.config.duration }}
+            spec-decoding: 'none'
+            disagg: 'false'
+            run-eval: false
+            scenario-type: agentic-replay
+            ref: ${{ inputs.ref }}
+
     test-sweep-evals:
         needs: get-jobs
         if: ${{ needs.get-jobs.outputs.eval-config != '[]' }}
@@ -304,8 +342,8 @@ jobs:
             ref: ${{ inputs.ref }}
 
     collect-results:
-        needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic]
-        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped') }}
+        needs: [test-sweep-multi-node, test-sweep-single-node, test-sweep-agentic, test-sweep-multi-node-agentic, test-sweep-agentic-replay]
+        if: ${{ always() && (needs.test-sweep-multi-node.result != 'skipped' || needs.test-sweep-single-node.result != 'skipped' || needs.test-sweep-agentic.result != 'skipped' || needs.test-sweep-multi-node-agentic.result != 'skipped' || needs.test-sweep-agentic-replay.result != 'skipped') }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
         with:

diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
@@ -265,6 +265,40 @@ jobs:
             run-eval: false
             scenario-type: agentic-coding
 
+    sweep-single-node-agentic-replay:
+        needs: setup
+        if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).single_node['agentic-replay']) != 'null' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: agentic-replay /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['agentic-replay'] }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            benchmark-client: ${{ matrix.config.benchmark-client }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            input-file: ${{ matrix.config.input-file }}
+            custom-dataset-type: ${{ matrix.config.custom-dataset-type }}
+            duration: ${{ matrix.config.duration }}
+            spec-decoding: 'none'
+            disagg: ${{ 'false' }}
+            run-eval: false
+            scenario-type: agentic-replay
+
     sweep-multi-node-agentic:
         needs: setup
         if: ${{ needs.setup.outputs.reuse-enabled != 'true' && toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['agentic']) != 'null' }}
@@ -381,6 +415,7 @@ jobs:
                 sweep-single-node-1k1k,
                 sweep-single-node-8k1k,
                 sweep-agentic,
+                sweep-single-node-agentic-replay,
                 sweep-multi-node-1k1k,
                 sweep-multi-node-8k1k,
                 sweep-multi-node-agentic,
@@ -393,6 +428,7 @@ jobs:
               (
                 needs.sweep-single-node-1k1k.result != 'skipped' ||
                 needs.sweep-single-node-8k1k.result != 'skipped' ||
+                needs.sweep-single-node-agentic-replay.result != 'skipped' ||
                 needs.sweep-multi-node-1k1k.result != 'skipped' ||
                 needs.sweep-multi-node-8k1k.result != 'skipped'
               )
@@ -534,6 +570,7 @@ jobs:
             always() &&
             github.event_name == 'pull_request' &&
             needs.collect-results.result == 'success'
+        continue-on-error: true
         runs-on: ubuntu-latest
 
         env:

diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh
@@ -614,6 +614,12 @@ run_client_benchmark() {
     local use_chat_template=false
     local dsv4=false
     local trust_remote_code=false
+    # Agentic-replay (trace) path: when --input-file is set, the benchmark
+    # replays a recorded mooncake_trace JSONL through AIPerf instead of a
+    # synthetic isl/osl workload. Only the aiperf client supports this.
+    local input_file=""
+    local custom_dataset_type=""
+    local request_count=""
 
     while [[ $# -gt 0 ]]; do
         case $1 in
@@ -630,6 +636,9 @@ run_client_benchmark() {
             --bench-serving-dir) bench_serving_dir="$2"; shift 2 ;;
             --server-pid) server_pid="$2"; shift 2 ;;
             --random-seed) random_seed="$2"; shift 2 ;;
+            --input-file) input_file="$2"; shift 2 ;;
+            --custom-dataset-type) custom_dataset_type="$2"; shift 2 ;;
+            --request-count) request_count="$2"; shift 2 ;;
             --use-chat-template) use_chat_template=true; shift ;;
             --dsv4) dsv4=true; use_chat_template=true; shift ;;
             --trust-remote-code) trust_remote_code=true; shift ;;
@@ -640,9 +649,13 @@ run_client_benchmark() {
     if [[ -z "$model" ]]; then echo "Error: --model is required"; return 1; fi
     if [[ -z "$port" ]]; then echo "Error: --port is required"; return 1; fi
     if [[ -z "$backend" ]]; then echo "Error: --backend is required"; return 1; fi
-    if [[ -z "$isl" ]]; then echo "Error: --isl is required"; return 1; fi
-    if [[ -z "$osl" ]]; then echo "Error: --osl is required"; return 1; fi
-    if [[ -z "$random_range_ratio" ]]; then echo "Error: --random-range-ratio is required"; return 1; fi
+    # isl/osl/random-range-ratio describe a synthetic workload; they are not
+    # required when replaying a recorded trace via --input-file.
+    if [[ -z "$input_file" ]]; then
+        if [[ -z "$isl" ]]; then echo "Error: --isl is required"; return 1; fi
+        if [[ -z "$osl" ]]; then echo "Error: --osl is required"; return 1; fi
+        if [[ -z "$random_range_ratio" ]]; then echo "Error: --random-range-ratio is required"; return 1; fi
+    fi
     if [[ -z "$concurrency" ]]; then echo "Error: --concurrency is required"; return 1; fi
     if [[ -z "$result_filename" ]]; then echo "Error: --result-filename is required"; return 1; fi
     if [[ -z "$result_dir" ]]; then echo "Error: --result-dir is required"; return 1; fi
@@ -658,20 +671,42 @@ run_client_benchmark() {
                 --url "http://0.0.0.0:$port"
                 --endpoint-type "$endpoint_type"
                 --concurrency "$concurrency"
-                --request-count "$((concurrency * 10))"
-                --warmup-request-count "$((concurrency * 2))"
-                --isl "$isl"
-                --osl "$osl"
                 --result-filename "$result_filename"
                 --result-dir "$result_dir"
                 --bench-serving-dir "$bench_serving_dir"
             )
+            if [[ -n "$input_file" ]]; then
+                # Trace replay: replay the recorded dataset once. request-count
+                # equals the dataset record count; isl/osl and warmup do not
+                # apply (the trace defines per-request lengths).
+                if [[ -z "$request_count" ]]; then
+                    echo "Error: --request-count is required when --input-file is set"; return 1
+                fi
+                aiperf_args+=(
+                    --request-count "$request_count"
+                    --input-file "$input_file"
+                )
+                if [[ -n "$custom_dataset_type" ]]; then
+                    aiperf_args+=(--custom-dataset-type "$custom_dataset_type")
+                fi
+            else
+                aiperf_args+=(
+                    --request-count "$((concurrency * 10))"
+                    --warmup-request-count "$((concurrency * 2))"
+                    --isl "$isl"
+                    --osl "$osl"
+                )
+            fi
             if [[ -n "$random_seed" ]]; then
                 aiperf_args+=(--random-seed "$random_seed")
             fi
             run_aiperf_benchmark "${aiperf_args[@]}"
             ;;
         inferencex_native)
+            if [[ -n "$input_file" ]]; then
+                echo "Error: --input-file (trace replay) is only supported with BENCHMARK_CLIENT=aiperf"
+                return 1
+            fi
             local native_args=(
                 --model "$model"
                 --port "$port"