mlcommons · viraatc · Nov 19, 2025 · Nov 19, 2025 · Nov 19, 2025 · Nov 26, 2025
@@ -190,19 +190,19 @@ PyTorch backend uses distributed execution with `torchrun` and `run_mlperf_mpi.p
 
 | Option         | Description                    | Default          |
 | -------------- | ------------------------------ | ---------------- |
-| `--mode`       | Scenario mode (offline/server) | `offline`        |
+| `--mode`       | Scenario mode (offline/server/interactive) | `offline`        |
 | `--accuracy`   | Run accuracy test              | `False`          |
 | `--output-dir` | Output directory for results   | `mlperf_results` |
 
 ### Backend Support Matrix
 
 The following table shows which backends support different evaluation and MLPerf operations:
 
-| Backend     | `run_eval.py` | `run_mlperf.py --mode=offline` | `run_mlperf.py --mode=server` |
-| ----------- | ------------- | ------------------------------ | ----------------------------- |
-| pytorch-fp8 | x             | x                              |                               |
-| vllm-fp8    | x             | x                              |                               |
-| sglang-fp8  | x             | x                              | x                             |
+| Backend     | `run_eval.py` | `run_mlperf.py --mode=offline` | `run_mlperf.py --mode=server` |`run_mlperf.py --mode=server` |
+| ----------- | ------------- | ------------------------------ | ----------------------------- | ---------------------------- |
+| pytorch-fp8 | x             | x                              |                               |                              |
+| vllm-fp8    | x             | x                              |                               |                              |
+| sglang-fp8  | x             | x                              | x                             | x                            |
 
 > **Note**: For PyTorch backend, use the `_mpi` versions with `torchrun`. For vLLM and SGLang backends, use the single-process versions without `_mpi`.
 

@@ -84,6 +84,10 @@ def _setup_environment(self) -> None:
         # Use the utility function to get cache directory
         cache_base = get_cache_directory()
 
+        # Use models subdirectory to match user's example paths
+        self.cache_dir = cache_base.parent / 'models'
+        self.cache_dir.mkdir(parents=True, exist_ok=True)
+
         # Set up HuggingFace cache environment variables
         setup_huggingface_cache()
 
@@ -121,7 +125,14 @@ def _build_server_command(self) -> List[str]:
         ]
 
         # Add optimization flags
-        if self.config['enable_torch_compile']:
+        if self.config['enable_speculative_decode']:
+            cmd.extend(['--speculative-algorithm', self.config['speculative_algorithm']])
+            cmd.extend(['--speculative-num-steps',
+                        str(self.config['speculative_num_steps'])])
+            cmd.extend(['--speculative-eagle-topk',
+                        str(self.config['speculative_topk'])])
+
+        elif self.config['enable_torch_compile']:
             cmd.append('--enable-torch-compile')
 
         if self.config['enable_flashinfer']:
@@ -134,7 +145,6 @@ def _build_server_command(self) -> List[str]:
 
         # Add performance settings
         cmd.extend([
-            '--cuda-graph-max-bs', str(self.config['cuda_graph_max_bs']),
             '--max-running-requests', str(self.config['max_running_requests'])
         ])
 
@@ -490,8 +500,7 @@ def generate(self,
 
             except Exception as e:
                 print(f"\nError generating completion: {e}")
-                raise RuntimeError(
-                    f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...")
+                results.append({'error': str(e)})
 
         return results
 
@@ -529,8 +538,7 @@ async def _async_generate_single(
 
             except Exception as e:
                 print(f"\nError generating completion for prompt {idx}: {e}")
-                raise RuntimeError(
-                    f"SGLang backend failed to generate tokens for prompt {idx}: {e}")
+                return idx, {'error': str(e)}
 
     @require_initialized
     def generate_async(self,

@@ -1,41 +1,17 @@
 # SGLang Backend Dockerfile
-FROM nvidia/cuda:12.6.0-devel-ubuntu22.04
+FROM lmsysorg/sglang:v0.5.2-cu129-b200
 
 # Set environment variables
 ENV DEBIAN_FRONTEND=noninteractive
 ENV PIP_BREAK_SYSTEM_PACKAGES=1
 ENV MLPERF_BACKEND=sglang
 
-# Install Python and system dependencies
-RUN apt-get update && apt-get install -y \
-    software-properties-common \
-    && add-apt-repository ppa:deadsnakes/ppa \
-    && apt-get update && apt-get install -y \
-    python3.10 \
-    python3.10-dev \
-    python3.10-distutils \
-    python3-pip \
-    git \
-    git-lfs \
-    curl \
-    wget \
-    ca-certificates \
-    cmake \
-    build-essential \
-    ninja-build \
-    pybind11-dev \
-    pkg-config \
-    sudo \
-    libnuma-dev \
-    htop \
-    && rm -rf /var/lib/apt/lists/*
-
-# Set Python 3.10 as default
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
-    update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1
-
-# Install pip for Python 3.10
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+# Make /root accessible to non-root users so they can access /root/.cargo and /root/.rustup
+# This is needed because flashinfer/tvm_ffi tries to search these paths
+RUN chmod a+rX /root && \
+    if [ -d /root/.cargo ]; then chmod -R a+rX /root/.cargo 2>/dev/null || true; fi && \
+    if [ -d /root/.rustup ]; then chmod -R a+rX /root/.rustup 2>/dev/null || true; fi && \
+    if [ -d /root/.cargo/bin ]; then cp -a /root/.cargo/bin/* /usr/local/bin/ 2>/dev/null || true; fi
 
 # Install UV package manager system-wide
 RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \

@@ -203,11 +203,22 @@ else
 	INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference"
 fi
 
+# Setup model cache directory mount
+# If --model-cache-dir is provided, mount it to /raid/data/$USER/
+# If not provided, mount /raid/data/$USER/ from host
+if [ -n "$MODEL_CACHE_DIR" ]; then
+	MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}"
+	echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}"
+else
+	MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}"
+	echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)"
+fi
+
 # Run the Docker container with all mounts (same as main docker setup)
 docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
 	$GPU_OPTS \
 	-v /home/mlperf_inference_storage:/home/mlperf_inference_storage \
-	-v /raid/data:/raid/data \
+	$MODEL_CACHE_MOUNT \
 	-e HISTFILE="${WORK_DIR}/.bash_history" \
 	--env "CCACHE_DIR=${CCACHE_DIR}" \
 	--env "USER=${USER_NAME}" \

@@ -10,7 +10,7 @@ IMAGE_TAG=${IMAGE_TAG:-latest}
 LOCAL_USER=${LOCAL_USER:-1}
 WORK_DIR=${WORK_DIR:-$(dirname "$(realpath "$0")")/../..}
 CONTAINER_NAME=${CONTAINER_NAME:-sglang}
-RUN_CMD=${RUN_CMD:-}
+RUN_CMD=""
 
 # Get user information
 USER_ID=$(id --user)
@@ -86,7 +86,11 @@ while [[ $# -gt 0 ]]; do
             ;;
         *)
             # Store remaining arguments for passing to container
-            RUN_CMD="$RUN_CMD $1"
+            if [ -z "$RUN_CMD" ]; then
+                RUN_CMD="$1"
+            else
+                RUN_CMD="$RUN_CMD $1"
+            fi
             shift
             ;;
     esac
@@ -203,11 +207,22 @@ else
     INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference"
 fi
 
+# Setup model cache directory mount
+# If --model-cache-dir is provided, mount it to /raid/data/$USER/
+# If not provided, mount /raid/data/$USER/ from host
+if [ -n "$MODEL_CACHE_DIR" ]; then
+    MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}"
+    echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}"
+else
+    MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}"
+    echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)"
+fi
+
 # Run the Docker container with all mounts (same as main docker setup)
 docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
     $GPU_OPTS \
     -v /home/mlperf_inference_storage:/home/mlperf_inference_storage \
-    -v /raid/data:/raid/data \
+    $MODEL_CACHE_MOUNT \
     -e HISTFILE="${WORK_DIR}/.bash_history" \
     --env "CCACHE_DIR=${CCACHE_DIR}" \
     --env "USER=${USER_NAME}" \
@@ -220,4 +235,4 @@ docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
     --hostname "$(hostname)-docker" \
     --name "${CONTAINER_NAME}-${RANDOM_NUM}-${USER_NAME}" \
     --tmpfs /tmp:exec \
-    "$FINAL_IMAGE" $RUN_CMD 
+    "$FINAL_IMAGE" ${RUN_CMD:-/bin/bash}
@@ -203,11 +203,22 @@ else
     INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference"
 fi
 
+# Setup model cache directory mount
+# If --model-cache-dir is provided, mount it to /raid/data/$USER/
+# If not provided, mount /raid/data/$USER/ from host
+if [ -n "$MODEL_CACHE_DIR" ]; then
+    MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}"
+    echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}"
+else
+    MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}"
+    echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)"
+fi
+
 # Run the Docker container with all mounts (same as main docker setup)
 docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
     $GPU_OPTS \
     -v /home/mlperf_inference_storage:/home/mlperf_inference_storage \
-    -v /raid/data:/raid/data \
+    $MODEL_CACHE_MOUNT \
     -e HISTFILE="${WORK_DIR}/.bash_history" \
     --env "CCACHE_DIR=${CCACHE_DIR}" \
     --env "USER=${USER_NAME}" \

@@ -106,6 +106,8 @@ install_evaluation_requirements() {
 	echo "Installing evaluation requirements..."
 	if [ -f "/work/docker/evaluation_requirements.txt" ]; then
 		VIRTUAL_ENV=$VENV_DIR uv pip install -r /work/docker/evaluation_requirements.txt
+		echo "Override datasets==3.0.0 (LiveCodeBench/code-generation-lite is not updated for datasets 3.2.0)..."
+		VIRTUAL_ENV=$VENV_DIR uv pip install --upgrade "datasets==3.0.0"
 		echo "Evaluation requirements installed successfully!"
 	else
 		echo "Warning: evaluation_requirements.txt not found at /work/docker/evaluation_requirements.txt"

@@ -68,26 +68,8 @@ install_evaluation_requirements
 # Install MLPerf LoadGen
 install_mlperf_loadgen "$FORCE_REBUILD" "$MLPERF_BACKEND"
 
-# SGLang-specific setup
-echo ""
-echo "=== SGLang Backend-Specific Setup ==="
-
-# Install core dependencies for SGLang
-echo "Installing core dependencies for SGLang..."
-VIRTUAL_ENV=$VENV_DIR uv pip install \
-    transformers \
-    pandas \
-    numpy \
-    tqdm \
-    huggingface_hub \
-    datasets \
-    accelerate \
-    openai \
-    httpx \
-    requests \
-    torch==2.6.0 \
-    sglang[all]==0.4.6.post5 \
-    sgl_kernel
+# Override sglang version to 0.5.4
+uv pip install sglang[all]==0.5.4 --prerelease=allow
 
 # Verify SGLang installation
 if python3 -c "import sglang" 2>/dev/null; then
@@ -98,13 +80,6 @@ else
     exit 1
 fi
 
-# Verify sgl_kernel installation
-if python3 -c "import sgl_kernel" 2>/dev/null; then
-    echo "sgl_kernel installed successfully"
-else
-    echo "Warning: sgl_kernel import failed - this optimization may not be available"
-fi
-
 # Verify torch is available for SGLang
 if python3 -c "import torch" 2>/dev/null; then
     TORCH_VERSION=$(python3 -c "import torch; print(torch.__version__)")
@@ -132,9 +107,6 @@ python3 -c "import sglang; print('✓ Installed')" 2>/dev/null || echo "✗ Not
 echo -n "FlashInfer: "
 python3 -c "import flashinfer; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed"
 
-echo -n "sgl_kernel: "
-python3 -c "import sgl_kernel; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed"
-
 echo -n "DeepGEMM: "
 python3 -c "import deepgemm; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed"
 

@@ -231,6 +231,25 @@ def main():
                     backend, tokenized_prompts, text_prompts=prompts)
 
             # Process raw results into standardized format using shared utility
+            # Filter out errors from raw_results and corresponding rows in
+            # df_output
+            valid_indices = []
+            valid_results = []
+            for i, res in enumerate(raw_results):
+                if 'error' not in res:
+                    valid_indices.append(i)
+                    valid_results.append(res)
+                else:
+                    print(
+                        f"Skipping prompt {i} due to error: {res.get('error', 'Unknown error')}")
+
+            if len(valid_results) < len(raw_results):
+                print(
+                    f"Filtered out {len(raw_results) - len(valid_results)} failed prompts")
+                raw_results = valid_results
+                df_output = df_output.iloc[valid_indices].reset_index(
+                    drop=True)
+
             print("Processing results...")
             standardized_results = process_inference_results(
                 raw_results, tokenizer

@@ -46,7 +46,7 @@ def create_argument_parser() -> argparse.ArgumentParser:
 
     # Scenario selection (no backend argument, auto-detected)
     parser.add_argument("--mode", type=str, default="offline",
-                        choices=["offline", "server"],
+                        choices=["offline", "server", "interactive"],
                         help="MLPerf scenario mode")
 
     # MLPerf configuration
@@ -82,7 +82,7 @@ def configure_loadgen(scenario: str,
     """Configure LoadGen test settings.
 
     Args:
-        scenario: MLPerf scenario ("offline" or "server")
+        scenario: MLPerf scenario ("offline", "server", or "interactive")
         accuracy_mode: Whether to run in accuracy mode
         mlperf_conf: Path to MLPerf config file
         user_conf: Path to user config file
@@ -97,10 +97,17 @@ def configure_loadgen(scenario: str,
     # Set scenario
     if scenario.lower() == "offline":
         settings.scenario = lg.TestScenario.Offline
-    elif scenario.lower() == "server":
+        config_scenario = "Offline"
+    elif scenario.lower() == "server" or scenario.lower() == "interactive":
         settings.scenario = lg.TestScenario.Server
+        config_scenario = "Server"
     else:
         raise ValueError(f"Unknown scenario: {scenario}")
+
+    # Adjust model name for interactive mode to use separate config
+    # LoadGen will look for "deepseek-r1-interactive.Server" config section
+    if scenario.lower() == "interactive":
+        model_name = f"{model_name}-interactive"
 
     # Set mode
     if accuracy_mode:
@@ -110,9 +117,9 @@ def configure_loadgen(scenario: str,
 
     # Load configurations if files exist
     if mlperf_conf and Path(mlperf_conf).exists():
-        settings.FromConfig(mlperf_conf, model_name, scenario, 2)
+        settings.FromConfig(mlperf_conf, model_name, config_scenario, 2)
     if user_conf and Path(user_conf).exists():
-        settings.FromConfig(user_conf, model_name, scenario, 1)
+        settings.FromConfig(user_conf, model_name, config_scenario, 1)
 
     return settings
 
@@ -247,12 +254,12 @@ def main():
                     dataset_strings=strings_for_sut,
                     name=f"{backend_name}_offline_sut"
                 )
-            else:  # server
+            else:  # server or interactive
                 sut = ServerSUT(
                     backend=backend,
                     dataset=dataset_for_sut,
                     dataset_strings=strings_for_sut,
-                    name=f"{backend_name}_server_sut"
+                    name=f"{backend_name}_{args.mode}_sut"
                 )
 
             # Create QSL
@@ -268,11 +275,10 @@ def main():
             )
 
             # Update settings with dataset info
-            # TODO(vir): these should be in mlperf.conf
-            settings.max_query_count = len(tokenized_prompts)
-            settings.min_query_count = len(tokenized_prompts)
-            settings.use_token_latencies = True
-            settings.server_coalesce_queries = True
+            # settings.max_query_count = len(tokenized_prompts)
+            # settings.min_query_count = len(tokenized_prompts)
+            # settings.use_token_latencies = True
+            # settings.server_coalesce_queries = True
 
             # Configure logging
             log_settings = lg.LogSettings()