From fc53b882c5fec14b682c77aa9b3dfdeac0c769ba Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Tue, 18 Nov 2025 23:14:35 -0800 Subject: [PATCH 1/7] feat: add MTP to ds-r1 ref. impl --- .../deepseek-r1/backends/sglang_backend.py | 12 +++++- language/deepseek-r1/docker/Dockerfile.sglang | 38 ++++--------------- .../docker/launch_scripts/launch_pytorch.sh | 13 ++++++- .../docker/launch_scripts/launch_sglang.sh | 23 +++++++++-- .../docker/launch_scripts/launch_vllm.sh | 13 ++++++- .../docker/setup_scripts/common.sh | 2 + .../docker/setup_scripts/setup_sglang.sh | 32 +--------------- .../deepseek-r1/utils/backend_registry.py | 11 ++++-- 8 files changed, 72 insertions(+), 72 deletions(-) diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py index 10be6e1dcd..3a2f5519cf 100644 --- a/language/deepseek-r1/backends/sglang_backend.py +++ b/language/deepseek-r1/backends/sglang_backend.py @@ -84,6 +84,10 @@ def _setup_environment(self) -> None: # Use the utility function to get cache directory cache_base = get_cache_directory() + # Use models subdirectory to match user's example paths + self.cache_dir = cache_base.parent / 'models' + self.cache_dir.mkdir(parents=True, exist_ok=True) + # Set up HuggingFace cache environment variables setup_huggingface_cache() @@ -121,7 +125,12 @@ def _build_server_command(self) -> List[str]: ] # Add optimization flags - if self.config['enable_torch_compile']: + if self.config['enable_speculative_decode']: + cmd.extend(['--speculative-algorithm', 'EAGLE']) + cmd.extend(['--speculative-num-steps', str(self.config['speculative_num_steps'])]) + cmd.extend(['--speculative-eagle-topk', str(self.config['speculative_topk'])]) + + elif self.config['enable_torch_compile']: cmd.append('--enable-torch-compile') if self.config['enable_flashinfer']: @@ -134,7 +143,6 @@ def _build_server_command(self) -> List[str]: # Add performance settings cmd.extend([ - '--cuda-graph-max-bs', str(self.config['cuda_graph_max_bs']), '--max-running-requests', str(self.config['max_running_requests']) ]) diff --git a/language/deepseek-r1/docker/Dockerfile.sglang b/language/deepseek-r1/docker/Dockerfile.sglang index 02703901d7..c0e690a6ec 100644 --- a/language/deepseek-r1/docker/Dockerfile.sglang +++ b/language/deepseek-r1/docker/Dockerfile.sglang @@ -1,41 +1,17 @@ # SGLang Backend Dockerfile -FROM nvidia/cuda:12.6.0-devel-ubuntu22.04 +FROM lmsysorg/sglang:v0.5.2-cu129-b200 # Set environment variables ENV DEBIAN_FRONTEND=noninteractive ENV PIP_BREAK_SYSTEM_PACKAGES=1 ENV MLPERF_BACKEND=sglang -# Install Python and system dependencies -RUN apt-get update && apt-get install -y \ - software-properties-common \ - && add-apt-repository ppa:deadsnakes/ppa \ - && apt-get update && apt-get install -y \ - python3.10 \ - python3.10-dev \ - python3.10-distutils \ - python3-pip \ - git \ - git-lfs \ - curl \ - wget \ - ca-certificates \ - cmake \ - build-essential \ - ninja-build \ - pybind11-dev \ - pkg-config \ - sudo \ - libnuma-dev \ - htop \ - && rm -rf /var/lib/apt/lists/* - -# Set Python 3.10 as default -RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \ - update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1 - -# Install pip for Python 3.10 -RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10 +# Make /root accessible to non-root users so they can access /root/.cargo and /root/.rustup +# This is needed because flashinfer/tvm_ffi tries to search these paths +RUN chmod a+rX /root && \ + if [ -d /root/.cargo ]; then chmod -R a+rX /root/.cargo 2>/dev/null || true; fi && \ + if [ -d /root/.rustup ]; then chmod -R a+rX /root/.rustup 2>/dev/null || true; fi && \ + if [ -d /root/.cargo/bin ]; then cp -a /root/.cargo/bin/* /usr/local/bin/ 2>/dev/null || true; fi # Install UV package manager system-wide RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \ diff --git a/language/deepseek-r1/docker/launch_scripts/launch_pytorch.sh b/language/deepseek-r1/docker/launch_scripts/launch_pytorch.sh index 540ee1860f..2b4dc05d04 100755 --- a/language/deepseek-r1/docker/launch_scripts/launch_pytorch.sh +++ b/language/deepseek-r1/docker/launch_scripts/launch_pytorch.sh @@ -203,11 +203,22 @@ else INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference" fi +# Setup model cache directory mount +# If --model-cache-dir is provided, mount it to /raid/data/$USER/ +# If not provided, mount /raid/data/$USER/ from host +if [ -n "$MODEL_CACHE_DIR" ]; then + MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}" + echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}" +else + MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}" + echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)" +fi + # Run the Docker container with all mounts (same as main docker setup) docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \ $GPU_OPTS \ -v /home/mlperf_inference_storage:/home/mlperf_inference_storage \ - -v /raid/data:/raid/data \ + $MODEL_CACHE_MOUNT \ -e HISTFILE="${WORK_DIR}/.bash_history" \ --env "CCACHE_DIR=${CCACHE_DIR}" \ --env "USER=${USER_NAME}" \ diff --git a/language/deepseek-r1/docker/launch_scripts/launch_sglang.sh b/language/deepseek-r1/docker/launch_scripts/launch_sglang.sh index 2d21a0dcd8..a2bdd0a72c 100755 --- a/language/deepseek-r1/docker/launch_scripts/launch_sglang.sh +++ b/language/deepseek-r1/docker/launch_scripts/launch_sglang.sh @@ -10,7 +10,7 @@ IMAGE_TAG=${IMAGE_TAG:-latest} LOCAL_USER=${LOCAL_USER:-1} WORK_DIR=${WORK_DIR:-$(dirname "$(realpath "$0")")/../..} CONTAINER_NAME=${CONTAINER_NAME:-sglang} -RUN_CMD=${RUN_CMD:-} +RUN_CMD="" # Get user information USER_ID=$(id --user) @@ -86,7 +86,11 @@ while [[ $# -gt 0 ]]; do ;; *) # Store remaining arguments for passing to container - RUN_CMD="$RUN_CMD $1" + if [ -z "$RUN_CMD" ]; then + RUN_CMD="$1" + else + RUN_CMD="$RUN_CMD $1" + fi shift ;; esac @@ -203,11 +207,22 @@ else INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference" fi +# Setup model cache directory mount +# If --model-cache-dir is provided, mount it to /raid/data/$USER/ +# If not provided, mount /raid/data/$USER/ from host +if [ -n "$MODEL_CACHE_DIR" ]; then + MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}" + echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}" +else + MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}" + echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)" +fi + # Run the Docker container with all mounts (same as main docker setup) docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \ $GPU_OPTS \ -v /home/mlperf_inference_storage:/home/mlperf_inference_storage \ - -v /raid/data:/raid/data \ + $MODEL_CACHE_MOUNT \ -e HISTFILE="${WORK_DIR}/.bash_history" \ --env "CCACHE_DIR=${CCACHE_DIR}" \ --env "USER=${USER_NAME}" \ @@ -220,4 +235,4 @@ docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \ --hostname "$(hostname)-docker" \ --name "${CONTAINER_NAME}-${RANDOM_NUM}-${USER_NAME}" \ --tmpfs /tmp:exec \ - "$FINAL_IMAGE" $RUN_CMD \ No newline at end of file + "$FINAL_IMAGE" ${RUN_CMD:-/bin/bash} diff --git a/language/deepseek-r1/docker/launch_scripts/launch_vllm.sh b/language/deepseek-r1/docker/launch_scripts/launch_vllm.sh index 966c1ab516..bb112a49a6 100755 --- a/language/deepseek-r1/docker/launch_scripts/launch_vllm.sh +++ b/language/deepseek-r1/docker/launch_scripts/launch_vllm.sh @@ -203,11 +203,22 @@ else INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference" fi +# Setup model cache directory mount +# If --model-cache-dir is provided, mount it to /raid/data/$USER/ +# If not provided, mount /raid/data/$USER/ from host +if [ -n "$MODEL_CACHE_DIR" ]; then + MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}" + echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}" +else + MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}" + echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)" +fi + # Run the Docker container with all mounts (same as main docker setup) docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \ $GPU_OPTS \ -v /home/mlperf_inference_storage:/home/mlperf_inference_storage \ - -v /raid/data:/raid/data \ + $MODEL_CACHE_MOUNT \ -e HISTFILE="${WORK_DIR}/.bash_history" \ --env "CCACHE_DIR=${CCACHE_DIR}" \ --env "USER=${USER_NAME}" \ diff --git a/language/deepseek-r1/docker/setup_scripts/common.sh b/language/deepseek-r1/docker/setup_scripts/common.sh index 863e982de0..11dea848c1 100755 --- a/language/deepseek-r1/docker/setup_scripts/common.sh +++ b/language/deepseek-r1/docker/setup_scripts/common.sh @@ -106,6 +106,8 @@ install_evaluation_requirements() { echo "Installing evaluation requirements..." if [ -f "/work/docker/evaluation_requirements.txt" ]; then VIRTUAL_ENV=$VENV_DIR uv pip install -r /work/docker/evaluation_requirements.txt + echo "Override datasets==3.0.0 (LiveCodeBench/code-generation-lite is not updated for datasets 3.2.0)..." + VIRTUAL_ENV=$VENV_DIR uv pip install --upgrade "datasets==3.0.0" echo "Evaluation requirements installed successfully!" else echo "Warning: evaluation_requirements.txt not found at /work/docker/evaluation_requirements.txt" diff --git a/language/deepseek-r1/docker/setup_scripts/setup_sglang.sh b/language/deepseek-r1/docker/setup_scripts/setup_sglang.sh index 9bd2e7677e..de1253b4f7 100755 --- a/language/deepseek-r1/docker/setup_scripts/setup_sglang.sh +++ b/language/deepseek-r1/docker/setup_scripts/setup_sglang.sh @@ -68,26 +68,8 @@ install_evaluation_requirements # Install MLPerf LoadGen install_mlperf_loadgen "$FORCE_REBUILD" "$MLPERF_BACKEND" -# SGLang-specific setup -echo "" -echo "=== SGLang Backend-Specific Setup ===" - -# Install core dependencies for SGLang -echo "Installing core dependencies for SGLang..." -VIRTUAL_ENV=$VENV_DIR uv pip install \ - transformers \ - pandas \ - numpy \ - tqdm \ - huggingface_hub \ - datasets \ - accelerate \ - openai \ - httpx \ - requests \ - torch==2.6.0 \ - sglang[all]==0.4.6.post5 \ - sgl_kernel +# Override sglang version to 0.5.4 +uv pip install sglang[all]==0.5.4 --prerelease=allow # Verify SGLang installation if python3 -c "import sglang" 2>/dev/null; then @@ -98,13 +80,6 @@ else exit 1 fi -# Verify sgl_kernel installation -if python3 -c "import sgl_kernel" 2>/dev/null; then - echo "sgl_kernel installed successfully" -else - echo "Warning: sgl_kernel import failed - this optimization may not be available" -fi - # Verify torch is available for SGLang if python3 -c "import torch" 2>/dev/null; then TORCH_VERSION=$(python3 -c "import torch; print(torch.__version__)") @@ -132,9 +107,6 @@ python3 -c "import sglang; print('✓ Installed')" 2>/dev/null || echo "✗ Not echo -n "FlashInfer: " python3 -c "import flashinfer; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed" -echo -n "sgl_kernel: " -python3 -c "import sgl_kernel; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed" - echo -n "DeepGEMM: " python3 -c "import deepgemm; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed" diff --git a/language/deepseek-r1/utils/backend_registry.py b/language/deepseek-r1/utils/backend_registry.py index 5f33c3ddd1..e28b5d06bc 100644 --- a/language/deepseek-r1/utils/backend_registry.py +++ b/language/deepseek-r1/utils/backend_registry.py @@ -98,7 +98,7 @@ # NOTE(vir): sg-lang crash without +2 additional "context_length": MAX_ISL + MAX_OSL + MAX_TEMPLATE_TOKS + 2, "max_tokens": MAX_OSL, - "mem_fraction_static": 0.90, + "mem_fraction_static": 0.85, "random_seed": 42, "dtype": "auto", "trust_remote_code": True, @@ -106,14 +106,19 @@ "enable_flashinfer": True, "enable_dp_attention": True, "dp": 8, - "cuda_graph_max_bs": 512, "temperature": 0.0, "top_p": 1.0, "seed": 42, - "max_running_requests": 512, # concurrency + "max_running_requests": 128, # concurrency "request_timeout": None, "server_startup_timeout": 1800, "health_check_interval": 5, + + + # True: enable with DS-R1 interactive scenario MTP config + 'enable_speculative_decode': True, + 'speculative_num_steps': 3, + 'speculative_topk': 1, }, 'env_vars': { 'CUDA_MODULE_LOADING': 'LAZY', From cb2ec4f12c7376592193857e6d14227dbb2eeafc Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 19 Nov 2025 00:00:34 -0800 Subject: [PATCH 2/7] error handling bad samples --- language/deepseek-r1/backends/sglang_backend.py | 6 ++---- language/deepseek-r1/run_eval.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py index 3a2f5519cf..17d0e3d9e3 100644 --- a/language/deepseek-r1/backends/sglang_backend.py +++ b/language/deepseek-r1/backends/sglang_backend.py @@ -498,8 +498,7 @@ def generate(self, except Exception as e: print(f"\nError generating completion: {e}") - raise RuntimeError( - f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...") + results.append({'error': str(e)}) return results @@ -537,8 +536,7 @@ async def _async_generate_single( except Exception as e: print(f"\nError generating completion for prompt {idx}: {e}") - raise RuntimeError( - f"SGLang backend failed to generate tokens for prompt {idx}: {e}") + return idx, {'error': str(e)} @require_initialized def generate_async(self, diff --git a/language/deepseek-r1/run_eval.py b/language/deepseek-r1/run_eval.py index 8965101bd4..338e6761d4 100755 --- a/language/deepseek-r1/run_eval.py +++ b/language/deepseek-r1/run_eval.py @@ -231,6 +231,21 @@ def main(): backend, tokenized_prompts, text_prompts=prompts) # Process raw results into standardized format using shared utility + # Filter out errors from raw_results and corresponding rows in df_output + valid_indices = [] + valid_results = [] + for i, res in enumerate(raw_results): + if 'error' not in res: + valid_indices.append(i) + valid_results.append(res) + else: + print(f"Skipping prompt {i} due to error: {res.get('error', 'Unknown error')}") + + if len(valid_results) < len(raw_results): + print(f"Filtered out {len(raw_results) - len(valid_results)} failed prompts") + raw_results = valid_results + df_output = df_output.iloc[valid_indices].reset_index(drop=True) + print("Processing results...") standardized_results = process_inference_results( raw_results, tokenizer From 5ba5d8dfcbd51ac5359632401c88105683e2a9b8 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Wed, 19 Nov 2025 08:01:09 +0000 Subject: [PATCH 3/7] [Automated Commit] Format Codebase --- language/deepseek-r1/backends/sglang_backend.py | 6 ++++-- language/deepseek-r1/run_eval.py | 14 +++++++++----- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py index 17d0e3d9e3..92fbbb9a21 100644 --- a/language/deepseek-r1/backends/sglang_backend.py +++ b/language/deepseek-r1/backends/sglang_backend.py @@ -127,8 +127,10 @@ def _build_server_command(self) -> List[str]: # Add optimization flags if self.config['enable_speculative_decode']: cmd.extend(['--speculative-algorithm', 'EAGLE']) - cmd.extend(['--speculative-num-steps', str(self.config['speculative_num_steps'])]) - cmd.extend(['--speculative-eagle-topk', str(self.config['speculative_topk'])]) + cmd.extend(['--speculative-num-steps', + str(self.config['speculative_num_steps'])]) + cmd.extend(['--speculative-eagle-topk', + str(self.config['speculative_topk'])]) elif self.config['enable_torch_compile']: cmd.append('--enable-torch-compile') diff --git a/language/deepseek-r1/run_eval.py b/language/deepseek-r1/run_eval.py index 338e6761d4..73f7e94fb8 100755 --- a/language/deepseek-r1/run_eval.py +++ b/language/deepseek-r1/run_eval.py @@ -231,7 +231,8 @@ def main(): backend, tokenized_prompts, text_prompts=prompts) # Process raw results into standardized format using shared utility - # Filter out errors from raw_results and corresponding rows in df_output + # Filter out errors from raw_results and corresponding rows in + # df_output valid_indices = [] valid_results = [] for i, res in enumerate(raw_results): @@ -239,12 +240,15 @@ def main(): valid_indices.append(i) valid_results.append(res) else: - print(f"Skipping prompt {i} due to error: {res.get('error', 'Unknown error')}") - + print( + f"Skipping prompt {i} due to error: {res.get('error', 'Unknown error')}") + if len(valid_results) < len(raw_results): - print(f"Filtered out {len(raw_results) - len(valid_results)} failed prompts") + print( + f"Filtered out {len(raw_results) - len(valid_results)} failed prompts") raw_results = valid_results - df_output = df_output.iloc[valid_indices].reset_index(drop=True) + df_output = df_output.iloc[valid_indices].reset_index( + drop=True) print("Processing results...") standardized_results = process_inference_results( From 1a29be23aa90e6fed3bdcd3a927a91ab65afa375 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 26 Nov 2025 01:13:01 -0800 Subject: [PATCH 4/7] misc fix --- language/deepseek-r1/utils/data_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/language/deepseek-r1/utils/data_utils.py b/language/deepseek-r1/utils/data_utils.py index 0eb4cd3dcd..1fdef80fd4 100644 --- a/language/deepseek-r1/utils/data_utils.py +++ b/language/deepseek-r1/utils/data_utils.py @@ -110,7 +110,9 @@ def save_results(df: pd.DataFrame, output_file = f"{base_name}_{timestamp_suffix}{ext}" # Ensure output directory exists - os.makedirs(os.path.dirname(output_file), exist_ok=True) + output_dir = os.path.dirname(output_file) + if output_dir: + os.makedirs(output_dir, exist_ok=True) print(f"Saving results to {output_file}...") From 8eefed20cb9536d1e3b7a9569d070a4c5def141f Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 26 Nov 2025 01:20:26 -0800 Subject: [PATCH 5/7] address comments --- language/deepseek-r1/backends/sglang_backend.py | 2 +- language/deepseek-r1/utils/backend_registry.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/language/deepseek-r1/backends/sglang_backend.py b/language/deepseek-r1/backends/sglang_backend.py index 92fbbb9a21..8efc0d76d2 100644 --- a/language/deepseek-r1/backends/sglang_backend.py +++ b/language/deepseek-r1/backends/sglang_backend.py @@ -126,7 +126,7 @@ def _build_server_command(self) -> List[str]: # Add optimization flags if self.config['enable_speculative_decode']: - cmd.extend(['--speculative-algorithm', 'EAGLE']) + cmd.extend(['--speculative-algorithm', self.config['speculative_algorithm']]) cmd.extend(['--speculative-num-steps', str(self.config['speculative_num_steps'])]) cmd.extend(['--speculative-eagle-topk', diff --git a/language/deepseek-r1/utils/backend_registry.py b/language/deepseek-r1/utils/backend_registry.py index e28b5d06bc..111327b9c5 100644 --- a/language/deepseek-r1/utils/backend_registry.py +++ b/language/deepseek-r1/utils/backend_registry.py @@ -117,8 +117,9 @@ # True: enable with DS-R1 interactive scenario MTP config 'enable_speculative_decode': True, + 'speculative_algorithm': 'EAGLE', # EAGLE1/2 style decoding with DS-R1 MTP Head 'speculative_num_steps': 3, - 'speculative_topk': 1, + 'speculative_topk': 1, # Linear (no draft-trees) }, 'env_vars': { 'CUDA_MODULE_LOADING': 'LAZY', From c3ea8bdac7d1fccc19eafbf2cd08ee5a0e1080d0 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 26 Nov 2025 01:36:21 -0800 Subject: [PATCH 6/7] add interactive configs --- loadgen/mlperf.conf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/loadgen/mlperf.conf b/loadgen/mlperf.conf index 1b825514bd..3b72da1b18 100644 --- a/loadgen/mlperf.conf +++ b/loadgen/mlperf.conf @@ -132,6 +132,11 @@ deepseek-r1.Server.target_latency = 0 deepseek-r1.Server.ttft_latency = 2000 deepseek-r1.Server.tpot_latency = 80 +# Target Latencies for interactive setting +deepseek-r1-interactive.Server.target_latency = 0 +deepseek-r1-interactive.Server.ttft_latency = +deepseek-r1-interactive.Server.tpot_latency = 15 + *.Offline.target_latency_percentile = 90 *.Offline.min_duration = 600000 From 510da46618928062f777d474ffa28c79436afb07 Mon Sep 17 00:00:00 2001 From: Viraat Chandra Date: Wed, 26 Nov 2025 01:48:27 -0800 Subject: [PATCH 7/7] add interactive scenario support --- language/deepseek-r1/README.md | 12 ++++++------ language/deepseek-r1/run_mlperf.py | 30 ++++++++++++++++++------------ 2 files changed, 24 insertions(+), 18 deletions(-) diff --git a/language/deepseek-r1/README.md b/language/deepseek-r1/README.md index a6c30a6155..d572c517e6 100644 --- a/language/deepseek-r1/README.md +++ b/language/deepseek-r1/README.md @@ -190,7 +190,7 @@ PyTorch backend uses distributed execution with `torchrun` and `run_mlperf_mpi.p | Option | Description | Default | | -------------- | ------------------------------ | ---------------- | -| `--mode` | Scenario mode (offline/server) | `offline` | +| `--mode` | Scenario mode (offline/server/interactive) | `offline` | | `--accuracy` | Run accuracy test | `False` | | `--output-dir` | Output directory for results | `mlperf_results` | @@ -198,11 +198,11 @@ PyTorch backend uses distributed execution with `torchrun` and `run_mlperf_mpi.p The following table shows which backends support different evaluation and MLPerf operations: -| Backend | `run_eval.py` | `run_mlperf.py --mode=offline` | `run_mlperf.py --mode=server` | -| ----------- | ------------- | ------------------------------ | ----------------------------- | -| pytorch-fp8 | x | x | | -| vllm-fp8 | x | x | | -| sglang-fp8 | x | x | x | +| Backend | `run_eval.py` | `run_mlperf.py --mode=offline` | `run_mlperf.py --mode=server` |`run_mlperf.py --mode=server` | +| ----------- | ------------- | ------------------------------ | ----------------------------- | ---------------------------- | +| pytorch-fp8 | x | x | | | +| vllm-fp8 | x | x | | | +| sglang-fp8 | x | x | x | x | > **Note**: For PyTorch backend, use the `_mpi` versions with `torchrun`. For vLLM and SGLang backends, use the single-process versions without `_mpi`. diff --git a/language/deepseek-r1/run_mlperf.py b/language/deepseek-r1/run_mlperf.py index 2345cf5b9b..442ebe6615 100755 --- a/language/deepseek-r1/run_mlperf.py +++ b/language/deepseek-r1/run_mlperf.py @@ -46,7 +46,7 @@ def create_argument_parser() -> argparse.ArgumentParser: # Scenario selection (no backend argument, auto-detected) parser.add_argument("--mode", type=str, default="offline", - choices=["offline", "server"], + choices=["offline", "server", "interactive"], help="MLPerf scenario mode") # MLPerf configuration @@ -82,7 +82,7 @@ def configure_loadgen(scenario: str, """Configure LoadGen test settings. Args: - scenario: MLPerf scenario ("offline" or "server") + scenario: MLPerf scenario ("offline", "server", or "interactive") accuracy_mode: Whether to run in accuracy mode mlperf_conf: Path to MLPerf config file user_conf: Path to user config file @@ -97,10 +97,17 @@ def configure_loadgen(scenario: str, # Set scenario if scenario.lower() == "offline": settings.scenario = lg.TestScenario.Offline - elif scenario.lower() == "server": + config_scenario = "Offline" + elif scenario.lower() == "server" or scenario.lower() == "interactive": settings.scenario = lg.TestScenario.Server + config_scenario = "Server" else: raise ValueError(f"Unknown scenario: {scenario}") + + # Adjust model name for interactive mode to use separate config + # LoadGen will look for "deepseek-r1-interactive.Server" config section + if scenario.lower() == "interactive": + model_name = f"{model_name}-interactive" # Set mode if accuracy_mode: @@ -110,9 +117,9 @@ def configure_loadgen(scenario: str, # Load configurations if files exist if mlperf_conf and Path(mlperf_conf).exists(): - settings.FromConfig(mlperf_conf, model_name, scenario, 2) + settings.FromConfig(mlperf_conf, model_name, config_scenario, 2) if user_conf and Path(user_conf).exists(): - settings.FromConfig(user_conf, model_name, scenario, 1) + settings.FromConfig(user_conf, model_name, config_scenario, 1) return settings @@ -247,12 +254,12 @@ def main(): dataset_strings=strings_for_sut, name=f"{backend_name}_offline_sut" ) - else: # server + else: # server or interactive sut = ServerSUT( backend=backend, dataset=dataset_for_sut, dataset_strings=strings_for_sut, - name=f"{backend_name}_server_sut" + name=f"{backend_name}_{args.mode}_sut" ) # Create QSL @@ -268,11 +275,10 @@ def main(): ) # Update settings with dataset info - # TODO(vir): these should be in mlperf.conf - settings.max_query_count = len(tokenized_prompts) - settings.min_query_count = len(tokenized_prompts) - settings.use_token_latencies = True - settings.server_coalesce_queries = True + # settings.max_query_count = len(tokenized_prompts) + # settings.min_query_count = len(tokenized_prompts) + # settings.use_token_latencies = True + # settings.server_coalesce_queries = True # Configure logging log_settings = lg.LogSettings()