Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions language/deepseek-r1/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -190,19 +190,19 @@ PyTorch backend uses distributed execution with `torchrun` and `run_mlperf_mpi.p

| Option | Description | Default |
| -------------- | ------------------------------ | ---------------- |
| `--mode` | Scenario mode (offline/server) | `offline` |
| `--mode` | Scenario mode (offline/server/interactive) | `offline` |
| `--accuracy` | Run accuracy test | `False` |
| `--output-dir` | Output directory for results | `mlperf_results` |

### Backend Support Matrix

The following table shows which backends support different evaluation and MLPerf operations:

| Backend | `run_eval.py` | `run_mlperf.py --mode=offline` | `run_mlperf.py --mode=server` |
| ----------- | ------------- | ------------------------------ | ----------------------------- |
| pytorch-fp8 | x | x | |
| vllm-fp8 | x | x | |
| sglang-fp8 | x | x | x |
| Backend | `run_eval.py` | `run_mlperf.py --mode=offline` | `run_mlperf.py --mode=server` |`run_mlperf.py --mode=server` |
| ----------- | ------------- | ------------------------------ | ----------------------------- | ---------------------------- |
| pytorch-fp8 | x | x | | |
| vllm-fp8 | x | x | | |
| sglang-fp8 | x | x | x | x |

> **Note**: For PyTorch backend, use the `_mpi` versions with `torchrun`. For vLLM and SGLang backends, use the single-process versions without `_mpi`.

Expand Down
20 changes: 14 additions & 6 deletions language/deepseek-r1/backends/sglang_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,10 @@ def _setup_environment(self) -> None:
# Use the utility function to get cache directory
cache_base = get_cache_directory()

# Use models subdirectory to match user's example paths
self.cache_dir = cache_base.parent / 'models'
self.cache_dir.mkdir(parents=True, exist_ok=True)

# Set up HuggingFace cache environment variables
setup_huggingface_cache()

Expand Down Expand Up @@ -121,7 +125,14 @@ def _build_server_command(self) -> List[str]:
]

# Add optimization flags
if self.config['enable_torch_compile']:
if self.config['enable_speculative_decode']:
cmd.extend(['--speculative-algorithm', self.config['speculative_algorithm']])
cmd.extend(['--speculative-num-steps',
str(self.config['speculative_num_steps'])])
cmd.extend(['--speculative-eagle-topk',
str(self.config['speculative_topk'])])

elif self.config['enable_torch_compile']:
cmd.append('--enable-torch-compile')

if self.config['enable_flashinfer']:
Expand All @@ -134,7 +145,6 @@ def _build_server_command(self) -> List[str]:

# Add performance settings
cmd.extend([
'--cuda-graph-max-bs', str(self.config['cuda_graph_max_bs']),
'--max-running-requests', str(self.config['max_running_requests'])
])

Expand Down Expand Up @@ -490,8 +500,7 @@ def generate(self,

except Exception as e:
print(f"\nError generating completion: {e}")
raise RuntimeError(
f"SGLang backend failed to generate tokens for prompt: {prompt[:100]}...")
results.append({'error': str(e)})

return results

Expand Down Expand Up @@ -529,8 +538,7 @@ async def _async_generate_single(

except Exception as e:
print(f"\nError generating completion for prompt {idx}: {e}")
raise RuntimeError(
f"SGLang backend failed to generate tokens for prompt {idx}: {e}")
return idx, {'error': str(e)}

@require_initialized
def generate_async(self,
Expand Down
38 changes: 7 additions & 31 deletions language/deepseek-r1/docker/Dockerfile.sglang
Original file line number Diff line number Diff line change
@@ -1,41 +1,17 @@
# SGLang Backend Dockerfile
FROM nvidia/cuda:12.6.0-devel-ubuntu22.04
FROM lmsysorg/sglang:v0.5.2-cu129-b200

# Set environment variables
ENV DEBIAN_FRONTEND=noninteractive
ENV PIP_BREAK_SYSTEM_PACKAGES=1
ENV MLPERF_BACKEND=sglang

# Install Python and system dependencies
RUN apt-get update && apt-get install -y \
software-properties-common \
&& add-apt-repository ppa:deadsnakes/ppa \
&& apt-get update && apt-get install -y \
python3.10 \
python3.10-dev \
python3.10-distutils \
python3-pip \
git \
git-lfs \
curl \
wget \
ca-certificates \
cmake \
build-essential \
ninja-build \
pybind11-dev \
pkg-config \
sudo \
libnuma-dev \
htop \
&& rm -rf /var/lib/apt/lists/*

# Set Python 3.10 as default
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1 && \
update-alternatives --install /usr/bin/python python /usr/bin/python3.10 1

# Install pip for Python 3.10
RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
# Make /root accessible to non-root users so they can access /root/.cargo and /root/.rustup
# This is needed because flashinfer/tvm_ffi tries to search these paths
RUN chmod a+rX /root && \
if [ -d /root/.cargo ]; then chmod -R a+rX /root/.cargo 2>/dev/null || true; fi && \
if [ -d /root/.rustup ]; then chmod -R a+rX /root/.rustup 2>/dev/null || true; fi && \
if [ -d /root/.cargo/bin ]; then cp -a /root/.cargo/bin/* /usr/local/bin/ 2>/dev/null || true; fi

# Install UV package manager system-wide
RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
Expand Down
13 changes: 12 additions & 1 deletion language/deepseek-r1/docker/launch_scripts/launch_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,22 @@ else
INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference"
fi

# Setup model cache directory mount
# If --model-cache-dir is provided, mount it to /raid/data/$USER/
# If not provided, mount /raid/data/$USER/ from host
if [ -n "$MODEL_CACHE_DIR" ]; then
MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}"
echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}"
else
MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}"
echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)"
fi

# Run the Docker container with all mounts (same as main docker setup)
docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
$GPU_OPTS \
-v /home/mlperf_inference_storage:/home/mlperf_inference_storage \
-v /raid/data:/raid/data \
$MODEL_CACHE_MOUNT \
-e HISTFILE="${WORK_DIR}/.bash_history" \
--env "CCACHE_DIR=${CCACHE_DIR}" \
--env "USER=${USER_NAME}" \
Expand Down
23 changes: 19 additions & 4 deletions language/deepseek-r1/docker/launch_scripts/launch_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ IMAGE_TAG=${IMAGE_TAG:-latest}
LOCAL_USER=${LOCAL_USER:-1}
WORK_DIR=${WORK_DIR:-$(dirname "$(realpath "$0")")/../..}
CONTAINER_NAME=${CONTAINER_NAME:-sglang}
RUN_CMD=${RUN_CMD:-}
RUN_CMD=""

# Get user information
USER_ID=$(id --user)
Expand Down Expand Up @@ -86,7 +86,11 @@ while [[ $# -gt 0 ]]; do
;;
*)
# Store remaining arguments for passing to container
RUN_CMD="$RUN_CMD $1"
if [ -z "$RUN_CMD" ]; then
RUN_CMD="$1"
else
RUN_CMD="$RUN_CMD $1"
fi
shift
;;
esac
Expand Down Expand Up @@ -203,11 +207,22 @@ else
INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference"
fi

# Setup model cache directory mount
# If --model-cache-dir is provided, mount it to /raid/data/$USER/
# If not provided, mount /raid/data/$USER/ from host
if [ -n "$MODEL_CACHE_DIR" ]; then
MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}"
echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}"
else
MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}"
echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)"
fi

# Run the Docker container with all mounts (same as main docker setup)
docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
$GPU_OPTS \
-v /home/mlperf_inference_storage:/home/mlperf_inference_storage \
-v /raid/data:/raid/data \
$MODEL_CACHE_MOUNT \
-e HISTFILE="${WORK_DIR}/.bash_history" \
--env "CCACHE_DIR=${CCACHE_DIR}" \
--env "USER=${USER_NAME}" \
Expand All @@ -220,4 +235,4 @@ docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
--hostname "$(hostname)-docker" \
--name "${CONTAINER_NAME}-${RANDOM_NUM}-${USER_NAME}" \
--tmpfs /tmp:exec \
"$FINAL_IMAGE" $RUN_CMD
"$FINAL_IMAGE" ${RUN_CMD:-/bin/bash}
13 changes: 12 additions & 1 deletion language/deepseek-r1/docker/launch_scripts/launch_vllm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -203,11 +203,22 @@ else
INFERENCE_MOUNT="-v ${INFERENCE_TMP}:/inference"
fi

# Setup model cache directory mount
# If --model-cache-dir is provided, mount it to /raid/data/$USER/
# If not provided, mount /raid/data/$USER/ from host
if [ -n "$MODEL_CACHE_DIR" ]; then
MODEL_CACHE_MOUNT="-v ${MODEL_CACHE_DIR}:/raid/data/${USER_NAME}"
echo "Model cache directory: ${MODEL_CACHE_DIR} -> /raid/data/${USER_NAME}"
else
MODEL_CACHE_MOUNT="-v /raid/data/${USER_NAME}:/raid/data/${USER_NAME}"
echo "Model cache directory: /raid/data/${USER_NAME} (host) -> /raid/data/${USER_NAME} (container)"
fi

# Run the Docker container with all mounts (same as main docker setup)
docker run $DOCKER_RUN_OPTS $DOCKER_RUN_ARGS \
$GPU_OPTS \
-v /home/mlperf_inference_storage:/home/mlperf_inference_storage \
-v /raid/data:/raid/data \
$MODEL_CACHE_MOUNT \
-e HISTFILE="${WORK_DIR}/.bash_history" \
--env "CCACHE_DIR=${CCACHE_DIR}" \
--env "USER=${USER_NAME}" \
Expand Down
2 changes: 2 additions & 0 deletions language/deepseek-r1/docker/setup_scripts/common.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,8 @@ install_evaluation_requirements() {
echo "Installing evaluation requirements..."
if [ -f "/work/docker/evaluation_requirements.txt" ]; then
VIRTUAL_ENV=$VENV_DIR uv pip install -r /work/docker/evaluation_requirements.txt
echo "Override datasets==3.0.0 (LiveCodeBench/code-generation-lite is not updated for datasets 3.2.0)..."
VIRTUAL_ENV=$VENV_DIR uv pip install --upgrade "datasets==3.0.0"
echo "Evaluation requirements installed successfully!"
else
echo "Warning: evaluation_requirements.txt not found at /work/docker/evaluation_requirements.txt"
Expand Down
32 changes: 2 additions & 30 deletions language/deepseek-r1/docker/setup_scripts/setup_sglang.sh
Original file line number Diff line number Diff line change
Expand Up @@ -68,26 +68,8 @@ install_evaluation_requirements
# Install MLPerf LoadGen
install_mlperf_loadgen "$FORCE_REBUILD" "$MLPERF_BACKEND"

# SGLang-specific setup
echo ""
echo "=== SGLang Backend-Specific Setup ==="

# Install core dependencies for SGLang
echo "Installing core dependencies for SGLang..."
VIRTUAL_ENV=$VENV_DIR uv pip install \
transformers \
pandas \
numpy \
tqdm \
huggingface_hub \
datasets \
accelerate \
openai \
httpx \
requests \
torch==2.6.0 \
sglang[all]==0.4.6.post5 \
sgl_kernel
# Override sglang version to 0.5.4
uv pip install sglang[all]==0.5.4 --prerelease=allow

# Verify SGLang installation
if python3 -c "import sglang" 2>/dev/null; then
Expand All @@ -98,13 +80,6 @@ else
exit 1
fi

# Verify sgl_kernel installation
if python3 -c "import sgl_kernel" 2>/dev/null; then
echo "sgl_kernel installed successfully"
else
echo "Warning: sgl_kernel import failed - this optimization may not be available"
fi

# Verify torch is available for SGLang
if python3 -c "import torch" 2>/dev/null; then
TORCH_VERSION=$(python3 -c "import torch; print(torch.__version__)")
Expand Down Expand Up @@ -132,9 +107,6 @@ python3 -c "import sglang; print('✓ Installed')" 2>/dev/null || echo "✗ Not
echo -n "FlashInfer: "
python3 -c "import flashinfer; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed"

echo -n "sgl_kernel: "
python3 -c "import sgl_kernel; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed"

echo -n "DeepGEMM: "
python3 -c "import deepgemm; print('✓ Installed')" 2>/dev/null || echo "✗ Not installed"

Expand Down
19 changes: 19 additions & 0 deletions language/deepseek-r1/run_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,6 +231,25 @@ def main():
backend, tokenized_prompts, text_prompts=prompts)

# Process raw results into standardized format using shared utility
# Filter out errors from raw_results and corresponding rows in
# df_output
valid_indices = []
valid_results = []
for i, res in enumerate(raw_results):
if 'error' not in res:
valid_indices.append(i)
valid_results.append(res)
else:
print(
f"Skipping prompt {i} due to error: {res.get('error', 'Unknown error')}")

if len(valid_results) < len(raw_results):
print(
f"Filtered out {len(raw_results) - len(valid_results)} failed prompts")
raw_results = valid_results
df_output = df_output.iloc[valid_indices].reset_index(
drop=True)

print("Processing results...")
standardized_results = process_inference_results(
raw_results, tokenizer
Expand Down
30 changes: 18 additions & 12 deletions language/deepseek-r1/run_mlperf.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def create_argument_parser() -> argparse.ArgumentParser:

# Scenario selection (no backend argument, auto-detected)
parser.add_argument("--mode", type=str, default="offline",
choices=["offline", "server"],
choices=["offline", "server", "interactive"],
help="MLPerf scenario mode")

# MLPerf configuration
Expand Down Expand Up @@ -82,7 +82,7 @@ def configure_loadgen(scenario: str,
"""Configure LoadGen test settings.

Args:
scenario: MLPerf scenario ("offline" or "server")
scenario: MLPerf scenario ("offline", "server", or "interactive")
accuracy_mode: Whether to run in accuracy mode
mlperf_conf: Path to MLPerf config file
user_conf: Path to user config file
Expand All @@ -97,10 +97,17 @@ def configure_loadgen(scenario: str,
# Set scenario
if scenario.lower() == "offline":
settings.scenario = lg.TestScenario.Offline
elif scenario.lower() == "server":
config_scenario = "Offline"
elif scenario.lower() == "server" or scenario.lower() == "interactive":
settings.scenario = lg.TestScenario.Server
config_scenario = "Server"
else:
raise ValueError(f"Unknown scenario: {scenario}")

# Adjust model name for interactive mode to use separate config
# LoadGen will look for "deepseek-r1-interactive.Server" config section
if scenario.lower() == "interactive":
model_name = f"{model_name}-interactive"

# Set mode
if accuracy_mode:
Expand All @@ -110,9 +117,9 @@ def configure_loadgen(scenario: str,

# Load configurations if files exist
if mlperf_conf and Path(mlperf_conf).exists():
settings.FromConfig(mlperf_conf, model_name, scenario, 2)
settings.FromConfig(mlperf_conf, model_name, config_scenario, 2)
if user_conf and Path(user_conf).exists():
settings.FromConfig(user_conf, model_name, scenario, 1)
settings.FromConfig(user_conf, model_name, config_scenario, 1)

return settings

Expand Down Expand Up @@ -247,12 +254,12 @@ def main():
dataset_strings=strings_for_sut,
name=f"{backend_name}_offline_sut"
)
else: # server
else: # server or interactive
sut = ServerSUT(
backend=backend,
dataset=dataset_for_sut,
dataset_strings=strings_for_sut,
name=f"{backend_name}_server_sut"
name=f"{backend_name}_{args.mode}_sut"
)

# Create QSL
Expand All @@ -268,11 +275,10 @@ def main():
)

# Update settings with dataset info
# TODO(vir): these should be in mlperf.conf
settings.max_query_count = len(tokenized_prompts)
settings.min_query_count = len(tokenized_prompts)
settings.use_token_latencies = True
settings.server_coalesce_queries = True
# settings.max_query_count = len(tokenized_prompts)
# settings.min_query_count = len(tokenized_prompts)
# settings.use_token_latencies = True
# settings.server_coalesce_queries = True

# Configure logging
log_settings = lg.LogSettings()
Expand Down
Loading
Loading