Skip to content

Commit c094618

Browse files
authored
Add a workflow to run vLLM unit tests on H100 (#55)
* Add a workflow to run vLLM unit tests on H100 Signed-off-by: Huy Do <[email protected]> * Use the correct test path Signed-off-by: Huy Do <[email protected]> * Find the right tests Signed-off-by: Huy Do <[email protected]> * Run on linux.aws.h100.4 Signed-off-by: Huy Do <[email protected]> * [no ci] Just a comment update Signed-off-by: Huy Do <[email protected]> * Update the script path Signed-off-by: Huy Do <[email protected]> --------- Signed-off-by: Huy Do <[email protected]>
1 parent d42421c commit c094618

File tree

2 files changed

+144
-0
lines changed

2 files changed

+144
-0
lines changed

.github/scripts/run_vllm_tests.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#!/bin/bash
2+
3+
set -eux
4+
5+
# A very simple setup for now without any sharding nor caching just to run some
6+
# critical tests on H100 that we couldn't run on vLLM CI
7+
8+
echo 'Update me. This is an example'
9+
10+
pushd /vllm-workspace/tests
11+
pytest -v models/multimodal/generation/test_maverick.py
12+
popd

.github/workflows/vllm-ci-test.yml

Lines changed: 132 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
name: Run vLLM tests
2+
3+
on:
4+
schedule:
5+
# Run every 4 hours
6+
- cron: '0 */4 * * *'
7+
workflow_dispatch:
8+
inputs:
9+
vllm_branch:
10+
description: vLLM branch (main, releases/vERSION for release validation, or refs/pull/PR_NUMBER/head for pre-merge check on pull request)
11+
required: true
12+
type: string
13+
default: main
14+
vllm_commit:
15+
description: vLLM commit (optional, default to the latest commit in the branch that has not yet been benchmarked)
16+
required: false
17+
type: string
18+
pull_request:
19+
paths:
20+
- .github/workflows/vllm-ci-test.yml
21+
- .github/scripts/run_vllm_tests.sh
22+
23+
concurrency:
24+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
25+
cancel-in-progress: true
26+
27+
jobs:
28+
test:
29+
name: Run vLLM tests
30+
strategy:
31+
fail-fast: false
32+
matrix:
33+
include:
34+
# TODO (huydhn): Figure out later if we need to scale this up to multiple runners
35+
- runs-on: linux.aws.h100.4
36+
device-name: cuda
37+
permissions:
38+
id-token: write
39+
contents: read
40+
runs-on: ${{ matrix.runs-on }}
41+
environment: pytorch-x-vllm
42+
steps:
43+
- name: Checkout repository
44+
uses: actions/checkout@v4
45+
46+
- name: Checkout vLLM repository
47+
uses: actions/checkout@v4
48+
with:
49+
repository: vllm-project/vllm
50+
path: vllm
51+
ref: ${{ inputs.vllm_branch || 'main' }}
52+
fetch-depth: 0
53+
54+
- name: Set Docker registry
55+
shell: bash
56+
env:
57+
HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
58+
DEVICE_NAME: ${{ matrix.device-name }}
59+
run: |
60+
set -eux
61+
62+
# Mimic the logic from vllm ci-infra test template
63+
if [[ "${HEAD_BRANCH}" == "main" ]]; then
64+
DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo
65+
else
66+
DOCKER_IMAGE_PREFIX=public.ecr.aws/q9t5s3a7/vllm-ci-test-repo
67+
fi
68+
69+
DOCKER_IMAGE_SUFFIX=""
70+
if [[ "${DEVICE_NAME}" == "rocm" ]]; then
71+
DOCKER_IMAGE_PREFIX=docker.io/rocm/vllm-ci
72+
elif [[ "${DEVICE_NAME}" == "cpu" ]]; then
73+
DOCKER_IMAGE_SUFFIX=-cpu
74+
fi
75+
echo "DOCKER_IMAGE_PREFIX=$DOCKER_IMAGE_PREFIX" >> $GITHUB_ENV
76+
echo "DOCKER_IMAGE_SUFFIX=$DOCKER_IMAGE_SUFFIX" >> $GITHUB_ENV
77+
78+
- name: Check for available Docker image
79+
working-directory: vllm
80+
env:
81+
HEAD_BRANCH: ${{ inputs.vllm_branch || 'main' }}
82+
HEAD_SHA: ${{ inputs.vllm_commit || '' }}
83+
run: |
84+
set -eux
85+
86+
if [[ -z "${HEAD_SHA}" ]]; then
87+
# Looking back the latest 100 commits is enough
88+
for i in {0..99}
89+
do
90+
# Check if the image is there, if it doesn't then check an older one
91+
# because the commit is too recent
92+
HEAD_SHA=$(git rev-parse --verify HEAD~${i})
93+
DOCKER_IMAGE="${DOCKER_IMAGE_PREFIX}:${HEAD_SHA}${DOCKER_IMAGE_SUFFIX}"
94+
95+
# No Docker image available yet because the commit is too recent
96+
if docker manifest inspect "${DOCKER_IMAGE}"; then
97+
break
98+
fi
99+
done
100+
fi
101+
102+
echo "HEAD_SHA=$HEAD_SHA" >> $GITHUB_ENV
103+
104+
- name: Setup CUDA GPU_FLAG for docker run
105+
if: matrix.device-name == 'cuda'
106+
run: |
107+
echo "GPU_FLAG=--gpus all -e NVIDIA_DRIVER_CAPABILITIES=all" >> "${GITHUB_ENV}"
108+
109+
- name: Setup ROCm
110+
if: matrix.device-name == 'rocm'
111+
uses: pytorch/pytorch/./.github/actions/setup-rocm@main
112+
113+
- name: Run vLLM tests
114+
env:
115+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
116+
DOCKER_IMAGE: ${{ env.DOCKER_IMAGE_PREFIX }}:${{ env.HEAD_SHA }}${{ env.DOCKER_IMAGE_SUFFIX }}
117+
run: |
118+
set -eux
119+
120+
container_name=$(docker run \
121+
${GPU_FLAG:-} \
122+
-e HF_TOKEN \
123+
--ipc=host \
124+
--tty \
125+
--detach \
126+
--security-opt seccomp=unconfined \
127+
--shm-size=4g \
128+
-v "${GITHUB_WORKSPACE}:/tmp/workspace" \
129+
-w /tmp/workspace \
130+
"${DOCKER_IMAGE}"
131+
)
132+
docker exec -t "${container_name}" bash -c "bash .github/scripts/run_vllm_tests.sh"

0 commit comments

Comments
 (0)