Skip to content

Commit 2a68e74

Browse files
authored
Add Qwen 3.5 MoE E2E CI test with prequantized HQQ-INT4 checkpoint (#18445)
Add end-to-end CI testing for Qwen3.5-35B-A3B MoE model using a prequantized HQQ-INT4 checkpoint from HuggingFace. The CI flow: 1. Download prequantized model (SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4) 2. Run inference.py sanity check (verify correct output in eager mode) 3. Export to .pte/.ptd via export.py --prequantized 4. Run C++ runner, validate output contains "Paris" Remove the standalone runner build from unittest-cuda (now covered by the E2E test job).
1 parent 431032b commit 2a68e74

File tree

3 files changed

+90
-9
lines changed

3 files changed

+90
-9
lines changed

.ci/scripts/export_model_artifact.sh

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,9 +184,17 @@ case "$HF_MODEL" in
184184
PREPROCESSOR_FEATURE_SIZE=""
185185
PREPROCESSOR_OUTPUT=""
186186
;;
187+
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
188+
MODEL_NAME="qwen3_5_moe"
189+
TASK=""
190+
MAX_SEQ_LEN=""
191+
EXTRA_PIP=""
192+
PREPROCESSOR_FEATURE_SIZE=""
193+
PREPROCESSOR_OUTPUT=""
194+
;;
187195
*)
188196
echo "Error: Unsupported model '$HF_MODEL'"
189-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
197+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
190198
exit 1
191199
;;
192200
esac
@@ -380,6 +388,45 @@ if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
380388
exit 0
381389
fi
382390

391+
# Qwen 3.5 MoE uses a prequantized checkpoint and custom export script
392+
if [ "$MODEL_NAME" = "qwen3_5_moe" ]; then
393+
pip install safetensors huggingface_hub
394+
pip install -r examples/models/qwen3_5_moe/requirements.txt
395+
396+
# Download prequantized model outside OUTPUT_DIR to avoid uploading on failure
397+
LOCAL_MODEL_DIR=$(mktemp -d)
398+
INDUCTOR_CACHE=$(mktemp -d)
399+
trap 'rm -rf "$LOCAL_MODEL_DIR" "$INDUCTOR_CACHE"' EXIT
400+
401+
python -c "from huggingface_hub import snapshot_download; snapshot_download('${HF_MODEL}', local_dir='${LOCAL_MODEL_DIR}')"
402+
403+
# Sanity check: run inference on the prequantized model
404+
echo "::group::Inference sanity check"
405+
python -m executorch.examples.models.qwen3_5_moe.inference \
406+
--prequantized "$LOCAL_MODEL_DIR" \
407+
--prompt "What is the capital of France?" \
408+
--max-new-tokens 32 \
409+
--temperature 0 \
410+
--no-compile
411+
echo "::endgroup::"
412+
413+
# Copy tokenizer for the runner
414+
cp "$LOCAL_MODEL_DIR/tokenizer.json" "${OUTPUT_DIR}/tokenizer.json"
415+
416+
# Export to .pte/.ptd (short cache dir avoids objcopy symbol length issues)
417+
echo "::group::Export"
418+
TORCHINDUCTOR_CACHE_DIR="$INDUCTOR_CACHE" \
419+
python -m executorch.examples.models.qwen3_5_moe.export \
420+
--prequantized "$LOCAL_MODEL_DIR" \
421+
--output-dir "${OUTPUT_DIR}"
422+
echo "::endgroup::"
423+
424+
test -f "${OUTPUT_DIR}/model.pte"
425+
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
426+
ls -al "${OUTPUT_DIR}"
427+
exit 0
428+
fi
429+
383430
MAX_SEQ_LEN_ARG=""
384431
if [ -n "$MAX_SEQ_LEN" ]; then
385432
MAX_SEQ_LEN_ARG="--max_seq_len $MAX_SEQ_LEN"

.ci/scripts/test_model_e2e.sh

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,9 +216,21 @@ case "$HF_MODEL" in
216216
AUDIO_FILE="test_audio.wav"
217217
IMAGE_PATH=""
218218
;;
219+
SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4)
220+
MODEL_NAME="qwen3_5_moe"
221+
RUNNER_TARGET="qwen3_5_moe_runner"
222+
RUNNER_PATH="qwen3_5_moe"
223+
EXPECTED_OUTPUT="Paris"
224+
PREPROCESSOR=""
225+
TOKENIZER_URL=""
226+
TOKENIZER_FILE="tokenizer.json"
227+
AUDIO_URL=""
228+
AUDIO_FILE=""
229+
IMAGE_PATH=""
230+
;;
219231
*)
220232
echo "Error: Unsupported model '$HF_MODEL'"
221-
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer"
233+
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt, facebook/dinov2-small-imagenet1k-1-layer, SocialLocalMobile/Qwen3.5-35B-A3B-HQQ-INT4"
222234
exit 1
223235
;;
224236
esac
@@ -232,7 +244,7 @@ echo "::group::Prepare $MODEL_NAME Artifacts"
232244

233245

234246
# Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
235-
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ]; then
247+
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ] && [ "$MODEL_NAME" != "dinov2" ] && [ "$MODEL_NAME" != "qwen3_5_moe" ]; then
236248
if [ "$TOKENIZER_FILE" != "" ]; then
237249
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
238250
else
@@ -341,6 +353,9 @@ EOF
341353
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
342354
fi
343355
;;
356+
qwen3_5_moe)
357+
RUNNER_ARGS="$RUNNER_ARGS --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --prompt 'What is the capital of France?' --max_new_tokens 32"
358+
;;
344359
voxtral_realtime)
345360
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
346361
# Add CUDA data path if present
@@ -359,7 +374,7 @@ EOF
359374
;;
360375
esac
361376

362-
OUTPUT=$($RUNNER_BIN $RUNNER_ARGS 2>&1)
377+
OUTPUT=$(eval $RUNNER_BIN $RUNNER_ARGS 2>&1)
363378
EXIT_CODE=$?
364379
set -e
365380

.github/workflows/cuda.yml

Lines changed: 24 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -148,9 +148,6 @@ jobs:
148148
# Run quantize roundtrip tests (Qwen 3.5 MoE save/load prequantized)
149149
python -m pytest examples/models/qwen3_5_moe/test_quantize_roundtrip.py -v -o "addopts="
150150
151-
# Build Qwen3.5 MoE runner (ExecuTorch already built above)
152-
cd examples/models/qwen3_5_moe && cmake --workflow --preset qwen3-5-moe-cuda
153-
154151
export-model-cuda-artifact:
155152
name: export-model-cuda-artifact
156153
# Skip this job if the pull request is from a fork (HuggingFace secrets are not available)
@@ -182,6 +179,8 @@ jobs:
182179
name: "parakeet-tdt"
183180
- repo: "facebook"
184181
name: "dinov2-small-imagenet1k-1-layer"
182+
- repo: "SocialLocalMobile"
183+
name: "Qwen3.5-35B-A3B-HQQ-INT4"
185184
quant:
186185
- "non-quantized"
187186
- "quantized-int4-tile-packed"
@@ -192,6 +191,15 @@ jobs:
192191
repo: "google"
193192
name: "gemma-3-4b-it"
194193
quant: "quantized-int4-weight-only"
194+
# Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
195+
- model:
196+
repo: "SocialLocalMobile"
197+
name: "Qwen3.5-35B-A3B-HQQ-INT4"
198+
quant: "non-quantized"
199+
- model:
200+
repo: "SocialLocalMobile"
201+
name: "Qwen3.5-35B-A3B-HQQ-INT4"
202+
quant: "quantized-int4-weight-only"
195203
# Voxtral Realtime only supports int4-tile-packed on CUDA
196204
- model:
197205
repo: "mistralai"
@@ -246,7 +254,7 @@ jobs:
246254
with:
247255
timeout: 90
248256
secrets-env: EXECUTORCH_HF_TOKEN
249-
runner: linux.g5.4xlarge.nvidia.gpu
257+
runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
250258
gpu-arch-type: cuda
251259
gpu-arch-version: 12.6
252260
use-custom-docker-registry: false
@@ -301,6 +309,8 @@ jobs:
301309
name: "parakeet-tdt"
302310
- repo: "facebook"
303311
name: "dinov2-small-imagenet1k-1-layer"
312+
- repo: "SocialLocalMobile"
313+
name: "Qwen3.5-35B-A3B-HQQ-INT4"
304314
quant:
305315
- "non-quantized"
306316
- "quantized-int4-tile-packed"
@@ -311,6 +321,15 @@ jobs:
311321
repo: "google"
312322
name: "gemma-3-4b-it"
313323
quant: "quantized-int4-weight-only"
324+
# Qwen3.5 MoE uses a prequantized checkpoint, only tile-packed
325+
- model:
326+
repo: "SocialLocalMobile"
327+
name: "Qwen3.5-35B-A3B-HQQ-INT4"
328+
quant: "non-quantized"
329+
- model:
330+
repo: "SocialLocalMobile"
331+
name: "Qwen3.5-35B-A3B-HQQ-INT4"
332+
quant: "quantized-int4-weight-only"
314333
# Voxtral Realtime only supports int4-tile-packed on CUDA
315334
- model:
316335
repo: "mistralai"
@@ -359,7 +378,7 @@ jobs:
359378
quant: "non-quantized"
360379
with:
361380
timeout: 90
362-
runner: linux.g5.4xlarge.nvidia.gpu
381+
runner: ${{ matrix.model.name == 'Qwen3.5-35B-A3B-HQQ-INT4' && 'linux.aws.a100' || 'linux.g5.4xlarge.nvidia.gpu' }}
363382
gpu-arch-type: cuda
364383
gpu-arch-version: 12.6
365384
use-custom-docker-registry: false

0 commit comments

Comments
 (0)