Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 47 additions & 1 deletion .ci/scripts/export_model_artifact.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Arguments:
- mistralai/Voxtral-Mini-4B-Realtime-2602
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
- google/gemma-3-4b-it
- nvidia/diar_streaming_sortformer_4spk-v2
- nvidia/parakeet-tdt

quant_name Quantization type (optional, default: non-quantized)
Expand All @@ -45,6 +46,7 @@ Examples:
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
Expand Down Expand Up @@ -157,6 +159,14 @@ case "$HF_MODEL" in
PREPROCESSOR_FEATURE_SIZE=""
PREPROCESSOR_OUTPUT=""
;;
nvidia/diar_streaming_sortformer_4spk-v2)
MODEL_NAME="sortformer"
TASK=""
MAX_SEQ_LEN=""
EXTRA_PIP=""
PREPROCESSOR_FEATURE_SIZE=""
PREPROCESSOR_OUTPUT=""
;;
mistralai/Voxtral-Mini-4B-Realtime-2602)
MODEL_NAME="voxtral_realtime"
TASK=""
Expand All @@ -167,7 +177,7 @@ case "$HF_MODEL" in
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
exit 1
;;
esac
Expand Down Expand Up @@ -247,6 +257,42 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
exit 0
fi

# Sortformer uses a custom export script
if [ "$MODEL_NAME" = "sortformer" ]; then
if [ "$QUANT_NAME" != "non-quantized" ]; then
echo "Error: Sortformer currently supports only non-quantized export"
exit 1
fi

pip install -r examples/models/sortformer/install_requirements.txt

SORTFORMER_BACKEND="$DEVICE"
if [ "$DEVICE" = "cuda-windows" ]; then
SORTFORMER_BACKEND="cuda-windows"
elif [ "$DEVICE" = "cuda" ]; then
SORTFORMER_BACKEND="cuda"
elif [ "$DEVICE" = "xnnpack" ]; then
SORTFORMER_BACKEND="xnnpack"
else
SORTFORMER_BACKEND="portable"
fi

python -m executorch.examples.models.sortformer.export_sortformer \
--hf-model "${HF_MODEL}" \
--backend "${SORTFORMER_BACKEND}" \
--output-dir "${OUTPUT_DIR}"

test -f "${OUTPUT_DIR}/sortformer.pte"
mv "${OUTPUT_DIR}/sortformer.pte" "${OUTPUT_DIR}/model.pte"
# CUDA saves named data to separate .ptd file, XNNPACK/portable do not.
if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
fi
ls -al "${OUTPUT_DIR}"
echo "::endgroup::"
exit 0
fi

# Voxtral Realtime uses a custom export script
if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
pip install safetensors huggingface_hub
Expand Down
26 changes: 23 additions & 3 deletions .ci/scripts/test_model_e2e.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Arguments:
hf_model HuggingFace model ID (required)
Supported models:
- mistralai/Voxtral-Mini-3B-2507
- nvidia/diar_streaming_sortformer_4spk-v2
- openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
- google/gemma-3-4b-it
- Qwen/Qwen3-0.6B
Expand All @@ -44,6 +45,7 @@ Arguments:
Examples:
test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
Expand Down Expand Up @@ -176,6 +178,18 @@ case "$HF_MODEL" in
AUDIO_FILE="test_audio.wav"
IMAGE_PATH=""
;;
nvidia/diar_streaming_sortformer_4spk-v2)
MODEL_NAME="sortformer"
RUNNER_TARGET="sortformer_runner"
RUNNER_PATH="sortformer"
EXPECTED_OUTPUT="Speaker 1"
PREPROCESSOR=""
TOKENIZER_URL=""
TOKENIZER_FILE=""
AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
AUDIO_FILE="poem.wav"
IMAGE_PATH=""
;;
mistralai/Voxtral-Mini-4B-Realtime-2602)
MODEL_NAME="voxtral_realtime"
RUNNER_TARGET="voxtral_realtime_runner"
Expand All @@ -190,7 +204,7 @@ case "$HF_MODEL" in
;;
*)
echo "Error: Unsupported model '$HF_MODEL'"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
exit 1
;;
esac
Expand All @@ -203,8 +217,8 @@ echo "::endgroup::"
echo "::group::Prepare $MODEL_NAME Artifacts"


# Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export)
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ]; then
# Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ]; then
if [ "$TOKENIZER_FILE" != "" ]; then
curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
else
Expand Down Expand Up @@ -296,6 +310,12 @@ EOF
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
fi
;;
sortformer)
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE"
if [ "$DEVICE" = "cuda" ]; then
RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
fi
;;
voxtral_realtime)
RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
# Add CUDA data path if present
Expand Down
20 changes: 19 additions & 1 deletion .ci/scripts/test_model_e2e_windows.ps1
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,17 @@ switch ($HfModel) {
$audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
$audioFile = "test_audio.wav"
}
"nvidia/diar_streaming_sortformer_4spk-v2" {
$runnerTarget = "sortformer_runner"
$runnerPath = "sortformer"
$runnerPreset = "sortformer-cuda"
$expectedOutput = "Speaker 1"
$preprocessor = ""
$tokenizerUrl = ""
$tokenizerFile = ""
$audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
$audioFile = "poem.wav"
}
"mistralai/Voxtral-Mini-4B-Realtime-2602" {
$runnerTarget = "voxtral_realtime_runner"
$runnerPath = "voxtral_realtime"
Expand All @@ -76,7 +87,7 @@ switch ($HfModel) {
$audioFile = "poem.wav"
}
default {
throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/parakeet-tdt"
throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
}
}

Expand Down Expand Up @@ -182,6 +193,13 @@ try {
"--data_path", $cudaBlob
)
}
"nvidia/diar_streaming_sortformer_4spk-v2" {
$runnerArgs = @(
"--model_path", $modelPte,
"--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
"--data_path", $cudaBlob
)
}
"mistralai/Voxtral-Mini-4B-Realtime-2602" {
$runnerArgs += @(
"--temperature", "0",
Expand Down
6 changes: 6 additions & 0 deletions .github/workflows/cuda-windows.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,9 @@ jobs:
- model_repo: "nvidia"
model_name: "parakeet-tdt"
quant: "quantized-int4-weight-only"
- model_repo: "nvidia"
model_name: "diar_streaming_sortformer_4spk-v2"
quant: "non-quantized"
- model_repo: "mistralai"
model_name: "Voxtral-Mini-4B-Realtime-2602"
quant: "quantized-int4-tile-packed"
Expand Down Expand Up @@ -113,6 +116,9 @@ jobs:
- model_repo: "nvidia"
model_name: "parakeet-tdt"
quant: "quantized-int4-weight-only"
- model_repo: "nvidia"
model_name: "diar_streaming_sortformer_4spk-v2"
quant: "non-quantized"
- model_repo: "mistralai"
model_name: "Voxtral-Mini-4B-Realtime-2602"
quant: "quantized-int4-tile-packed"
Expand Down
22 changes: 22 additions & 0 deletions .github/workflows/cuda.yml
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,8 @@ jobs:
name: "Voxtral-Mini-3B-2507"
- repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
- repo: "nvidia"
name: "diar_streaming_sortformer_4spk-v2"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
Expand Down Expand Up @@ -168,6 +170,15 @@ jobs:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "quantized-int4-weight-only"
# Sortformer currently supports only non-quantized export
- model:
repo: "nvidia"
name: "diar_streaming_sortformer_4spk-v2"
quant: "quantized-int4-tile-packed"
- model:
repo: "nvidia"
name: "diar_streaming_sortformer_4spk-v2"
quant: "quantized-int4-weight-only"
with:
timeout: 90
secrets-env: EXECUTORCH_HF_TOKEN
Expand Down Expand Up @@ -214,6 +225,8 @@ jobs:
name: "Voxtral-Mini-3B-2507"
- repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
- repo: "nvidia"
name: "diar_streaming_sortformer_4spk-v2"
- repo: "openai"
name: "whisper-small"
- repo: "openai"
Expand Down Expand Up @@ -241,6 +254,15 @@ jobs:
repo: "mistralai"
name: "Voxtral-Mini-4B-Realtime-2602"
quant: "quantized-int4-weight-only"
# Sortformer currently supports only non-quantized export
- model:
repo: "nvidia"
name: "diar_streaming_sortformer_4spk-v2"
quant: "quantized-int4-tile-packed"
- model:
repo: "nvidia"
name: "diar_streaming_sortformer_4spk-v2"
quant: "quantized-int4-weight-only"
with:
timeout: 90
runner: linux.g5.4xlarge.nvidia.gpu
Expand Down
14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
# - voxtral_realtime: Realtime speech-to-text model (CPU, CUDA, Metal)
# - whisper: Speech recognition model (CPU, CUDA, Metal)
# - parakeet: Speech recognition model (CPU, CUDA, Metal)
# - sortformer: Speaker diarization model (CPU)
# - sortformer: Speaker diarization model (CPU, CUDA)
# - silero_vad: Voice activity detection model (CPU)
# - llama: Text generation model (CPU)
# - llava: Vision + language model (CPU)
Expand Down Expand Up @@ -91,7 +91,7 @@
#
# ==============================================================================

.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help

help:
@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
Expand All @@ -109,6 +109,7 @@ help:
@echo " parakeet-cuda-debug - Build Parakeet runner with CUDA backend (debug mode)"
@echo " parakeet-cpu - Build Parakeet runner with CPU backend"
@echo " parakeet-metal - Build Parakeet runner with Metal backend (macOS only)"
@echo " sortformer-cuda - Build Sortformer runner with CUDA backend"
@echo " sortformer-cpu - Build Sortformer runner with CPU backend"
@echo " silero-vad-cpu - Build Silero VAD runner with CPU backend"
@echo " llama-cuda - Build Llama runner with CUDA backend"
Expand Down Expand Up @@ -218,6 +219,15 @@ parakeet-metal:
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/parakeet/parakeet_runner"

sortformer-cuda:
@echo "==> Building and installing ExecuTorch with CUDA..."
cmake --workflow --preset llm-release-cuda
@echo "==> Building Sortformer runner with CUDA..."
cd examples/models/sortformer && cmake --workflow --preset sortformer-cuda
@echo ""
@echo "✓ Build complete!"
@echo " Binary: cmake-out/examples/models/sortformer/sortformer_runner"

sortformer-cpu:
@echo "==> Building and installing ExecuTorch..."
cmake --workflow --preset llm-release
Expand Down
33 changes: 33 additions & 0 deletions examples/models/sortformer/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@
"name": "sortformer-cpu",
"displayName": "Sortformer runner (CPU)",
"inherits": ["sortformer-base"]
},
{
"name": "sortformer-cuda",
"displayName": "Sortformer runner (CUDA)",
"inherits": ["sortformer-base"],
"cacheVariables": {
"EXECUTORCH_BUILD_CUDA": "ON"
},
"condition": {
"type": "inList",
"string": "${hostSystemName}",
"list": ["Linux", "Windows"]
}
}
],
"buildPresets": [
Expand All @@ -23,6 +36,12 @@
"displayName": "Build Sortformer runner (CPU)",
"configurePreset": "sortformer-cpu",
"targets": ["sortformer_runner"]
},
{
"name": "sortformer-cuda",
"displayName": "Build Sortformer runner (CUDA)",
"configurePreset": "sortformer-cuda",
"targets": ["sortformer_runner"]
}
],
"workflowPresets": [
Expand All @@ -39,6 +58,20 @@
"name": "sortformer-cpu"
}
]
},
{
"name": "sortformer-cuda",
"displayName": "Configure and build Sortformer runner (CUDA)",
"steps": [
{
"type": "configure",
"name": "sortformer-cuda"
},
{
"type": "build",
"name": "sortformer-cuda"
}
]
}
]
}
Loading
Loading