diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh index 6a7343ece24..5c0fc969000 100755 --- a/.ci/scripts/export_model_artifact.sh +++ b/.ci/scripts/export_model_artifact.sh @@ -22,6 +22,7 @@ Arguments: - mistralai/Voxtral-Mini-4B-Realtime-2602 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}) - google/gemma-3-4b-it + - nvidia/diar_streaming_sortformer_4spk-v2 - nvidia/parakeet-tdt quant_name Quantization type (optional, default: non-quantized) @@ -45,6 +46,7 @@ Examples: export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal" export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming" export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" + export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output" export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output" export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output" export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output" @@ -157,6 +159,14 @@ case "$HF_MODEL" in PREPROCESSOR_FEATURE_SIZE="" PREPROCESSOR_OUTPUT="" ;; + nvidia/diar_streaming_sortformer_4spk-v2) + MODEL_NAME="sortformer" + TASK="" + MAX_SEQ_LEN="" + EXTRA_PIP="" + PREPROCESSOR_FEATURE_SIZE="" + PREPROCESSOR_OUTPUT="" + ;; mistralai/Voxtral-Mini-4B-Realtime-2602) MODEL_NAME="voxtral_realtime" TASK="" @@ -167,7 +177,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt" exit 1 ;; esac @@ -247,6 +257,42 @@ if [ "$MODEL_NAME" = "parakeet" ]; then exit 0 fi +# Sortformer uses a custom export script +if [ "$MODEL_NAME" = "sortformer" ]; then + if [ "$QUANT_NAME" != "non-quantized" ]; then + echo "Error: Sortformer currently supports only non-quantized export" + exit 1 + fi + + pip install -r examples/models/sortformer/install_requirements.txt + + SORTFORMER_BACKEND="$DEVICE" + if [ "$DEVICE" = "cuda-windows" ]; then + SORTFORMER_BACKEND="cuda-windows" + elif [ "$DEVICE" = "cuda" ]; then + SORTFORMER_BACKEND="cuda" + elif [ "$DEVICE" = "xnnpack" ]; then + SORTFORMER_BACKEND="xnnpack" + else + SORTFORMER_BACKEND="portable" + fi + + python -m executorch.examples.models.sortformer.export_sortformer \ + --hf-model "${HF_MODEL}" \ + --backend "${SORTFORMER_BACKEND}" \ + --output-dir "${OUTPUT_DIR}" + + test -f "${OUTPUT_DIR}/sortformer.pte" + mv "${OUTPUT_DIR}/sortformer.pte" "${OUTPUT_DIR}/model.pte" + # CUDA saves named data to separate .ptd file, XNNPACK/portable do not. + if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then + test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd" + fi + ls -al "${OUTPUT_DIR}" + echo "::endgroup::" + exit 0 +fi + # Voxtral Realtime uses a custom export script if [ "$MODEL_NAME" = "voxtral_realtime" ]; then pip install safetensors huggingface_hub diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh index b0d9a68c5b0..579d41b5b3f 100755 --- a/.ci/scripts/test_model_e2e.sh +++ b/.ci/scripts/test_model_e2e.sh @@ -19,6 +19,7 @@ Arguments: hf_model HuggingFace model ID (required) Supported models: - mistralai/Voxtral-Mini-3B-2507 + - nvidia/diar_streaming_sortformer_4spk-v2 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}) - google/gemma-3-4b-it - Qwen/Qwen3-0.6B @@ -44,6 +45,7 @@ Arguments: Examples: test_model_e2e.sh metal "openai/whisper-small" "non-quantized" test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output" + test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output" test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output" test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output" test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming" @@ -176,6 +178,18 @@ case "$HF_MODEL" in AUDIO_FILE="test_audio.wav" IMAGE_PATH="" ;; + nvidia/diar_streaming_sortformer_4spk-v2) + MODEL_NAME="sortformer" + RUNNER_TARGET="sortformer_runner" + RUNNER_PATH="sortformer" + EXPECTED_OUTPUT="Speaker 1" + PREPROCESSOR="" + TOKENIZER_URL="" + TOKENIZER_FILE="" + AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" + AUDIO_FILE="poem.wav" + IMAGE_PATH="" + ;; mistralai/Voxtral-Mini-4B-Realtime-2602) MODEL_NAME="voxtral_realtime" RUNNER_TARGET="voxtral_realtime_runner" @@ -190,7 +204,7 @@ case "$HF_MODEL" in ;; *) echo "Error: Unsupported model '$HF_MODEL'" - echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt" + echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt" exit 1 ;; esac @@ -203,8 +217,8 @@ echo "::endgroup::" echo "::group::Prepare $MODEL_NAME Artifacts" -# Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export) -if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ]; then +# Download tokenizer files (skip for models that bundle tokenizer in export or do not use one) +if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ]; then if [ "$TOKENIZER_FILE" != "" ]; then curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE else @@ -296,6 +310,12 @@ EOF RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd" fi ;; + sortformer) + RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE" + if [ "$DEVICE" = "cuda" ]; then + RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd" + fi + ;; voxtral_realtime) RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0" # Add CUDA data path if present diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1 index beb4d069ce7..430730dfb68 100644 --- a/.ci/scripts/test_model_e2e_windows.ps1 +++ b/.ci/scripts/test_model_e2e_windows.ps1 @@ -64,6 +64,17 @@ switch ($HfModel) { $audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav" $audioFile = "test_audio.wav" } + "nvidia/diar_streaming_sortformer_4spk-v2" { + $runnerTarget = "sortformer_runner" + $runnerPath = "sortformer" + $runnerPreset = "sortformer-cuda" + $expectedOutput = "Speaker 1" + $preprocessor = "" + $tokenizerUrl = "" + $tokenizerFile = "" + $audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav" + $audioFile = "poem.wav" + } "mistralai/Voxtral-Mini-4B-Realtime-2602" { $runnerTarget = "voxtral_realtime_runner" $runnerPath = "voxtral_realtime" @@ -76,7 +87,7 @@ switch ($HfModel) { $audioFile = "poem.wav" } default { - throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/parakeet-tdt" + throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt" } } @@ -182,6 +193,13 @@ try { "--data_path", $cudaBlob ) } + "nvidia/diar_streaming_sortformer_4spk-v2" { + $runnerArgs = @( + "--model_path", $modelPte, + "--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile), + "--data_path", $cudaBlob + ) + } "mistralai/Voxtral-Mini-4B-Realtime-2602" { $runnerArgs += @( "--temperature", "0", diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml index 9d74c831ed5..e125fe4cc8e 100644 --- a/.github/workflows/cuda-windows.yml +++ b/.github/workflows/cuda-windows.yml @@ -41,6 +41,9 @@ jobs: - model_repo: "nvidia" model_name: "parakeet-tdt" quant: "quantized-int4-weight-only" + - model_repo: "nvidia" + model_name: "diar_streaming_sortformer_4spk-v2" + quant: "non-quantized" - model_repo: "mistralai" model_name: "Voxtral-Mini-4B-Realtime-2602" quant: "quantized-int4-tile-packed" @@ -113,6 +116,9 @@ jobs: - model_repo: "nvidia" model_name: "parakeet-tdt" quant: "quantized-int4-weight-only" + - model_repo: "nvidia" + model_name: "diar_streaming_sortformer_4spk-v2" + quant: "non-quantized" - model_repo: "mistralai" model_name: "Voxtral-Mini-4B-Realtime-2602" quant: "quantized-int4-tile-packed" diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml index 71f98c03196..acd157455eb 100644 --- a/.github/workflows/cuda.yml +++ b/.github/workflows/cuda.yml @@ -139,6 +139,8 @@ jobs: name: "Voxtral-Mini-3B-2507" - repo: "mistralai" name: "Voxtral-Mini-4B-Realtime-2602" + - repo: "nvidia" + name: "diar_streaming_sortformer_4spk-v2" - repo: "openai" name: "whisper-small" - repo: "openai" @@ -168,6 +170,15 @@ jobs: repo: "mistralai" name: "Voxtral-Mini-4B-Realtime-2602" quant: "quantized-int4-weight-only" + # Sortformer currently supports only non-quantized export + - model: + repo: "nvidia" + name: "diar_streaming_sortformer_4spk-v2" + quant: "quantized-int4-tile-packed" + - model: + repo: "nvidia" + name: "diar_streaming_sortformer_4spk-v2" + quant: "quantized-int4-weight-only" with: timeout: 90 secrets-env: EXECUTORCH_HF_TOKEN @@ -214,6 +225,8 @@ jobs: name: "Voxtral-Mini-3B-2507" - repo: "mistralai" name: "Voxtral-Mini-4B-Realtime-2602" + - repo: "nvidia" + name: "diar_streaming_sortformer_4spk-v2" - repo: "openai" name: "whisper-small" - repo: "openai" @@ -241,6 +254,15 @@ jobs: repo: "mistralai" name: "Voxtral-Mini-4B-Realtime-2602" quant: "quantized-int4-weight-only" + # Sortformer currently supports only non-quantized export + - model: + repo: "nvidia" + name: "diar_streaming_sortformer_4spk-v2" + quant: "quantized-int4-tile-packed" + - model: + repo: "nvidia" + name: "diar_streaming_sortformer_4spk-v2" + quant: "quantized-int4-weight-only" with: timeout: 90 runner: linux.g5.4xlarge.nvidia.gpu diff --git a/Makefile b/Makefile index ad8544210f7..c4535adb7f7 100644 --- a/Makefile +++ b/Makefile @@ -18,7 +18,7 @@ # - voxtral_realtime: Realtime speech-to-text model (CPU, CUDA, Metal) # - whisper: Speech recognition model (CPU, CUDA, Metal) # - parakeet: Speech recognition model (CPU, CUDA, Metal) -# - sortformer: Speaker diarization model (CPU) +# - sortformer: Speaker diarization model (CPU, CUDA) # - silero_vad: Voice activity detection model (CPU) # - llama: Text generation model (CPU) # - llava: Vision + language model (CPU) @@ -91,7 +91,7 @@ # # ============================================================================== -.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help +.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help help: @echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make \`. Available targets:" @@ -109,6 +109,7 @@ help: @echo " parakeet-cuda-debug - Build Parakeet runner with CUDA backend (debug mode)" @echo " parakeet-cpu - Build Parakeet runner with CPU backend" @echo " parakeet-metal - Build Parakeet runner with Metal backend (macOS only)" + @echo " sortformer-cuda - Build Sortformer runner with CUDA backend" @echo " sortformer-cpu - Build Sortformer runner with CPU backend" @echo " silero-vad-cpu - Build Silero VAD runner with CPU backend" @echo " llama-cuda - Build Llama runner with CUDA backend" @@ -218,6 +219,15 @@ parakeet-metal: @echo "✓ Build complete!" @echo " Binary: cmake-out/examples/models/parakeet/parakeet_runner" +sortformer-cuda: + @echo "==> Building and installing ExecuTorch with CUDA..." + cmake --workflow --preset llm-release-cuda + @echo "==> Building Sortformer runner with CUDA..." + cd examples/models/sortformer && cmake --workflow --preset sortformer-cuda + @echo "" + @echo "✓ Build complete!" + @echo " Binary: cmake-out/examples/models/sortformer/sortformer_runner" + sortformer-cpu: @echo "==> Building and installing ExecuTorch..." cmake --workflow --preset llm-release diff --git a/examples/models/sortformer/CMakePresets.json b/examples/models/sortformer/CMakePresets.json index 09a88237e31..59c31902248 100644 --- a/examples/models/sortformer/CMakePresets.json +++ b/examples/models/sortformer/CMakePresets.json @@ -15,6 +15,19 @@ "name": "sortformer-cpu", "displayName": "Sortformer runner (CPU)", "inherits": ["sortformer-base"] + }, + { + "name": "sortformer-cuda", + "displayName": "Sortformer runner (CUDA)", + "inherits": ["sortformer-base"], + "cacheVariables": { + "EXECUTORCH_BUILD_CUDA": "ON" + }, + "condition": { + "type": "inList", + "string": "${hostSystemName}", + "list": ["Linux", "Windows"] + } } ], "buildPresets": [ @@ -23,6 +36,12 @@ "displayName": "Build Sortformer runner (CPU)", "configurePreset": "sortformer-cpu", "targets": ["sortformer_runner"] + }, + { + "name": "sortformer-cuda", + "displayName": "Build Sortformer runner (CUDA)", + "configurePreset": "sortformer-cuda", + "targets": ["sortformer_runner"] } ], "workflowPresets": [ @@ -39,6 +58,20 @@ "name": "sortformer-cpu" } ] + }, + { + "name": "sortformer-cuda", + "displayName": "Configure and build Sortformer runner (CUDA)", + "steps": [ + { + "type": "configure", + "name": "sortformer-cuda" + }, + { + "type": "build", + "name": "sortformer-cuda" + } + ] } ] } diff --git a/examples/models/sortformer/README.md b/examples/models/sortformer/README.md index 95fae96f93b..1cdae4a068d 100644 --- a/examples/models/sortformer/README.md +++ b/examples/models/sortformer/README.md @@ -4,14 +4,19 @@ Export and run [nvidia/diar_streaming_sortformer_4spk-v2](https://huggingface.co Speaker diarization answers "who spoke when" — the model outputs per-frame activity probabilities for up to 4 speakers. This is not ASR; there is no text output. +## Requirements + +- Python `>= 3.11` (required for Sortformer export tooling) + ## Quick Start ```bash +cd examples/models/sortformer + # Install Python dependencies pip install -r install_requirements.txt # Export to .pte -cd examples/models/sortformer python export_sortformer.py --nemo-path /path/to/model.nemo --backend xnnpack # Build the C++ runner (from repo root) @@ -23,6 +28,27 @@ make sortformer-cpu --audio_path /path/to/audio.wav ``` +## CUDA Quick Start + +```bash +cd examples/models/sortformer + +# Install Python dependencies +pip install -r install_requirements.txt + +# Export to .pte + .ptd +python export_sortformer.py --nemo-path /path/to/model.nemo --backend cuda + +# Build the C++ runner (from repo root) +make sortformer-cuda + +# Run diarization +./cmake-out/examples/models/sortformer/sortformer_runner \ + --model_path examples/models/sortformer/sortformer_exports/sortformer.pte \ + --data_path examples/models/sortformer/sortformer_exports/aoti_cuda_blob.ptd \ + --audio_path /path/to/audio.wav +``` + Output: ``` @@ -50,10 +76,14 @@ python export_sortformer.py --nemo-path /path/to/model.nemo --backend xnnpack |----------|-------------| | `--nemo-path` | Path to `.nemo` model file | | `--hf-model` | HuggingFace model ID (default: `nvidia/diar_streaming_sortformer_4spk-v2`) | -| `--backend` | `portable` or `xnnpack` (default: `xnnpack`) | +| `--backend` | `portable`, `xnnpack`, `cuda`, or `cuda-windows` (default: `xnnpack`) | | `--output-dir` | Output directory (default: `./sortformer_exports`) | -Output: `sortformer_exports/sortformer.pte` (~470 MB unquantized). The preprocessor is always lowered with the portable backend regardless of `--backend`. +Output: +- `sortformer_exports/sortformer.pte` (~470 MB unquantized) +- `sortformer_exports/aoti_cuda_blob.ptd` (when backend is `cuda` or `cuda-windows`) + +The preprocessor is always lowered with the portable backend regardless of `--backend`. ## Validate @@ -76,6 +106,8 @@ From the repository root: ```bash make sortformer-cpu +# or +make sortformer-cuda ``` Binary: `cmake-out/examples/models/sortformer/sortformer_runner` @@ -85,6 +117,7 @@ Binary: `cmake-out/examples/models/sortformer/sortformer_runner` | Argument | Description | |----------|-------------| | `--model_path` | Path to `.pte` file (default: `sortformer.pte`) | +| `--data_path` | Path to `.ptd` file for delegate data (required for CUDA) | | `--audio_path` | Path to input WAV file (16kHz mono, required) | | `--threshold` | Speaker activity threshold, 0.0–1.0 (default: `0.5`) | | `--chunk_len` | Encode chunk size in 80ms frames (default: `124`) | @@ -135,8 +168,8 @@ The `.pte` contains three methods, split along streaming boundaries so the calle | Method | Backend | Input | Output | |--------|---------|-------|--------| | `preprocessor` | portable | `audio` (N,) float, `length` (1,) int64 | `mel` (1, 128, T) float, `mel_len` (1,) int64 | -| `pre_encode` | XNNPACK | `chunk` (1, 4000, 128) float, `chunk_len` (1,) int64 | `embs` (1, 500, 512) float, `emb_len` (1,) int64 | -| `encode` | XNNPACK | `embs` (1, T, 512) float, `emb_len` (1,) int64 | `preds` (1, T, 4) float | +| `pre_encode` | XNNPACK or CUDA | `chunk` (1, 4000, 128) float, `chunk_len` (1,) int64 | `embs` (1, 500, 512) float, `emb_len` (1,) int64 | +| `encode` | XNNPACK or CUDA | `embs` (1, T, 512) float, `emb_len` (1,) int64 | `preds` (1, T, 4) float | - `preprocessor`: dynamic audio length (min=1600, max=1,920,000 samples). - `pre_encode`: static shapes (4000 mel frames). diff --git a/examples/models/sortformer/export_sortformer.py b/examples/models/sortformer/export_sortformer.py index 8905f7ec308..e8f25780607 100644 --- a/examples/models/sortformer/export_sortformer.py +++ b/examples/models/sortformer/export_sortformer.py @@ -309,6 +309,23 @@ def lower_to_executorch(programs, metadata=None, backend="portable"): partitioner[key] = [] else: partitioner[key] = [XnnpackPartitioner()] + elif backend in ("cuda", "cuda-windows"): + from executorch.backends.cuda.cuda_backend import CudaBackend + from executorch.backends.cuda.cuda_partitioner import CudaPartitioner + from executorch.exir.backend.compile_spec_schema import CompileSpec + + print( + f"\nLowering to ExecuTorch with CUDA{' (Windows)' if backend == 'cuda-windows' else ''}..." + ) + partitioner = {} + for key in programs.keys(): + if key == "preprocessor": + partitioner[key] = [] + continue + compile_specs = [CudaBackend.generate_method_name_compile_spec(key)] + if backend == "cuda-windows": + compile_specs.append(CompileSpec("platform", b"windows")) + partitioner[key] = [CudaPartitioner(compile_specs)] else: print("\nLowering to ExecuTorch...") partitioner = [] @@ -354,7 +371,7 @@ def main(): "--backend", type=str, default="xnnpack", - choices=["portable", "xnnpack"], + choices=["portable", "xnnpack", "cuda", "cuda-windows"], help="Backend for acceleration (default: xnnpack)", ) @@ -373,6 +390,15 @@ def main(): print(f"\nSaving ExecuTorch program to: {pte_path}") with open(pte_path, "wb") as f: et.write_to_file(f) + + if args.backend in ("cuda", "cuda-windows"): + cuda_blob_path = os.path.join(args.output_dir, "aoti_cuda_blob.ptd") + print(f"Writing CUDA named-data blob to: {cuda_blob_path}") + et.write_tensor_data_to_file(args.output_dir) + if not os.path.isfile(cuda_blob_path): + raise FileNotFoundError( + f"Expected CUDA named-data blob at {cuda_blob_path}, but it was not created." + ) print(f"Saved {os.path.getsize(pte_path) / (1024 * 1024):.1f} MB") print("\nDone!") diff --git a/examples/models/sortformer/main.cpp b/examples/models/sortformer/main.cpp index af3bd061cfe..d3cbd2fc796 100644 --- a/examples/models/sortformer/main.cpp +++ b/examples/models/sortformer/main.cpp @@ -25,6 +25,10 @@ #include "sortformer_runner.h" DEFINE_string(model_path, "sortformer.pte", "Path to Sortformer model (.pte)."); +DEFINE_string( + data_path, + "", + "Path to data file (.ptd) for delegate data (required for CUDA)."); DEFINE_string(audio_path, "", "Path to input audio file (.wav)."); DEFINE_double(threshold, 0.5, "Speaker activity threshold (0.0 - 1.0)."); DEFINE_int32(chunk_len, 124, "Streaming chunk length in 80ms frames."); @@ -41,7 +45,7 @@ int main(int argc, char** argv) { ::executorch::extension::llm::Stats stats; stats.model_load_start_ms = ::executorch::extension::llm::time_in_ms(); - sortformer::SortformerRunner runner(FLAGS_model_path); + sortformer::SortformerRunner runner(FLAGS_model_path, FLAGS_data_path); stats.model_load_end_ms = ::executorch::extension::llm::time_in_ms(); stats.inference_start_ms = ::executorch::extension::llm::time_in_ms(); diff --git a/examples/models/sortformer/sortformer_runner.cpp b/examples/models/sortformer/sortformer_runner.cpp index afd34b35dcc..5b8e1bdd176 100644 --- a/examples/models/sortformer/sortformer_runner.cpp +++ b/examples/models/sortformer/sortformer_runner.cpp @@ -92,9 +92,17 @@ void compress_cache( // Read model parameters from .pte constant_methods. These are baked into the // exported model by export_sortformer.py and describe the preprocessing config // and architecture dimensions needed to set up the streaming pipeline. -SortformerRunner::SortformerRunner(const std::string& model_path) { +SortformerRunner::SortformerRunner( + const std::string& model_path, + const std::string& data_path) { ET_LOG(Info, "Loading model from: %s", model_path.c_str()); - model_ = std::make_unique(model_path, Module::LoadMode::Mmap); + if (!data_path.empty()) { + ET_LOG(Info, "Loading data from: %s", data_path.c_str()); + model_ = + std::make_unique(model_path, data_path, Module::LoadMode::Mmap); + } else { + model_ = std::make_unique(model_path, Module::LoadMode::Mmap); + } auto load_error = model_->load(); if (load_error != Error::Ok) { ET_LOG(Error, "Failed to load model."); diff --git a/examples/models/sortformer/sortformer_runner.h b/examples/models/sortformer/sortformer_runner.h index f9549eac0df..d16b94986f3 100644 --- a/examples/models/sortformer/sortformer_runner.h +++ b/examples/models/sortformer/sortformer_runner.h @@ -53,7 +53,9 @@ using SegmentCallback = std::function; class SortformerRunner { public: - explicit SortformerRunner(const std::string& model_path); + explicit SortformerRunner( + const std::string& model_path, + const std::string& data_path = ""); struct Result { int64_t num_frames;