diff --git a/.ci/scripts/export_model_artifact.sh b/.ci/scripts/export_model_artifact.sh
index 6a7343ece24..5c0fc969000 100755
--- a/.ci/scripts/export_model_artifact.sh
+++ b/.ci/scripts/export_model_artifact.sh
@@ -22,6 +22,7 @@ Arguments:
                  - mistralai/Voxtral-Mini-4B-Realtime-2602
                  - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                  - google/gemma-3-4b-it
+                 - nvidia/diar_streaming_sortformer_4spk-v2
                  - nvidia/parakeet-tdt
 
   quant_name   Quantization type (optional, default: non-quantized)
@@ -45,6 +46,7 @@ Examples:
   export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "quantized-int4-metal"
   export_model_artifact.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
   export_model_artifact.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed"
+  export_model_artifact.sh cuda-windows "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./output"
   export_model_artifact.sh cuda "google/gemma-3-4b-it" "non-quantized" "./output"
   export_model_artifact.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./output"
   export_model_artifact.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./output"
@@ -157,6 +159,14 @@ case "$HF_MODEL" in
     PREPROCESSOR_FEATURE_SIZE=""
     PREPROCESSOR_OUTPUT=""
     ;;
+  nvidia/diar_streaming_sortformer_4spk-v2)
+    MODEL_NAME="sortformer"
+    TASK=""
+    MAX_SEQ_LEN=""
+    EXTRA_PIP=""
+    PREPROCESSOR_FEATURE_SIZE=""
+    PREPROCESSOR_OUTPUT=""
+    ;;
   mistralai/Voxtral-Mini-4B-Realtime-2602)
     MODEL_NAME="voxtral_realtime"
     TASK=""
@@ -167,7 +177,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}, google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -247,6 +257,42 @@ if [ "$MODEL_NAME" = "parakeet" ]; then
   exit 0
 fi
 
+# Sortformer uses a custom export script
+if [ "$MODEL_NAME" = "sortformer" ]; then
+  if [ "$QUANT_NAME" != "non-quantized" ]; then
+    echo "Error: Sortformer currently supports only non-quantized export"
+    exit 1
+  fi
+
+  pip install -r examples/models/sortformer/install_requirements.txt
+
+  SORTFORMER_BACKEND="$DEVICE"
+  if [ "$DEVICE" = "cuda-windows" ]; then
+    SORTFORMER_BACKEND="cuda-windows"
+  elif [ "$DEVICE" = "cuda" ]; then
+    SORTFORMER_BACKEND="cuda"
+  elif [ "$DEVICE" = "xnnpack" ]; then
+    SORTFORMER_BACKEND="xnnpack"
+  else
+    SORTFORMER_BACKEND="portable"
+  fi
+
+  python -m executorch.examples.models.sortformer.export_sortformer \
+      --hf-model "${HF_MODEL}" \
+      --backend "${SORTFORMER_BACKEND}" \
+      --output-dir "${OUTPUT_DIR}"
+
+  test -f "${OUTPUT_DIR}/sortformer.pte"
+  mv "${OUTPUT_DIR}/sortformer.pte" "${OUTPUT_DIR}/model.pte"
+  # CUDA saves named data to separate .ptd file, XNNPACK/portable do not.
+  if [ "$DEVICE" = "cuda" ] || [ "$DEVICE" = "cuda-windows" ]; then
+    test -f "${OUTPUT_DIR}/aoti_cuda_blob.ptd"
+  fi
+  ls -al "${OUTPUT_DIR}"
+  echo "::endgroup::"
+  exit 0
+fi
+
 # Voxtral Realtime uses a custom export script
 if [ "$MODEL_NAME" = "voxtral_realtime" ]; then
   pip install safetensors huggingface_hub
diff --git a/.ci/scripts/test_model_e2e.sh b/.ci/scripts/test_model_e2e.sh
index b0d9a68c5b0..579d41b5b3f 100755
--- a/.ci/scripts/test_model_e2e.sh
+++ b/.ci/scripts/test_model_e2e.sh
@@ -19,6 +19,7 @@ Arguments:
   hf_model    HuggingFace model ID (required)
               Supported models:
                 - mistralai/Voxtral-Mini-3B-2507
+                - nvidia/diar_streaming_sortformer_4spk-v2
                 - openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo})
                 - google/gemma-3-4b-it
                 - Qwen/Qwen3-0.6B
@@ -44,6 +45,7 @@ Arguments:
 Examples:
   test_model_e2e.sh metal "openai/whisper-small" "non-quantized"
   test_model_e2e.sh cuda "mistralai/Voxtral-Mini-3B-2507" "quantized-int4-tile-packed" "./model_output"
+  test_model_e2e.sh cuda "nvidia/diar_streaming_sortformer_4spk-v2" "non-quantized" "./model_output"
   test_model_e2e.sh cuda "nvidia/parakeet-tdt" "non-quantized" "./model_output"
   test_model_e2e.sh xnnpack "nvidia/parakeet-tdt" "quantized-8da4w" "./model_output"
   test_model_e2e.sh metal "mistralai/Voxtral-Mini-4B-Realtime-2602" "non-quantized" "." "vr-streaming"
@@ -176,6 +178,18 @@ case "$HF_MODEL" in
     AUDIO_FILE="test_audio.wav"
     IMAGE_PATH=""
     ;;
+  nvidia/diar_streaming_sortformer_4spk-v2)
+    MODEL_NAME="sortformer"
+    RUNNER_TARGET="sortformer_runner"
+    RUNNER_PATH="sortformer"
+    EXPECTED_OUTPUT="Speaker 1"
+    PREPROCESSOR=""
+    TOKENIZER_URL=""
+    TOKENIZER_FILE=""
+    AUDIO_URL="https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+    AUDIO_FILE="poem.wav"
+    IMAGE_PATH=""
+    ;;
   mistralai/Voxtral-Mini-4B-Realtime-2602)
     MODEL_NAME="voxtral_realtime"
     RUNNER_TARGET="voxtral_realtime_runner"
@@ -190,7 +204,7 @@ case "$HF_MODEL" in
     ;;
   *)
     echo "Error: Unsupported model '$HF_MODEL'"
-    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
+    echo "Supported models: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, openai/whisper series (whisper-{small, medium, large, large-v2, large-v3, large-v3-turbo}), google/gemma-3-4b-it, Qwen/Qwen3-0.6B, nvidia/parakeet-tdt"
     exit 1
     ;;
 esac
@@ -203,8 +217,8 @@ echo "::endgroup::"
 echo "::group::Prepare $MODEL_NAME Artifacts"
 
 
-# Download tokenizer files (skip for parakeet and voxtral_realtime which bundle tokenizer in export)
-if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ]; then
+# Download tokenizer files (skip for models that bundle tokenizer in export or do not use one)
+if [ "$MODEL_NAME" != "parakeet" ] && [ "$MODEL_NAME" != "voxtral_realtime" ] && [ "$MODEL_NAME" != "sortformer" ]; then
   if [ "$TOKENIZER_FILE" != "" ]; then
     curl -L $TOKENIZER_URL/$TOKENIZER_FILE -o $MODEL_DIR/$TOKENIZER_FILE
   else
@@ -296,6 +310,12 @@ EOF
       RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
     fi
     ;;
+  sortformer)
+    RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --audio_path ${MODEL_DIR}/$AUDIO_FILE"
+    if [ "$DEVICE" = "cuda" ]; then
+      RUNNER_ARGS="$RUNNER_ARGS --data_path ${MODEL_DIR}/aoti_cuda_blob.ptd"
+    fi
+    ;;
   voxtral_realtime)
     RUNNER_ARGS="--model_path ${MODEL_DIR}/model.pte --tokenizer_path ${MODEL_DIR}/$TOKENIZER_FILE --preprocessor_path ${MODEL_DIR}/$PREPROCESSOR --audio_path ${MODEL_DIR}/$AUDIO_FILE --temperature 0"
     # Add CUDA data path if present
diff --git a/.ci/scripts/test_model_e2e_windows.ps1 b/.ci/scripts/test_model_e2e_windows.ps1
index beb4d069ce7..430730dfb68 100644
--- a/.ci/scripts/test_model_e2e_windows.ps1
+++ b/.ci/scripts/test_model_e2e_windows.ps1
@@ -64,6 +64,17 @@ switch ($HfModel) {
         $audioUrl = "https://dldata-public.s3.us-east-2.amazonaws.com/2086-149220-0033.wav"
         $audioFile = "test_audio.wav"
     }
+    "nvidia/diar_streaming_sortformer_4spk-v2" {
+        $runnerTarget = "sortformer_runner"
+        $runnerPath = "sortformer"
+        $runnerPreset = "sortformer-cuda"
+        $expectedOutput = "Speaker 1"
+        $preprocessor = ""
+        $tokenizerUrl = ""
+        $tokenizerFile = ""
+        $audioUrl = "https://github.com/voxserv/audio_quality_testing_samples/raw/refs/heads/master/testaudio/16000/test01_20s.wav"
+        $audioFile = "poem.wav"
+    }
     "mistralai/Voxtral-Mini-4B-Realtime-2602" {
         $runnerTarget = "voxtral_realtime_runner"
         $runnerPath = "voxtral_realtime"
@@ -76,7 +87,7 @@ switch ($HfModel) {
         $audioFile = "poem.wav"
     }
     default {
-        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/parakeet-tdt"
+        throw "Unsupported model '$HfModel'. Supported: mistralai/Voxtral-Mini-3B-2507, mistralai/Voxtral-Mini-4B-Realtime-2602, nvidia/diar_streaming_sortformer_4spk-v2, nvidia/parakeet-tdt"
     }
 }
 
@@ -182,6 +193,13 @@ try {
                 "--data_path", $cudaBlob
             )
         }
+        "nvidia/diar_streaming_sortformer_4spk-v2" {
+            $runnerArgs = @(
+                "--model_path", $modelPte,
+                "--audio_path", (Join-Path -Path $resolvedModelDir -ChildPath $audioFile),
+                "--data_path", $cudaBlob
+            )
+        }
         "mistralai/Voxtral-Mini-4B-Realtime-2602" {
             $runnerArgs += @(
                 "--temperature", "0",
diff --git a/.github/workflows/cuda-windows.yml b/.github/workflows/cuda-windows.yml
index 9d74c831ed5..e125fe4cc8e 100644
--- a/.github/workflows/cuda-windows.yml
+++ b/.github/workflows/cuda-windows.yml
@@ -41,6 +41,9 @@ jobs:
           - model_repo: "nvidia"
             model_name: "parakeet-tdt"
             quant: "quantized-int4-weight-only"
+          - model_repo: "nvidia"
+            model_name: "diar_streaming_sortformer_4spk-v2"
+            quant: "non-quantized"
           - model_repo: "mistralai"
             model_name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "quantized-int4-tile-packed"
@@ -113,6 +116,9 @@ jobs:
           - model_repo: "nvidia"
             model_name: "parakeet-tdt"
             quant: "quantized-int4-weight-only"
+          - model_repo: "nvidia"
+            model_name: "diar_streaming_sortformer_4spk-v2"
+            quant: "non-quantized"
           - model_repo: "mistralai"
             model_name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "quantized-int4-tile-packed"
diff --git a/.github/workflows/cuda.yml b/.github/workflows/cuda.yml
index 71f98c03196..acd157455eb 100644
--- a/.github/workflows/cuda.yml
+++ b/.github/workflows/cuda.yml
@@ -139,6 +139,8 @@ jobs:
             name: "Voxtral-Mini-3B-2507"
           - repo: "mistralai"
             name: "Voxtral-Mini-4B-Realtime-2602"
+          - repo: "nvidia"
+            name: "diar_streaming_sortformer_4spk-v2"
           - repo: "openai"
             name: "whisper-small"
           - repo: "openai"
@@ -168,6 +170,15 @@ jobs:
               repo: "mistralai"
               name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "quantized-int4-weight-only"
+          # Sortformer currently supports only non-quantized export
+          - model:
+              repo: "nvidia"
+              name: "diar_streaming_sortformer_4spk-v2"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "nvidia"
+              name: "diar_streaming_sortformer_4spk-v2"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       secrets-env: EXECUTORCH_HF_TOKEN
@@ -214,6 +225,8 @@ jobs:
             name: "Voxtral-Mini-3B-2507"
           - repo: "mistralai"
             name: "Voxtral-Mini-4B-Realtime-2602"
+          - repo: "nvidia"
+            name: "diar_streaming_sortformer_4spk-v2"
           - repo: "openai"
             name: "whisper-small"
           - repo: "openai"
@@ -241,6 +254,15 @@ jobs:
               repo: "mistralai"
               name: "Voxtral-Mini-4B-Realtime-2602"
             quant: "quantized-int4-weight-only"
+          # Sortformer currently supports only non-quantized export
+          - model:
+              repo: "nvidia"
+              name: "diar_streaming_sortformer_4spk-v2"
+            quant: "quantized-int4-tile-packed"
+          - model:
+              repo: "nvidia"
+              name: "diar_streaming_sortformer_4spk-v2"
+            quant: "quantized-int4-weight-only"
     with:
       timeout: 90
       runner: linux.g5.4xlarge.nvidia.gpu
diff --git a/Makefile b/Makefile
index ad8544210f7..c4535adb7f7 100644
--- a/Makefile
+++ b/Makefile
@@ -18,7 +18,7 @@
 # - voxtral_realtime: Realtime speech-to-text model (CPU, CUDA, Metal)
 # - whisper:  Speech recognition model (CPU, CUDA, Metal)
 # - parakeet: Speech recognition model (CPU, CUDA, Metal)
-# - sortformer: Speaker diarization model (CPU)
+# - sortformer: Speaker diarization model (CPU, CUDA)
 # - silero_vad: Voice activity detection model (CPU)
 # - llama:    Text generation model (CPU)
 # - llava:    Vision + language model (CPU)
@@ -91,7 +91,7 @@
 #
 # ==============================================================================
 
-.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
+.PHONY: voxtral-cuda voxtral-cpu voxtral-metal voxtral_realtime-cuda voxtral_realtime-cpu voxtral_realtime-metal whisper-cuda whisper-cuda-debug whisper-cpu whisper-metal parakeet-cuda parakeet-cuda-debug parakeet-cpu parakeet-metal sortformer-cuda sortformer-cpu silero-vad-cpu llama-cuda llama-cuda-debug llama-cpu llava-cpu gemma3-cuda gemma3-cpu clean help
 
 help:
 	@echo "This Makefile adds targets to build runners for various models on various backends. Run using \`make <target>\`. Available targets:"
@@ -109,6 +109,7 @@ help:
 	@echo "  parakeet-cuda-debug - Build Parakeet runner with CUDA backend (debug mode)"
 	@echo "  parakeet-cpu        - Build Parakeet runner with CPU backend"
 	@echo "  parakeet-metal      - Build Parakeet runner with Metal backend (macOS only)"
+	@echo "  sortformer-cuda     - Build Sortformer runner with CUDA backend"
 	@echo "  sortformer-cpu      - Build Sortformer runner with CPU backend"
 	@echo "  silero-vad-cpu      - Build Silero VAD runner with CPU backend"
 	@echo "  llama-cuda          - Build Llama runner with CUDA backend"
@@ -218,6 +219,15 @@ parakeet-metal:
 	@echo "✓ Build complete!"
 	@echo "  Binary: cmake-out/examples/models/parakeet/parakeet_runner"
 
+sortformer-cuda:
+	@echo "==> Building and installing ExecuTorch with CUDA..."
+	cmake --workflow --preset llm-release-cuda
+	@echo "==> Building Sortformer runner with CUDA..."
+	cd examples/models/sortformer && cmake --workflow --preset sortformer-cuda
+	@echo ""
+	@echo "✓ Build complete!"
+	@echo "  Binary: cmake-out/examples/models/sortformer/sortformer_runner"
+
 sortformer-cpu:
 	@echo "==> Building and installing ExecuTorch..."
 	cmake --workflow --preset llm-release
diff --git a/examples/models/sortformer/CMakePresets.json b/examples/models/sortformer/CMakePresets.json
index 09a88237e31..59c31902248 100644
--- a/examples/models/sortformer/CMakePresets.json
+++ b/examples/models/sortformer/CMakePresets.json
@@ -15,6 +15,19 @@
             "name": "sortformer-cpu",
             "displayName": "Sortformer runner (CPU)",
             "inherits": ["sortformer-base"]
+        },
+        {
+            "name": "sortformer-cuda",
+            "displayName": "Sortformer runner (CUDA)",
+            "inherits": ["sortformer-base"],
+            "cacheVariables": {
+                "EXECUTORCH_BUILD_CUDA": "ON"
+            },
+            "condition": {
+                "type": "inList",
+                "string": "${hostSystemName}",
+                "list": ["Linux", "Windows"]
+            }
         }
     ],
     "buildPresets": [
@@ -23,6 +36,12 @@
             "displayName": "Build Sortformer runner (CPU)",
             "configurePreset": "sortformer-cpu",
             "targets": ["sortformer_runner"]
+        },
+        {
+            "name": "sortformer-cuda",
+            "displayName": "Build Sortformer runner (CUDA)",
+            "configurePreset": "sortformer-cuda",
+            "targets": ["sortformer_runner"]
         }
     ],
     "workflowPresets": [
@@ -39,6 +58,20 @@
                     "name": "sortformer-cpu"
                 }
             ]
+        },
+        {
+            "name": "sortformer-cuda",
+            "displayName": "Configure and build Sortformer runner (CUDA)",
+            "steps": [
+                {
+                    "type": "configure",
+                    "name": "sortformer-cuda"
+                },
+                {
+                    "type": "build",
+                    "name": "sortformer-cuda"
+                }
+            ]
         }
     ]
 }
diff --git a/examples/models/sortformer/README.md b/examples/models/sortformer/README.md
index 95fae96f93b..1cdae4a068d 100644
--- a/examples/models/sortformer/README.md
+++ b/examples/models/sortformer/README.md
@@ -4,14 +4,19 @@ Export and run [nvidia/diar_streaming_sortformer_4spk-v2](https://huggingface.co
 
 Speaker diarization answers "who spoke when" — the model outputs per-frame activity probabilities for up to 4 speakers. This is not ASR; there is no text output.
 
+## Requirements
+
+- Python `>= 3.11` (required for Sortformer export tooling)
+
 ## Quick Start
 
 ```bash
+cd examples/models/sortformer
+
 # Install Python dependencies
 pip install -r install_requirements.txt
 
 # Export to .pte
-cd examples/models/sortformer
 python export_sortformer.py --nemo-path /path/to/model.nemo --backend xnnpack
 
 # Build the C++ runner (from repo root)
@@ -23,6 +28,27 @@ make sortformer-cpu
     --audio_path /path/to/audio.wav
 ```
 
+## CUDA Quick Start
+
+```bash
+cd examples/models/sortformer
+
+# Install Python dependencies
+pip install -r install_requirements.txt
+
+# Export to .pte + .ptd
+python export_sortformer.py --nemo-path /path/to/model.nemo --backend cuda
+
+# Build the C++ runner (from repo root)
+make sortformer-cuda
+
+# Run diarization
+./cmake-out/examples/models/sortformer/sortformer_runner \
+    --model_path examples/models/sortformer/sortformer_exports/sortformer.pte \
+    --data_path examples/models/sortformer/sortformer_exports/aoti_cuda_blob.ptd \
+    --audio_path /path/to/audio.wav
+```
+
 Output:
 
 ```
@@ -50,10 +76,14 @@ python export_sortformer.py --nemo-path /path/to/model.nemo --backend xnnpack
 |----------|-------------|
 | `--nemo-path` | Path to `.nemo` model file |
 | `--hf-model` | HuggingFace model ID (default: `nvidia/diar_streaming_sortformer_4spk-v2`) |
-| `--backend` | `portable` or `xnnpack` (default: `xnnpack`) |
+| `--backend` | `portable`, `xnnpack`, `cuda`, or `cuda-windows` (default: `xnnpack`) |
 | `--output-dir` | Output directory (default: `./sortformer_exports`) |
 
-Output: `sortformer_exports/sortformer.pte` (~470 MB unquantized). The preprocessor is always lowered with the portable backend regardless of `--backend`.
+Output:
+- `sortformer_exports/sortformer.pte` (~470 MB unquantized)
+- `sortformer_exports/aoti_cuda_blob.ptd` (when backend is `cuda` or `cuda-windows`)
+
+The preprocessor is always lowered with the portable backend regardless of `--backend`.
 
 ## Validate
 
@@ -76,6 +106,8 @@ From the repository root:
 
 ```bash
 make sortformer-cpu
+# or
+make sortformer-cuda
 ```
 
 Binary: `cmake-out/examples/models/sortformer/sortformer_runner`
@@ -85,6 +117,7 @@ Binary: `cmake-out/examples/models/sortformer/sortformer_runner`
 | Argument | Description |
 |----------|-------------|
 | `--model_path` | Path to `.pte` file (default: `sortformer.pte`) |
+| `--data_path` | Path to `.ptd` file for delegate data (required for CUDA) |
 | `--audio_path` | Path to input WAV file (16kHz mono, required) |
 | `--threshold` | Speaker activity threshold, 0.0–1.0 (default: `0.5`) |
 | `--chunk_len` | Encode chunk size in 80ms frames (default: `124`) |
@@ -135,8 +168,8 @@ The `.pte` contains three methods, split along streaming boundaries so the calle
 | Method | Backend | Input | Output |
 |--------|---------|-------|--------|
 | `preprocessor` | portable | `audio` (N,) float, `length` (1,) int64 | `mel` (1, 128, T) float, `mel_len` (1,) int64 |
-| `pre_encode` | XNNPACK | `chunk` (1, 4000, 128) float, `chunk_len` (1,) int64 | `embs` (1, 500, 512) float, `emb_len` (1,) int64 |
-| `encode` | XNNPACK | `embs` (1, T, 512) float, `emb_len` (1,) int64 | `preds` (1, T, 4) float |
+| `pre_encode` | XNNPACK or CUDA | `chunk` (1, 4000, 128) float, `chunk_len` (1,) int64 | `embs` (1, 500, 512) float, `emb_len` (1,) int64 |
+| `encode` | XNNPACK or CUDA | `embs` (1, T, 512) float, `emb_len` (1,) int64 | `preds` (1, T, 4) float |
 
 - `preprocessor`: dynamic audio length (min=1600, max=1,920,000 samples).
 - `pre_encode`: static shapes (4000 mel frames).
diff --git a/examples/models/sortformer/export_sortformer.py b/examples/models/sortformer/export_sortformer.py
index 8905f7ec308..e8f25780607 100644
--- a/examples/models/sortformer/export_sortformer.py
+++ b/examples/models/sortformer/export_sortformer.py
@@ -309,6 +309,23 @@ def lower_to_executorch(programs, metadata=None, backend="portable"):
                 partitioner[key] = []
             else:
                 partitioner[key] = [XnnpackPartitioner()]
+    elif backend in ("cuda", "cuda-windows"):
+        from executorch.backends.cuda.cuda_backend import CudaBackend
+        from executorch.backends.cuda.cuda_partitioner import CudaPartitioner
+        from executorch.exir.backend.compile_spec_schema import CompileSpec
+
+        print(
+            f"\nLowering to ExecuTorch with CUDA{' (Windows)' if backend == 'cuda-windows' else ''}..."
+        )
+        partitioner = {}
+        for key in programs.keys():
+            if key == "preprocessor":
+                partitioner[key] = []
+                continue
+            compile_specs = [CudaBackend.generate_method_name_compile_spec(key)]
+            if backend == "cuda-windows":
+                compile_specs.append(CompileSpec("platform", b"windows"))
+            partitioner[key] = [CudaPartitioner(compile_specs)]
     else:
         print("\nLowering to ExecuTorch...")
         partitioner = []
@@ -354,7 +371,7 @@ def main():
         "--backend",
         type=str,
         default="xnnpack",
-        choices=["portable", "xnnpack"],
+        choices=["portable", "xnnpack", "cuda", "cuda-windows"],
         help="Backend for acceleration (default: xnnpack)",
     )
 
@@ -373,6 +390,15 @@ def main():
     print(f"\nSaving ExecuTorch program to: {pte_path}")
     with open(pte_path, "wb") as f:
         et.write_to_file(f)
+
+    if args.backend in ("cuda", "cuda-windows"):
+        cuda_blob_path = os.path.join(args.output_dir, "aoti_cuda_blob.ptd")
+        print(f"Writing CUDA named-data blob to: {cuda_blob_path}")
+        et.write_tensor_data_to_file(args.output_dir)
+        if not os.path.isfile(cuda_blob_path):
+            raise FileNotFoundError(
+                f"Expected CUDA named-data blob at {cuda_blob_path}, but it was not created."
+            )
     print(f"Saved {os.path.getsize(pte_path) / (1024 * 1024):.1f} MB")
 
     print("\nDone!")
diff --git a/examples/models/sortformer/main.cpp b/examples/models/sortformer/main.cpp
index af3bd061cfe..d3cbd2fc796 100644
--- a/examples/models/sortformer/main.cpp
+++ b/examples/models/sortformer/main.cpp
@@ -25,6 +25,10 @@
 #include "sortformer_runner.h"
 
 DEFINE_string(model_path, "sortformer.pte", "Path to Sortformer model (.pte).");
+DEFINE_string(
+    data_path,
+    "",
+    "Path to data file (.ptd) for delegate data (required for CUDA).");
 DEFINE_string(audio_path, "", "Path to input audio file (.wav).");
 DEFINE_double(threshold, 0.5, "Speaker activity threshold (0.0 - 1.0).");
 DEFINE_int32(chunk_len, 124, "Streaming chunk length in 80ms frames.");
@@ -41,7 +45,7 @@ int main(int argc, char** argv) {
   ::executorch::extension::llm::Stats stats;
   stats.model_load_start_ms = ::executorch::extension::llm::time_in_ms();
 
-  sortformer::SortformerRunner runner(FLAGS_model_path);
+  sortformer::SortformerRunner runner(FLAGS_model_path, FLAGS_data_path);
 
   stats.model_load_end_ms = ::executorch::extension::llm::time_in_ms();
   stats.inference_start_ms = ::executorch::extension::llm::time_in_ms();
diff --git a/examples/models/sortformer/sortformer_runner.cpp b/examples/models/sortformer/sortformer_runner.cpp
index afd34b35dcc..5b8e1bdd176 100644
--- a/examples/models/sortformer/sortformer_runner.cpp
+++ b/examples/models/sortformer/sortformer_runner.cpp
@@ -92,9 +92,17 @@ void compress_cache(
 // Read model parameters from .pte constant_methods. These are baked into the
 // exported model by export_sortformer.py and describe the preprocessing config
 // and architecture dimensions needed to set up the streaming pipeline.
-SortformerRunner::SortformerRunner(const std::string& model_path) {
+SortformerRunner::SortformerRunner(
+    const std::string& model_path,
+    const std::string& data_path) {
   ET_LOG(Info, "Loading model from: %s", model_path.c_str());
-  model_ = std::make_unique<Module>(model_path, Module::LoadMode::Mmap);
+  if (!data_path.empty()) {
+    ET_LOG(Info, "Loading data from: %s", data_path.c_str());
+    model_ =
+        std::make_unique<Module>(model_path, data_path, Module::LoadMode::Mmap);
+  } else {
+    model_ = std::make_unique<Module>(model_path, Module::LoadMode::Mmap);
+  }
   auto load_error = model_->load();
   if (load_error != Error::Ok) {
     ET_LOG(Error, "Failed to load model.");
diff --git a/examples/models/sortformer/sortformer_runner.h b/examples/models/sortformer/sortformer_runner.h
index f9549eac0df..d16b94986f3 100644
--- a/examples/models/sortformer/sortformer_runner.h
+++ b/examples/models/sortformer/sortformer_runner.h
@@ -53,7 +53,9 @@ using SegmentCallback = std::function<void(const Segment&)>;
 
 class SortformerRunner {
  public:
-  explicit SortformerRunner(const std::string& model_path);
+  explicit SortformerRunner(
+      const std::string& model_path,
+      const std::string& data_path = "");
 
   struct Result {
     int64_t num_frames;