darktable-org · andriiryzhkov · Mar 23, 2026
diff --git a/src/ai/backend_onnx.c b/src/ai/backend_onnx.c
@@ -78,40 +78,72 @@ static void _stderr_suppress_end(int saved)
 }
 #endif
 
+// Load ORT API from a dynamically loaded module. Returns NULL on failure.
+static const OrtApi *_ort_api_from_module(GModule *mod, const char *label)
+{
+  typedef const OrtApiBase *(*OrtGetApiBaseFn)(void);
+  OrtGetApiBaseFn get_api_base = NULL;
+  if(!g_module_symbol(mod, "OrtGetApiBase", (gpointer *)&get_api_base) || !get_api_base)
+  {
+    dt_print(DT_DEBUG_AI, "[darktable_ai] OrtGetApiBase symbol not found in '%s'", label);
+    return NULL;
+  }
+  dt_print(DT_DEBUG_AI, "[darktable_ai] loaded ORT %s from '%s'",
+           get_api_base()->GetVersionString(), label);
+  return get_api_base()->GetApi(ORT_API_VERSION);
+}
+
 static gpointer _init_ort_api(gpointer data)
 {
   (void)data;
   const OrtApi *api = NULL;
 
-#ifdef ORT_LAZY_LOAD
-  // Ubuntu/Debian's system ORT links against libonnx, causing harmless but noisy
-  // "already registered" ONNX schema warnings when the library is first loaded.
-  // suppress them by loading ORT explicitly, with stderr temporarily redirected.
-  // G_MODULE_BIND_LAZY = RTLD_LAZY; default (no BIND_LOCAL) = RTLD_GLOBAL so
-  // provider symbols remain visible to the rest of the process via dlsym(NULL).
-  const int saved = _stderr_suppress_begin();
-  // the handle is intentionally not stored: ORT must stay loaded for the process
-  // lifetime and g_module_close is never called, so the library stays resident.
-  GModule *ort_mod = g_module_open(ORT_LIBRARY_PATH, G_MODULE_BIND_LAZY);
-  _stderr_suppress_end(saved);
+  // DT_ORT_LIBRARY allows users to point to a GPU-enabled ORT build
+  // (e.g. CUDA or ROCm) without rebuilding darktable. On Linux this
+  // overrides the compile-time default; on Windows/macOS it dynamically
+  // loads a user-supplied library instead of the bundled DirectML/CoreML one.
+  const char *ort_override = g_getenv("DT_ORT_LIBRARY");
 
-  if(!ort_mod)
+  if(ort_override && ort_override[0])
   {
-    dt_print(DT_DEBUG_AI,
-             "[darktable_ai] failed to load ORT library '%s': %s",
-             ORT_LIBRARY_PATH, g_module_error());
-    return NULL;
+    GModule *ort_mod = g_module_open(ort_override, G_MODULE_BIND_LAZY);
+    if(!ort_mod)
+    {
+      dt_print(DT_DEBUG_AI,
+               "[darktable_ai] failed to load ORT library '%s': %s",
+               ort_override, g_module_error());
+      return NULL;
+    }
+    api = _ort_api_from_module(ort_mod, ort_override);
   }
-  typedef const OrtApiBase *(*OrtGetApiBaseFn)(void);
-  OrtGetApiBaseFn get_api_base = NULL;
-  if(!g_module_symbol(ort_mod, "OrtGetApiBase", (gpointer *)&get_api_base) || !get_api_base)
+#ifdef ORT_LAZY_LOAD
+  else
   {
-    dt_print(DT_DEBUG_AI, "[darktable_ai] OrtGetApiBase symbol not found");
-    return NULL;
+    // Linux default: lazy-load the bundled or system ORT library.
+    // Suppress stderr during load - Ubuntu/Debian's system ORT links against
+    // libonnx, causing harmless "already registered" ONNX schema warnings.
+    const int saved = _stderr_suppress_begin();
+    GModule *ort_mod = g_module_open(ORT_LIBRARY_PATH, G_MODULE_BIND_LAZY);
+    _stderr_suppress_end(saved);
+
+    if(!ort_mod)
+    {
+      dt_print(DT_DEBUG_AI,
+               "[darktable_ai] failed to load ORT library '%s': %s",
+               ORT_LIBRARY_PATH, g_module_error());
+      return NULL;
+    }
+    api = _ort_api_from_module(ort_mod, ORT_LIBRARY_PATH);
   }
-  api = get_api_base()->GetApi(ORT_API_VERSION);
 #else
-  api = OrtGetApiBase()->GetApi(ORT_API_VERSION);
+  else
+  {
+    // Windows/macOS: use the directly linked ORT library (DirectML/CoreML).
+    const OrtApiBase *base = OrtGetApiBase();
+    dt_print(DT_DEBUG_AI, "[darktable_ai] loaded ORT %s (bundled)",
+             base->GetVersionString());
+    api = base->GetApi(ORT_API_VERSION);
+  }
 #endif
 
   if(!api)
@@ -876,7 +908,7 @@ dt_ai_onnx_load_ext(const char *model_dir, const char *model_file,
         {
           ctx->dynamic_outputs = TRUE;
           dt_print(DT_DEBUG_AI,
-                   "[darktable_ai] output[%zu] has dynamic dims — using ORT-allocated outputs",
+                   "[darktable_ai] output[%zu] has dynamic dims - using ORT-allocated outputs",
                    i);
           break;
         }

diff --git a/tools/ai/README.md b/tools/ai/README.md
@@ -0,0 +1,172 @@
+# GPU-Accelerated ONNX Runtime for darktable
+
+darktable bundles a CPU-only ONNX Runtime by default. On Linux, it also
+bundles DirectML on Windows and CoreML on macOS. These scripts install a
+GPU-enabled ORT build to accelerate AI features (denoise, upscale,
+segmentation).
+
+## What's bundled by default
+
+| Platform | Bundled ORT | GPU support |
+|----------|------------|-------------|
+| Linux | CPU only | None – use scripts below |
+| Windows | DirectML | AMD, NVIDIA, Intel via DirectX 12 |
+| macOS | CoreML | Apple Silicon Neural Engine |
+
+## Installing GPU-accelerated ORT
+
+### NVIDIA (CUDA) – Linux & Windows
+
+**Requirements:**
+
+- NVIDIA GPU with compute capability 6.0+ (GeForce GTX 1000 "Pascal" or newer)
+- NVIDIA driver 525 or later
+- CUDA 12.x runtime – included with the driver on Windows; on Linux install
+  the CUDA toolkit (`nvidia-cuda-toolkit` on Ubuntu/Debian, `cuda` on Arch)
+- cuDNN 9.x – download from https://developer.nvidia.com/cudnn-downloads or
+  install via package manager (`libcudnn9-cuda-12` on Ubuntu/Debian, `cudnn`
+  on Arch)
+
+Linux:
+```bash
+./tools/ai/install-ort-nvidia.sh
+```
+
+Windows (PowerShell):
+```powershell
+.\tools\ai\install-ort-nvidia.ps1
+```
+
+Downloads a prebuilt ORT with CUDA EP from GitHub (~200 MB, ~30 sec).
+On Windows, use this instead of the bundled DirectML for potentially
+better NVIDIA performance.
+
+### AMD (MIGraphX) – Linux
+
+**Requirements:**
+
+- AMD GPU supported by ROCm:
+  - Consumer: Radeon RX 6000 series (RDNA2) or newer
+  - Data center: Instinct MI100 (CDNA) or newer
+- ROCm 6.0 or later – install from AMD's repo:
+  https://rocm.docs.amd.com/projects/install-on-linux/en/latest/
+  - Ubuntu/Debian: `sudo apt install rocm`
+  - Arch: `sudo pacman -S rocm-hip-sdk`
+  - Fedora: `sudo dnf install rocm`
+- MIGraphX (included in ROCm, or install separately):
+  - Ubuntu/Debian: `sudo apt install migraphx migraphx-dev`
+  - Arch: `sudo pacman -S migraphx`
+- For building from source: cmake 3.26+, gcc/g++, python3, git
+
+Prebuilt (fast, ~30 sec):
+```bash
+./tools/ai/install-ort-amd.sh
+```
+
+Build from source (fallback if prebuilt doesn't work, 10-20 min):
+```bash
+./tools/ai/install-ort-amd-build.sh
+```
+
+The prebuilt script downloads a wheel from AMD's package repository. The
+build script compiles ORT against your installed ROCm headers and
+libraries – use it if the prebuilt version has ABI compatibility issues.
+Both auto-detect your ROCm version and select the matching ORT release:
+
+| ROCm | ORT version |
+|------|-------------|
+| 7.2 | 1.23.2 |
+| 7.1 | 1.23.1 |
+| 7.0 | 1.22.1 |
+| 6.4 | 1.21.0 |
+| 6.3 | 1.19.0 |
+| 6.2 | 1.18.0 |
+| 6.1 | 1.17.0 |
+| 6.0 | 1.16.0 |
+
+### Intel (OpenVINO) – Linux
+
+**Requirements:**
+
+- Intel GPU or any x86_64 CPU:
+  - Integrated: HD Graphics, UHD Graphics, Iris Xe (Gen9+)
+  - Discrete: Intel Arc A-series (A770, A750, A580, etc.)
+  - CPU-only mode works on any x86_64 processor (Intel or AMD)
+- For GPU acceleration: Intel compute runtime with Level Zero
+  - Ubuntu/Debian: `sudo apt install intel-opencl-icd level-zero`
+  - Arch: `sudo pacman -S intel-compute-runtime level-zero-loader`
+  - For Arc GPUs: kernel 6.2 or later recommended
+- pip3 (for downloading the wheel)
+- OpenVINO runtime is bundled in the package – no separate install needed
+
+```bash
+./tools/ai/install-ort-intel.sh
+```
+
+Downloads a prebuilt ORT with OpenVINO EP from PyPI (~60 MB, ~30 sec).
+Includes all OpenVINO runtime libraries.
+
+## Using the custom ORT
+
+All scripts install to `~/.local/lib/onnxruntime-<provider>/` and print
+the path to use. Set the `DT_ORT_LIBRARY` environment variable to point
+darktable to the custom build:
+
+```bash
+DT_ORT_LIBRARY=~/.local/lib/onnxruntime-cuda/libonnxruntime.so.1.24.4 darktable
+```
+
+Or add to `~/.bashrc` for persistence:
+```bash
+export DT_ORT_LIBRARY=~/.local/lib/onnxruntime-cuda/libonnxruntime.so.1.24.4
+```
+
+On Windows (PowerShell):
+```powershell
+$env:DT_ORT_LIBRARY="C:\Users\you\AppData\Local\onnxruntime-cuda\onnxruntime.dll"
+darktable
+```
+
+Or set permanently via System → Environment Variables.
+
+If `DT_ORT_LIBRARY` is not set, darktable uses the bundled ORT (CPU on
+Linux, DirectML on Windows, CoreML on macOS).
+
+## Manual installation (without scripts)
+
+If you prefer to install manually or the scripts don't work for your setup:
+
+1. **Get an ORT shared library with your desired EP compiled in:**
+   - NVIDIA CUDA: download `onnxruntime-linux-x64-gpu-VERSION.tgz` (Linux)
+     or `onnxruntime-win-x64-gpu-VERSION.zip` (Windows) from
+     https://github.com/microsoft/onnxruntime/releases
+   - AMD MIGraphX: download `onnxruntime_rocm` wheel from
+     https://repo.radeon.com/rocm/manylinux/ (match your ROCm version)
+     or build from source: `./build.sh --config Release --build_shared_lib --use_migraphx --migraphx_home /opt/rocm`
+   - Intel OpenVINO: `pip download --no-deps onnxruntime-openvino`
+
+2. **Extract the shared library:**
+   - `.tgz`/`.zip`: extract `lib/libonnxruntime.so*` (or `lib/onnxruntime.dll`)
+   - `.whl`: rename to `.zip` and extract `onnxruntime/capi/libonnxruntime.so*`
+     and any `libonnxruntime_providers_*.so` files
+
+3. **Point darktable to it:**
+   ```bash
+   export DT_ORT_LIBRARY=/path/to/libonnxruntime.so.X.Y.Z
+   ```
+
+## Verifying
+
+Run darktable with AI debug output to confirm which ORT is loaded:
+
+```bash
+DT_ORT_LIBRARY=... darktable -d ai
+```
+
+Look for:
+```
+[darktable_ai] loaded ORT 1.24.4 from '/home/user/.local/lib/onnxruntime-cuda/libonnxruntime.so.1.24.4'
+```
+
+Then check Preferences → Processing → AI execution provider to select
+your GPU provider (CUDA, MIGraphX, OpenVINO).