Optimize gradient rendering on CPU (#1301)

valadaptive · web-flow · commit a04070aeecf4 · 2025-11-24T17:24:09.000Z
Resolves #1300. This gets rid of all the interleave/deinterleave ops, and just performs a bunch of straight array copies. There's some nuance around handling NaNs. Previously, there were two select operations done: one at the beginning, to set the LUT indices to be in bounds if the corresponding *t* value was NaN, and one for each channel to make the color value completely transparent if the corresponding non-extended *t* value was NaN. The second select operation is a bit awkward since the channel values are now packed together, so I've added an extra transparent color value to the end of each gradient LUT. If a *t* value is NaN, we set the LUT index to that value instead. This only requires one select. It's a bit ugly that we do this for all gradients in `vello_common` even though it's only necessary for radial gradients on the CPU path; I'm very open to ideas on how to make it better. (EDIT: I've gone ahead and applied the same trick to the GPU path as well.) I've also wrapped all the gradient paint operations in `vectorize` calls, since they were missing before and not actually being vectorized as a result. On my machine (Ryzen 7 7700x), this takes `fine/gradient/linear/opaque_u8_avx2` from 830ns/iter to 620ns/iter, and `fine/gradient/linear/opaque_f32_avx2` from 850ns/iter to 660ns/iter. `fine/gradient/linear/opaque_f32_scalar` regresses from 890ns/iter to 1.1µs/iter, and I'm not sure why. The `fine/gradient/linear/opaque_u8` benches appear to use the `lowp` module's gradient painter, but the *tests* use the `common` module's gradient painter. I don't know how to benchmark the latter, so the `fine/gradient/linear/opaque_u8_avx2` gain seems to be solely from adding the `vectorize` wrapper.
diff --git a/sparse_strips/vello_common/src/encode.rs b/sparse_strips/vello_common/src/encode.rs
@@ -707,6 +707,16 @@ pub enum EncodedKind {
     Sweep(SweepKind),
 }
 
+impl EncodedKind {
+    /// Whether the gradient is undefined at any location.
+    fn has_undefined(&self) -> bool {
+        match self {
+            Self::Radial(radial_kind) => radial_kind.has_undefined(),
+            _ => false,
+        }
+    }
+}
+
 /// An encoded gradient.
 #[derive(Debug)]
 pub struct EncodedGradient {
@@ -734,13 +744,13 @@ impl EncodedGradient {
     /// Get the lookup table for sampling u8-based gradient values.
     pub fn u8_lut<S: Simd>(&self, simd: S) -> &GradientLut<u8> {
         self.u8_lut
-            .get_or_init(|| GradientLut::new(simd, &self.ranges))
+            .get_or_init(|| GradientLut::new(simd, &self.ranges, self.kind.has_undefined()))
     }
 
     /// Get the lookup table for sampling f32-based gradient values.
     pub fn f32_lut<S: Simd>(&self, simd: S) -> &GradientLut<f32> {
         self.f32_lut
-            .get_or_init(|| GradientLut::new(simd, &self.ranges))
+            .get_or_init(|| GradientLut::new(simd, &self.ranges, self.kind.has_undefined()))
     }
 }
 
@@ -960,16 +970,19 @@ impl FromF32Color for u8 {
 pub struct GradientLut<T: FromF32Color> {
     lut: Vec<[T; 4]>,
     scale: f32,
+    has_undefined: bool,
 }
 
 impl<T: FromF32Color> GradientLut<T> {
     /// Create a new lookup table.
-    fn new<S: Simd>(simd: S, ranges: &[GradientRange]) -> Self {
+    fn new<S: Simd>(simd: S, ranges: &[GradientRange], has_undefined: bool) -> Self {
         let lut_size = determine_lut_size(ranges);
 
-        // Add a bit of padding since we always process in blocks of 4, even though less might be
-        // needed.
-        let mut lut = vec![[T::ZERO, T::ZERO, T::ZERO, T::ZERO]; lut_size + 3];
+        // If the gradient's t value is undefined at some pixels, we store an extra transparent
+        // color at the end. An undefined t value can later be mapped to that LUT index, allowing
+        // for uniform control flow.
+        let padded_lut_size = lut_size + has_undefined as usize;
+        let mut lut = vec![[T::ZERO; 4]; padded_lut_size];
 
         // Calculate how many indices are covered by each range.
         let ramps = {
@@ -995,7 +1008,7 @@ impl<T: FromF32Color> GradientLut<T> {
             let biases = f32x16::block_splat(f32x4::from_slice(simd, &range.bias));
             let scales = f32x16::block_splat(f32x4::from_slice(simd, &range.scale));
 
-            ramp_range.step_by(4).for_each(|idx| {
+            ramp_range.clone().step_by(4).for_each(|idx| {
                 let t_vals = f32x4::splat(simd, idx as f32).madd(inv_lut_scale, add_factor);
 
                 let t_vals = element_wise_splat(simd, t_vals);
@@ -1019,19 +1032,20 @@ impl<T: FromF32Color> GradientLut<T> {
                 let (im1, im2) = simd.split_f32x16(result);
                 let (r1, r2) = simd.split_f32x8(im1);
                 let (r3, r4) = simd.split_f32x8(im2);
+                let rs = [r1, r2, r3, r4].map(T::from_f32);
 
-                let lut = &mut lut[idx..][..4];
-                lut[0] = T::from_f32(r1);
-                lut[1] = T::from_f32(r2);
-                lut[2] = T::from_f32(r3);
-                lut[3] = T::from_f32(r4);
+                // Make sure not to overwrite any extra transparent color at the end (it's not
+                // counted in `lut_size`)
+                let lut = &mut lut[idx..(idx + 4).min(lut_size)];
+                lut.copy_from_slice(&rs[..lut.len()]);
             });
         }
 
-        // Due to SIMD we worked in blocks of 4, so we need to truncate to the actual length.
-        lut.truncate(lut_size);
-
-        Self { lut, scale }
+        Self {
+            lut,
+            scale,
+            has_undefined,
+        }
     }
 
     /// Get the sample value at a specific index.
@@ -1046,6 +1060,26 @@ impl<T: FromF32Color> GradientLut<T> {
         &self.lut
     }
 
+    /// Return the index of the transparent color stored at the end of the table, used if a
+    /// gradient's t value is undefined. Only exists if the gradient is of a type that can have
+    /// undefined t values.
+    #[inline(always)]
+    pub fn transparent_index(&self) -> Option<usize> {
+        self.has_undefined.then(|| self.lut.len() - 1)
+    }
+
+    /// Return the number of normal entries in the lookup table. This does not include any potential
+    /// transparent color stored at the end of the table, which is used for gradients that can have
+    /// undefined t values.
+    #[inline(always)]
+    pub fn width(&self) -> usize {
+        if self.has_undefined {
+            self.lut.len() - 1
+        } else {
+            self.lut.len()
+        }
+    }
+
     /// Get the scale factor by which to scale the parametric value to
     /// compute the correct lookup index.
     #[inline(always)]
diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs
@@ -1,7 +1,7 @@
 // Copyright 2025 the Vello Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
-use crate::fine::{NumericVec, PosExt, ShaderResultF32};
+use crate::fine::{NumericVec, PosExt};
 use crate::kurbo::Point;
 use crate::peniko;
 use core::slice::ChunksExact;
@@ -45,25 +45,18 @@ pub(crate) struct GradientPainter<'a, S: Simd> {
     gradient: &'a EncodedGradient,
     lut: &'a GradientLut<f32>,
     t_vals: ChunksExact<'a, f32>,
-    has_undefined: bool,
     scale_factor: f32x8<S>,
     simd: S,
 }
 
 impl<'a, S: Simd> GradientPainter<'a, S> {
-    pub(crate) fn new(
-        simd: S,
-        gradient: &'a EncodedGradient,
-        has_undefined: bool,
-        t_vals: &'a [f32],
-    ) -> Self {
+    pub(crate) fn new(simd: S, gradient: &'a EncodedGradient, t_vals: &'a [f32]) -> Self {
         let lut = gradient.f32_lut(simd);
         let scale_factor = f32x8::splat(simd, lut.scale_factor());
 
         Self {
             gradient,
             scale_factor,
-            has_undefined,
             lut,
             t_vals: t_vals.chunks_exact(8),
             simd,
@@ -72,105 +65,80 @@ impl<'a, S: Simd> GradientPainter<'a, S> {
 }
 
 impl<S: Simd> Iterator for GradientPainter<'_, S> {
-    type Item = ShaderResultF32<S>;
+    type Item = u32x8<S>;
 
     #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         let extend = self.gradient.extend;
         let pos = f32x8::from_slice(self.simd, self.t_vals.next()?);
         let t_vals = apply_extend(pos, extend);
 
-        let indices = {
-            // Clear NaNs.
-            let cleared_t_vals = self.simd.select_f32x8(
-                t_vals.simd_eq(t_vals),
-                t_vals,
-                f32x8::splat(self.simd, 0.0),
-            );
-
-            (cleared_t_vals * self.scale_factor).cvt_u32()
+        let indices = (t_vals * self.scale_factor).cvt_u32();
+
+        // Clear NaNs.
+        let indices = if let Some(transparent_index) = self.lut.transparent_index() {
+            self.simd.select_u32x8(
+                pos.simd_eq(pos),
+                indices,
+                u32x8::splat(self.simd, transparent_index as u32),
+            )
+        } else {
+            indices
         };
 
-        let mut r = [0.0_f32; 8];
-        let mut g = [0.0_f32; 8];
-        let mut b = [0.0_f32; 8];
-        let mut a = [0.0_f32; 8];
-
-        // TODO: Investigate whether we can use a loop without performance hit.
-        macro_rules! gather {
-            ($idx:expr) => {
-                let sample = self.lut.get(indices[$idx] as usize);
-                r[$idx] = sample[0];
-                g[$idx] = sample[1];
-                b[$idx] = sample[2];
-                a[$idx] = sample[3];
-            };
-        }
-
-        gather!(0);
-        gather!(1);
-        gather!(2);
-        gather!(3);
-        gather!(4);
-        gather!(5);
-        gather!(6);
-        gather!(7);
-
-        let mut r = f32x8::from_slice(self.simd, &r);
-        let mut g = f32x8::from_slice(self.simd, &g);
-        let mut b = f32x8::from_slice(self.simd, &b);
-        let mut a = f32x8::from_slice(self.simd, &a);
-
-        if self.has_undefined {
-            macro_rules! mask_nan {
-                ($channel:expr) => {
-                    $channel = self.simd.select_f32x8(
-                        // On some architectures, the NaNs of `t_vals` might have been cleared already by
-                        // the `extend` function, so use the original variable as the mask.
-                        // Mask out NaNs with 0.
-                        self.simd.simd_eq_f32x8(pos, pos),
-                        $channel,
-                        f32x8::splat(self.simd, 0.0),
-                    );
-                };
-            }
-
-            mask_nan!(r);
-            mask_nan!(g);
-            mask_nan!(b);
-            mask_nan!(a);
-        }
-
-        Some(ShaderResultF32 { r, g, b, a })
+        Some(indices)
     }
 }
 
 impl<S: Simd> crate::fine::Painter for GradientPainter<'_, S> {
     fn paint_u8(&mut self, buf: &mut [u8]) {
-        for chunk in buf.chunks_exact_mut(64) {
-            let first = self.next().unwrap();
-            let simd = first.r.simd;
-            let second = self.next().unwrap();
-
-            let r = u8x16::from_f32(simd, simd.combine_f32x8(first.r, second.r));
-            let g = u8x16::from_f32(simd, simd.combine_f32x8(first.g, second.g));
-            let b = u8x16::from_f32(simd, simd.combine_f32x8(first.b, second.b));
-            let a = u8x16::from_f32(simd, simd.combine_f32x8(first.a, second.a));
-
-            let combined = simd.combine_u8x32(simd.combine_u8x16(r, g), simd.combine_u8x16(b, a));
-
-            simd.store_interleaved_128_u8x64(combined, (&mut chunk[..]).try_into().unwrap());
-        }
+        self.simd.vectorize(
+            #[inline(always)]
+            || {
+                for chunk in buf.chunks_exact_mut(32) {
+                    let indices = self.next().unwrap();
+
+                    let rgbas_1: [f32x4<S>; 4] = core::array::from_fn(|i| {
+                        f32x4::from_slice(self.simd, &self.lut.get(indices[i] as usize))
+                    });
+                    let rgbas_1 = self.simd.combine_f32x8(
+                        self.simd.combine_f32x4(rgbas_1[0], rgbas_1[1]),
+                        self.simd.combine_f32x4(rgbas_1[2], rgbas_1[3]),
+                    );
+                    let rgbas_1 = u8x16::from_f32(self.simd, rgbas_1);
+                    chunk[..16].copy_from_slice(rgbas_1.as_slice());
+
+                    let rgbas_2: [f32x4<S>; 4] = core::array::from_fn(|i| {
+                        f32x4::from_slice(self.simd, &self.lut.get(indices[i + 4] as usize))
+                    });
+                    let rgbas_2 = self.simd.combine_f32x8(
+                        self.simd.combine_f32x4(rgbas_2[0], rgbas_2[1]),
+                        self.simd.combine_f32x4(rgbas_2[2], rgbas_2[3]),
+                    );
+                    let rgbas_2 = u8x16::from_f32(self.simd, rgbas_2);
+                    chunk[16..].copy_from_slice(rgbas_2.as_slice());
+                }
+            },
+        );
     }
 
     fn paint_f32(&mut self, buf: &mut [f32]) {
-        for chunk in buf.chunks_exact_mut(32) {
-            let (c1, c2) = self.next().unwrap().get();
-            c1.simd
-                .store_interleaved_128_f32x16(c1, (&mut chunk[..16]).try_into().unwrap());
-            c2.simd
-                .store_interleaved_128_f32x16(c2, (&mut chunk[16..]).try_into().unwrap());
-        }
+        self.simd.vectorize(
+            #[inline(always)]
+            || {
+                for chunk in buf.chunks_exact_mut(32) {
+                    let indices = self.next().unwrap();
+                    chunk[0..4].copy_from_slice(&self.lut.get(indices[0] as usize));
+                    chunk[4..8].copy_from_slice(&self.lut.get(indices[1] as usize));
+                    chunk[8..12].copy_from_slice(&self.lut.get(indices[2] as usize));
+                    chunk[12..16].copy_from_slice(&self.lut.get(indices[3] as usize));
+                    chunk[16..20].copy_from_slice(&self.lut.get(indices[4] as usize));
+                    chunk[20..24].copy_from_slice(&self.lut.get(indices[5] as usize));
+                    chunk[24..28].copy_from_slice(&self.lut.get(indices[6] as usize));
+                    chunk[28..32].copy_from_slice(&self.lut.get(indices[7] as usize));
+                }
+            },
+        );
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs b/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs
@@ -54,9 +54,14 @@ impl<S: Simd> Iterator for GradientPainter<'_, S> {
 
 impl<S: Simd> crate::fine::Painter for GradientPainter<'_, S> {
     fn paint_u8(&mut self, buf: &mut [u8]) {
-        for chunk in buf.chunks_exact_mut(64) {
-            chunk.copy_from_slice(&self.next().unwrap().val);
-        }
+        self.simd.vectorize(
+            #[inline(always)]
+            || {
+                for chunk in buf.chunks_exact_mut(64) {
+                    chunk.copy_from_slice(&self.next().unwrap().val);
+                }
+            },
+        );
     }
 
     fn paint_f32(&mut self, _: &mut [f32]) {
diff --git a/sparse_strips/vello_cpu/src/fine/mod.rs b/sparse_strips/vello_cpu/src/fine/mod.rs
@@ -260,22 +260,25 @@ pub trait FineKernel<S: Simd>: Send + Sync + 'static {
     ) -> impl Painter + 'a {
         simd.vectorize(
             #[inline(always)]
-            || GradientPainter::new(simd, gradient, false, t_vals),
+            || GradientPainter::new(simd, gradient, t_vals),
         )
     }
 
     /// Create a painter for rendering gradients with undefined region support.
     ///
     /// Similar to `gradient_painter`, but with support for masking undefined locations
     /// (used for radial gradients that may have mathematically undefined regions).
+    ///
+    /// This is intentionally a duplicate of the default [`FineKernel::gradient_painter`]
+    /// implementation--the `U8Kernel` overrides that method, but not this one.
     fn gradient_painter_with_undefined<'a>(
         simd: S,
         gradient: &'a EncodedGradient,
         t_vals: &'a [f32],
     ) -> impl Painter + 'a {
         simd.vectorize(
             #[inline(always)]
-            || GradientPainter::new(simd, gradient, true, t_vals),
+            || GradientPainter::new(simd, gradient, t_vals),
         )
     }
     /// Create a painter for rendering axis-aligned nearest-neighbor images.
diff --git a/sparse_strips/vello_hybrid/src/gradient_cache.rs b/sparse_strips/vello_hybrid/src/gradient_cache.rs
@@ -81,9 +81,8 @@ impl GradientRampCache {
 
         // Generate new gradient LUT.
         let lut_start = self.luts.len() as u32 / BYTES_PER_TEXEL;
-        dispatch!(self.level, simd => generate_gradient_lut_impl(simd, gradient, &mut self.luts));
-        let lut_end = self.luts.len() as u32 / BYTES_PER_TEXEL;
-        let width = lut_end - lut_start;
+        let width = dispatch!(self.level, simd => generate_gradient_lut_impl(simd, gradient, &mut self.luts))
+            as u32;
         let cached_ramp = CachedRamp { width, lut_start };
         self.has_changed = true;
         self.cache
@@ -272,11 +271,12 @@ fn generate_gradient_lut_impl<S: Simd>(
     simd: S,
     gradient: &vello_common::encode::EncodedGradient,
     output: &mut Vec<u8>,
-) {
+) -> usize {
     let lut = gradient.u8_lut(simd);
     let bytes: &[u8] = bytemuck::cast_slice(lut.lut());
     output.reserve(bytes.len());
     output.extend_from_slice(bytes);
+    lut.width()
 }
 
 #[cfg(test)]
diff --git a/sparse_strips/vello_sparse_shaders/shaders/render_strips.wgsl b/sparse_strips/vello_sparse_shaders/shaders/render_strips.wgsl