Skip to content

Commit a04070a

Browse files
authored
Optimize gradient rendering on CPU (#1301)
Resolves #1300. This gets rid of all the interleave/deinterleave ops, and just performs a bunch of straight array copies. There's some nuance around handling NaNs. Previously, there were two select operations done: one at the beginning, to set the LUT indices to be in bounds if the corresponding *t* value was NaN, and one for each channel to make the color value completely transparent if the corresponding non-extended *t* value was NaN. The second select operation is a bit awkward since the channel values are now packed together, so I've added an extra transparent color value to the end of each gradient LUT. If a *t* value is NaN, we set the LUT index to that value instead. This only requires one select. It's a bit ugly that we do this for all gradients in `vello_common` even though it's only necessary for radial gradients on the CPU path; I'm very open to ideas on how to make it better. (EDIT: I've gone ahead and applied the same trick to the GPU path as well.) I've also wrapped all the gradient paint operations in `vectorize` calls, since they were missing before and not actually being vectorized as a result. On my machine (Ryzen 7 7700x), this takes `fine/gradient/linear/opaque_u8_avx2` from 830ns/iter to 620ns/iter, and `fine/gradient/linear/opaque_f32_avx2` from 850ns/iter to 660ns/iter. `fine/gradient/linear/opaque_f32_scalar` regresses from 890ns/iter to 1.1µs/iter, and I'm not sure why. The `fine/gradient/linear/opaque_u8` benches appear to use the `lowp` module's gradient painter, but the *tests* use the `common` module's gradient painter. I don't know how to benchmark the latter, so the `fine/gradient/linear/opaque_u8_avx2` gain seems to be solely from adding the `vectorize` wrapper.
1 parent 9c38a4a commit a04070a

File tree

6 files changed

+140
-132
lines changed

6 files changed

+140
-132
lines changed

sparse_strips/vello_common/src/encode.rs

Lines changed: 50 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -707,6 +707,16 @@ pub enum EncodedKind {
707707
Sweep(SweepKind),
708708
}
709709

710+
impl EncodedKind {
711+
/// Whether the gradient is undefined at any location.
712+
fn has_undefined(&self) -> bool {
713+
match self {
714+
Self::Radial(radial_kind) => radial_kind.has_undefined(),
715+
_ => false,
716+
}
717+
}
718+
}
719+
710720
/// An encoded gradient.
711721
#[derive(Debug)]
712722
pub struct EncodedGradient {
@@ -734,13 +744,13 @@ impl EncodedGradient {
734744
/// Get the lookup table for sampling u8-based gradient values.
735745
pub fn u8_lut<S: Simd>(&self, simd: S) -> &GradientLut<u8> {
736746
self.u8_lut
737-
.get_or_init(|| GradientLut::new(simd, &self.ranges))
747+
.get_or_init(|| GradientLut::new(simd, &self.ranges, self.kind.has_undefined()))
738748
}
739749

740750
/// Get the lookup table for sampling f32-based gradient values.
741751
pub fn f32_lut<S: Simd>(&self, simd: S) -> &GradientLut<f32> {
742752
self.f32_lut
743-
.get_or_init(|| GradientLut::new(simd, &self.ranges))
753+
.get_or_init(|| GradientLut::new(simd, &self.ranges, self.kind.has_undefined()))
744754
}
745755
}
746756

@@ -960,16 +970,19 @@ impl FromF32Color for u8 {
960970
pub struct GradientLut<T: FromF32Color> {
961971
lut: Vec<[T; 4]>,
962972
scale: f32,
973+
has_undefined: bool,
963974
}
964975

965976
impl<T: FromF32Color> GradientLut<T> {
966977
/// Create a new lookup table.
967-
fn new<S: Simd>(simd: S, ranges: &[GradientRange]) -> Self {
978+
fn new<S: Simd>(simd: S, ranges: &[GradientRange], has_undefined: bool) -> Self {
968979
let lut_size = determine_lut_size(ranges);
969980

970-
// Add a bit of padding since we always process in blocks of 4, even though less might be
971-
// needed.
972-
let mut lut = vec![[T::ZERO, T::ZERO, T::ZERO, T::ZERO]; lut_size + 3];
981+
// If the gradient's t value is undefined at some pixels, we store an extra transparent
982+
// color at the end. An undefined t value can later be mapped to that LUT index, allowing
983+
// for uniform control flow.
984+
let padded_lut_size = lut_size + has_undefined as usize;
985+
let mut lut = vec![[T::ZERO; 4]; padded_lut_size];
973986

974987
// Calculate how many indices are covered by each range.
975988
let ramps = {
@@ -995,7 +1008,7 @@ impl<T: FromF32Color> GradientLut<T> {
9951008
let biases = f32x16::block_splat(f32x4::from_slice(simd, &range.bias));
9961009
let scales = f32x16::block_splat(f32x4::from_slice(simd, &range.scale));
9971010

998-
ramp_range.step_by(4).for_each(|idx| {
1011+
ramp_range.clone().step_by(4).for_each(|idx| {
9991012
let t_vals = f32x4::splat(simd, idx as f32).madd(inv_lut_scale, add_factor);
10001013

10011014
let t_vals = element_wise_splat(simd, t_vals);
@@ -1019,19 +1032,20 @@ impl<T: FromF32Color> GradientLut<T> {
10191032
let (im1, im2) = simd.split_f32x16(result);
10201033
let (r1, r2) = simd.split_f32x8(im1);
10211034
let (r3, r4) = simd.split_f32x8(im2);
1035+
let rs = [r1, r2, r3, r4].map(T::from_f32);
10221036

1023-
let lut = &mut lut[idx..][..4];
1024-
lut[0] = T::from_f32(r1);
1025-
lut[1] = T::from_f32(r2);
1026-
lut[2] = T::from_f32(r3);
1027-
lut[3] = T::from_f32(r4);
1037+
// Make sure not to overwrite any extra transparent color at the end (it's not
1038+
// counted in `lut_size`)
1039+
let lut = &mut lut[idx..(idx + 4).min(lut_size)];
1040+
lut.copy_from_slice(&rs[..lut.len()]);
10281041
});
10291042
}
10301043

1031-
// Due to SIMD we worked in blocks of 4, so we need to truncate to the actual length.
1032-
lut.truncate(lut_size);
1033-
1034-
Self { lut, scale }
1044+
Self {
1045+
lut,
1046+
scale,
1047+
has_undefined,
1048+
}
10351049
}
10361050

10371051
/// Get the sample value at a specific index.
@@ -1046,6 +1060,26 @@ impl<T: FromF32Color> GradientLut<T> {
10461060
&self.lut
10471061
}
10481062

1063+
/// Return the index of the transparent color stored at the end of the table, used if a
1064+
/// gradient's t value is undefined. Only exists if the gradient is of a type that can have
1065+
/// undefined t values.
1066+
#[inline(always)]
1067+
pub fn transparent_index(&self) -> Option<usize> {
1068+
self.has_undefined.then(|| self.lut.len() - 1)
1069+
}
1070+
1071+
/// Return the number of normal entries in the lookup table. This does not include any potential
1072+
/// transparent color stored at the end of the table, which is used for gradients that can have
1073+
/// undefined t values.
1074+
#[inline(always)]
1075+
pub fn width(&self) -> usize {
1076+
if self.has_undefined {
1077+
self.lut.len() - 1
1078+
} else {
1079+
self.lut.len()
1080+
}
1081+
}
1082+
10491083
/// Get the scale factor by which to scale the parametric value to
10501084
/// compute the correct lookup index.
10511085
#[inline(always)]

sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs

Lines changed: 59 additions & 91 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Copyright 2025 the Vello Authors
22
// SPDX-License-Identifier: Apache-2.0 OR MIT
33

4-
use crate::fine::{NumericVec, PosExt, ShaderResultF32};
4+
use crate::fine::{NumericVec, PosExt};
55
use crate::kurbo::Point;
66
use crate::peniko;
77
use core::slice::ChunksExact;
@@ -45,25 +45,18 @@ pub(crate) struct GradientPainter<'a, S: Simd> {
4545
gradient: &'a EncodedGradient,
4646
lut: &'a GradientLut<f32>,
4747
t_vals: ChunksExact<'a, f32>,
48-
has_undefined: bool,
4948
scale_factor: f32x8<S>,
5049
simd: S,
5150
}
5251

5352
impl<'a, S: Simd> GradientPainter<'a, S> {
54-
pub(crate) fn new(
55-
simd: S,
56-
gradient: &'a EncodedGradient,
57-
has_undefined: bool,
58-
t_vals: &'a [f32],
59-
) -> Self {
53+
pub(crate) fn new(simd: S, gradient: &'a EncodedGradient, t_vals: &'a [f32]) -> Self {
6054
let lut = gradient.f32_lut(simd);
6155
let scale_factor = f32x8::splat(simd, lut.scale_factor());
6256

6357
Self {
6458
gradient,
6559
scale_factor,
66-
has_undefined,
6760
lut,
6861
t_vals: t_vals.chunks_exact(8),
6962
simd,
@@ -72,105 +65,80 @@ impl<'a, S: Simd> GradientPainter<'a, S> {
7265
}
7366

7467
impl<S: Simd> Iterator for GradientPainter<'_, S> {
75-
type Item = ShaderResultF32<S>;
68+
type Item = u32x8<S>;
7669

7770
#[inline(always)]
7871
fn next(&mut self) -> Option<Self::Item> {
7972
let extend = self.gradient.extend;
8073
let pos = f32x8::from_slice(self.simd, self.t_vals.next()?);
8174
let t_vals = apply_extend(pos, extend);
8275

83-
let indices = {
84-
// Clear NaNs.
85-
let cleared_t_vals = self.simd.select_f32x8(
86-
t_vals.simd_eq(t_vals),
87-
t_vals,
88-
f32x8::splat(self.simd, 0.0),
89-
);
90-
91-
(cleared_t_vals * self.scale_factor).cvt_u32()
76+
let indices = (t_vals * self.scale_factor).cvt_u32();
77+
78+
// Clear NaNs.
79+
let indices = if let Some(transparent_index) = self.lut.transparent_index() {
80+
self.simd.select_u32x8(
81+
pos.simd_eq(pos),
82+
indices,
83+
u32x8::splat(self.simd, transparent_index as u32),
84+
)
85+
} else {
86+
indices
9287
};
9388

94-
let mut r = [0.0_f32; 8];
95-
let mut g = [0.0_f32; 8];
96-
let mut b = [0.0_f32; 8];
97-
let mut a = [0.0_f32; 8];
98-
99-
// TODO: Investigate whether we can use a loop without performance hit.
100-
macro_rules! gather {
101-
($idx:expr) => {
102-
let sample = self.lut.get(indices[$idx] as usize);
103-
r[$idx] = sample[0];
104-
g[$idx] = sample[1];
105-
b[$idx] = sample[2];
106-
a[$idx] = sample[3];
107-
};
108-
}
109-
110-
gather!(0);
111-
gather!(1);
112-
gather!(2);
113-
gather!(3);
114-
gather!(4);
115-
gather!(5);
116-
gather!(6);
117-
gather!(7);
118-
119-
let mut r = f32x8::from_slice(self.simd, &r);
120-
let mut g = f32x8::from_slice(self.simd, &g);
121-
let mut b = f32x8::from_slice(self.simd, &b);
122-
let mut a = f32x8::from_slice(self.simd, &a);
123-
124-
if self.has_undefined {
125-
macro_rules! mask_nan {
126-
($channel:expr) => {
127-
$channel = self.simd.select_f32x8(
128-
// On some architectures, the NaNs of `t_vals` might have been cleared already by
129-
// the `extend` function, so use the original variable as the mask.
130-
// Mask out NaNs with 0.
131-
self.simd.simd_eq_f32x8(pos, pos),
132-
$channel,
133-
f32x8::splat(self.simd, 0.0),
134-
);
135-
};
136-
}
137-
138-
mask_nan!(r);
139-
mask_nan!(g);
140-
mask_nan!(b);
141-
mask_nan!(a);
142-
}
143-
144-
Some(ShaderResultF32 { r, g, b, a })
89+
Some(indices)
14590
}
14691
}
14792

14893
impl<S: Simd> crate::fine::Painter for GradientPainter<'_, S> {
14994
fn paint_u8(&mut self, buf: &mut [u8]) {
150-
for chunk in buf.chunks_exact_mut(64) {
151-
let first = self.next().unwrap();
152-
let simd = first.r.simd;
153-
let second = self.next().unwrap();
154-
155-
let r = u8x16::from_f32(simd, simd.combine_f32x8(first.r, second.r));
156-
let g = u8x16::from_f32(simd, simd.combine_f32x8(first.g, second.g));
157-
let b = u8x16::from_f32(simd, simd.combine_f32x8(first.b, second.b));
158-
let a = u8x16::from_f32(simd, simd.combine_f32x8(first.a, second.a));
159-
160-
let combined = simd.combine_u8x32(simd.combine_u8x16(r, g), simd.combine_u8x16(b, a));
161-
162-
simd.store_interleaved_128_u8x64(combined, (&mut chunk[..]).try_into().unwrap());
163-
}
95+
self.simd.vectorize(
96+
#[inline(always)]
97+
|| {
98+
for chunk in buf.chunks_exact_mut(32) {
99+
let indices = self.next().unwrap();
100+
101+
let rgbas_1: [f32x4<S>; 4] = core::array::from_fn(|i| {
102+
f32x4::from_slice(self.simd, &self.lut.get(indices[i] as usize))
103+
});
104+
let rgbas_1 = self.simd.combine_f32x8(
105+
self.simd.combine_f32x4(rgbas_1[0], rgbas_1[1]),
106+
self.simd.combine_f32x4(rgbas_1[2], rgbas_1[3]),
107+
);
108+
let rgbas_1 = u8x16::from_f32(self.simd, rgbas_1);
109+
chunk[..16].copy_from_slice(rgbas_1.as_slice());
110+
111+
let rgbas_2: [f32x4<S>; 4] = core::array::from_fn(|i| {
112+
f32x4::from_slice(self.simd, &self.lut.get(indices[i + 4] as usize))
113+
});
114+
let rgbas_2 = self.simd.combine_f32x8(
115+
self.simd.combine_f32x4(rgbas_2[0], rgbas_2[1]),
116+
self.simd.combine_f32x4(rgbas_2[2], rgbas_2[3]),
117+
);
118+
let rgbas_2 = u8x16::from_f32(self.simd, rgbas_2);
119+
chunk[16..].copy_from_slice(rgbas_2.as_slice());
120+
}
121+
},
122+
);
164123
}
165124

166125
fn paint_f32(&mut self, buf: &mut [f32]) {
167-
for chunk in buf.chunks_exact_mut(32) {
168-
let (c1, c2) = self.next().unwrap().get();
169-
c1.simd
170-
.store_interleaved_128_f32x16(c1, (&mut chunk[..16]).try_into().unwrap());
171-
c2.simd
172-
.store_interleaved_128_f32x16(c2, (&mut chunk[16..]).try_into().unwrap());
173-
}
126+
self.simd.vectorize(
127+
#[inline(always)]
128+
|| {
129+
for chunk in buf.chunks_exact_mut(32) {
130+
let indices = self.next().unwrap();
131+
chunk[0..4].copy_from_slice(&self.lut.get(indices[0] as usize));
132+
chunk[4..8].copy_from_slice(&self.lut.get(indices[1] as usize));
133+
chunk[8..12].copy_from_slice(&self.lut.get(indices[2] as usize));
134+
chunk[12..16].copy_from_slice(&self.lut.get(indices[3] as usize));
135+
chunk[16..20].copy_from_slice(&self.lut.get(indices[4] as usize));
136+
chunk[20..24].copy_from_slice(&self.lut.get(indices[5] as usize));
137+
chunk[24..28].copy_from_slice(&self.lut.get(indices[6] as usize));
138+
chunk[28..32].copy_from_slice(&self.lut.get(indices[7] as usize));
139+
}
140+
},
141+
);
174142
}
175143
}
176144

sparse_strips/vello_cpu/src/fine/lowp/gradient.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,14 @@ impl<S: Simd> Iterator for GradientPainter<'_, S> {
5454

5555
impl<S: Simd> crate::fine::Painter for GradientPainter<'_, S> {
5656
fn paint_u8(&mut self, buf: &mut [u8]) {
57-
for chunk in buf.chunks_exact_mut(64) {
58-
chunk.copy_from_slice(&self.next().unwrap().val);
59-
}
57+
self.simd.vectorize(
58+
#[inline(always)]
59+
|| {
60+
for chunk in buf.chunks_exact_mut(64) {
61+
chunk.copy_from_slice(&self.next().unwrap().val);
62+
}
63+
},
64+
);
6065
}
6166

6267
fn paint_f32(&mut self, _: &mut [f32]) {

sparse_strips/vello_cpu/src/fine/mod.rs

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -260,22 +260,25 @@ pub trait FineKernel<S: Simd>: Send + Sync + 'static {
260260
) -> impl Painter + 'a {
261261
simd.vectorize(
262262
#[inline(always)]
263-
|| GradientPainter::new(simd, gradient, false, t_vals),
263+
|| GradientPainter::new(simd, gradient, t_vals),
264264
)
265265
}
266266

267267
/// Create a painter for rendering gradients with undefined region support.
268268
///
269269
/// Similar to `gradient_painter`, but with support for masking undefined locations
270270
/// (used for radial gradients that may have mathematically undefined regions).
271+
///
272+
/// This is intentionally a duplicate of the default [`FineKernel::gradient_painter`]
273+
/// implementation--the `U8Kernel` overrides that method, but not this one.
271274
fn gradient_painter_with_undefined<'a>(
272275
simd: S,
273276
gradient: &'a EncodedGradient,
274277
t_vals: &'a [f32],
275278
) -> impl Painter + 'a {
276279
simd.vectorize(
277280
#[inline(always)]
278-
|| GradientPainter::new(simd, gradient, true, t_vals),
281+
|| GradientPainter::new(simd, gradient, t_vals),
279282
)
280283
}
281284
/// Create a painter for rendering axis-aligned nearest-neighbor images.

sparse_strips/vello_hybrid/src/gradient_cache.rs

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -81,9 +81,8 @@ impl GradientRampCache {
8181

8282
// Generate new gradient LUT.
8383
let lut_start = self.luts.len() as u32 / BYTES_PER_TEXEL;
84-
dispatch!(self.level, simd => generate_gradient_lut_impl(simd, gradient, &mut self.luts));
85-
let lut_end = self.luts.len() as u32 / BYTES_PER_TEXEL;
86-
let width = lut_end - lut_start;
84+
let width = dispatch!(self.level, simd => generate_gradient_lut_impl(simd, gradient, &mut self.luts))
85+
as u32;
8786
let cached_ramp = CachedRamp { width, lut_start };
8887
self.has_changed = true;
8988
self.cache
@@ -272,11 +271,12 @@ fn generate_gradient_lut_impl<S: Simd>(
272271
simd: S,
273272
gradient: &vello_common::encode::EncodedGradient,
274273
output: &mut Vec<u8>,
275-
) {
274+
) -> usize {
276275
let lut = gradient.u8_lut(simd);
277276
let bytes: &[u8] = bytemuck::cast_slice(lut.lut());
278277
output.reserve(bytes.len());
279278
output.extend_from_slice(bytes);
279+
lut.width()
280280
}
281281

282282
#[cfg(test)]

0 commit comments

Comments
 (0)