diff --git a/sparse_strips/vello_bench/src/fine/fill.rs b/sparse_strips/vello_bench/src/fine/fill.rs
index 850548b4a..e2d6d9c66 100644
--- a/sparse_strips/vello_bench/src/fine/fill.rs
+++ b/sparse_strips/vello_bench/src/fine/fill.rs
@@ -59,7 +59,7 @@ pub(crate) fn fill_single<S: Simd, N: FineKernel<S>>(
     fine: &mut Fine<S, N>,
 ) {
     b.iter(|| {
-        fine.fill(0, width, paint, blend_mode, encoded_paints, None);
+        fine.fill(0, width, paint, blend_mode, encoded_paints, None, None);
 
         std::hint::black_box(&fine);
     });
diff --git a/sparse_strips/vello_bench/src/fine/strip.rs b/sparse_strips/vello_bench/src/fine/strip.rs
index 569381839..cbf04b466 100644
--- a/sparse_strips/vello_bench/src/fine/strip.rs
+++ b/sparse_strips/vello_bench/src/fine/strip.rs
@@ -58,6 +58,7 @@ fn strip_single<S: Simd, N: FineKernel<S>>(
             default_blend(),
             encoded_paints,
             Some(&alphas),
+            None,
         );
 
         std::hint::black_box(&fine);
diff --git a/sparse_strips/vello_common/src/coarse.rs b/sparse_strips/vello_common/src/coarse.rs
index 90def7b5a..5de80ee6c 100644
--- a/sparse_strips/vello_common/src/coarse.rs
+++ b/sparse_strips/vello_common/src/coarse.rs
@@ -235,7 +235,13 @@ impl<const MODE: u8> Wide<MODE> {
     ///    - Generate alpha fill commands for the intersected wide tiles
     /// 2. For active fill regions (determined by fill rule):
     ///    - Generate solid fill commands for the regions between strips
-    pub fn generate(&mut self, strip_buf: &[Strip], paint: Paint, thread_idx: u8) {
+    pub fn generate(
+        &mut self,
+        strip_buf: &[Strip],
+        paint: Paint,
+        thread_idx: u8,
+        mask: Option<Mask>,
+    ) {
         if strip_buf.is_empty() {
             return;
         }
@@ -307,6 +313,7 @@ impl<const MODE: u8> Wide<MODE> {
                     thread_idx,
                     paint: paint.clone(),
                     blend_mode: None,
+                    mask: mask.clone(),
                 };
                 x += width;
                 col += u32::from(width);
@@ -332,8 +339,12 @@ impl<const MODE: u8> Wide<MODE> {
                     let x_wtile_rel = x % WideTile::WIDTH;
                     let width = x2.min((wtile_x + 1) * WideTile::WIDTH) - x;
                     x += width;
-                    self.get_mut(wtile_x, strip_y)
-                        .fill(x_wtile_rel, width, paint.clone());
+                    self.get_mut(wtile_x, strip_y).fill(
+                        x_wtile_rel,
+                        width,
+                        paint.clone(),
+                        mask.clone(),
+                    );
                 }
             }
         }
@@ -818,7 +829,7 @@ impl<const MODE: u8> WideTile<MODE> {
         }
     }
 
-    pub(crate) fn fill(&mut self, x: u16, width: u16, paint: Paint) {
+    pub(crate) fn fill(&mut self, x: u16, width: u16, paint: Paint, mask: Option<Mask>) {
         if !self.is_zero_clip() {
             match MODE {
                 MODE_CPU => {
@@ -836,6 +847,7 @@ impl<const MODE: u8> WideTile<MODE> {
                         let can_override = x == 0
                             && width == WideTile::WIDTH
                             && s.is_opaque()
+                            && mask.is_none()
                             && self.n_clip == 0
                             && self.n_bufs == 0;
                         can_override.then_some(*s)
@@ -853,6 +865,7 @@ impl<const MODE: u8> WideTile<MODE> {
                             width,
                             paint,
                             blend_mode: None,
+                            mask,
                         }));
                     }
                 }
@@ -862,6 +875,7 @@ impl<const MODE: u8> WideTile<MODE> {
                         width,
                         paint,
                         blend_mode: None,
+                        mask,
                     }));
                 }
                 _ => unreachable!(),
@@ -1004,6 +1018,8 @@ pub struct CmdFill {
     pub paint: Paint,
     /// The blend mode to apply before drawing the contents.
     pub blend_mode: Option<BlendMode>,
+    /// A mask to apply to the command.
+    pub mask: Option<Mask>,
 }
 
 /// Fill a consecutive region of a wide tile with an alpha mask.
@@ -1023,6 +1039,8 @@ pub struct CmdAlphaFill {
     pub paint: Paint,
     /// A blend mode to apply before drawing the contents.
     pub blend_mode: Option<BlendMode>,
+    /// A mask to apply to the command.
+    pub mask: Option<Mask>,
 }
 
 /// Same as fill, but copies top of clip stack to next on stack.
@@ -1097,11 +1115,13 @@ mod tests {
             0,
             10,
             Paint::Solid(PremulColor::from_alpha_color(TRANSPARENT)),
+            None,
         );
         wide.fill(
             10,
             10,
             Paint::Solid(PremulColor::from_alpha_color(TRANSPARENT)),
+            None,
         );
         wide.pop_buf();
 
@@ -1117,8 +1137,8 @@ mod tests {
 
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf();
-        wide.fill(0, 10, paint.clone());
-        wide.fill(10, 10, paint.clone());
+        wide.fill(0, 10, paint.clone(), None);
+        wide.fill(10, 10, paint.clone(), None);
         wide.blend(blend_mode);
         wide.pop_buf();
 
@@ -1134,7 +1154,7 @@ mod tests {
 
         let mut wide = WideTile::<MODE_CPU>::new(0, 0);
         wide.push_buf();
-        wide.fill(0, 10, paint.clone());
+        wide.fill(0, 10, paint.clone(), None);
         wide.blend(blend_mode);
         wide.pop_buf();
 
diff --git a/sparse_strips/vello_common/src/mask.rs b/sparse_strips/vello_common/src/mask.rs
index d7b1a4b9e..e45b3efd8 100644
--- a/sparse_strips/vello_common/src/mask.rs
+++ b/sparse_strips/vello_common/src/mask.rs
@@ -5,15 +5,19 @@
 
 use crate::pixmap::Pixmap;
 use alloc::sync::Arc;
+use alloc::vec::Vec;
 
-/// A mask.
-#[derive(Debug, Clone, PartialEq, Eq)]
-pub struct Mask {
-    data: Arc<[u8]>,
+#[derive(Debug, PartialEq, Eq)]
+struct MaskRepr {
+    data: Vec<u8>,
     width: u16,
     height: u16,
 }
 
+/// A mask.
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub struct Mask(Arc<MaskRepr>);
+
 impl Mask {
     /// Create a new alpha mask from the pixmap.
     pub fn new_alpha(pixmap: &Pixmap) -> Self {
@@ -25,55 +29,85 @@ impl Mask {
         Self::new_with(pixmap, false)
     }
 
+    /// Create a new mask from the given alpha data.
+    ///
+    /// The `data` vector must be of length `width * height` exactly.
+    ///
+    /// The pixels are in row-major order.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the `data` vector is not of length `width * height`.
+    pub fn from_parts(data: Vec<u8>, width: u16, height: u16) -> Self {
+        assert_eq!(
+            data.len(),
+            usize::from(width) * usize::from(height),
+            "Expected `data` to have length of exactly `width * height`"
+        );
+
+        Self(Arc::new(MaskRepr {
+            data,
+            width,
+            height,
+        }))
+    }
+
     fn new_with(pixmap: &Pixmap, alpha_mask: bool) -> Self {
-        let data = Arc::from_iter(pixmap.data().iter().map(|pixel| {
-            if alpha_mask {
-                pixel.a
-            } else {
-                let r = f32::from(pixel.r) / 255.;
-                let g = f32::from(pixel.g) / 255.;
-                let b = f32::from(pixel.b) / 255.;
+        let data = pixmap
+            .data()
+            .iter()
+            .map(|pixel| {
+                if alpha_mask {
+                    pixel.a
+                } else {
+                    let r = f32::from(pixel.r) / 255.;
+                    let g = f32::from(pixel.g) / 255.;
+                    let b = f32::from(pixel.b) / 255.;
 
-                // See CSS Masking Module Level 1 § 7.10.1
-                // <https://www.w3.org/TR/css-masking-1/#MaskValues>
-                // and Filter Effects Module Level 1 § 9.6
-                // <https://www.w3.org/TR/filter-effects-1/#elementdef-fecolormatrix>.
-                // Note r, g and b are premultiplied by alpha.
-                let luma = r * 0.2126 + g * 0.7152 + b * 0.0722;
-                #[expect(clippy::cast_possible_truncation, reason = "This cannot overflow")]
-                {
-                    (luma * 255.0 + 0.5) as u8
+                    // See CSS Masking Module Level 1 § 7.10.1
+                    // <https://www.w3.org/TR/css-masking-1/#MaskValues>
+                    // and Filter Effects Module Level 1 § 9.6
+                    // <https://www.w3.org/TR/filter-effects-1/#elementdef-fecolormatrix>.
+                    // Note r, g and b are premultiplied by alpha.
+                    let luma = r * 0.2126 + g * 0.7152 + b * 0.0722;
+                    #[expect(clippy::cast_possible_truncation, reason = "This cannot overflow")]
+                    {
+                        (luma * 255.0 + 0.5) as u8
+                    }
                 }
-            }
-        }));
+            })
+            .collect::<Vec<u8>>();
 
-        Self {
+        Self(Arc::new(MaskRepr {
             data,
             width: pixmap.width(),
             height: pixmap.height(),
-        }
+        }))
     }
 
     /// Return the width of the mask.
+    #[inline]
     pub fn width(&self) -> u16 {
-        self.width
+        self.0.width
     }
 
     /// Return the height of the mask.
+    #[inline]
     pub fn height(&self) -> u16 {
-        self.height
+        self.0.height
     }
 
     /// Sample the value at a specific location.
     ///
     /// This function might panic or yield a wrong result if the location
     /// is out-of-bounds.
+    #[inline(always)]
     pub fn sample(&self, x: u16, y: u16) -> u8 {
         debug_assert!(
-            x < self.width && y < self.height,
+            x < self.0.width && y < self.0.height,
             "cannot sample mask outside of its range"
         );
 
-        self.data[y as usize * self.width as usize + x as usize]
+        self.0.data[y as usize * self.0.width as usize + x as usize]
     }
 }
diff --git a/sparse_strips/vello_cpu/src/dispatch/mod.rs b/sparse_strips/vello_cpu/src/dispatch/mod.rs
index 8c7dd49c3..6680d84b7 100644
--- a/sparse_strips/vello_cpu/src/dispatch/mod.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/mod.rs
@@ -26,6 +26,7 @@ pub(crate) trait Dispatcher: Debug + Send + Sync {
         transform: Affine,
         paint: Paint,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     );
     fn stroke_path(
         &mut self,
@@ -34,6 +35,7 @@ pub(crate) trait Dispatcher: Debug + Send + Sync {
         transform: Affine,
         paint: Paint,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     );
     fn push_layer(
         &mut self,
diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
index 925ab07f1..72a058106 100644
--- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
@@ -281,17 +281,20 @@ impl MultiThreadedDispatcher {
                                 strips: strip_range,
                                 paint,
                                 thread_id,
+                                mask,
                             } => self.wide.generate(
                                 &task.allocation_group.strips
                                     [strip_range.start as usize..strip_range.end as usize],
                                 paint.clone(),
                                 thread_id,
+                                mask,
                             ),
                             CoarseTaskType::RenderWideCommand {
                                 strips,
                                 paint,
                                 thread_id,
-                            } => self.wide.generate(&strips, paint.clone(), thread_id),
+                                mask,
+                            } => self.wide.generate(&strips, paint.clone(), thread_id, mask),
                             CoarseTaskType::PushLayer {
                                 thread_id,
                                 clip_path,
@@ -387,6 +390,7 @@ impl Dispatcher for MultiThreadedDispatcher {
         transform: Affine,
         paint: Paint,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     ) {
         let start = self.allocation_group.path.len() as u32;
         self.allocation_group.path.extend(path);
@@ -397,6 +401,7 @@ impl Dispatcher for MultiThreadedDispatcher {
             paint,
             fill_rule,
             aliasing_threshold,
+            mask,
         });
     }
 
@@ -407,6 +412,7 @@ impl Dispatcher for MultiThreadedDispatcher {
         transform: Affine,
         paint: Paint,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     ) {
         let start = self.allocation_group.path.len() as u32;
         self.allocation_group.path.extend(path);
@@ -417,6 +423,7 @@ impl Dispatcher for MultiThreadedDispatcher {
             paint,
             stroke: stroke.clone(),
             aliasing_threshold,
+            mask,
         });
     }
 
@@ -696,6 +703,7 @@ pub(crate) enum RenderTaskType {
         paint: Paint,
         fill_rule: Fill,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     },
     WideCommand {
         strip_buf: Box<[Strip]>,
@@ -708,6 +716,7 @@ pub(crate) enum RenderTaskType {
         paint: Paint,
         stroke: Stroke,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     },
     PushLayer {
         clip_path: Option<(Range<u32>, Affine)>,
@@ -730,11 +739,13 @@ pub(crate) enum CoarseTaskType {
         thread_id: u8,
         strips: Range<u32>,
         paint: Paint,
+        mask: Option<Mask>,
     },
     RenderWideCommand {
         thread_id: u8,
         strips: Box<[Strip]>,
         paint: Paint,
+        mask: Option<Mask>,
     },
     PushLayer {
         thread_id: u8,
@@ -811,6 +822,7 @@ mod tests {
                 Affine::IDENTITY,
                 Paint::Solid(PremulColor::from_alpha_color(BLUE)),
                 None,
+                None,
             );
             dispatcher.flush();
         }
diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded/worker.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded/worker.rs
index 9d1ca1f71..f8c7051b9 100644
--- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded/worker.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded/worker.rs
@@ -62,6 +62,7 @@ impl Worker {
                     paint,
                     fill_rule,
                     aliasing_threshold,
+                    mask,
                 } => {
                     let start = self.strip_storage.strips.len() as u32;
                     let path = &render_task.allocation_group.path
@@ -80,6 +81,7 @@ impl Worker {
                         thread_id: self.thread_id,
                         strips: start..end,
                         paint,
+                        mask,
                     };
 
                     render_task
@@ -93,6 +95,7 @@ impl Worker {
                     paint,
                     stroke,
                     aliasing_threshold,
+                    mask,
                 } => {
                     let start = self.strip_storage.strips.len() as u32;
                     let path = &render_task.allocation_group.path
@@ -111,6 +114,7 @@ impl Worker {
                         thread_id: self.thread_id,
                         strips: start..end,
                         paint,
+                        mask,
                     };
 
                     render_task
@@ -174,6 +178,7 @@ impl Worker {
                         thread_id: thread_idx,
                         strips: strip_buf,
                         paint,
+                        mask: None,
                     };
 
                     render_task
diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
index 09b8cc6c6..ba7b1d1d8 100644
--- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
@@ -97,6 +97,7 @@ impl Dispatcher for SingleThreadedDispatcher {
         transform: Affine,
         paint: Paint,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     ) {
         let wide = &mut self.wide;
 
@@ -108,7 +109,7 @@ impl Dispatcher for SingleThreadedDispatcher {
             &mut self.strip_storage,
         );
 
-        wide.generate(&self.strip_storage.strips, paint, 0);
+        wide.generate(&self.strip_storage.strips, paint, 0, mask);
     }
 
     fn stroke_path(
@@ -118,6 +119,7 @@ impl Dispatcher for SingleThreadedDispatcher {
         transform: Affine,
         paint: Paint,
         aliasing_threshold: Option<u8>,
+        mask: Option<Mask>,
     ) {
         let wide = &mut self.wide;
 
@@ -129,7 +131,7 @@ impl Dispatcher for SingleThreadedDispatcher {
             &mut self.strip_storage,
         );
 
-        wide.generate(&self.strip_storage.strips, paint, 0);
+        wide.generate(&self.strip_storage.strips, paint, 0, mask);
     }
 
     fn push_layer(
@@ -188,7 +190,8 @@ impl Dispatcher for SingleThreadedDispatcher {
     }
 
     fn generate_wide_cmd(&mut self, strip_buf: &[Strip], paint: Paint) {
-        self.wide.generate(strip_buf, paint, 0);
+        // Masks are not supported in recordings, so just pass `None` for now.
+        self.wide.generate(strip_buf, paint, 0, None);
     }
 
     fn strip_storage_mut(&mut self) -> &mut StripStorage {
@@ -258,6 +261,7 @@ mod tests {
             Affine::IDENTITY,
             Paint::Solid(PremulColor::from_alpha_color(BLUE)),
             None,
+            None,
         );
 
         // Ensure there is data to clear.
diff --git a/sparse_strips/vello_cpu/src/fine/highp/mod.rs b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
index 2c44268ff..7d335f475 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
@@ -6,6 +6,7 @@ use crate::fine::{COLOR_COMPONENTS, Painter};
 use crate::peniko::BlendMode;
 use crate::region::Region;
 use vello_common::fearless_simd::*;
+use vello_common::mask::Mask;
 use vello_common::paint::PremulColor;
 use vello_common::tile::Tile;
 
@@ -99,7 +100,12 @@ impl<S: Simd> FineKernel<S> for F32Kernel {
         alphas: Option<&[u8]>,
     ) {
         if let Some(alphas) = alphas {
-            alpha_fill::alpha_composite_solid(simd, dest, src, alphas);
+            alpha_fill::alpha_composite_solid(
+                simd,
+                dest,
+                src,
+                alphas.chunks_exact(4).map(|c| [c[0], c[1], c[2], c[3]]),
+            );
         } else {
             fill::alpha_composite_solid(simd, dest, src);
         }
@@ -116,7 +122,7 @@ impl<S: Simd> FineKernel<S> for F32Kernel {
                 simd,
                 dest,
                 src.chunks_exact(16).map(|el| f32x16::from_slice(simd, el)),
-                alphas,
+                alphas.chunks_exact(4).map(|c| [c[0], c[1], c[2], c[3]]),
             );
         } else {
             fill::alpha_composite_arbitrary(
@@ -130,14 +136,56 @@ impl<S: Simd> FineKernel<S> for F32Kernel {
     fn blend(
         simd: S,
         dest: &mut [Self::Numeric],
+        mut start_x: u16,
+        start_y: u16,
         src: impl Iterator<Item = Self::Composite>,
         blend_mode: BlendMode,
         alphas: Option<&[u8]>,
+        mask: Option<&Mask>,
     ) {
-        if let Some(alphas) = alphas {
-            alpha_fill::blend(simd, dest, src, alphas, blend_mode);
-        } else {
-            fill::blend(simd, dest, src, blend_mode);
+        let alpha_iter = alphas.map(|a| a.chunks_exact(4).map(|d| [d[0], d[1], d[2], d[3]]));
+
+        let mask_iter = mask.map(|m| {
+            core::iter::from_fn(|| {
+                let sample = |x: u16, y: u16| {
+                    if x < m.width() && y < m.height() {
+                        m.sample(x, y)
+                    } else {
+                        255
+                    }
+                };
+
+                let samples = [
+                    sample(start_x, start_y),
+                    sample(start_x, start_y + 1),
+                    sample(start_x, start_y + 2),
+                    sample(start_x, start_y + 3),
+                ];
+
+                start_x += 1;
+
+                Some(samples)
+            })
+        });
+
+        match (alpha_iter, mask_iter) {
+            (Some(alpha_iter), Some(mut mask_iter)) => {
+                let iter = alpha_iter.map(|a1| {
+                    let a2 = mask_iter.next().unwrap();
+                    [
+                        ((a1[0] as u16 * a2[0] as u16) / 255) as u8,
+                        ((a1[1] as u16 * a2[1] as u16) / 255) as u8,
+                        ((a1[2] as u16 * a2[2] as u16) / 255) as u8,
+                        ((a1[3] as u16 * a2[3] as u16) / 255) as u8,
+                    ]
+                });
+                alpha_fill::blend(simd, dest, src, iter, blend_mode);
+            }
+            (None, Some(mask_iter)) => alpha_fill::blend(simd, dest, src, mask_iter, blend_mode),
+            (Some(alpha_iter), None) => alpha_fill::blend(simd, dest, src, alpha_iter, blend_mode),
+            (None, None) => {
+                fill::blend(simd, dest, src, blend_mode);
+            }
         }
     }
 }
@@ -225,7 +273,7 @@ mod alpha_fill {
         s: S,
         dest: &mut [f32],
         src: [f32; 4],
-        alphas: &[u8],
+        alphas: impl Iterator<Item = [u8; 4]>,
     ) {
         s.vectorize(
             #[inline(always)]
@@ -234,33 +282,29 @@ mod alpha_fill {
                 let src_c = f32x16::block_splat(src.simd_into(s));
                 let one = f32x16::splat(s, 1.0);
 
-                for (next_dest, next_mask) in dest.chunks_exact_mut(16).zip(alphas.chunks_exact(4))
-                {
-                    alpha_composite_inner(s, next_dest, next_mask, src_c, src_a, one);
+                for (next_dest, next_mask) in dest.chunks_exact_mut(16).zip(alphas) {
+                    alpha_composite_inner(s, next_dest, &next_mask, src_c, src_a, one);
                 }
             },
         );
     }
 
-    #[inline(always)]
     pub(super) fn alpha_composite_arbitrary<S: Simd, T: Iterator<Item = f32x16<S>>>(
         simd: S,
         dest: &mut [f32],
         src: T,
-        alphas: &[u8],
+        alphas: impl Iterator<Item = [u8; 4]>,
     ) {
         simd.vectorize(
             #[inline(always)]
             || {
                 let one = f32x16::splat(simd, 1.0);
 
-                for ((next_dest, next_mask), next_src) in dest
-                    .chunks_exact_mut(16)
-                    .zip(alphas.chunks_exact(4))
-                    .zip(src)
+                for ((next_dest, next_mask), next_src) in
+                    dest.chunks_exact_mut(16).zip(alphas).zip(src)
                 {
                     let src_a = next_src.splat_4th();
-                    alpha_composite_inner(simd, next_dest, next_mask, next_src, src_a, one);
+                    alpha_composite_inner(simd, next_dest, &next_mask, next_src, src_a, one);
                 }
             },
         );
@@ -270,18 +314,16 @@ mod alpha_fill {
         simd: S,
         dest: &mut [f32],
         src: T,
-        alphas: &[u8],
+        alphas: impl Iterator<Item = [u8; 4]>,
         blend_mode: BlendMode,
     ) {
         simd.vectorize(
             #[inline(always)]
             || {
-                for ((next_dest, next_mask), next_src) in dest
-                    .chunks_exact_mut(16)
-                    .zip(alphas.chunks_exact(4))
-                    .zip(src)
+                for ((next_dest, next_mask), next_src) in
+                    dest.chunks_exact_mut(16).zip(alphas).zip(src)
                 {
-                    let masks = extract_masks(simd, next_mask);
+                    let masks = extract_masks(simd, &next_mask);
                     let bg = f32x16::from_slice(simd, next_dest);
                     let src_c = blend::mix(next_src, bg, blend_mode);
                     let res = blend_mode.compose(simd, src_c, bg, masks);
@@ -295,7 +337,7 @@ mod alpha_fill {
     fn alpha_composite_inner<S: Simd>(
         s: S,
         dest: &mut [f32],
-        masks: &[u8],
+        masks: &[u8; 4],
         src_c: f32x16<S>,
         src_a: f32x16<S>,
         one: f32x16<S>,
@@ -310,7 +352,7 @@ mod alpha_fill {
 }
 
 #[inline(always)]
-fn extract_masks<S: Simd>(simd: S, masks: &[u8]) -> f32x16<S> {
+fn extract_masks<S: Simd>(simd: S, masks: &[u8; 4]) -> f32x16<S> {
     let mut base_mask = [
         masks[0] as f32,
         masks[1] as f32,
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
index e4b74456c..302213827 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
@@ -11,10 +11,13 @@ use crate::fine::{FineKernel, highp, u8_to_f32};
 use crate::peniko::BlendMode;
 use crate::region::Region;
 use crate::util::Div255Ext;
+use crate::util::scalar::div_255;
 use bytemuck::cast_slice;
+use core::iter;
 use vello_common::coarse::WideTile;
 use vello_common::encode::{EncodedGradient, EncodedImage};
 use vello_common::fearless_simd::*;
+use vello_common::mask::Mask;
 use vello_common::paint::PremulColor;
 use vello_common::pixmap::Pixmap;
 use vello_common::tile::Tile;
@@ -122,7 +125,14 @@ impl<S: Simd> FineKernel<S> for U8Kernel {
         alphas: Option<&[u8]>,
     ) {
         if let Some(alphas) = alphas {
-            alpha_fill::alpha_composite_solid(simd, dest, src, alphas);
+            alpha_fill::alpha_composite_solid(
+                simd,
+                dest,
+                src,
+                alphas
+                    .chunks_exact(8)
+                    .map(|d| [d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]]),
+            );
         } else {
             fill::alpha_composite_solid(simd, dest, src);
         }
@@ -137,7 +147,14 @@ impl<S: Simd> FineKernel<S> for U8Kernel {
         let src_iter = src.chunks_exact(32).map(|el| u8x32::from_slice(simd, el));
 
         if let Some(alphas) = alphas {
-            alpha_fill::alpha_composite(simd, dest, src_iter, alphas);
+            alpha_fill::alpha_composite(
+                simd,
+                dest,
+                src_iter,
+                alphas
+                    .chunks_exact(8)
+                    .map(|d| [d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]]),
+            );
         } else {
             fill::alpha_composite(simd, dest, src_iter);
         }
@@ -146,14 +163,67 @@ impl<S: Simd> FineKernel<S> for U8Kernel {
     fn blend(
         simd: S,
         dest: &mut [Self::Numeric],
+        mut start_x: u16,
+        start_y: u16,
         src: impl Iterator<Item = Self::Composite>,
         blend_mode: BlendMode,
         alphas: Option<&[u8]>,
+        mask: Option<&Mask>,
     ) {
-        if let Some(alphas) = alphas {
-            alpha_fill::blend(simd, dest, src, blend_mode, alphas);
-        } else {
-            fill::blend(simd, dest, src, blend_mode);
+        let alpha_iter = alphas.map(|a| {
+            a.chunks_exact(8)
+                .map(|d| [d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]])
+        });
+
+        let mask_iter = mask.map(|m| {
+            iter::from_fn(|| {
+                let sample = |x: u16, y: u16| {
+                    if x < m.width() && y < m.height() {
+                        m.sample(x, y)
+                    } else {
+                        255
+                    }
+                };
+
+                let samples = [
+                    sample(start_x, start_y),
+                    sample(start_x, start_y + 1),
+                    sample(start_x, start_y + 2),
+                    sample(start_x, start_y + 3),
+                    sample(start_x + 1, start_y),
+                    sample(start_x + 1, start_y + 1),
+                    sample(start_x + 1, start_y + 2),
+                    sample(start_x + 1, start_y + 3),
+                ];
+
+                start_x += 2;
+
+                Some(samples)
+            })
+        });
+
+        match (alpha_iter, mask_iter) {
+            (Some(alpha_iter), Some(mut mask_iter)) => {
+                let iter = alpha_iter.map(|a1| {
+                    let a2 = mask_iter.next().unwrap();
+                    [
+                        div_255(a1[0] as u16 * a2[0] as u16) as u8,
+                        div_255(a1[1] as u16 * a2[1] as u16) as u8,
+                        div_255(a1[2] as u16 * a2[2] as u16) as u8,
+                        div_255(a1[3] as u16 * a2[3] as u16) as u8,
+                        div_255(a1[4] as u16 * a2[4] as u16) as u8,
+                        div_255(a1[5] as u16 * a2[5] as u16) as u8,
+                        div_255(a1[6] as u16 * a2[6] as u16) as u8,
+                        div_255(a1[7] as u16 * a2[7] as u16) as u8,
+                    ]
+                });
+                alpha_fill::blend(simd, dest, src, blend_mode, iter);
+            }
+            (None, Some(mask_iter)) => alpha_fill::blend(simd, dest, src, blend_mode, mask_iter),
+            (Some(alpha_iter), None) => alpha_fill::blend(simd, dest, src, blend_mode, alpha_iter),
+            (None, None) => {
+                fill::blend(simd, dest, src, blend_mode);
+            }
         }
     }
 }
@@ -255,7 +325,7 @@ mod alpha_fill {
         dest: &mut [u8],
         src: T,
         blend_mode: BlendMode,
-        alphas: &[u8],
+        alphas: impl Iterator<Item = [u8; 8]>,
     ) {
         simd.vectorize(
             #[inline(always)]
@@ -263,10 +333,8 @@ mod alpha_fill {
                 #[expect(deprecated, reason = "Provided by the user, need to handle correctly.")]
                 let default_mix = matches!(blend_mode.mix, Mix::Normal | Mix::Clip);
 
-                for ((next_bg, next_mask), next_src) in dest
-                    .chunks_exact_mut(32)
-                    .zip(alphas.chunks_exact(8))
-                    .zip(src)
+                for ((next_bg, next_mask), next_src) in
+                    dest.chunks_exact_mut(32).zip(alphas).zip(src)
                 {
                     let bg_v = u8x32::from_slice(simd, next_bg);
                     let src_c = if default_mix {
@@ -274,7 +342,7 @@ mod alpha_fill {
                     } else {
                         mix(next_src, bg_v, blend_mode)
                     };
-                    let masks = extract_masks(simd, next_mask);
+                    let masks = extract_masks(simd, &next_mask);
                     let res = blend_mode.compose(simd, src_c, bg_v, masks);
 
                     next_bg.copy_from_slice(&res.val);
@@ -288,7 +356,7 @@ mod alpha_fill {
         s: S,
         dest: &mut [u8],
         src: [u8; 4],
-        alphas: &[u8],
+        alphas: impl Iterator<Item = [u8; 8]>,
     ) {
         s.vectorize(
             #[inline(always)]
@@ -297,8 +365,8 @@ mod alpha_fill {
                 let src_c = u32x8::splat(s, u32::from_ne_bytes(src)).reinterpret_u8();
                 let one = u8x32::splat(s, 255);
 
-                for (next_bg, next_mask) in dest.chunks_exact_mut(32).zip(alphas.chunks_exact(8)) {
-                    alpha_composite_inner(s, next_bg, next_mask, src_c, src_a, one);
+                for (next_bg, next_mask) in dest.chunks_exact_mut(32).zip(alphas) {
+                    alpha_composite_inner(s, next_bg, &next_mask, src_c, src_a, one);
                 }
             },
         );
@@ -309,20 +377,18 @@ mod alpha_fill {
         simd: S,
         dest: &mut [u8],
         src: T,
-        alphas: &[u8],
+        alphas: impl Iterator<Item = [u8; 8]>,
     ) {
         simd.vectorize(
             #[inline(always)]
             || {
                 let one = u8x32::splat(simd, 255);
 
-                for ((next_dest, next_mask), next_src) in dest
-                    .chunks_exact_mut(32)
-                    .zip(alphas.chunks_exact(8))
-                    .zip(src)
+                for ((next_dest, next_mask), next_src) in
+                    dest.chunks_exact_mut(32).zip(alphas).zip(src)
                 {
                     let src_a = next_src.splat_4th();
-                    alpha_composite_inner(simd, next_dest, next_mask, next_src, src_a, one);
+                    alpha_composite_inner(simd, next_dest, &next_mask, next_src, src_a, one);
                 }
             },
         );
@@ -332,7 +398,7 @@ mod alpha_fill {
     fn alpha_composite_inner<S: Simd>(
         s: S,
         dest: &mut [u8],
-        masks: &[u8],
+        masks: &[u8; 8],
         src_c: u8x32<S>,
         src_a: u8x32<S>,
         one: u8x32<S>,
@@ -385,7 +451,7 @@ fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32
 }
 
 #[inline(always)]
-fn extract_masks<S: Simd>(simd: S, masks: &[u8]) -> u8x32<S> {
+fn extract_masks<S: Simd>(simd: S, masks: &[u8; 8]) -> u8x32<S> {
     let m1 =
         u32x4::splat(simd, u32::from_ne_bytes(masks[0..4].try_into().unwrap())).reinterpret_u8();
     let m2 =
diff --git a/sparse_strips/vello_cpu/src/fine/mod.rs b/sparse_strips/vello_cpu/src/fine/mod.rs
index 0b6326dad..60f8e2a21 100644
--- a/sparse_strips/vello_cpu/src/fine/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/mod.rs
@@ -35,6 +35,7 @@ pub use lowp::U8Kernel;
 use vello_common::fearless_simd::{
     Simd, SimdBase, SimdFloat, SimdInto, f32x4, f32x8, f32x16, u8x16, u8x32, u32x4, u32x8,
 };
+use vello_common::mask::Mask;
 use vello_common::pixmap::Pixmap;
 use vello_common::simd::Splat4thExt;
 use vello_common::util::f32_to_u8;
@@ -275,9 +276,12 @@ pub trait FineKernel<S: Simd>: Send + Sync + 'static {
     fn blend(
         simd: S,
         dest: &mut [Self::Numeric],
+        start_x: u16,
+        start_y: u16,
         src: impl Iterator<Item = Self::Composite>,
         blend_mode: BlendMode,
         alphas: Option<&[u8]>,
+        mask: Option<&Mask>,
     );
 }
 
@@ -334,6 +338,7 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
                         .unwrap_or(BlendMode::new(Mix::Normal, Compose::SrcOver)),
                     paints,
                     None,
+                    f.mask.as_ref(),
                 );
             }
             Cmd::AlphaFill(s) => {
@@ -345,6 +350,7 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
                         .unwrap_or(BlendMode::new(Mix::Normal, Compose::SrcOver)),
                     paints,
                     Some(&alphas[s.alpha_idx..]),
+                    s.mask.as_ref(),
                 );
             }
             Cmd::PushBuf => {
@@ -430,6 +436,7 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
         blend_mode: BlendMode,
         encoded_paints: &[EncodedPaint],
         alphas: Option<&[u8]>,
+        mask: Option<&Mask>,
     ) {
         let blend_buf = &mut self.blend_buf.last_mut().unwrap()[x * TILE_HEIGHT_COMPONENTS..]
             [..TILE_HEIGHT_COMPONENTS * width];
@@ -441,21 +448,31 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
 
                 // If color is completely opaque, we can just directly override
                 // the blend buffer.
-                if color[3] == T::Numeric::ONE && default_blend && alphas.is_none() {
+                if color[3] == T::Numeric::ONE
+                    && default_blend
+                    && alphas.is_none()
+                    && mask.is_none()
+                {
                     T::copy_solid(self.simd, blend_buf, color);
 
                     return;
                 }
 
-                if default_blend {
+                if default_blend && mask.is_none() {
                     T::alpha_composite_solid(self.simd, blend_buf, color, alphas);
                 } else {
+                    let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
+                    let start_y = self.wide_coords.1 * Tile::HEIGHT;
+
                     T::blend(
                         self.simd,
                         blend_buf,
+                        start_x,
+                        start_y,
                         iter::repeat(T::Composite::from_color(self.simd, color)),
                         blend_mode,
                         alphas,
+                        mask,
                     );
                 }
             }
@@ -475,17 +492,20 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
                         if $has_opacities || alphas.is_some() {
                             T::apply_painter(self.simd, color_buf, $filler);
 
-                            if default_blend {
+                            if default_blend && mask.is_none() {
                                 T::alpha_composite_buffer(self.simd, blend_buf, color_buf, alphas);
                             } else {
                                 T::blend(
                                     self.simd,
                                     blend_buf,
+                                    start_x,
+                                    start_y,
                                     color_buf
                                         .chunks_exact(T::Composite::LENGTH)
                                         .map(|s| T::Composite::from_slice(self.simd, s)),
                                     blend_mode,
                                     alphas,
+                                    mask,
                                 );
                             }
                         } else {
@@ -620,11 +640,16 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
             T::blend(
                 self.simd,
                 target_buffer,
+                // `start_x` and `start_y` are only needed to sample the correct position
+                // of a mask, so we can just pass dummy values here.
+                0,
+                0,
                 source_buffer
                     .chunks_exact(T::Composite::LENGTH)
                     .map(|s| T::Composite::from_slice(self.simd, s)),
                 blend_mode,
                 None,
+                None,
             );
         }
     }
diff --git a/sparse_strips/vello_cpu/src/render.rs b/sparse_strips/vello_cpu/src/render.rs
index ebcb5ef37..1989492ac 100644
--- a/sparse_strips/vello_cpu/src/render.rs
+++ b/sparse_strips/vello_cpu/src/render.rs
@@ -43,6 +43,7 @@ pub struct RenderContext {
     pub(crate) height: u16,
     pub(crate) paint: PaintType,
     pub(crate) paint_transform: Affine,
+    pub(crate) mask: Option<Mask>,
     pub(crate) stroke: Stroke,
     pub(crate) transform: Affine,
     pub(crate) fill_rule: Fill,
@@ -142,6 +143,7 @@ impl RenderContext {
             aliasing_threshold,
             paint,
             render_settings: settings,
+            mask: None,
             paint_transform,
             fill_rule,
             stroke,
@@ -178,6 +180,7 @@ impl RenderContext {
             self.transform,
             paint,
             self.aliasing_threshold,
+            self.mask.clone(),
         );
     }
 
@@ -190,6 +193,7 @@ impl RenderContext {
             self.transform,
             paint,
             self.aliasing_threshold,
+            self.mask.clone(),
         );
     }
 
@@ -203,6 +207,7 @@ impl RenderContext {
             self.transform,
             paint,
             self.aliasing_threshold,
+            self.mask.clone(),
         );
     }
 
@@ -254,6 +259,7 @@ impl RenderContext {
             self.transform,
             paint,
             self.aliasing_threshold,
+            self.mask.clone(),
         );
     }
 
@@ -267,6 +273,7 @@ impl RenderContext {
             self.transform,
             paint,
             self.aliasing_threshold,
+            self.mask.clone(),
         );
     }
 
@@ -398,6 +405,11 @@ impl RenderContext {
         self.fill_rule = fill_rule;
     }
 
+    /// Set the mask to use for path-painting operations.
+    pub fn set_mask(&mut self, mask: Option<Mask>) {
+        self.mask = mask;
+    }
+
     /// Get the current fill rule.
     pub fn fill_rule(&self) -> &Fill {
         &self.fill_rule
@@ -422,6 +434,7 @@ impl RenderContext {
     pub fn reset(&mut self) {
         self.dispatcher.reset();
         self.encoded_paints.clear();
+        self.mask = None;
         self.reset_transform();
         self.reset_paint_transform();
         #[cfg(feature = "text")]
@@ -502,6 +515,7 @@ impl GlyphRenderer for RenderContext {
                     prepared_glyph.transform,
                     paint,
                     self.aliasing_threshold,
+                    self.mask.clone(),
                 );
             }
             GlyphType::Bitmap(glyph) => {
@@ -607,6 +621,7 @@ impl GlyphRenderer for RenderContext {
                     prepared_glyph.transform,
                     paint,
                     self.aliasing_threshold,
+                    self.mask.clone(),
                 );
             }
             GlyphType::Bitmap(_) | GlyphType::Colr(_) => {
diff --git a/sparse_strips/vello_cpu/src/util.rs b/sparse_strips/vello_cpu/src/util.rs
index 6acd24bde..14264e14e 100644
--- a/sparse_strips/vello_cpu/src/util.rs
+++ b/sparse_strips/vello_cpu/src/util.rs
@@ -6,11 +6,6 @@ use vello_common::encode::EncodedImage;
 use vello_common::fearless_simd::{Simd, SimdBase, f32x4, u8x32, u16x16, u16x32};
 use vello_common::math::FloatExt;
 
-#[allow(
-    dead_code,
-    reason = "this is not used because the division by 255 is now done with SIMD, but\
-we still keep it around to document its properties."
-)]
 pub(crate) mod scalar {
     /// Perform an approximate division by 255.
     ///
diff --git a/sparse_strips/vello_hybrid/src/scene.rs b/sparse_strips/vello_hybrid/src/scene.rs
index 40234d8c9..4f827d835 100644
--- a/sparse_strips/vello_hybrid/src/scene.rs
+++ b/sparse_strips/vello_hybrid/src/scene.rs
@@ -186,7 +186,7 @@ impl Scene {
             aliasing_threshold,
             &mut self.strip_storage,
         );
-        wide.generate(&self.strip_storage.strips, paint, 0);
+        wide.generate(&self.strip_storage.strips, paint, 0, None);
     }
 
     /// Stroke a path with the current paint and stroke settings.
@@ -217,7 +217,7 @@ impl Scene {
             &mut self.strip_storage,
         );
 
-        wide.generate(&self.strip_storage.strips, paint, 0);
+        wide.generate(&self.strip_storage.strips, paint, 0, None);
     }
 
     /// Set the aliasing threshold.
@@ -626,7 +626,8 @@ impl Scene {
             "Invalid strip range: start={start}, end={end}, count={count}"
         );
         let paint = self.encode_current_paint();
-        self.wide.generate(&adjusted_strips[start..end], paint, 0);
+        self.wide
+            .generate(&adjusted_strips[start..end], paint, 0, None);
     }
 
     /// Prepare cached strips for rendering by adjusting alpha indices and extending alpha buffer.
diff --git a/sparse_strips/vello_sparse_tests/snapshots/mask_non_isolated.png b/sparse_strips/vello_sparse_tests/snapshots/mask_non_isolated.png
new file mode 100644
index 000000000..2da6ace59
--- /dev/null
+++ b/sparse_strips/vello_sparse_tests/snapshots/mask_non_isolated.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:fffe5139b6ad63b2b7251c97f19851607a9617d84c3ba733f991e8d5c94c2f32
+size 263
diff --git a/sparse_strips/vello_sparse_tests/tests/mask.rs b/sparse_strips/vello_sparse_tests/tests/mask.rs
index 39dd07eeb..c089f3018 100644
--- a/sparse_strips/vello_sparse_tests/tests/mask.rs
+++ b/sparse_strips/vello_sparse_tests/tests/mask.rs
@@ -77,3 +77,14 @@ fn mask_alpha(ctx: &mut impl Renderer) {
 fn mask_luminance(ctx: &mut impl Renderer) {
     mask(ctx, false);
 }
+
+#[vello_test(skip_hybrid)]
+fn mask_non_isolated(ctx: &mut impl Renderer) {
+    let mask = example_mask(false);
+
+    ctx.set_paint(BLACK);
+    ctx.fill_rect(&Rect::new(10.0, 10.0, 90.0, 90.0));
+    ctx.set_mask(Some(mask));
+    ctx.set_paint(RED);
+    ctx.fill_rect(&Rect::new(10.0, 10.0, 90.0, 90.0));
+}
diff --git a/sparse_strips/vello_sparse_tests/tests/renderer.rs b/sparse_strips/vello_sparse_tests/tests/renderer.rs
index e2ce9a185..f04a5e037 100644
--- a/sparse_strips/vello_sparse_tests/tests/renderer.rs
+++ b/sparse_strips/vello_sparse_tests/tests/renderer.rs
@@ -46,6 +46,7 @@ pub(crate) trait Renderer: Sized {
     fn push_mask_layer(&mut self, mask: Mask);
     fn pop_layer(&mut self);
     fn set_stroke(&mut self, stroke: Stroke);
+    fn set_mask(&mut self, mask: Option<Mask>);
     fn set_paint(&mut self, paint: impl Into<PaintType>);
     fn set_paint_transform(&mut self, affine: Affine);
     fn set_fill_rule(&mut self, fill_rule: Fill);
@@ -141,6 +142,10 @@ impl Renderer for RenderContext {
         Self::set_stroke(self, stroke);
     }
 
+    fn set_mask(&mut self, mask: Option<Mask>) {
+        Self::set_mask(self, mask);
+    }
+
     fn set_paint(&mut self, paint: impl Into<PaintType>) {
         Self::set_paint(self, paint);
     }
@@ -327,6 +332,10 @@ impl Renderer for HybridRenderer {
         self.scene.set_stroke(stroke);
     }
 
+    fn set_mask(&mut self, _: Option<Mask>) {
+        unimplemented!()
+    }
+
     fn set_paint(&mut self, paint: impl Into<PaintType>) {
         let paint_type: PaintType = paint.into();
         match paint_type {
@@ -611,6 +620,10 @@ impl Renderer for HybridRenderer {
         self.scene.set_stroke(stroke);
     }
 
+    fn set_mask(&mut self, _: Option<Mask>) {
+        unimplemented!()
+    }
+
     fn set_paint(&mut self, paint: impl Into<PaintType>) {
         let paint_type: PaintType = paint.into();
         match paint_type {
diff --git a/sparse_strips/vello_toy/src/debug.rs b/sparse_strips/vello_toy/src/debug.rs
index 01732a8f6..d9331848e 100644
--- a/sparse_strips/vello_toy/src/debug.rs
+++ b/sparse_strips/vello_toy/src/debug.rs
@@ -87,7 +87,7 @@ fn main() {
     }
 
     if stages.iter().any(|s| s.requires_wide_tiles()) {
-        wide.generate(&strip_buf, BLACK.into(), 0);
+        wide.generate(&strip_buf, BLACK.into(), 0, None);
     }
 
     draw_grid(&mut document, args.width, args.height);