diff --git a/src/decoder.rs b/src/decoder.rs
index de2c64b..b36275b 100644
--- a/src/decoder.rs
+++ b/src/decoder.rs
@@ -271,6 +271,43 @@ pub enum LoopCount {
     Times(NonZeroU16),
 }
 
+/// WebP decoder configuration options
+#[derive(Clone)]
+#[non_exhaustive]
+pub struct WebPDecodeOptions {
+    /// The upsampling method used in conversion from lossy yuv to rgb
+    ///
+    /// Defaults to `Bilinear`.
+    pub lossy_upsampling: UpsamplingMethod,
+}
+
+impl Default for WebPDecodeOptions {
+    fn default() -> Self {
+        Self {
+            lossy_upsampling: UpsamplingMethod::Bilinear,
+        }
+    }
+}
+
+/// Methods for upsampling the chroma values in lossy decoding
+///
+/// The chroma red and blue planes are encoded in VP8 as half the size of the luma plane
+/// Therefore we need to upsample these values up to fit each pixel in the image.
+#[derive(Clone, Copy, Default)]
+pub enum UpsamplingMethod {
+    /// Fancy upsampling
+    ///
+    /// Does bilinear interpolation using the 4 values nearest to the pixel, weighting based on the distance
+    /// from the pixel.
+    #[default]
+    Bilinear,
+    /// Simple upsampling, just uses the closest u/v value to the pixel when upsampling
+    ///
+    /// Matches the -nofancy option in dwebp.
+    /// Should be faster but may lead to slightly jagged edges.
+    Simple,
+}
+
 /// WebP image format decoder.
 pub struct WebPDecoder<R> {
     r: R,
@@ -289,12 +326,23 @@ pub struct WebPDecoder<R> {
     loop_duration: u64,
 
     chunks: HashMap<WebPRiffChunk, Range<u64>>,
+
+    webp_decode_options: WebPDecodeOptions,
 }
 
 impl<R: BufRead + Seek> WebPDecoder<R> {
     /// Create a new `WebPDecoder` from the reader `r`. The decoder performs many small reads, so the
     /// reader should be buffered.
     pub fn new(r: R) -> Result<Self, DecodingError> {
+        Self::new_with_options(r, WebPDecodeOptions::default())
+    }
+
+    /// Create a new `WebPDecoder` from the reader `r` with the options `WebPDecodeOptions`. The decoder
+    /// performs many small reads, so the reader should be buffered.
+    pub fn new_with_options(
+        r: R,
+        webp_decode_options: WebPDecodeOptions,
+    ) -> Result<Self, DecodingError> {
         let mut decoder = Self {
             r,
             width: 0,
@@ -308,6 +356,7 @@ impl<R: BufRead + Seek> WebPDecoder<R> {
             has_alpha: false,
             loop_count: LoopCount::Times(NonZeroU16::new(1).unwrap()),
             loop_duration: 0,
+            webp_decode_options,
         };
         decoder.read_data()?;
         Ok(decoder)
@@ -653,7 +702,7 @@ impl<R: BufRead + Seek> WebPDecoder<R> {
             }
 
             if self.has_alpha() {
-                frame.fill_rgba(buf);
+                frame.fill_rgba(buf, self.webp_decode_options.lossy_upsampling);
 
                 let range = self
                     .chunks
@@ -684,7 +733,7 @@ impl<R: BufRead + Seek> WebPDecoder<R> {
                     }
                 }
             } else {
-                frame.fill_rgb(buf);
+                frame.fill_rgb(buf, self.webp_decode_options.lossy_upsampling);
             }
         }
 
@@ -758,7 +807,7 @@ impl<R: BufRead + Seek> WebPDecoder<R> {
                     return Err(DecodingError::InconsistentImageSizes);
                 }
                 let mut rgb_frame = vec![0; frame_width as usize * frame_height as usize * 3];
-                raw_frame.fill_rgb(&mut rgb_frame);
+                raw_frame.fill_rgb(&mut rgb_frame, self.webp_decode_options.lossy_upsampling);
                 (rgb_frame, false)
             }
             WebPRiffChunk::VP8L => {
@@ -789,7 +838,7 @@ impl<R: BufRead + Seek> WebPDecoder<R> {
                 let frame = Vp8Decoder::decode_frame((&mut self.r).take(next_chunk_size))?;
 
                 let mut rgba_frame = vec![0; frame_width as usize * frame_height as usize * 4];
-                frame.fill_rgba(&mut rgba_frame);
+                frame.fill_rgba(&mut rgba_frame, self.webp_decode_options.lossy_upsampling);
 
                 for y in 0..frame.height {
                     for x in 0..frame.width {
@@ -880,6 +929,11 @@ impl<R: BufRead + Seek> WebPDecoder<R> {
         self.animation.next_frame_start = self.chunks.get(&WebPRiffChunk::ANMF).unwrap().start - 8;
         self.animation.dispose_next_frame = true;
     }
+
+    /// Sets the upsampling method that is used in lossy decoding
+    pub fn set_lossy_upsampling(&mut self, upsampling_method: UpsamplingMethod) {
+        self.webp_decode_options.lossy_upsampling = upsampling_method;
+    }
 }
 
 pub(crate) fn range_reader<R: BufRead + Seek>(
diff --git a/src/lib.rs b/src/lib.rs
index dedc9f8..fe29eae 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -9,7 +9,9 @@
 #[cfg(all(test, feature = "_benchmarks"))]
 extern crate test;
 
-pub use self::decoder::{DecodingError, LoopCount, WebPDecoder};
+pub use self::decoder::{
+    DecodingError, LoopCount, UpsamplingMethod, WebPDecodeOptions, WebPDecoder,
+};
 pub use self::encoder::{ColorType, EncoderParams, EncodingError, WebPEncoder};
 
 mod alpha_blending;
@@ -22,5 +24,6 @@ mod lossless;
 mod lossless_transform;
 mod transform;
 mod vp8_arithmetic_decoder;
+mod yuv;
 
 pub mod vp8;
diff --git a/src/vp8.rs b/src/vp8.rs
index 47ec1bd..ac14b93 100644
--- a/src/vp8.rs
+++ b/src/vp8.rs
@@ -14,7 +14,8 @@ use byteorder_lite::{LittleEndian, ReadBytesExt};
 use std::default::Default;
 use std::io::Read;
 
-use crate::decoder::DecodingError;
+use crate::decoder::{DecodingError, UpsamplingMethod};
+use crate::yuv;
 
 use super::vp8_arithmetic_decoder::ArithmeticDecoder;
 use super::{loop_filter, transform};
@@ -802,7 +803,6 @@ pub struct Frame {
 }
 
 impl Frame {
-    /// Chroma plane is half the size of the Luma plane
     const fn chroma_width(&self) -> u16 {
         self.width.div_ceil(2)
     }
@@ -816,168 +816,65 @@ impl Frame {
         }
     }
 
-    /// Fills an rgb buffer with the image
-    pub(crate) fn fill_rgb(&self, buf: &mut [u8]) {
+    /// Fills an rgb buffer from the YUV buffers
+    pub(crate) fn fill_rgb(&self, buf: &mut [u8], upsampling_method: UpsamplingMethod) {
         const BPP: usize = 3;
 
-        let buffer_width = usize::from(self.buffer_width());
-
-        let u_row_twice_iter = self
-            .ubuf
-            .chunks_exact(buffer_width / 2)
-            .flat_map(|n| std::iter::repeat(n).take(2));
-        let v_row_twice_iter = self
-            .vbuf
-            .chunks_exact(buffer_width / 2)
-            .flat_map(|n| std::iter::repeat(n).take(2));
-
-        for (((row, y_row), u_row), v_row) in buf
-            .chunks_exact_mut(usize::from(self.width) * BPP)
-            .zip(self.ybuf.chunks_exact(buffer_width))
-            .zip(u_row_twice_iter)
-            .zip(v_row_twice_iter)
-        {
-            Self::fill_rgb_row(
-                &y_row[..usize::from(self.width)],
-                &u_row[..usize::from(self.chroma_width())],
-                &v_row[..usize::from(self.chroma_width())],
-                row,
-            );
-        }
-    }
-
-    fn fill_rgb_row(y_vec: &[u8], u_vec: &[u8], v_vec: &[u8], rgb: &mut [u8]) {
-        // Fill 2 pixels per iteration: these pixels share `u` and `v` components
-        let mut rgb_chunks = rgb.chunks_exact_mut(6);
-        let mut y_chunks = y_vec.chunks_exact(2);
-        let mut u_iter = u_vec.iter();
-        let mut v_iter = v_vec.iter();
-
-        for (((rgb, y), &u), &v) in (&mut rgb_chunks)
-            .zip(&mut y_chunks)
-            .zip(&mut u_iter)
-            .zip(&mut v_iter)
-        {
-            let coeffs = [
-                mulhi(v, 26149),
-                mulhi(u, 6419),
-                mulhi(v, 13320),
-                mulhi(u, 33050),
-            ];
-
-            rgb[0] = clip(mulhi(y[0], 19077) + coeffs[0] - 14234);
-            rgb[1] = clip(mulhi(y[0], 19077) - coeffs[1] - coeffs[2] + 8708);
-            rgb[2] = clip(mulhi(y[0], 19077) + coeffs[3] - 17685);
-
-            rgb[3] = clip(mulhi(y[1], 19077) + coeffs[0] - 14234);
-            rgb[4] = clip(mulhi(y[1], 19077) - coeffs[1] - coeffs[2] + 8708);
-            rgb[5] = clip(mulhi(y[1], 19077) + coeffs[3] - 17685);
-        }
-
-        let remainder = rgb_chunks.into_remainder();
-        if remainder.len() >= 3 {
-            if let (Some(&y), Some(&u), Some(&v)) = (
-                y_chunks.remainder().iter().next(),
-                u_iter.next(),
-                v_iter.next(),
-            ) {
-                let coeffs = [
-                    mulhi(v, 26149),
-                    mulhi(u, 6419),
-                    mulhi(v, 13320),
-                    mulhi(u, 33050),
-                ];
-
-                remainder[0] = clip(mulhi(y, 19077) + coeffs[0] - 14234);
-                remainder[1] = clip(mulhi(y, 19077) - coeffs[1] - coeffs[2] + 8708);
-                remainder[2] = clip(mulhi(y, 19077) + coeffs[3] - 17685);
+        match upsampling_method {
+            UpsamplingMethod::Bilinear => {
+                yuv::fill_rgb_buffer_fancy::<BPP>(
+                    buf,
+                    &self.ybuf,
+                    &self.ubuf,
+                    &self.vbuf,
+                    usize::from(self.width),
+                    usize::from(self.height),
+                    usize::from(self.buffer_width()),
+                );
+            }
+            UpsamplingMethod::Simple => {
+                yuv::fill_rgb_buffer_simple::<BPP>(
+                    buf,
+                    &self.ybuf,
+                    &self.ubuf,
+                    &self.vbuf,
+                    usize::from(self.width),
+                    usize::from(self.chroma_width()),
+                    usize::from(self.buffer_width()),
+                );
             }
         }
     }
 
-    /// Fills an rgba buffer by skipping the alpha values
-    pub(crate) fn fill_rgba(&self, buf: &mut [u8]) {
+    /// Fills an rgba buffer from the YUV buffers
+    pub(crate) fn fill_rgba(&self, buf: &mut [u8], upsampling_method: UpsamplingMethod) {
         const BPP: usize = 4;
 
-        let buffer_width = usize::from(self.buffer_width());
-
-        let u_row_twice_iter = self
-            .ubuf
-            .chunks_exact(buffer_width / 2)
-            .flat_map(|n| std::iter::repeat(n).take(2));
-        let v_row_twice_iter = self
-            .vbuf
-            .chunks_exact(buffer_width / 2)
-            .flat_map(|n| std::iter::repeat(n).take(2));
-
-        for (((row, y_row), u_row), v_row) in buf
-            .chunks_exact_mut(usize::from(self.width) * BPP)
-            .zip(self.ybuf.chunks_exact(buffer_width))
-            .zip(u_row_twice_iter)
-            .zip(v_row_twice_iter)
-        {
-            Self::fill_rgba_row(
-                &y_row[..usize::from(self.width)],
-                &u_row[..usize::from(self.chroma_width())],
-                &v_row[..usize::from(self.chroma_width())],
-                row,
-            );
-        }
-    }
-
-    fn fill_rgba_row(y_vec: &[u8], u_vec: &[u8], v_vec: &[u8], rgba: &mut [u8]) {
-        // Fill 2 pixels per iteration: these pixels share `u` and `v` components
-        let mut rgb_chunks = rgba.chunks_exact_mut(8);
-        let mut y_chunks = y_vec.chunks_exact(2);
-        let mut u_iter = u_vec.iter();
-        let mut v_iter = v_vec.iter();
-
-        for (((rgb, y), &u), &v) in (&mut rgb_chunks)
-            .zip(&mut y_chunks)
-            .zip(&mut u_iter)
-            .zip(&mut v_iter)
-        {
-            let coeffs = [
-                mulhi(v, 26149),
-                mulhi(u, 6419),
-                mulhi(v, 13320),
-                mulhi(u, 33050),
-            ];
-
-            let to_copy = [
-                clip(mulhi(y[0], 19077) + coeffs[0] - 14234),
-                clip(mulhi(y[0], 19077) - coeffs[1] - coeffs[2] + 8708),
-                clip(mulhi(y[0], 19077) + coeffs[3] - 17685),
-                rgb[3],
-                clip(mulhi(y[1], 19077) + coeffs[0] - 14234),
-                clip(mulhi(y[1], 19077) - coeffs[1] - coeffs[2] + 8708),
-                clip(mulhi(y[1], 19077) + coeffs[3] - 17685),
-                rgb[7],
-            ];
-            rgb.copy_from_slice(&to_copy);
-        }
-
-        let remainder = rgb_chunks.into_remainder();
-        if remainder.len() >= 4 {
-            if let (Some(&y), Some(&u), Some(&v)) = (
-                y_chunks.remainder().iter().next(),
-                u_iter.next(),
-                v_iter.next(),
-            ) {
-                let coeffs = [
-                    mulhi(v, 26149),
-                    mulhi(u, 6419),
-                    mulhi(v, 13320),
-                    mulhi(u, 33050),
-                ];
-
-                remainder[0] = clip(mulhi(y, 19077) + coeffs[0] - 14234);
-                remainder[1] = clip(mulhi(y, 19077) - coeffs[1] - coeffs[2] + 8708);
-                remainder[2] = clip(mulhi(y, 19077) + coeffs[3] - 17685);
+        match upsampling_method {
+            UpsamplingMethod::Bilinear => {
+                yuv::fill_rgb_buffer_fancy::<BPP>(
+                    buf,
+                    &self.ybuf,
+                    &self.ubuf,
+                    &self.vbuf,
+                    usize::from(self.width),
+                    usize::from(self.height),
+                    usize::from(self.buffer_width()),
+                );
+            }
+            UpsamplingMethod::Simple => {
+                yuv::fill_rgb_buffer_simple::<BPP>(
+                    buf,
+                    &self.ybuf,
+                    &self.ubuf,
+                    &self.vbuf,
+                    usize::from(self.width),
+                    usize::from(self.chroma_width()),
+                    usize::from(self.buffer_width()),
+                );
             }
         }
     }
-
     /// Gets the buffer size
     #[must_use]
     pub fn get_buf_size(&self) -> usize {
@@ -985,36 +882,6 @@ impl Frame {
     }
 }
 
-/// `_mm_mulhi_epu16` emulation used in `Frame::fill_rgb` and `Frame::fill_rgba`.
-fn mulhi(v: u8, coeff: u16) -> i32 {
-    ((u32::from(v) * u32::from(coeff)) >> 8) as i32
-}
-
-/// Used in `Frame::fill_rgb` and `Frame::fill_rgba`.
-/// This function has been rewritten to encourage auto-vectorization.
-///
-/// Based on [src/dsp/yuv.h](https://github.com/webmproject/libwebp/blob/8534f53960befac04c9631e6e50d21dcb42dfeaf/src/dsp/yuv.h#L79)
-/// from the libwebp source.
-/// ```text
-/// const YUV_FIX2: i32 = 6;
-/// const YUV_MASK2: i32 = (256 << YUV_FIX2) - 1;
-/// fn clip(v: i32) -> u8 {
-///     if (v & !YUV_MASK2) == 0 {
-///         (v >> YUV_FIX2) as u8
-///     } else if v < 0 {
-///         0
-///     } else {
-///         255
-///     }
-/// }
-/// ```
-// Clippy suggests the clamp method, but it seems to optimize worse as of rustc 1.82.0 nightly.
-#[allow(clippy::manual_clamp)]
-fn clip(v: i32) -> u8 {
-    const YUV_FIX2: i32 = 6;
-    (v >> YUV_FIX2).max(0).min(255) as u8
-}
-
 #[derive(Clone, Copy, Default)]
 struct Segment {
     ydc: i16,
diff --git a/src/yuv.rs b/src/yuv.rs
new file mode 100644
index 0000000..3594f04
--- /dev/null
+++ b/src/yuv.rs
@@ -0,0 +1,452 @@
+//! Utilities for doing the YUV -> RGB conversion
+//! The images are encoded in the Y'CbCr format as detailed here: <https://en.wikipedia.org/wiki/YCbCr>
+//! so need to be converted to RGB to be displayed
+//! To do the YUV -> RGB conversion we need to first decide how to map the yuv values to the pixels
+//! The y buffer is the same size as the pixel buffer so that maps 1-1 but the
+//! u and v buffers are half the size of the pixel buffer so we need to scale it up
+//! The simple way to upscale is just to take each u/v value and associate it with the 4
+//! pixels around it e.g. for a 4x4 image:
+//!
+//! ||||||
+//! |yyyy|
+//! |yyyy|
+//! |yyyy|
+//! |yyyy|
+//! ||||||
+//!
+//! |||||||
+//! |uu|vv|
+//! |uu|vv|
+//! |||||||
+//!
+//! Then each of the 2x2 pixels would match the u/v from the same quadrant
+//!
+//! However fancy upsampling is the default for libwebp which does a little more work to make the values smoother
+//! It interpolates u and v so that for e.g. the pixel 1 down and 1 from the left the u value
+//! would be (9*u0 + 3*u1 + 3*u2 + u3 + 8) / 16 and similar for the other pixels
+//! The edges are mirrored, so for the pixel 1 down and 0 from the left it uses (9*u0 + 3*u2 + 3*u0 + u2 + 8) / 16
+
+/// `_mm_mulhi_epu16` emulation
+fn mulhi(v: u8, coeff: u16) -> i32 {
+    ((u32::from(v) * u32::from(coeff)) >> 8) as i32
+}
+
+/// This function has been rewritten to encourage auto-vectorization.
+///
+/// Based on [src/dsp/yuv.h](https://github.com/webmproject/libwebp/blob/8534f53960befac04c9631e6e50d21dcb42dfeaf/src/dsp/yuv.h#L79)
+/// from the libwebp source.
+/// ```text
+/// const YUV_FIX2: i32 = 6;
+/// const YUV_MASK2: i32 = (256 << YUV_FIX2) - 1;
+/// fn clip(v: i32) -> u8 {
+///     if (v & !YUV_MASK2) == 0 {
+///         (v >> YUV_FIX2) as u8
+///     } else if v < 0 {
+///         0
+///     } else {
+///         255
+///     }
+/// }
+/// ```
+// Clippy suggests the clamp method, but it seems to optimize worse as of rustc 1.82.0 nightly.
+#[allow(clippy::manual_clamp)]
+fn clip(v: i32) -> u8 {
+    const YUV_FIX2: i32 = 6;
+    (v >> YUV_FIX2).max(0).min(255) as u8
+}
+
+#[inline(always)]
+fn yuv_to_r(y: u8, v: u8) -> u8 {
+    clip(mulhi(y, 19077) + mulhi(v, 26149) - 14234)
+}
+
+#[inline(always)]
+fn yuv_to_g(y: u8, u: u8, v: u8) -> u8 {
+    clip(mulhi(y, 19077) - mulhi(u, 6419) - mulhi(v, 13320) + 8708)
+}
+
+#[inline(always)]
+fn yuv_to_b(y: u8, u: u8) -> u8 {
+    clip(mulhi(y, 19077) + mulhi(u, 33050) - 17685)
+}
+
+/// Fills an rgb buffer with the image from the yuv buffers
+/// Size of the buffer is assumed to be correct
+/// BPP is short for bytes per pixel, allows both rgb and rgba to be decoded
+pub(crate) fn fill_rgb_buffer_fancy<const BPP: usize>(
+    buffer: &mut [u8],
+    y_buffer: &[u8],
+    u_buffer: &[u8],
+    v_buffer: &[u8],
+    width: usize,
+    height: usize,
+    buffer_width: usize,
+) {
+    // buffer width is always even so don't need to do div_ceil
+    let chroma_buffer_width = buffer_width / 2;
+    let chroma_width = width.div_ceil(2);
+
+    // fill top row first since it only uses the top u/v row
+    let top_row_y = &y_buffer[..width];
+    let top_row_u = &u_buffer[..chroma_width];
+    let top_row_v = &v_buffer[..chroma_width];
+    let top_row_buffer = &mut buffer[..width * BPP];
+    fill_row_fancy_with_1_uv_row::<BPP>(top_row_buffer, top_row_y, top_row_u, top_row_v);
+
+    let mut main_row_chunks = buffer[width * BPP..].chunks_exact_mut(width * BPP * 2);
+    // the y buffer iterator limits the end of the row iterator so we need this end index
+    let end_y_index = height * buffer_width;
+    let mut main_y_chunks = y_buffer[buffer_width..end_y_index].chunks_exact(buffer_width * 2);
+    let mut main_u_windows = u_buffer
+        .windows(chroma_buffer_width * 2)
+        .step_by(chroma_buffer_width);
+    let mut main_v_windows = v_buffer
+        .windows(chroma_buffer_width * 2)
+        .step_by(chroma_buffer_width);
+
+    for (((row_buffer, y_rows), u_rows), v_rows) in (&mut main_row_chunks)
+        .zip(&mut main_y_chunks)
+        .zip(&mut main_u_windows)
+        .zip(&mut main_v_windows)
+    {
+        let (u_row_1, u_row_2) = u_rows.split_at(chroma_buffer_width);
+        let (v_row_1, v_row_2) = v_rows.split_at(chroma_buffer_width);
+        let (row_buf_1, row_buf_2) = row_buffer.split_at_mut(width * BPP);
+        let (y_row_1, y_row_2) = y_rows.split_at(buffer_width);
+        fill_row_fancy_with_2_uv_rows::<BPP>(
+            row_buf_1,
+            &y_row_1[..width],
+            &u_row_1[..chroma_width],
+            &u_row_2[..chroma_width],
+            &v_row_1[..chroma_width],
+            &v_row_2[..chroma_width],
+        );
+        fill_row_fancy_with_2_uv_rows::<BPP>(
+            row_buf_2,
+            &y_row_2[..width],
+            &u_row_2[..chroma_width],
+            &u_row_1[..chroma_width],
+            &v_row_2[..chroma_width],
+            &v_row_1[..chroma_width],
+        );
+    }
+
+    let final_row_buffer = main_row_chunks.into_remainder();
+
+    // if the image has even height there will be one final row with only one u/v row matching it
+    if !final_row_buffer.is_empty() {
+        let final_y_row = main_y_chunks.remainder();
+
+        let chroma_height = height.div_ceil(2);
+        let start_chroma_index = (chroma_height - 1) * chroma_buffer_width;
+
+        let final_u_row = &u_buffer[start_chroma_index..];
+        let final_v_row = &v_buffer[start_chroma_index..];
+        fill_row_fancy_with_1_uv_row::<BPP>(
+            final_row_buffer,
+            &final_y_row[..width],
+            &final_u_row[..chroma_width],
+            &final_v_row[..chroma_width],
+        );
+    }
+}
+
+/// Fills a row with the fancy interpolation as detailed
+fn fill_row_fancy_with_2_uv_rows<const BPP: usize>(
+    row_buffer: &mut [u8],
+    y_row: &[u8],
+    u_row_1: &[u8],
+    u_row_2: &[u8],
+    v_row_1: &[u8],
+    v_row_2: &[u8],
+) {
+    // need to do left pixel separately since it will only have one u/v value
+    {
+        let rgb1 = &mut row_buffer[0..3];
+        let y_value = y_row[0];
+        // first pixel uses the first u/v as the main one
+        let u_value = get_fancy_chroma_value(u_row_1[0], u_row_1[0], u_row_2[0], u_row_2[0]);
+        let v_value = get_fancy_chroma_value(v_row_1[0], v_row_1[0], v_row_2[0], v_row_2[0]);
+        set_pixel(rgb1, y_value, u_value, v_value);
+    }
+
+    let rest_row_buffer = &mut row_buffer[BPP..];
+    let rest_y_row = &y_row[1..];
+
+    // we do two pixels at a time since they share the same u/v values
+    let mut main_row_chunks = rest_row_buffer.chunks_exact_mut(BPP * 2);
+    let mut main_y_chunks = rest_y_row.chunks_exact(2);
+
+    for (((((rgb, y_val), u_val_1), u_val_2), v_val_1), v_val_2) in (&mut main_row_chunks)
+        .zip(&mut main_y_chunks)
+        .zip(u_row_1.windows(2))
+        .zip(u_row_2.windows(2))
+        .zip(v_row_1.windows(2))
+        .zip(v_row_2.windows(2))
+    {
+        {
+            let rgb1 = &mut rgb[0..3];
+            let y_value = y_val[0];
+            // first pixel uses the first u/v as the main one
+            let u_value = get_fancy_chroma_value(u_val_1[0], u_val_1[1], u_val_2[0], u_val_2[1]);
+            let v_value = get_fancy_chroma_value(v_val_1[0], v_val_1[1], v_val_2[0], v_val_2[1]);
+            set_pixel(rgb1, y_value, u_value, v_value);
+        }
+        {
+            let rgb2 = &mut rgb[BPP..];
+            let y_value = y_val[1];
+            let u_value = get_fancy_chroma_value(u_val_1[1], u_val_1[0], u_val_2[1], u_val_2[0]);
+            let v_value = get_fancy_chroma_value(v_val_1[1], v_val_1[0], v_val_2[1], v_val_2[0]);
+            set_pixel(rgb2, y_value, u_value, v_value);
+        }
+    }
+
+    let final_pixel = main_row_chunks.into_remainder();
+    let final_y = main_y_chunks.remainder();
+
+    if let (rgb, [y_value]) = (final_pixel, final_y) {
+        let final_u_1 = *u_row_1.last().unwrap();
+        let final_u_2 = *u_row_2.last().unwrap();
+
+        let final_v_1 = *v_row_1.last().unwrap();
+        let final_v_2 = *v_row_2.last().unwrap();
+
+        let rgb1 = &mut rgb[0..3];
+        // first pixel uses the first u/v as the main one
+        let u_value = get_fancy_chroma_value(final_u_1, final_u_1, final_u_2, final_u_2);
+        let v_value = get_fancy_chroma_value(final_v_1, final_v_1, final_v_2, final_v_2);
+        set_pixel(rgb1, *y_value, u_value, v_value);
+    }
+}
+
+fn fill_row_fancy_with_1_uv_row<const BPP: usize>(
+    row_buffer: &mut [u8],
+    y_row: &[u8],
+    u_row: &[u8],
+    v_row: &[u8],
+) {
+    // doing left pixel first
+    {
+        let rgb1 = &mut row_buffer[0..3];
+        let y_value = y_row[0];
+
+        let u_value = u_row[0];
+        let v_value = v_row[0];
+        set_pixel(rgb1, y_value, u_value, v_value);
+    }
+
+    // two pixels at a time since they share the same u/v value
+    let mut main_row_chunks = row_buffer[BPP..].chunks_exact_mut(BPP * 2);
+    let mut main_y_row_chunks = y_row[1..].chunks_exact(2);
+
+    for (((rgb, y_val), u_val), v_val) in (&mut main_row_chunks)
+        .zip(&mut main_y_row_chunks)
+        .zip(u_row.windows(2))
+        .zip(v_row.windows(2))
+    {
+        {
+            let rgb1 = &mut rgb[0..3];
+            let y_value = y_val[0];
+            // first pixel uses the first u/v as the main one
+            let u_value = get_fancy_chroma_value(u_val[0], u_val[1], u_val[0], u_val[1]);
+            let v_value = get_fancy_chroma_value(v_val[0], v_val[1], v_val[0], v_val[1]);
+            set_pixel(rgb1, y_value, u_value, v_value);
+        }
+        {
+            let rgb2 = &mut rgb[BPP..];
+            let y_value = y_val[1];
+            let u_value = get_fancy_chroma_value(u_val[1], u_val[0], u_val[1], u_val[0]);
+            let v_value = get_fancy_chroma_value(v_val[1], v_val[0], v_val[1], v_val[0]);
+            set_pixel(rgb2, y_value, u_value, v_value);
+        }
+    }
+
+    let final_pixel = main_row_chunks.into_remainder();
+    let final_y = main_y_row_chunks.remainder();
+
+    if let (rgb, [final_y]) = (final_pixel, final_y) {
+        let final_u = *u_row.last().unwrap();
+        let final_v = *v_row.last().unwrap();
+
+        set_pixel(rgb, *final_y, final_u, final_v);
+    }
+}
+
+#[inline]
+fn get_fancy_chroma_value(main: u8, secondary1: u8, secondary2: u8, tertiary: u8) -> u8 {
+    let val0 = u16::from(main);
+    let val1 = u16::from(secondary1);
+    let val2 = u16::from(secondary2);
+    let val3 = u16::from(tertiary);
+    ((9 * val0 + 3 * val1 + 3 * val2 + val3 + 8) / 16) as u8
+}
+
+#[inline]
+fn set_pixel(rgb: &mut [u8], y: u8, u: u8, v: u8) {
+    rgb[0] = yuv_to_r(y, v);
+    rgb[1] = yuv_to_g(y, u, v);
+    rgb[2] = yuv_to_b(y, u);
+}
+
+/// Simple conversion, not currently used but could add a config to allow for using the simple
+#[allow(unused)]
+pub(crate) fn fill_rgb_buffer_simple<const BPP: usize>(
+    buffer: &mut [u8],
+    y_buffer: &[u8],
+    u_buffer: &[u8],
+    v_buffer: &[u8],
+    width: usize,
+    chroma_width: usize,
+    buffer_width: usize,
+) {
+    let u_row_twice_iter = u_buffer
+        .chunks_exact(buffer_width / 2)
+        .flat_map(|n| std::iter::repeat(n).take(2));
+    let v_row_twice_iter = v_buffer
+        .chunks_exact(buffer_width / 2)
+        .flat_map(|n| std::iter::repeat(n).take(2));
+
+    for (((row, y_row), u_row), v_row) in buffer
+        .chunks_exact_mut(width * BPP)
+        .zip(y_buffer.chunks_exact(buffer_width))
+        .zip(u_row_twice_iter)
+        .zip(v_row_twice_iter)
+    {
+        fill_rgba_row_simple::<BPP>(
+            &y_row[..width],
+            &u_row[..chroma_width],
+            &v_row[..chroma_width],
+            row,
+        );
+    }
+}
+
+fn fill_rgba_row_simple<const BPP: usize>(
+    y_vec: &[u8],
+    u_vec: &[u8],
+    v_vec: &[u8],
+    rgba: &mut [u8],
+) {
+    // Fill 2 pixels per iteration: these pixels share `u` and `v` components
+    let mut rgb_chunks = rgba.chunks_exact_mut(BPP * 2);
+    let mut y_chunks = y_vec.chunks_exact(2);
+    let mut u_iter = u_vec.iter();
+    let mut v_iter = v_vec.iter();
+
+    for (((rgb, y), &u), &v) in (&mut rgb_chunks)
+        .zip(&mut y_chunks)
+        .zip(&mut u_iter)
+        .zip(&mut v_iter)
+    {
+        let coeffs = [
+            mulhi(v, 26149),
+            mulhi(u, 6419),
+            mulhi(v, 13320),
+            mulhi(u, 33050),
+        ];
+
+        let get_r = |y: u8| clip(mulhi(y, 19077) + coeffs[0] - 14234);
+        let get_g = |y: u8| clip(mulhi(y, 19077) - coeffs[1] - coeffs[2] + 8708);
+        let get_b = |y: u8| clip(mulhi(y, 19077) + coeffs[3] - 17685);
+
+        let rgb1 = &mut rgb[0..3];
+        rgb1[0] = get_r(y[0]);
+        rgb1[1] = get_g(y[0]);
+        rgb1[2] = get_b(y[0]);
+
+        let rgb2 = &mut rgb[BPP..];
+        rgb2[0] = get_r(y[1]);
+        rgb2[1] = get_g(y[1]);
+        rgb2[2] = get_b(y[1]);
+    }
+
+    let remainder = rgb_chunks.into_remainder();
+    if remainder.len() >= 3 {
+        if let (Some(&y), Some(&u), Some(&v)) = (
+            y_chunks.remainder().iter().next(),
+            u_iter.next(),
+            v_iter.next(),
+        ) {
+            let coeffs = [
+                mulhi(v, 26149),
+                mulhi(u, 6419),
+                mulhi(v, 13320),
+                mulhi(u, 33050),
+            ];
+
+            remainder[0] = clip(mulhi(y, 19077) + coeffs[0] - 14234);
+            remainder[1] = clip(mulhi(y, 19077) - coeffs[1] - coeffs[2] + 8708);
+            remainder[2] = clip(mulhi(y, 19077) + coeffs[3] - 17685);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_fancy_grid() {
+        #[rustfmt::skip]
+        let y_buffer = [
+            77, 162, 202, 185,
+            28, 13, 199, 182,
+            135, 147, 164, 135, 
+            66, 27, 171, 130,
+        ];
+
+        #[rustfmt::skip]
+        let u_buffer = [
+            34, 101, 
+            123, 163
+        ];
+
+        #[rustfmt::skip]
+        let v_buffer = [
+            97, 167,
+            149, 23,
+        ];
+
+        let mut rgb_buffer = [0u8; 16 * 3];
+        fill_rgb_buffer_fancy::<3>(&mut rgb_buffer, &y_buffer, &u_buffer, &v_buffer, 4, 4, 4);
+
+        #[rustfmt::skip]
+        let upsampled_u_buffer = [
+            34, 51, 84, 101,
+            56, 71, 101, 117,
+            101, 112, 136, 148,
+            123, 133, 153, 163,
+        ];
+
+        #[rustfmt::skip]
+        let upsampled_v_buffer = [
+            97, 115, 150, 167,
+            110, 115, 126, 131,
+            136, 117, 78, 59,
+            149, 118, 55, 23,
+        ];
+
+        let mut upsampled_rgb_buffer = [0u8; 16 * 3];
+        for (((rgb_val, y), u), v) in upsampled_rgb_buffer
+            .chunks_exact_mut(3)
+            .zip(y_buffer)
+            .zip(upsampled_u_buffer)
+            .zip(upsampled_v_buffer)
+        {
+            rgb_val[0] = yuv_to_r(y, v);
+            rgb_val[1] = yuv_to_g(y, u, v);
+            rgb_val[2] = yuv_to_b(y, u);
+        }
+
+        assert_eq!(rgb_buffer, upsampled_rgb_buffer);
+    }
+
+    #[test]
+    fn test_yuv_conversions() {
+        let (y, u, v) = (203, 40, 42);
+
+        assert_eq!(yuv_to_r(y, v), 80);
+        assert_eq!(yuv_to_g(y, u, v), 255);
+        assert_eq!(yuv_to_b(y, u), 40);
+    }
+}
diff --git a/tests/CREDITS.md b/tests/CREDITS.md
index c0c21f1..421a84f 100644
--- a/tests/CREDITS.md
+++ b/tests/CREDITS.md
@@ -38,7 +38,11 @@ These files are all PNGs with contents that should exactly match the associated
 
 ## reference/gallery1 and reference/gallery2
 
-These files were all produced by running dwebp with the `-nofancy` option.
+These files were all produced by running dwebp with the default settings.
+
+## reference/gallery1_nofancy
+
+These files were produced by running dwebp with the `nofancy` option.
 
 ## reference/animated
 
@@ -51,7 +55,7 @@ for i in {1..3}; do webpmux -get frame ${i} ../../images/animated/random_lossles
 random-lossy-N.png:
 
 ```
-for i in {1..4}; do webpmux -get frame ${i} ../../images/animated/random_lossy.webp -o random_lossy-${i}.png && dwebp random_lossy-${i}.png -nofancy -o random_lossy-${i}.png; done
+for i in {1..4}; do webpmux -get frame ${i} ../../images/animated/random_lossy.webp -o random_lossy-${i}.png && dwebp random_lossy-${i}.png -o random_lossy-${i}.png; done
 ```
 
 ## reference/regression
diff --git a/tests/decode.rs b/tests/decode.rs
index 0e8e8fe..8674ee0 100644
--- a/tests/decode.rs
+++ b/tests/decode.rs
@@ -37,16 +37,26 @@ fn save_image(data: &[u8], file: &str, i: Option<u32>, has_alpha: bool, width: u
 }
 
 fn reference_test(file: &str) {
+    reference_test_with_options(file, image_webp::WebPDecodeOptions::default(), None);
+}
+
+fn reference_test_with_options(
+    file: &str,
+    options: image_webp::WebPDecodeOptions,
+    custom_reference_file: Option<&str>,
+) {
     // Prepare WebP decoder
     let contents = std::fs::read(format!("tests/images/{file}.webp")).unwrap();
-    let mut decoder = image_webp::WebPDecoder::new(Cursor::new(contents)).unwrap();
+    let mut decoder =
+        image_webp::WebPDecoder::new_with_options(Cursor::new(contents), options).unwrap();
     let (width, height) = decoder.dimensions();
 
     // Decode reference PNG
+    let reference_file = custom_reference_file.unwrap_or(file);
     let reference_path = if decoder.is_animated() {
-        format!("tests/reference/{file}-1.png")
+        format!("tests/reference/{reference_file}-1.png")
     } else {
-        format!("tests/reference/{file}.png")
+        format!("tests/reference/{reference_file}.png")
     };
     let reference_contents = std::fs::read(reference_path).unwrap();
     let mut reference_decoder = png::Decoder::new(Cursor::new(reference_contents))
@@ -85,7 +95,7 @@ fn reference_test(file: &str) {
         }
     } else {
         // NOTE: WebP lossy images are stored in YUV format. The conversion to RGB is not precisely
-        // defined, but we currently attempt to match the dwebp's "-nofancy" conversion option.
+        // defined, but we currently attempt to match the dwebp's default conversion option.
         let num_bytes_different = data
             .iter()
             .zip(reference_data.iter())
@@ -156,7 +166,29 @@ macro_rules! reftest {
     }
 }
 
+macro_rules! reftest_nofancy {
+    ($basename:expr, $name:expr) => {
+        paste::paste! {
+            #[test]
+            fn [<reftest_nofancy_ $basename _ $name>]() {
+                let mut options = image_webp::WebPDecodeOptions::default();
+                options.lossy_upsampling = image_webp::UpsamplingMethod::Simple;
+                reference_test_with_options(
+                    concat!(stringify!($basename), "/", stringify!($name)),
+                    options,
+                    Some(concat!(stringify!($basename), "_nofancy", "/", stringify!($name)))
+                );
+            }
+        }
+    };
+    ($basename:expr, $name:expr, $($tail:expr),+) => {
+        reftest_nofancy!( $basename, $name );
+        reftest_nofancy!( $basename, $($tail),+ );
+    }
+}
+
 reftest!(gallery1, 1, 2, 3, 4, 5);
+reftest_nofancy!(gallery1, 1, 2, 3, 4, 5);
 reftest!(gallery2, 1_webp_ll, 2_webp_ll, 3_webp_ll, 4_webp_ll, 5_webp_ll);
 reftest!(gallery2, 1_webp_a, 2_webp_a, 3_webp_a, 4_webp_a, 5_webp_a);
 reftest!(animated, random_lossless, random_lossy);
diff --git a/tests/reference/animated/random_lossy-1.png b/tests/reference/animated/random_lossy-1.png
index d555097..c0b05d2 100644
Binary files a/tests/reference/animated/random_lossy-1.png and b/tests/reference/animated/random_lossy-1.png differ
diff --git a/tests/reference/animated/random_lossy-2.png b/tests/reference/animated/random_lossy-2.png
index 41ca056..1272574 100644
Binary files a/tests/reference/animated/random_lossy-2.png and b/tests/reference/animated/random_lossy-2.png differ
diff --git a/tests/reference/animated/random_lossy-3.png b/tests/reference/animated/random_lossy-3.png
index bee4cbf..5227737 100644
Binary files a/tests/reference/animated/random_lossy-3.png and b/tests/reference/animated/random_lossy-3.png differ
diff --git a/tests/reference/animated/random_lossy-4.png b/tests/reference/animated/random_lossy-4.png
index ca504e0..e2ee179 100644
Binary files a/tests/reference/animated/random_lossy-4.png and b/tests/reference/animated/random_lossy-4.png differ
diff --git a/tests/reference/gallery1/1.png b/tests/reference/gallery1/1.png
index ceb01a8..c50e3cd 100644
Binary files a/tests/reference/gallery1/1.png and b/tests/reference/gallery1/1.png differ
diff --git a/tests/reference/gallery1/2.png b/tests/reference/gallery1/2.png
index a9df76a..3cc4f75 100644
Binary files a/tests/reference/gallery1/2.png and b/tests/reference/gallery1/2.png differ
diff --git a/tests/reference/gallery1/3.png b/tests/reference/gallery1/3.png
index 3197655..b998a8c 100644
Binary files a/tests/reference/gallery1/3.png and b/tests/reference/gallery1/3.png differ
diff --git a/tests/reference/gallery1/4.png b/tests/reference/gallery1/4.png
index 3c9b404..dd216c4 100644
Binary files a/tests/reference/gallery1/4.png and b/tests/reference/gallery1/4.png differ
diff --git a/tests/reference/gallery1/5.png b/tests/reference/gallery1/5.png
index 1b5ac55..4ed511a 100644
Binary files a/tests/reference/gallery1/5.png and b/tests/reference/gallery1/5.png differ
diff --git a/tests/reference/gallery1_nofancy/1.png b/tests/reference/gallery1_nofancy/1.png
new file mode 100644
index 0000000..ceb01a8
Binary files /dev/null and b/tests/reference/gallery1_nofancy/1.png differ
diff --git a/tests/reference/gallery1_nofancy/2.png b/tests/reference/gallery1_nofancy/2.png
new file mode 100644
index 0000000..a9df76a
Binary files /dev/null and b/tests/reference/gallery1_nofancy/2.png differ
diff --git a/tests/reference/gallery1_nofancy/3.png b/tests/reference/gallery1_nofancy/3.png
new file mode 100644
index 0000000..3197655
Binary files /dev/null and b/tests/reference/gallery1_nofancy/3.png differ
diff --git a/tests/reference/gallery1_nofancy/4.png b/tests/reference/gallery1_nofancy/4.png
new file mode 100644
index 0000000..3c9b404
Binary files /dev/null and b/tests/reference/gallery1_nofancy/4.png differ
diff --git a/tests/reference/gallery1_nofancy/5.png b/tests/reference/gallery1_nofancy/5.png
new file mode 100644
index 0000000..1b5ac55
Binary files /dev/null and b/tests/reference/gallery1_nofancy/5.png differ
diff --git a/tests/reference/gallery2/1_webp_a.png b/tests/reference/gallery2/1_webp_a.png
index 401c7c4..72f61f5 100644
Binary files a/tests/reference/gallery2/1_webp_a.png and b/tests/reference/gallery2/1_webp_a.png differ
diff --git a/tests/reference/gallery2/2_webp_a.png b/tests/reference/gallery2/2_webp_a.png
index 36aa663..6868995 100644
Binary files a/tests/reference/gallery2/2_webp_a.png and b/tests/reference/gallery2/2_webp_a.png differ
diff --git a/tests/reference/gallery2/3_webp_a.png b/tests/reference/gallery2/3_webp_a.png
index a9f939e..f3251e7 100644
Binary files a/tests/reference/gallery2/3_webp_a.png and b/tests/reference/gallery2/3_webp_a.png differ
diff --git a/tests/reference/gallery2/4_webp_a.png b/tests/reference/gallery2/4_webp_a.png
index b341ef1..6a141e6 100644
Binary files a/tests/reference/gallery2/4_webp_a.png and b/tests/reference/gallery2/4_webp_a.png differ
diff --git a/tests/reference/gallery2/5_webp_a.png b/tests/reference/gallery2/5_webp_a.png
index c613bae..2b658af 100644
Binary files a/tests/reference/gallery2/5_webp_a.png and b/tests/reference/gallery2/5_webp_a.png differ