Implement 8-bit multiplication in x86

valadaptive · valadaptive · commit d0bff932a8b5 · 2025-11-11T21:55:10.000-05:00
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
@@ -3,11 +3,6 @@
 
 // This file is autogenerated by fearless_simd_gen
 
-#![expect(
-    unused_variables,
-    clippy::todo,
-    reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
-)]
 use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
@@ -238,7 +233,16 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -378,7 +382,16 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
@@ -1495,7 +1508,18 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
@@ -1669,7 +1693,18 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
@@ -3,11 +3,6 @@
 
 // This file is autogenerated by fearless_simd_gen
 
-#![expect(
-    unused_variables,
-    clippy::todo,
-    reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
-)]
 use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
 use crate::{
     f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
@@ -246,7 +241,16 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -389,7 +393,16 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
diff --git a/fearless_simd_gen/src/mk_avx2.rs b/fearless_simd_gen/src/mk_avx2.rs
@@ -33,13 +33,6 @@ pub(crate) fn mk_avx2_impl() -> TokenStream {
     let ty_impl = mk_type_impl();
 
     quote! {
-        // Until we have implemented all functions.
-        #![expect(
-            unused_variables,
-            clippy::todo,
-            reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
-        )]
-
         #[cfg(target_arch = "x86")]
         use core::arch::x86::*;
         #[cfg(target_arch = "x86_64")]
diff --git a/fearless_simd_gen/src/mk_sse4_2.rs b/fearless_simd_gen/src/mk_sse4_2.rs
@@ -33,13 +33,6 @@ pub(crate) fn mk_sse4_2_impl() -> TokenStream {
     let ty_impl = mk_type_impl();
 
     quote! {
-        // Until we have implemented all functions.
-        #![expect(
-            unused_variables,
-            clippy::todo,
-            reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
-        )]
-
         #[cfg(target_arch = "x86")]
         use core::arch::x86::*;
         #[cfg(target_arch = "x86_64")]
@@ -429,9 +422,21 @@ pub(crate) fn handle_binary(
     arch: impl Arch,
 ) -> TokenStream {
     if method == "mul" && vec_ty.scalar_bits == 8 {
+        // https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t
+        let mullo = intrinsic_ident("mullo", "epi16", vec_ty.n_bits());
+        let set1 = intrinsic_ident("set1", "epi16", vec_ty.n_bits());
+        let and = intrinsic_ident("and", coarse_type(*vec_ty), vec_ty.n_bits());
+        let or = intrinsic_ident("or", coarse_type(*vec_ty), vec_ty.n_bits());
+        let slli = intrinsic_ident("slli", "epi16", vec_ty.n_bits());
+        let srli = intrinsic_ident("srli", "epi16", vec_ty.n_bits());
         quote! {
             #method_sig {
-                todo!()
+                unsafe {
+                    let dst_even = #mullo(a.into(), b.into());
+                    let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into()));
+
+                    #or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self)
+                }
             }
         }
     } else {
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
@@ -2447,6 +2447,46 @@ fn trunc_f64x2<S: Simd>(simd: S) {
     assert_eq!(a.trunc().val, [1.0, -2.0]);
 }
 
+#[simd_test]
+fn mul_u8x16<S: Simd>(simd: S) {
+    let a = u8x16::from_slice(
+        simd,
+        &[0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 100],
+    );
+    let b = u8x16::from_slice(
+        simd,
+        &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2],
+    );
+
+    assert_eq!(
+        (a * b).val,
+        [
+            0, 2, 6, 12, 20, 30, 70, 120, 180, 250, 74, 164, 8, 188, 132, 200
+        ]
+    );
+}
+
+#[simd_test]
+fn mul_i8x16<S: Simd>(simd: S) {
+    let a = i8x16::from_slice(
+        simd,
+        &[
+            0, 1, -2, 3, -4, 5, 10, -15, 20, -25, 30, 35, -40, 50, -60, 100,
+        ],
+    );
+    let b = i8x16::from_slice(
+        simd,
+        &[1, 2, 3, -4, 5, -6, 7, 8, 9, 10, -11, 12, 13, -14, 15, 2],
+    );
+
+    assert_eq!(
+        (a * b).val,
+        [
+            0, 2, -6, -12, -20, -30, 70, -120, -76, 6, -74, -92, -8, 68, 124, -56
+        ]
+    );
+}
+
 #[simd_test]
 fn mul_u16x8<S: Simd>(simd: S) {
     let a = u16x8::from_slice(simd, &[0, 5, 10, 30, 500, 0, 0, 0]);