Implement 8-bit multiplication in x86

valadaptive · valadaptive · commit 429a30453bd9 · 2025-11-11T21:24:22.000-05:00
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
@@ -238,7 +238,16 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -378,7 +387,16 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
@@ -1495,7 +1513,18 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
@@ -1669,7 +1698,18 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm256_mullo_epi16(a.into(), b.into());
+            let dst_odd = _mm256_mullo_epi16(
+                _mm256_srli_epi16::<8>(a.into()),
+                _mm256_srli_epi16::<8>(b.into()),
+            );
+            _mm256_or_si256(
+                _mm256_slli_epi16(dst_odd, 8),
+                _mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
@@ -246,7 +246,16 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -389,7 +398,16 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        todo!()
+        unsafe {
+            let dst_even = _mm_mullo_epi16(a.into(), b.into());
+            let dst_odd =
+                _mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
+            _mm_or_si128(
+                _mm_slli_epi16(dst_odd, 8),
+                _mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
diff --git a/fearless_simd_gen/src/mk_sse4_2.rs b/fearless_simd_gen/src/mk_sse4_2.rs
@@ -429,9 +429,21 @@ pub(crate) fn handle_binary(
     arch: impl Arch,
 ) -> TokenStream {
     if method == "mul" && vec_ty.scalar_bits == 8 {
+        // https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t
+        let mullo = intrinsic_ident("mullo", "epi16", vec_ty.n_bits());
+        let set1 = intrinsic_ident("set1", "epi16", vec_ty.n_bits());
+        let and = intrinsic_ident("and", coarse_type(*vec_ty), vec_ty.n_bits());
+        let or = intrinsic_ident("or", coarse_type(*vec_ty), vec_ty.n_bits());
+        let slli = intrinsic_ident("slli", "epi16", vec_ty.n_bits());
+        let srli = intrinsic_ident("srli", "epi16", vec_ty.n_bits());
         quote! {
             #method_sig {
-                todo!()
+                unsafe {
+                    let dst_even = #mullo(a.into(), b.into());
+                    let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into()));
+
+                    #or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self)
+                }
             }
         }
     } else {
diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
@@ -2447,6 +2447,46 @@ fn trunc_f64x2<S: Simd>(simd: S) {
     assert_eq!(a.trunc().val, [1.0, -2.0]);
 }
 
+#[simd_test]
+fn mul_u8x16<S: Simd>(simd: S) {
+    let a = u8x16::from_slice(
+        simd,
+        &[0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 100],
+    );
+    let b = u8x16::from_slice(
+        simd,
+        &[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2],
+    );
+
+    assert_eq!(
+        (a * b).val,
+        [
+            0, 2, 6, 12, 20, 30, 70, 120, 180, 250, 74, 164, 8, 188, 132, 200
+        ]
+    );
+}
+
+#[simd_test]
+fn mul_i8x16<S: Simd>(simd: S) {
+    let a = i8x16::from_slice(
+        simd,
+        &[
+            0, 1, -2, 3, -4, 5, 10, -15, 20, -25, 30, 35, -40, 50, -60, 100,
+        ],
+    );
+    let b = i8x16::from_slice(
+        simd,
+        &[1, 2, 3, -4, 5, -6, 7, 8, 9, 10, -11, 12, 13, -14, 15, 2],
+    );
+
+    assert_eq!(
+        (a * b).val,
+        [
+            0, 2, -6, -12, -20, -30, 70, -120, -76, 6, -74, -92, -8, 68, 124, -56
+        ]
+    );
+}
+
 #[simd_test]
 fn mul_u16x8<S: Simd>(simd: S) {
     let a = u16x8::from_slice(simd, &[0, 5, 10, 30, 500, 0, 0, 0]);