Skip to content

Commit d0bff93

Browse files
committed
Implement 8-bit multiplication in x86
1 parent 843fc66 commit d0bff93

File tree

5 files changed

+117
-31
lines changed

5 files changed

+117
-31
lines changed

fearless_simd/src/generated/avx2.rs

Lines changed: 44 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,6 @@
33

44
// This file is autogenerated by fearless_simd_gen
55

6-
#![expect(
7-
unused_variables,
8-
clippy::todo,
9-
reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
10-
)]
116
use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
127
use crate::{
138
f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
@@ -238,7 +233,16 @@ impl Simd for Avx2 {
238233
}
239234
#[inline(always)]
240235
fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
241-
todo!()
236+
unsafe {
237+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
238+
let dst_odd =
239+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
240+
_mm_or_si128(
241+
_mm_slli_epi16(dst_odd, 8),
242+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
243+
)
244+
.simd_into(self)
245+
}
242246
}
243247
#[inline(always)]
244248
fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -378,7 +382,16 @@ impl Simd for Avx2 {
378382
}
379383
#[inline(always)]
380384
fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
381-
todo!()
385+
unsafe {
386+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
387+
let dst_odd =
388+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
389+
_mm_or_si128(
390+
_mm_slli_epi16(dst_odd, 8),
391+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
392+
)
393+
.simd_into(self)
394+
}
382395
}
383396
#[inline(always)]
384397
fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
@@ -1495,7 +1508,18 @@ impl Simd for Avx2 {
14951508
}
14961509
#[inline(always)]
14971510
fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1498-
todo!()
1511+
unsafe {
1512+
let dst_even = _mm256_mullo_epi16(a.into(), b.into());
1513+
let dst_odd = _mm256_mullo_epi16(
1514+
_mm256_srli_epi16::<8>(a.into()),
1515+
_mm256_srli_epi16::<8>(b.into()),
1516+
);
1517+
_mm256_or_si256(
1518+
_mm256_slli_epi16(dst_odd, 8),
1519+
_mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
1520+
)
1521+
.simd_into(self)
1522+
}
14991523
}
15001524
#[inline(always)]
15011525
fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
@@ -1669,7 +1693,18 @@ impl Simd for Avx2 {
16691693
}
16701694
#[inline(always)]
16711695
fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1672-
todo!()
1696+
unsafe {
1697+
let dst_even = _mm256_mullo_epi16(a.into(), b.into());
1698+
let dst_odd = _mm256_mullo_epi16(
1699+
_mm256_srli_epi16::<8>(a.into()),
1700+
_mm256_srli_epi16::<8>(b.into()),
1701+
);
1702+
_mm256_or_si256(
1703+
_mm256_slli_epi16(dst_odd, 8),
1704+
_mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
1705+
)
1706+
.simd_into(self)
1707+
}
16731708
}
16741709
#[inline(always)]
16751710
fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {

fearless_simd/src/generated/sse4_2.rs

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,6 @@
33

44
// This file is autogenerated by fearless_simd_gen
55

6-
#![expect(
7-
unused_variables,
8-
clippy::todo,
9-
reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
10-
)]
116
use crate::{Level, Simd, SimdFrom, SimdInto, seal::Seal};
127
use crate::{
138
f32x4, f32x8, f32x16, f64x2, f64x4, f64x8, i8x16, i8x32, i8x64, i16x8, i16x16, i16x32, i32x4,
@@ -246,7 +241,16 @@ impl Simd for Sse4_2 {
246241
}
247242
#[inline(always)]
248243
fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
249-
todo!()
244+
unsafe {
245+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
246+
let dst_odd =
247+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
248+
_mm_or_si128(
249+
_mm_slli_epi16(dst_odd, 8),
250+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
251+
)
252+
.simd_into(self)
253+
}
250254
}
251255
#[inline(always)]
252256
fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -389,7 +393,16 @@ impl Simd for Sse4_2 {
389393
}
390394
#[inline(always)]
391395
fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
392-
todo!()
396+
unsafe {
397+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
398+
let dst_odd =
399+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
400+
_mm_or_si128(
401+
_mm_slli_epi16(dst_odd, 8),
402+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
403+
)
404+
.simd_into(self)
405+
}
393406
}
394407
#[inline(always)]
395408
fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {

fearless_simd_gen/src/mk_avx2.rs

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,6 @@ pub(crate) fn mk_avx2_impl() -> TokenStream {
3333
let ty_impl = mk_type_impl();
3434

3535
quote! {
36-
// Until we have implemented all functions.
37-
#![expect(
38-
unused_variables,
39-
clippy::todo,
40-
reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
41-
)]
42-
4336
#[cfg(target_arch = "x86")]
4437
use core::arch::x86::*;
4538
#[cfg(target_arch = "x86_64")]

fearless_simd_gen/src/mk_sse4_2.rs

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -33,13 +33,6 @@ pub(crate) fn mk_sse4_2_impl() -> TokenStream {
3333
let ty_impl = mk_type_impl();
3434

3535
quote! {
36-
// Until we have implemented all functions.
37-
#![expect(
38-
unused_variables,
39-
clippy::todo,
40-
reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
41-
)]
42-
4336
#[cfg(target_arch = "x86")]
4437
use core::arch::x86::*;
4538
#[cfg(target_arch = "x86_64")]
@@ -429,9 +422,21 @@ pub(crate) fn handle_binary(
429422
arch: impl Arch,
430423
) -> TokenStream {
431424
if method == "mul" && vec_ty.scalar_bits == 8 {
425+
// https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t
426+
let mullo = intrinsic_ident("mullo", "epi16", vec_ty.n_bits());
427+
let set1 = intrinsic_ident("set1", "epi16", vec_ty.n_bits());
428+
let and = intrinsic_ident("and", coarse_type(*vec_ty), vec_ty.n_bits());
429+
let or = intrinsic_ident("or", coarse_type(*vec_ty), vec_ty.n_bits());
430+
let slli = intrinsic_ident("slli", "epi16", vec_ty.n_bits());
431+
let srli = intrinsic_ident("srli", "epi16", vec_ty.n_bits());
432432
quote! {
433433
#method_sig {
434-
todo!()
434+
unsafe {
435+
let dst_even = #mullo(a.into(), b.into());
436+
let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into()));
437+
438+
#or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self)
439+
}
435440
}
436441
}
437442
} else {

fearless_simd_tests/tests/harness/mod.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2447,6 +2447,46 @@ fn trunc_f64x2<S: Simd>(simd: S) {
24472447
assert_eq!(a.trunc().val, [1.0, -2.0]);
24482448
}
24492449

2450+
#[simd_test]
2451+
fn mul_u8x16<S: Simd>(simd: S) {
2452+
let a = u8x16::from_slice(
2453+
simd,
2454+
&[0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 100],
2455+
);
2456+
let b = u8x16::from_slice(
2457+
simd,
2458+
&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2],
2459+
);
2460+
2461+
assert_eq!(
2462+
(a * b).val,
2463+
[
2464+
0, 2, 6, 12, 20, 30, 70, 120, 180, 250, 74, 164, 8, 188, 132, 200
2465+
]
2466+
);
2467+
}
2468+
2469+
#[simd_test]
2470+
fn mul_i8x16<S: Simd>(simd: S) {
2471+
let a = i8x16::from_slice(
2472+
simd,
2473+
&[
2474+
0, 1, -2, 3, -4, 5, 10, -15, 20, -25, 30, 35, -40, 50, -60, 100,
2475+
],
2476+
);
2477+
let b = i8x16::from_slice(
2478+
simd,
2479+
&[1, 2, 3, -4, 5, -6, 7, 8, 9, 10, -11, 12, 13, -14, 15, 2],
2480+
);
2481+
2482+
assert_eq!(
2483+
(a * b).val,
2484+
[
2485+
0, 2, -6, -12, -20, -30, 70, -120, -76, 6, -74, -92, -8, 68, 124, -56
2486+
]
2487+
);
2488+
}
2489+
24502490
#[simd_test]
24512491
fn mul_u16x8<S: Simd>(simd: S) {
24522492
let a = u16x8::from_slice(simd, &[0, 5, 10, 30, 500, 0, 0, 0]);

0 commit comments

Comments
 (0)