Skip to content

Commit 429a304

Browse files
committed
Implement 8-bit multiplication in x86
1 parent 93e636d commit 429a304

File tree

4 files changed

+117
-7
lines changed

4 files changed

+117
-7
lines changed

fearless_simd/src/generated/avx2.rs

Lines changed: 44 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -238,7 +238,16 @@ impl Simd for Avx2 {
238238
}
239239
#[inline(always)]
240240
fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
241-
todo!()
241+
unsafe {
242+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
243+
let dst_odd =
244+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
245+
_mm_or_si128(
246+
_mm_slli_epi16(dst_odd, 8),
247+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
248+
)
249+
.simd_into(self)
250+
}
242251
}
243252
#[inline(always)]
244253
fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -378,7 +387,16 @@ impl Simd for Avx2 {
378387
}
379388
#[inline(always)]
380389
fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
381-
todo!()
390+
unsafe {
391+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
392+
let dst_odd =
393+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
394+
_mm_or_si128(
395+
_mm_slli_epi16(dst_odd, 8),
396+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
397+
)
398+
.simd_into(self)
399+
}
382400
}
383401
#[inline(always)]
384402
fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
@@ -1495,7 +1513,18 @@ impl Simd for Avx2 {
14951513
}
14961514
#[inline(always)]
14971515
fn mul_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
1498-
todo!()
1516+
unsafe {
1517+
let dst_even = _mm256_mullo_epi16(a.into(), b.into());
1518+
let dst_odd = _mm256_mullo_epi16(
1519+
_mm256_srli_epi16::<8>(a.into()),
1520+
_mm256_srli_epi16::<8>(b.into()),
1521+
);
1522+
_mm256_or_si256(
1523+
_mm256_slli_epi16(dst_odd, 8),
1524+
_mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
1525+
)
1526+
.simd_into(self)
1527+
}
14991528
}
15001529
#[inline(always)]
15011530
fn and_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
@@ -1669,7 +1698,18 @@ impl Simd for Avx2 {
16691698
}
16701699
#[inline(always)]
16711700
fn mul_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
1672-
todo!()
1701+
unsafe {
1702+
let dst_even = _mm256_mullo_epi16(a.into(), b.into());
1703+
let dst_odd = _mm256_mullo_epi16(
1704+
_mm256_srli_epi16::<8>(a.into()),
1705+
_mm256_srli_epi16::<8>(b.into()),
1706+
);
1707+
_mm256_or_si256(
1708+
_mm256_slli_epi16(dst_odd, 8),
1709+
_mm256_and_si256(dst_even, _mm256_set1_epi16(0xFF)),
1710+
)
1711+
.simd_into(self)
1712+
}
16731713
}
16741714
#[inline(always)]
16751715
fn and_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {

fearless_simd/src/generated/sse4_2.rs

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,16 @@ impl Simd for Sse4_2 {
246246
}
247247
#[inline(always)]
248248
fn mul_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
249-
todo!()
249+
unsafe {
250+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
251+
let dst_odd =
252+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
253+
_mm_or_si128(
254+
_mm_slli_epi16(dst_odd, 8),
255+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
256+
)
257+
.simd_into(self)
258+
}
250259
}
251260
#[inline(always)]
252261
fn and_i8x16(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
@@ -389,7 +398,16 @@ impl Simd for Sse4_2 {
389398
}
390399
#[inline(always)]
391400
fn mul_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
392-
todo!()
401+
unsafe {
402+
let dst_even = _mm_mullo_epi16(a.into(), b.into());
403+
let dst_odd =
404+
_mm_mullo_epi16(_mm_srli_epi16::<8>(a.into()), _mm_srli_epi16::<8>(b.into()));
405+
_mm_or_si128(
406+
_mm_slli_epi16(dst_odd, 8),
407+
_mm_and_si128(dst_even, _mm_set1_epi16(0xFF)),
408+
)
409+
.simd_into(self)
410+
}
393411
}
394412
#[inline(always)]
395413
fn and_u8x16(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {

fearless_simd_gen/src/mk_sse4_2.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -429,9 +429,21 @@ pub(crate) fn handle_binary(
429429
arch: impl Arch,
430430
) -> TokenStream {
431431
if method == "mul" && vec_ty.scalar_bits == 8 {
432+
// https://stackoverflow.com/questions/8193601/sse-multiplication-16-x-uint8-t
433+
let mullo = intrinsic_ident("mullo", "epi16", vec_ty.n_bits());
434+
let set1 = intrinsic_ident("set1", "epi16", vec_ty.n_bits());
435+
let and = intrinsic_ident("and", coarse_type(*vec_ty), vec_ty.n_bits());
436+
let or = intrinsic_ident("or", coarse_type(*vec_ty), vec_ty.n_bits());
437+
let slli = intrinsic_ident("slli", "epi16", vec_ty.n_bits());
438+
let srli = intrinsic_ident("srli", "epi16", vec_ty.n_bits());
432439
quote! {
433440
#method_sig {
434-
todo!()
441+
unsafe {
442+
let dst_even = #mullo(a.into(), b.into());
443+
let dst_odd = #mullo(#srli::<8>(a.into()), #srli::<8>(b.into()));
444+
445+
#or(#slli(dst_odd, 8), #and(dst_even, #set1(0xFF))).simd_into(self)
446+
}
435447
}
436448
}
437449
} else {

fearless_simd_tests/tests/harness/mod.rs

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2447,6 +2447,46 @@ fn trunc_f64x2<S: Simd>(simd: S) {
24472447
assert_eq!(a.trunc().val, [1.0, -2.0]);
24482448
}
24492449

2450+
#[simd_test]
2451+
fn mul_u8x16<S: Simd>(simd: S) {
2452+
let a = u8x16::from_slice(
2453+
simd,
2454+
&[0, 1, 2, 3, 4, 5, 10, 15, 20, 25, 30, 35, 40, 50, 60, 100],
2455+
);
2456+
let b = u8x16::from_slice(
2457+
simd,
2458+
&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 2],
2459+
);
2460+
2461+
assert_eq!(
2462+
(a * b).val,
2463+
[
2464+
0, 2, 6, 12, 20, 30, 70, 120, 180, 250, 74, 164, 8, 188, 132, 200
2465+
]
2466+
);
2467+
}
2468+
2469+
#[simd_test]
2470+
fn mul_i8x16<S: Simd>(simd: S) {
2471+
let a = i8x16::from_slice(
2472+
simd,
2473+
&[
2474+
0, 1, -2, 3, -4, 5, 10, -15, 20, -25, 30, 35, -40, 50, -60, 100,
2475+
],
2476+
);
2477+
let b = i8x16::from_slice(
2478+
simd,
2479+
&[1, 2, 3, -4, 5, -6, 7, 8, 9, 10, -11, 12, 13, -14, 15, 2],
2480+
);
2481+
2482+
assert_eq!(
2483+
(a * b).val,
2484+
[
2485+
0, 2, -6, -12, -20, -30, 70, -120, -76, 6, -74, -92, -8, 68, 124, -56
2486+
]
2487+
);
2488+
}
2489+
24502490
#[simd_test]
24512491
fn mul_u16x8<S: Simd>(simd: S) {
24522492
let a = u16x8::from_slice(simd, &[0, 5, 10, 30, 500, 0, 0, 0]);

0 commit comments

Comments
 (0)