Skip to content
10 changes: 10 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,13 @@ This release has an [MSRV][] of 1.88.
### Added

- All vector types now implement `Index` and `IndexMut`. ([#112][] by [@Ralith][])
- 256-bit vector types now use native AVX2 intrinsics on supported platforms. ([#115][] by [@valadaptive][])
- 8-bit integer multiplication is now implemented on x86. ([#115][] by [@valadaptive][])

### Fixed

- Integer equality comparisons now function properly on x86. Previously, they performed "greater than" comparisons.
([#115][] by [@valadaptive][])

### Changed

Expand All @@ -27,6 +34,7 @@ This release has an [MSRV][] of 1.88.
A consequence of this is that the available variants on `Level` are now dependent on the target features you are compiling with.
The fallback level can be restored with the `force_support_fallback` cargo feature. We don't expect this to be necessary outside
of tests.
- Code generation for `select` and `unzip` operations on x86 has been improved. ([#115][] by [@valadaptive][])

### Removed

Expand Down Expand Up @@ -86,6 +94,7 @@ No changelog was kept for this release.

[@Ralith]: https://github.com/Ralith
[@DJMcNab]: https://github.com/DJMcNab
[@valadaptive]: https://github.com/valadaptive

[#75]: https://github.com/linebender/fearless_simd/pull/75
[#76]: https://github.com/linebender/fearless_simd/pull/76
Expand All @@ -103,6 +112,7 @@ No changelog was kept for this release.
[#96]: https://github.com/linebender/fearless_simd/pull/96
[#99]: https://github.com/linebender/fearless_simd/pull/99
[#105]: https://github.com/linebender/fearless_simd/pull/105
[#115]: https://github.com/linebender/fearless_simd/pull/115

[Unreleased]: https://github.com/linebender/fearless_simd/compare/v0.3.0...HEAD
[0.3.0]: https://github.com/linebender/fearless_simd/compare/v0.3.0...v0.2.0
Expand Down
1,820 changes: 809 additions & 1,011 deletions fearless_simd/src/generated/avx2.rs

Large diffs are not rendered by default.

176 changes: 54 additions & 122 deletions fearless_simd/src/generated/sse4_2.rs

Large diffs are not rendered by default.

18 changes: 0 additions & 18 deletions fearless_simd_gen/src/arch/avx2.rs

This file was deleted.

5 changes: 1 addition & 4 deletions fearless_simd_gen/src/arch/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,8 @@

pub(crate) mod fallback;
pub(crate) mod neon;

pub(crate) mod avx2;
pub(crate) mod sse4_2;
pub(crate) mod wasm;
pub(crate) mod x86_common;
pub(crate) mod x86;

use proc_macro2::TokenStream;

Expand Down
18 changes: 0 additions & 18 deletions fearless_simd_gen/src/arch/sse4_2.rs

This file was deleted.

275 changes: 275 additions & 0 deletions fearless_simd_gen/src/arch/x86.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,275 @@
// Copyright 2025 the Fearless_SIMD Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

#![expect(
unreachable_pub,
reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
)]

use crate::arch::Arch;
use crate::types::{ScalarType, VecType};
use proc_macro2::{Ident, Span, TokenStream};
use quote::{format_ident, quote};

pub struct X86;

pub(crate) fn translate_op(op: &str) -> Option<&'static str> {
Some(match op {
"floor" => "floor",
"sqrt" => "sqrt",
"add" => "add",
"sub" => "sub",
"div" => "div",
"and" => "and",
"simd_eq" => "cmpeq",
"simd_lt" => "cmplt",
"simd_le" => "cmple",
"simd_ge" => "cmpge",
"simd_gt" => "cmpgt",
"or" => "or",
"xor" => "xor",
"shl" => "shl",
"shr" => "shr",
"max" => "max",
"min" => "min",
"max_precise" => "max",
"min_precise" => "min",
"select" => "blendv",
_ => return None,
})
}

impl Arch for X86 {
fn arch_ty(&self, ty: &VecType) -> TokenStream {
let suffix = match (ty.scalar, ty.scalar_bits) {
(ScalarType::Float, 32) => "",
(ScalarType::Float, 64) => "d",
(ScalarType::Float, _) => unimplemented!(),
(ScalarType::Unsigned | ScalarType::Int | ScalarType::Mask, _) => "i",
};
let name = format!("__m{}{}", ty.scalar_bits * ty.len, suffix);
let ident = Ident::new(&name, Span::call_site());
quote! { #ident }
}

fn expr(&self, op: &str, ty: &VecType, args: &[TokenStream]) -> TokenStream {
if let Some(op_name) = translate_op(op) {
let sign_aware = matches!(op, "max" | "min");

let suffix = match op_name {
"and" | "or" | "xor" => coarse_type(*ty),
"blendv" if ty.scalar != ScalarType::Float => "epi8",
_ => op_suffix(ty.scalar, ty.scalar_bits, sign_aware),
};
let intrinsic = intrinsic_ident(op_name, suffix, ty.n_bits());
quote! { #intrinsic ( #( #args ),* ) }
} else {
let suffix = op_suffix(ty.scalar, ty.scalar_bits, true);
match op {
"trunc" => {
let intrinsic = intrinsic_ident("round", suffix, ty.n_bits());
quote! { #intrinsic ( #( #args, )* _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC) }
}
"neg" => match ty.scalar {
ScalarType::Float => {
let set1 = set1_intrinsic(ty.scalar, ty.scalar_bits, ty.n_bits());
let xor =
simple_intrinsic("xor", ScalarType::Float, ty.scalar_bits, ty.n_bits());
quote! {
#( #xor(#args, #set1(-0.0)) )*
}
}
ScalarType::Int => {
let set0 = intrinsic_ident("setzero", coarse_type(*ty), ty.n_bits());
let sub = simple_intrinsic("sub", ty.scalar, ty.scalar_bits, ty.n_bits());
let arg = &args[0];
quote! {
#sub(#set0(), #arg)
}
}
_ => unreachable!(),
},
"abs" => {
let set1 = set1_intrinsic(ty.scalar, ty.scalar_bits, ty.n_bits());
let andnot =
simple_intrinsic("andnot", ScalarType::Float, ty.scalar_bits, ty.n_bits());
quote! {
#( #andnot(#set1(-0.0), #args) )*
}
}
"copysign" => {
let a = &args[0];
let b = &args[1];
let set1 = set1_intrinsic(ty.scalar, ty.scalar_bits, ty.n_bits());
let and =
simple_intrinsic("and", ScalarType::Float, ty.scalar_bits, ty.n_bits());
let andnot =
simple_intrinsic("andnot", ScalarType::Float, ty.scalar_bits, ty.n_bits());
let or = simple_intrinsic("or", ScalarType::Float, ty.scalar_bits, ty.n_bits());
quote! {
let mask = #set1(-0.0);
#or(#and(mask, #b), #andnot(mask, #a))
}
}
"mul" => {
let suffix = op_suffix(ty.scalar, ty.scalar_bits, false);
let intrinsic = if matches!(ty.scalar, ScalarType::Int | ScalarType::Unsigned) {
intrinsic_ident("mullo", suffix, ty.n_bits())
} else {
intrinsic_ident("mul", suffix, ty.n_bits())
};

quote! { #intrinsic ( #( #args ),* ) }
}
"shrv" if ty.scalar_bits > 16 => {
let suffix = op_suffix(ty.scalar, ty.scalar_bits, false);
let name = match ty.scalar {
ScalarType::Int => "srav",
_ => "srlv",
};
let intrinsic = intrinsic_ident(name, suffix, ty.n_bits());
quote! { #intrinsic ( #( #args ),* ) }
}
_ => unimplemented!("{}", op),
}
}
}
}

pub(crate) fn op_suffix(mut ty: ScalarType, bits: usize, sign_aware: bool) -> &'static str {
use ScalarType::*;
if !sign_aware && ty == Unsigned {
ty = Int;
}
match (ty, bits) {
(Float, 32) => "ps",
(Float, 64) => "pd",
(Float, _) => unimplemented!("{bits} bit floats"),
(Int | Mask, 8) => "epi8",
(Int | Mask, 16) => "epi16",
(Int | Mask, 32) => "epi32",
(Int | Mask, 64) => "epi64",
(Unsigned, 8) => "epu8",
(Unsigned, 16) => "epu16",
(Unsigned, 32) => "epu32",
(Unsigned, 64) => "epu64",
_ => unreachable!(),
}
}

/// Intrinsic name for the "int, float, or double" type (not as fine-grained as [`op_suffix`]).
pub(crate) fn coarse_type(vec_ty: VecType) -> &'static str {
use ScalarType::*;
match (vec_ty.scalar, vec_ty.n_bits()) {
(Int | Unsigned | Mask, 128) => "si128",
(Int | Unsigned | Mask, 256) => "si256",
(Int | Unsigned | Mask, 512) => "si512",
_ => op_suffix(vec_ty.scalar, vec_ty.scalar_bits, false),
}
}

pub(crate) fn set1_intrinsic(ty: ScalarType, bits: usize, ty_bits: usize) -> Ident {
use ScalarType::*;
let suffix = match (ty, bits) {
(Int | Unsigned | Mask, 64) => "epi64x",
_ => op_suffix(ty, bits, false),
};

intrinsic_ident("set1", suffix, ty_bits)
}

pub(crate) fn simple_intrinsic(name: &str, ty: ScalarType, bits: usize, ty_bits: usize) -> Ident {
let suffix = op_suffix(ty, bits, true);

intrinsic_ident(name, suffix, ty_bits)
}

pub(crate) fn simple_sign_unaware_intrinsic(
name: &str,
ty: ScalarType,
bits: usize,
ty_bits: usize,
) -> Ident {
let suffix = op_suffix(ty, bits, false);

intrinsic_ident(name, suffix, ty_bits)
}

pub(crate) fn extend_intrinsic(
ty: ScalarType,
from_bits: usize,
to_bits: usize,
ty_bits: usize,
) -> Ident {
let from_suffix = op_suffix(ty, from_bits, true);
let to_suffix = op_suffix(ty, to_bits, false);

intrinsic_ident(&format!("cvt{from_suffix}"), to_suffix, ty_bits)
}

pub(crate) fn cvt_intrinsic(from: VecType, to: VecType) -> Ident {
let from_suffix = op_suffix(from.scalar, from.scalar_bits, false);
let to_suffix = op_suffix(to.scalar, to.scalar_bits, false);

intrinsic_ident(&format!("cvt{from_suffix}"), to_suffix, from.n_bits())
}

pub(crate) fn pack_intrinsic(from_bits: usize, signed: bool, ty_bits: usize) -> Ident {
let unsigned = match signed {
true => "",
false => "u",
};
let suffix = op_suffix(ScalarType::Int, from_bits, false);

intrinsic_ident(&format!("pack{unsigned}s"), suffix, ty_bits)
}

pub(crate) fn unpack_intrinsic(
scalar_type: ScalarType,
scalar_bits: usize,
low: bool,
ty_bits: usize,
) -> Ident {
let suffix = op_suffix(scalar_type, scalar_bits, false);

let low_pref = if low { "lo" } else { "hi" };

intrinsic_ident(&format!("unpack{low_pref}"), suffix, ty_bits)
}

pub(crate) fn intrinsic_ident(name: &str, suffix: &str, ty_bits: usize) -> Ident {
let prefix = match ty_bits {
128 => "",
256 => "256",
512 => "512",
_ => unreachable!(),
};

format_ident!("_mm{prefix}_{name}_{suffix}")
}

pub(crate) fn cast_ident(
src_scalar_ty: ScalarType,
dst_scalar_ty: ScalarType,
scalar_bits: usize,
ty_bits: usize,
) -> Ident {
let prefix = match ty_bits {
128 => "",
256 => "256",
512 => "512",
_ => unreachable!(),
};
let src_name = coarse_type(VecType::new(
src_scalar_ty,
scalar_bits,
ty_bits / scalar_bits,
));
let dst_name = coarse_type(VecType::new(
dst_scalar_ty,
scalar_bits,
ty_bits / scalar_bits,
));

format_ident!("_mm{prefix}_cast{src_name}_{dst_name}")
}
Loading
Loading