Skip to content

Commit ee275a8

Browse files
authored
Switch to using Wasmtime-style builtins for ceil, floor, etc. (#10657)
* Switch to using Wasmtime-style builtins for ceil, floor, etc. With this patch, all emitted calls are Wasmtime-style builtins, rather than Cranelift-style libcalls. This ensures that all calls from Cranelift-generated code into Wasmtime host code use the same mechanism, and eliminates the relocation handling code for the libcall mechanism. * Update tests. * Avoid using x86_pshufb on non-x86 platforms. * Revert unneeded f32/f64 changes in Pulley. * Define i8x16 as an unconstructible type if sse isn't available. * Delete the setters too. * Fix f32/f64 setters. * Test with prtest:full. prtest:full * Support fma. * Return true for `has_native_fma` on pulley. This works because pulley already has code implementing fma. This avoids needing to marshal f32x4 values into builtin function calls on pulley. * Update tests.
1 parent 7a66c39 commit ee275a8

File tree

38 files changed

+693
-580
lines changed

38 files changed

+693
-580
lines changed

cranelift/codegen/src/isa/aarch64/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -223,6 +223,10 @@ impl TargetIsa for AArch64Backend {
223223
true
224224
}
225225

226+
fn has_round(&self) -> bool {
227+
true
228+
}
229+
226230
fn has_x86_blendv_lowering(&self, _: Type) -> bool {
227231
false
228232
}

cranelift/codegen/src/isa/mod.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,9 @@ pub trait TargetIsa: fmt::Display + Send + Sync {
385385
/// not detected.
386386
fn has_native_fma(&self) -> bool;
387387

388+
/// Returns whether this ISA has instructions for `ceil`, `floor`, etc.
389+
fn has_round(&self) -> bool;
390+
388391
/// Returns whether the CLIF `x86_blendv` instruction is implemented for
389392
/// this ISA for the specified type.
390393
fn has_x86_blendv_lowering(&self, ty: Type) -> bool;

cranelift/codegen/src/isa/pulley_shared/mod.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -225,7 +225,13 @@ where
225225
}
226226

227227
fn has_native_fma(&self) -> bool {
228-
false
228+
// The pulley interpreter does have fma opcodes.
229+
true
230+
}
231+
232+
fn has_round(&self) -> bool {
233+
// The pulley interpreter does have rounding opcodes.
234+
true
229235
}
230236

231237
fn has_x86_blendv_lowering(&self, _ty: ir::Type) -> bool {

cranelift/codegen/src/isa/riscv64/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -201,6 +201,10 @@ impl TargetIsa for Riscv64Backend {
201201
true
202202
}
203203

204+
fn has_round(&self) -> bool {
205+
true
206+
}
207+
204208
fn has_x86_blendv_lowering(&self, _: Type) -> bool {
205209
false
206210
}

cranelift/codegen/src/isa/s390x/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -183,6 +183,10 @@ impl TargetIsa for S390xBackend {
183183
true
184184
}
185185

186+
fn has_round(&self) -> bool {
187+
true
188+
}
189+
186190
fn has_x86_blendv_lowering(&self, _: Type) -> bool {
187191
false
188192
}

cranelift/codegen/src/isa/x64/mod.rs

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ impl TargetIsa for X64Backend {
167167
self.x64_flags.use_fma()
168168
}
169169

170+
fn has_round(&self) -> bool {
171+
self.x64_flags.use_sse41()
172+
}
173+
170174
fn has_x86_blendv_lowering(&self, ty: Type) -> bool {
171175
// The `blendvpd`, `blendvps`, and `pblendvb` instructions are all only
172176
// available from SSE 4.1 and onwards. Otherwise the i16x8 type has no

crates/cranelift/src/func_environ.rs

Lines changed: 209 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use cranelift_codegen::ir::immediates::{Imm64, Offset32};
1212
use cranelift_codegen::ir::pcc::Fact;
1313
use cranelift_codegen::ir::types::*;
1414
use cranelift_codegen::ir::{self, types};
15-
use cranelift_codegen::ir::{ArgumentPurpose, Function, InstBuilder, MemFlags};
15+
use cranelift_codegen::ir::{ArgumentPurpose, ConstantData, Function, InstBuilder, MemFlags};
1616
use cranelift_codegen::isa::{TargetFrontendConfig, TargetIsa};
1717
use cranelift_entity::{EntityRef, PrimaryMap, SecondaryMap};
1818
use cranelift_frontend::FunctionBuilder;
@@ -3219,10 +3219,6 @@ impl FuncEnvironment<'_> {
32193219
self.isa.has_x86_blendv_lowering(ty)
32203220
}
32213221

3222-
pub fn use_x86_pshufb_for_relaxed_swizzle(&self) -> bool {
3223-
self.isa.has_x86_pshufb_lowering()
3224-
}
3225-
32263222
pub fn use_x86_pmulhrsw_for_relaxed_q15mul(&self) -> bool {
32273223
self.isa.has_x86_pmulhrsw_lowering()
32283224
}
@@ -3323,6 +3319,214 @@ impl FuncEnvironment<'_> {
33233319
let _ = (builder, num_pages, mem_index);
33243320
}
33253321

3322+
pub fn ceil_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3323+
// If the ISA has rounding instructions, let Cranelift use them. But if
3324+
// not, lower to a libcall here, rather than having Cranelift do it. We
3325+
// can pass our libcall the vmctx pointer, which we use for stack
3326+
// overflow checking.
3327+
if self.isa.has_round() {
3328+
builder.ins().ceil(value)
3329+
} else {
3330+
let ceil = self.builtin_functions.ceil_f32(builder.func);
3331+
let vmctx = self.vmctx_val(&mut builder.cursor());
3332+
let call = builder.ins().call(ceil, &[vmctx, value]);
3333+
*builder.func.dfg.inst_results(call).first().unwrap()
3334+
}
3335+
}
3336+
3337+
pub fn ceil_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3338+
// See the comments in `ceil_f32` about libcalls.
3339+
if self.isa.has_round() {
3340+
builder.ins().ceil(value)
3341+
} else {
3342+
let ceil = self.builtin_functions.ceil_f64(builder.func);
3343+
let vmctx = self.vmctx_val(&mut builder.cursor());
3344+
let call = builder.ins().call(ceil, &[vmctx, value]);
3345+
*builder.func.dfg.inst_results(call).first().unwrap()
3346+
}
3347+
}
3348+
3349+
pub fn floor_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3350+
// See the comments in `ceil_f32` about libcalls.
3351+
if self.isa.has_round() {
3352+
builder.ins().floor(value)
3353+
} else {
3354+
let floor = self.builtin_functions.floor_f32(builder.func);
3355+
let vmctx = self.vmctx_val(&mut builder.cursor());
3356+
let call = builder.ins().call(floor, &[vmctx, value]);
3357+
*builder.func.dfg.inst_results(call).first().unwrap()
3358+
}
3359+
}
3360+
3361+
pub fn floor_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3362+
// See the comments in `ceil_f32` about libcalls.
3363+
if self.isa.has_round() {
3364+
builder.ins().floor(value)
3365+
} else {
3366+
let floor = self.builtin_functions.floor_f64(builder.func);
3367+
let vmctx = self.vmctx_val(&mut builder.cursor());
3368+
let call = builder.ins().call(floor, &[vmctx, value]);
3369+
*builder.func.dfg.inst_results(call).first().unwrap()
3370+
}
3371+
}
3372+
3373+
pub fn trunc_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3374+
// See the comments in `ceil_f32` about libcalls.
3375+
if self.isa.has_round() {
3376+
builder.ins().trunc(value)
3377+
} else {
3378+
let trunc = self.builtin_functions.trunc_f32(builder.func);
3379+
let vmctx = self.vmctx_val(&mut builder.cursor());
3380+
let call = builder.ins().call(trunc, &[vmctx, value]);
3381+
*builder.func.dfg.inst_results(call).first().unwrap()
3382+
}
3383+
}
3384+
3385+
pub fn trunc_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3386+
// See the comments in `ceil_f32` about libcalls.
3387+
if self.isa.has_round() {
3388+
builder.ins().trunc(value)
3389+
} else {
3390+
let trunc = self.builtin_functions.trunc_f64(builder.func);
3391+
let vmctx = self.vmctx_val(&mut builder.cursor());
3392+
let call = builder.ins().call(trunc, &[vmctx, value]);
3393+
*builder.func.dfg.inst_results(call).first().unwrap()
3394+
}
3395+
}
3396+
3397+
pub fn nearest_f32(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3398+
// See the comments in `ceil_f32` about libcalls.
3399+
if self.isa.has_round() {
3400+
builder.ins().nearest(value)
3401+
} else {
3402+
let nearest = self.builtin_functions.nearest_f32(builder.func);
3403+
let vmctx = self.vmctx_val(&mut builder.cursor());
3404+
let call = builder.ins().call(nearest, &[vmctx, value]);
3405+
*builder.func.dfg.inst_results(call).first().unwrap()
3406+
}
3407+
}
3408+
3409+
pub fn nearest_f64(&mut self, builder: &mut FunctionBuilder, value: ir::Value) -> ir::Value {
3410+
// See the comments in `ceil_f32` about libcalls.
3411+
if self.isa.has_round() {
3412+
builder.ins().nearest(value)
3413+
} else {
3414+
let nearest = self.builtin_functions.nearest_f64(builder.func);
3415+
let vmctx = self.vmctx_val(&mut builder.cursor());
3416+
let call = builder.ins().call(nearest, &[vmctx, value]);
3417+
*builder.func.dfg.inst_results(call).first().unwrap()
3418+
}
3419+
}
3420+
3421+
pub fn swizzle(
3422+
&mut self,
3423+
builder: &mut FunctionBuilder,
3424+
a: ir::Value,
3425+
b: ir::Value,
3426+
) -> ir::Value {
3427+
// On x86, swizzle would typically be compiled to `pshufb`, except
3428+
// that that's not available on CPUs that lack SSSE3. In that case,
3429+
// fall back to a builtin function.
3430+
if !self.is_x86() || self.isa.has_x86_pshufb_lowering() {
3431+
builder.ins().swizzle(a, b)
3432+
} else {
3433+
let swizzle = self.builtin_functions.i8x16_swizzle(builder.func);
3434+
let vmctx = self.vmctx_val(&mut builder.cursor());
3435+
let call = builder.ins().call(swizzle, &[vmctx, a, b]);
3436+
*builder.func.dfg.inst_results(call).first().unwrap()
3437+
}
3438+
}
3439+
3440+
pub fn relaxed_swizzle(
3441+
&mut self,
3442+
builder: &mut FunctionBuilder,
3443+
a: ir::Value,
3444+
b: ir::Value,
3445+
) -> ir::Value {
3446+
// As above, fall back to a builtin if we lack SSSE3.
3447+
if !self.is_x86() || self.isa.has_x86_pshufb_lowering() {
3448+
if !self.is_x86() || self.relaxed_simd_deterministic() {
3449+
builder.ins().swizzle(a, b)
3450+
} else {
3451+
builder.ins().x86_pshufb(a, b)
3452+
}
3453+
} else {
3454+
let swizzle = self.builtin_functions.i8x16_swizzle(builder.func);
3455+
let vmctx = self.vmctx_val(&mut builder.cursor());
3456+
let call = builder.ins().call(swizzle, &[vmctx, a, b]);
3457+
*builder.func.dfg.inst_results(call).first().unwrap()
3458+
}
3459+
}
3460+
3461+
pub fn i8x16_shuffle(
3462+
&mut self,
3463+
builder: &mut FunctionBuilder,
3464+
a: ir::Value,
3465+
b: ir::Value,
3466+
lanes: &[u8; 16],
3467+
) -> ir::Value {
3468+
// As with swizzle, i8x16.shuffle would also commonly be implemented
3469+
// with pshufb, so if we lack SSSE3, fall back to a builtin.
3470+
if !self.is_x86() || self.isa.has_x86_pshufb_lowering() {
3471+
let lanes = ConstantData::from(&lanes[..]);
3472+
let mask = builder.func.dfg.immediates.push(lanes);
3473+
builder.ins().shuffle(a, b, mask)
3474+
} else {
3475+
let lanes = builder
3476+
.func
3477+
.dfg
3478+
.constants
3479+
.insert(ConstantData::from(&lanes[..]));
3480+
let lanes = builder.ins().vconst(I8X16, lanes);
3481+
let i8x16_shuffle = self.builtin_functions.i8x16_shuffle(builder.func);
3482+
let vmctx = self.vmctx_val(&mut builder.cursor());
3483+
let call = builder.ins().call(i8x16_shuffle, &[vmctx, a, b, lanes]);
3484+
*builder.func.dfg.inst_results(call).first().unwrap()
3485+
}
3486+
}
3487+
3488+
pub fn fma_f32x4(
3489+
&mut self,
3490+
builder: &mut FunctionBuilder,
3491+
a: ir::Value,
3492+
b: ir::Value,
3493+
c: ir::Value,
3494+
) -> ir::Value {
3495+
if self.has_native_fma() {
3496+
builder.ins().fma(a, b, c)
3497+
} else if self.relaxed_simd_deterministic() {
3498+
// Deterministic semantics are "fused multiply and add".
3499+
let fma = self.builtin_functions.fma_f32x4(builder.func);
3500+
let vmctx = self.vmctx_val(&mut builder.cursor());
3501+
let call = builder.ins().call(fma, &[vmctx, a, b, c]);
3502+
*builder.func.dfg.inst_results(call).first().unwrap()
3503+
} else {
3504+
let mul = builder.ins().fmul(a, b);
3505+
builder.ins().fadd(mul, c)
3506+
}
3507+
}
3508+
3509+
pub fn fma_f64x2(
3510+
&mut self,
3511+
builder: &mut FunctionBuilder,
3512+
a: ir::Value,
3513+
b: ir::Value,
3514+
c: ir::Value,
3515+
) -> ir::Value {
3516+
if self.has_native_fma() {
3517+
builder.ins().fma(a, b, c)
3518+
} else if self.relaxed_simd_deterministic() {
3519+
// Deterministic semantics are "fused multiply and add".
3520+
let fma = self.builtin_functions.fma_f64x2(builder.func);
3521+
let vmctx = self.vmctx_val(&mut builder.cursor());
3522+
let call = builder.ins().call(fma, &[vmctx, a, b, c]);
3523+
*builder.func.dfg.inst_results(call).first().unwrap()
3524+
} else {
3525+
let mul = builder.ins().fmul(a, b);
3526+
builder.ins().fadd(mul, c)
3527+
}
3528+
}
3529+
33263530
pub fn isa(&self) -> &dyn TargetIsa {
33273531
&*self.isa
33283532
}

crates/cranelift/src/lib.rs

Lines changed: 23 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -307,8 +307,9 @@ fn mach_reloc_to_reloc(
307307
}
308308
}
309309
FinalizedRelocTarget::ExternalName(ExternalName::LibCall(libcall)) => {
310-
let libcall = libcall_cranelift_to_wasmtime(libcall);
311-
RelocationTarget::HostLibcall(libcall)
310+
// We should have avoided any code that needs this style of libcalls
311+
// in the Wasm-to-Cranelift translator.
312+
panic!("unexpected libcall {libcall:?}");
312313
}
313314
_ => panic!("unrecognized external name"),
314315
};
@@ -320,24 +321,6 @@ fn mach_reloc_to_reloc(
320321
}
321322
}
322323

323-
fn libcall_cranelift_to_wasmtime(call: ir::LibCall) -> wasmtime_environ::obj::LibCall {
324-
use wasmtime_environ::obj::LibCall as LC;
325-
match call {
326-
ir::LibCall::FloorF32 => LC::FloorF32,
327-
ir::LibCall::FloorF64 => LC::FloorF64,
328-
ir::LibCall::NearestF32 => LC::NearestF32,
329-
ir::LibCall::NearestF64 => LC::NearestF64,
330-
ir::LibCall::CeilF32 => LC::CeilF32,
331-
ir::LibCall::CeilF64 => LC::CeilF64,
332-
ir::LibCall::TruncF32 => LC::TruncF32,
333-
ir::LibCall::TruncF64 => LC::TruncF64,
334-
ir::LibCall::FmaF32 => LC::FmaF32,
335-
ir::LibCall::FmaF64 => LC::FmaF64,
336-
ir::LibCall::X86Pshufb => LC::X86Pshufb,
337-
_ => panic!("cranelift emitted a libcall wasmtime does not support: {call:?}"),
338-
}
339-
}
340-
341324
/// Helper structure for creating a `Signature` for all builtins.
342325
struct BuiltinFunctionSignatures {
343326
pointer_type: ir::Type,
@@ -373,10 +356,30 @@ impl BuiltinFunctionSignatures {
373356
AbiParam::new(ir::types::I64)
374357
}
375358

359+
fn f32(&self) -> AbiParam {
360+
AbiParam::new(ir::types::F32)
361+
}
362+
363+
fn f64(&self) -> AbiParam {
364+
AbiParam::new(ir::types::F64)
365+
}
366+
376367
fn u8(&self) -> AbiParam {
377368
AbiParam::new(ir::types::I8)
378369
}
379370

371+
fn i8x16(&self) -> AbiParam {
372+
AbiParam::new(ir::types::I8X16)
373+
}
374+
375+
fn f32x4(&self) -> AbiParam {
376+
AbiParam::new(ir::types::F32X4)
377+
}
378+
379+
fn f64x2(&self) -> AbiParam {
380+
AbiParam::new(ir::types::F64X2)
381+
}
382+
380383
fn bool(&self) -> AbiParam {
381384
AbiParam::new(ir::types::I8)
382385
}

0 commit comments

Comments
 (0)