Skip to content

Commit 403d9a2

Browse files
committed
[WebAssembly] Optimize convert_iKxN_u into convert_iKxN_s
convert_iKxN_s is canonicalized into convert_iKxN_u when the argument is known to have sign bit 0. This results in emitting Wasm opcodes that, on some targets (like x86_64), are dramatically slower than signed versions on major engines. Similarly to X86, we now fix this up in isel when the instruction has nonneg flag from canonicalization or if we know the source has zero sign bit.
1 parent 8349bbd commit 403d9a2

File tree

3 files changed

+55
-5
lines changed

3 files changed

+55
-5
lines changed

llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2934,6 +2934,25 @@ performVectorExtendToFPCombine(SDNode *N,
29342934
return DAG.getNode(N->getOpcode(), SDLoc(N), ResVT, Conv);
29352935
}
29362936

2937+
static SDValue
2938+
performVectorNonNegToFPCombine(SDNode *N,
2939+
TargetLowering::DAGCombinerInfo &DCI) {
2940+
auto &DAG = DCI.DAG;
2941+
2942+
SDNodeFlags Flags = N->getFlags();
2943+
SDValue Op0 = N->getOperand(0);
2944+
EVT VT = N->getValueType(0);
2945+
2946+
// Optimize uitofp to sitofp when the sign bit is known to be zero.
2947+
// Depending on the target (runtime) backend, this might be performance
2948+
// neutral (e.g. AArch64) or a significant improvement (e.g. x86_64).
2949+
if (VT.isVector() && (Flags.hasNonNeg() || DAG.SignBitIsZero(Op0))) {
2950+
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
2951+
}
2952+
2953+
return SDValue();
2954+
}
2955+
29372956
static SDValue
29382957
performVectorExtendCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
29392958
auto &DAG = DCI.DAG;
@@ -3515,6 +3534,9 @@ WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
35153534
case ISD::ZERO_EXTEND:
35163535
return performVectorExtendCombine(N, DCI);
35173536
case ISD::UINT_TO_FP:
3537+
if (auto ExtCombine = performVectorExtendToFPCombine(N, DCI))
3538+
return ExtCombine;
3539+
return performVectorNonNegToFPCombine(N, DCI);
35183540
case ISD::SINT_TO_FP:
35193541
return performVectorExtendToFPCombine(N, DCI);
35203542
case ISD::FP_TO_SINT_SAT:

llvm/test/CodeGen/WebAssembly/simd-conversions.ll

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -441,3 +441,31 @@ define <2 x double> @promote_mixed_v2f64(<4 x float> %x, <4 x float> %y) {
441441
%a = fpext <2 x float> %v to <2 x double>
442442
ret <2 x double> %a
443443
}
444+
445+
define <4 x float> @convert_u_v4f32_maybeneg(<4 x i32> %x) {
446+
; CHECK-LABEL: convert_u_v4f32_maybeneg:
447+
; CHECK: .functype convert_u_v4f32_maybeneg (v128) -> (v128)
448+
; CHECK-NEXT: # %bb.0:
449+
; CHECK-NEXT: local.get 0
450+
; CHECK-NEXT: i32.const 1
451+
; CHECK-NEXT: i32x4.shr_s
452+
; CHECK-NEXT: f32x4.convert_i32x4_u
453+
; CHECK-NEXT: # fallthrough-return
454+
%a = ashr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
455+
%b = uitofp <4 x i32> %a to <4 x float>
456+
ret <4 x float> %b
457+
}
458+
459+
define <4 x float> @convert_u_v4f32_nonneg(<4 x i32> %x) {
460+
; CHECK-LABEL: convert_u_v4f32_nonneg:
461+
; CHECK: .functype convert_u_v4f32_nonneg (v128) -> (v128)
462+
; CHECK-NEXT: # %bb.0:
463+
; CHECK-NEXT: local.get 0
464+
; CHECK-NEXT: i32.const 1
465+
; CHECK-NEXT: i32x4.shr_u
466+
; CHECK-NEXT: f32x4.convert_i32x4_s
467+
; CHECK-NEXT: # fallthrough-return
468+
%a = lshr <4 x i32> %x, <i32 1, i32 1, i32 1, i32 1>
469+
%b = uitofp <4 x i32> %a to <4 x float>
470+
ret <4 x float> %b
471+
}

llvm/test/CodeGen/WebAssembly/simd-extending-convert.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ define <4 x float> @extend_to_float_low_i16x8_u(<8 x i16> %x) {
1212
; CHECK-NEXT: # %bb.0:
1313
; CHECK-NEXT: local.get 0
1414
; CHECK-NEXT: i32x4.extend_low_i16x8_u
15-
; CHECK-NEXT: f32x4.convert_i32x4_u
15+
; CHECK-NEXT: f32x4.convert_i32x4_s
1616
; CHECK-NEXT: # fallthrough-return
1717
%low = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1818
%extended = uitofp <4 x i16> %low to <4 x float>
@@ -25,7 +25,7 @@ define <4 x float> @extend_to_float_high_i16x8_u(<8 x i16> %x) {
2525
; CHECK-NEXT: # %bb.0:
2626
; CHECK-NEXT: local.get 0
2727
; CHECK-NEXT: i32x4.extend_high_i16x8_u
28-
; CHECK-NEXT: f32x4.convert_i32x4_u
28+
; CHECK-NEXT: f32x4.convert_i32x4_s
2929
; CHECK-NEXT: # fallthrough-return
3030
%high = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3131
%extended = uitofp <4 x i16> %high to <4 x float>
@@ -39,7 +39,7 @@ define <4 x float> @extend_to_float_low_i8x16_u(<8 x i8> %x) {
3939
; CHECK-NEXT: local.get 0
4040
; CHECK-NEXT: i16x8.extend_low_i8x16_u
4141
; CHECK-NEXT: i32x4.extend_low_i16x8_u
42-
; CHECK-NEXT: f32x4.convert_i32x4_u
42+
; CHECK-NEXT: f32x4.convert_i32x4_s
4343
; CHECK-NEXT: # fallthrough-return
4444
%low = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4545
%extended = uitofp <4 x i8> %low to <4 x float>
@@ -55,7 +55,7 @@ define <4 x float> @extend_to_float_high_i8x16_u(<8 x i8> %x) {
5555
; CHECK-NEXT: i8x16.shuffle 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
5656
; CHECK-NEXT: i16x8.extend_low_i8x16_u
5757
; CHECK-NEXT: i32x4.extend_low_i16x8_u
58-
; CHECK-NEXT: f32x4.convert_i32x4_u
58+
; CHECK-NEXT: f32x4.convert_i32x4_s
5959
; CHECK-NEXT: # fallthrough-return
6060
%high = shufflevector <8 x i8> %x, <8 x i8> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
6161
%extended = uitofp <4 x i8> %high to <4 x float>
@@ -136,7 +136,7 @@ define <2 x double> @extend_to_double_low_i16x4_u(<4 x i16> %x) {
136136
; CHECK-NEXT: # %bb.0:
137137
; CHECK-NEXT: local.get 0
138138
; CHECK-NEXT: i32x4.extend_low_i16x8_u
139-
; CHECK-NEXT: f64x2.convert_low_i32x4_u
139+
; CHECK-NEXT: f64x2.convert_low_i32x4_s
140140
; CHECK-NEXT: # fallthrough-return
141141
%low = shufflevector <4 x i16> %x, <4 x i16> undef, <2 x i32> <i32 0, i32 1>
142142
%extended = uitofp <2 x i16> %low to <2 x double>

0 commit comments

Comments
 (0)