Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
: TargetLowering(TM), Subtarget(&STI) {
auto MVTPtr = Subtarget->hasAddr64() ? MVT::i64 : MVT::i32;

// Set the load count for memcmp expand optimization
MaxLoadsPerMemcmp = 8;
MaxLoadsPerMemcmpOptSize = 4;

// Booleans always contain 0 or 1.
setBooleanContents(ZeroOrOneBooleanContent);
// Except in SIMD vectors
Expand Down
17 changes: 17 additions & 0 deletions llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,23 @@ InstructionCost WebAssemblyTTIImpl::getCastInstrCost(
return BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I);
}

WebAssemblyTTIImpl::TTI::MemCmpExpansionOptions
WebAssemblyTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
TTI::MemCmpExpansionOptions Options;

Options.AllowOverlappingLoads = true;

// TODO: Teach WebAssembly backend about load v128.
// if (ST->hasSIMD128())
// Options.LoadSizes.push_back(16);

Options.LoadSizes.append({8, 4, 2, 1});
Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
Options.NumLoadsPerBlock = Options.MaxNumLoads;

return Options;
}

InstructionCost WebAssemblyTTIImpl::getMemoryOpCost(
unsigned Opcode, Type *Ty, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind, TTI::OperandValueInfo OpInfo,
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
TTI::CastContextHint CCH, TTI::TargetCostKind CostKind,
const Instruction *I = nullptr) const override;

TTI::MemCmpExpansionOptions
enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const override;

InstructionCost getMemoryOpCost(
unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
TTI::TargetCostKind CostKind,
Expand Down
144 changes: 144 additions & 0 deletions llvm/test/CodeGen/WebAssembly/memcmp-expand.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -O3 -verify-machineinstrs -disable-wasm-fallthrough-return-opt -wasm-disable-explicit-locals -wasm-keep-registers -mattr=+simd128 | FileCheck %s

target triple = "wasm32-unknown-unknown"

declare i32 @memcmp(ptr, ptr, i32)

define i1 @memcmp_expand_3(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_3:
; CHECK: .functype memcmp_expand_3 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.load16_u $push7=, 0($0):p2align=0
; CHECK-NEXT: i32.load16_u $push6=, 0($1):p2align=0
; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6
; CHECK-NEXT: i32.const $push0=, 2
; CHECK-NEXT: i32.add $push3=, $0, $pop0
; CHECK-NEXT: i32.load8_u $push4=, 0($pop3)
; CHECK-NEXT: i32.const $push13=, 2
; CHECK-NEXT: i32.add $push1=, $1, $pop13
; CHECK-NEXT: i32.load8_u $push2=, 0($pop1)
; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2
; CHECK-NEXT: i32.or $push9=, $pop8, $pop5
; CHECK-NEXT: i32.const $push10=, 65535
; CHECK-NEXT: i32.and $push11=, $pop9, $pop10
; CHECK-NEXT: i32.eqz $push12=, $pop11
; CHECK-NEXT: return $pop12
%cmp_3 = call i32 @memcmp(ptr %a, ptr %b, i32 3)
%res = icmp eq i32 %cmp_3, 0
ret i1 %res
}

define i1 @memcmp_expand_5(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_5:
; CHECK: .functype memcmp_expand_5 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0
; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0
; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6
; CHECK-NEXT: i32.const $push0=, 4
; CHECK-NEXT: i32.add $push3=, $0, $pop0
; CHECK-NEXT: i32.load8_u $push4=, 0($pop3)
; CHECK-NEXT: i32.const $push11=, 4
; CHECK-NEXT: i32.add $push1=, $1, $pop11
; CHECK-NEXT: i32.load8_u $push2=, 0($pop1)
; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2
; CHECK-NEXT: i32.or $push9=, $pop8, $pop5
; CHECK-NEXT: i32.eqz $push10=, $pop9
; CHECK-NEXT: return $pop10
%cmp_5 = call i32 @memcmp(ptr %a, ptr %b, i32 5)
%res = icmp eq i32 %cmp_5, 0
ret i1 %res
}

define i1 @memcmp_expand_7(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_7:
; CHECK: .functype memcmp_expand_7 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.load $push7=, 0($0):p2align=0
; CHECK-NEXT: i32.load $push6=, 0($1):p2align=0
; CHECK-NEXT: i32.xor $push8=, $pop7, $pop6
; CHECK-NEXT: i32.const $push0=, 3
; CHECK-NEXT: i32.add $push3=, $0, $pop0
; CHECK-NEXT: i32.load $push4=, 0($pop3):p2align=0
; CHECK-NEXT: i32.const $push11=, 3
; CHECK-NEXT: i32.add $push1=, $1, $pop11
; CHECK-NEXT: i32.load $push2=, 0($pop1):p2align=0
; CHECK-NEXT: i32.xor $push5=, $pop4, $pop2
; CHECK-NEXT: i32.or $push9=, $pop8, $pop5
; CHECK-NEXT: i32.eqz $push10=, $pop9
; CHECK-NEXT: return $pop10
%cmp_7 = call i32 @memcmp(ptr %a, ptr %b, i32 7)
%res = icmp eq i32 %cmp_7, 0
ret i1 %res
}

; INFO: Negative test
; Should not expand even with simd128
define i1 @memcmp_expand_129(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_129:
; CHECK: .functype memcmp_expand_129 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.const $push0=, 129
; CHECK-NEXT: call $push1=, memcmp, $0, $1, $pop0
; CHECK-NEXT: i32.eqz $push2=, $pop1
; CHECK-NEXT: return $pop2
%cmp_129 = call i32 @memcmp(ptr %a, ptr %b, i32 129)
%res = icmp eq i32 %cmp_129, 0
ret i1 %res
}

define i1 @memcmp_expand_2(ptr %a, ptr %b) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a test where the pointers are aligned and see if we omit the p2aligns?

So basically the same test as this but just with

define i1 @memcmp_expand_2_align(ptr align(2) %a, ptr align(2) %b) {

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yep looks like it works

define i1 @memcmp_expand_2(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_2:
; CHECK:         .functype memcmp_expand_2 (i32, i32) -> (i32)
; CHECK-NEXT:  # %bb.0:
; CHECK-NEXT:    i32.load16_u $push1=, 0($0):p2align=0
; CHECK-NEXT:    i32.load16_u $push0=, 0($1):p2align=0
; CHECK-NEXT:    i32.eq $push2=, $pop1, $pop0
; CHECK-NEXT:    return $pop2
  %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2)
  %res = icmp eq i32 %cmp_2, 0
  ret i1 %res
}

define i1 @memcmp_expand_2_align(ptr align(2) %a, ptr align(2) %b) {
; CHECK-LABEL: memcmp_expand_2_align:
; CHECK:         .functype memcmp_expand_2_align (i32, i32) -> (i32)
; CHECK-NEXT:  # %bb.0:
; CHECK-NEXT:    i32.load16_u $push1=, 0($0)
; CHECK-NEXT:    i32.load16_u $push0=, 0($1)
; CHECK-NEXT:    i32.eq $push2=, $pop1, $pop0
; CHECK-NEXT:    return $pop2
  %cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2)
  %res = icmp eq i32 %cmp_2, 0
  ret i1 %res
}

; CHECK-LABEL: memcmp_expand_2:
; CHECK: .functype memcmp_expand_2 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i32.load16_u $push1=, 0($0):p2align=0
; CHECK-NEXT: i32.load16_u $push0=, 0($1):p2align=0
; CHECK-NEXT: i32.eq $push2=, $pop1, $pop0
; CHECK-NEXT: return $pop2
%cmp_2 = call i32 @memcmp(ptr %a, ptr %b, i32 2)
%res = icmp eq i32 %cmp_2, 0
ret i1 %res
}


define i1 @memcmp_expand_8(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_8:
; CHECK: .functype memcmp_expand_8 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i64.load $push1=, 0($0):p2align=0
; CHECK-NEXT: i64.load $push0=, 0($1):p2align=0
; CHECK-NEXT: i64.eq $push2=, $pop1, $pop0
; CHECK-NEXT: return $pop2
%cmp_8 = call i32 @memcmp(ptr %a, ptr %b, i32 8)
%res = icmp eq i32 %cmp_8, 0
ret i1 %res
}


; TODO: Should be using a single load i64x2 or equivalent in bitsizes
define i1 @memcmp_expand_16(ptr %a, ptr %b) {
; CHECK-LABEL: memcmp_expand_16:
; CHECK: .functype memcmp_expand_16 (i32, i32) -> (i32)
; CHECK-NEXT: # %bb.0:
; CHECK-NEXT: i64.load $push7=, 0($0):p2align=0
; CHECK-NEXT: i64.load $push6=, 0($1):p2align=0
; CHECK-NEXT: i64.xor $push8=, $pop7, $pop6
; CHECK-NEXT: i32.const $push0=, 8
; CHECK-NEXT: i32.add $push3=, $0, $pop0
; CHECK-NEXT: i64.load $push4=, 0($pop3):p2align=0
; CHECK-NEXT: i32.const $push11=, 8
; CHECK-NEXT: i32.add $push1=, $1, $pop11
; CHECK-NEXT: i64.load $push2=, 0($pop1):p2align=0
; CHECK-NEXT: i64.xor $push5=, $pop4, $pop2
; CHECK-NEXT: i64.or $push9=, $pop8, $pop5
; CHECK-NEXT: i64.eqz $push10=, $pop9
; CHECK-NEXT: return $pop10
%cmp_16 = call i32 @memcmp(ptr %a, ptr %b, i32 16)
%res = icmp eq i32 %cmp_16, 0
ret i1 %res
}




Loading