Skip to content

[NVPTX] Add prefetch tensormap variant #146203

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 22 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion llvm/docs/NVPTXUsage.rst
Original file line number Diff line number Diff line change
Expand Up @@ -971,6 +971,10 @@ Syntax:
declare void @llvm.nvvm.prefetch.L1(ptr %ptr)
declare void @llvm.nvvm.prefetch.L2(ptr %ptr)

declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)

declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)

Expand All @@ -983,7 +987,10 @@ The '``@llvm.nvvm.prefetch.*``' and '``@llvm.nvvm.prefetchu.*``' intrinsic
correspond to the '``prefetch.*``;' and '``prefetchu.*``' family of PTX instructions.
The '``prefetch.*``' instructions bring the cache line containing the
specified address in global or local memory address space into the
specified cache level (L1 or L2). The '`prefetchu.*``' instruction brings the cache line
specified cache level (L1 or L2). If the '``.tensormap``' qualifier is specified then the
prefetch instruction brings the cache line containing the specified address in the
'``.const``' or '``.param memory``' state space for subsequent use by the '``cp.async.bulk.tensor``'
instruction. The '`prefetchu.*``' instruction brings the cache line
containing the specified generic address into the specified uniform cache level.
If no address space is specified, it is assumed to be generic address. The intrinsic
uses and eviction priority which can be accessed by the '``.level::eviction_priority``' modifier.
Expand Down
13 changes: 8 additions & 5 deletions llvm/include/llvm/IR/IntrinsicsNVVM.td
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@

def llvm_global_ptr_ty : LLVMQualPointerType<1>; // (global)ptr
def llvm_shared_ptr_ty : LLVMQualPointerType<3>; // (shared)ptr
def llvm_constant_ptr_ty: LLVMQualPointerType<4>; // (const)ptr
def llvm_local_ptr_ty : LLVMQualPointerType<5>; // (local)ptr
def llvm_tmem_ptr_ty : LLVMQualPointerType<6>; // (tensor memory)ptr
def llvm_shared_cluster_ptr_ty : LLVMQualPointerType<7>; // (shared_cluster)ptr
Expand Down Expand Up @@ -2087,15 +2088,17 @@ foreach dim = 1...5 in {
// Intrinsics for Prefetch and Prefetchu
let IntrProperties = [IntrArgMemOnly, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>] in {
foreach level = ["L1", "L2"] in {
def int_nvvm_prefetch_ # level : Intrinsic<[], [llvm_ptr_ty]>;
def int_nvvm_prefetch_global_ # level : Intrinsic<[], [llvm_global_ptr_ty]>;
def int_nvvm_prefetch_local_ # level : Intrinsic<[], [llvm_local_ptr_ty]>;
def int_nvvm_prefetch_ # level : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
def int_nvvm_prefetch_global_ # level : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;
def int_nvvm_prefetch_local_ # level : DefaultAttrsIntrinsic<[], [llvm_local_ptr_ty]>;
}

def int_nvvm_prefetch_tensormap : DefaultAttrsIntrinsic<[], [llvm_anyptr_ty]>;

foreach eviction_priority = ["evict_normal", "evict_last"] in
def int_nvvm_prefetch_global_L2_ # eviction_priority : Intrinsic<[], [llvm_global_ptr_ty]>;
def int_nvvm_prefetch_global_L2_ # eviction_priority : DefaultAttrsIntrinsic<[], [llvm_global_ptr_ty]>;

def int_nvvm_prefetchu_L1 : Intrinsic<[], [llvm_ptr_ty]>;
def int_nvvm_prefetchu_L1 : DefaultAttrsIntrinsic<[], [llvm_ptr_ty]>;
}

// applypriority
Expand Down
12 changes: 12 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3973,6 +3973,18 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
return true;
}

case Intrinsic::nvvm_prefetch_tensormap: {
auto &DL = I.getDataLayout();
Info.opc = ISD::INTRINSIC_VOID;
Info.memVT = getPointerTy(DL);
Info.ptrVal = I.getArgOperand(0);
Info.offset = 0;
Info.flags =
MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable;
Info.align.reset();
return true;
}

case Intrinsic::nvvm_ldu_global_i:
case Intrinsic::nvvm_ldu_global_f:
case Intrinsic::nvvm_ldu_global_p: {
Expand Down
63 changes: 40 additions & 23 deletions llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ def AS_match {
code global = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_GLOBAL);
}];
code const = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_CONST);
}];
code param = [{
return ChkMemSDNodeAddressSpace(N, llvm::ADDRESS_SPACE_PARAM);
}];
}

// A node that will be replaced with the current PTX version.
Expand Down Expand Up @@ -744,35 +750,46 @@ foreach dim = [1, 2, 3, 4, 5] in {
}
}

//Prefetch and Prefetchu
//Prefetchu and Prefetch

class PREFETCH_INTRS<string InstName> :
BasicNVPTXInst<(outs), (ins ADDR:$addr),
InstName,
[(!cast<Intrinsic>(!strconcat("int_nvvm_",
!subst(".", "_", InstName))) addr:$addr)]>,
Requires<[hasPTX<80>, hasSM<90>]>;

defvar frag_pat = (int_nvvm_prefetch_tensormap node:$addr);

def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1">;
def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2">;
def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1">;
def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">;
def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">;
def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">;
multiclass PREFETCH_TENSORMAP_PATFRAG<string suffix, code predicate> {
def !tolower(suffix) : PatFrag<!setdagop(frag_pat, ops), frag_pat, predicate>;
}

def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr),
"prefetch.global.L2::evict_normal",
[(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>,
Requires<[hasPTX<80>, hasSM<90>]>;
defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"CONST", AS_match.const>;
defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"GENERIC", AS_match.generic>;
defm prefetch_tensormap_ : PREFETCH_TENSORMAP_PATFRAG<"PARAM", AS_match.param>;

def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr),
"prefetch.global.L2::evict_last",
[(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>,
Requires<[hasPTX<80>, hasSM<90>]>;
multiclass PREFETCH_TENSORMAP_INST<string addrspace_name, PatFrag pattern_frag> {
def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr),
"prefetch" # addrspace_name # ".tensormap",
[(pattern_frag addr:$addr)]>,
Requires<[hasPTX<80>, hasSM<90>]>;
}

defm PREFETCH_CONST_TENSORMAP : PREFETCH_TENSORMAP_INST<".const", prefetch_tensormap_const>;
defm PREFETCH_GENERIC_TENSORMAP : PREFETCH_TENSORMAP_INST<"", prefetch_tensormap_generic>;
defm PREFETCH_PARAM_TENSORMAP : PREFETCH_TENSORMAP_INST<".param", prefetch_tensormap_param>;

class PREFETCH_INTRS<string InstName, Intrinsic Intr> :
BasicNVPTXInst<(outs), (ins ADDR:$addr),
InstName,
[(Intr addr:$addr)]>,
Requires<[hasPTX<80>, hasSM<90>]>;

def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">;
def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1", int_nvvm_prefetchu_L1>;
def PREFETCH_L1 : PREFETCH_INTRS<"prefetch.L1", int_nvvm_prefetch_L1>;
def PREFETCH_L2 : PREFETCH_INTRS<"prefetch.L2", int_nvvm_prefetch_L2>;
def PREFETCH_GLOBAL_L1 : PREFETCH_INTRS<"prefetch.global.L1", int_nvvm_prefetch_global_L1>;
def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1", int_nvvm_prefetch_local_L1>;
def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2", int_nvvm_prefetch_global_L2>;
def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2", int_nvvm_prefetch_local_L2>;
def PREFETCH_GLOBAL_L2_EVICT_NORMAL : PREFETCH_INTRS<"prefetch.global.L2::evict_normal",
int_nvvm_prefetch_global_L2_evict_normal>;
def PREFETCH_GLOBAL_L2_EVICT_LAST : PREFETCH_INTRS<"prefetch.global.L2::evict_last",
int_nvvm_prefetch_global_L2_evict_last>;

//Applypriority intrinsics
class APPLYPRIORITY_L2_INTRS<string addrspace> :
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,7 +564,8 @@ bool NVPTXTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
case Intrinsic::nvvm_isspacep_global:
case Intrinsic::nvvm_isspacep_local:
case Intrinsic::nvvm_isspacep_shared:
case Intrinsic::nvvm_isspacep_shared_cluster: {
case Intrinsic::nvvm_isspacep_shared_cluster:
case Intrinsic::nvvm_prefetch_tensormap: {
OpIndexes.push_back(0);
return true;
}
Expand All @@ -587,6 +588,11 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
return ConstantInt::get(II->getType(), *R);
return nullptr;
}
case Intrinsic::nvvm_prefetch_tensormap: {
IRBuilder<> Builder(II);
return Builder.CreateUnaryIntrinsic(Intrinsic::nvvm_prefetch_tensormap,
NewV);
}
}
return nullptr;
}
Expand Down
78 changes: 78 additions & 0 deletions llvm/test/CodeGen/NVPTX/prefetch-inferas-test.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
; RUN: opt < %s -S -passes=infer-address-spaces | FileCheck %s --check-prefix=INFER
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck %s --check-prefix=PTX
; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify %}

target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"

@constant_tensormap = addrspace(4) global [64 x i8] zeroinitializer, align 64

; Inference from const address space
define void @test_infer_const_from_cast() {
; INFER-LABEL: @test_infer_const_from_cast
; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
; BOTH: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
; PTX-LABEL: .visible .func test_infer_const_from_cast(
; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
entry:
%casted = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %casted)
ret void
}

; Cast from Const space to Generic
define void @test_const_to_generic_cast(ptr addrspace(4) %const_ptr) {
; INFER-LABEL: @test_const_to_generic_cast
; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
; PTX-LABEL: .visible .func test_const_to_generic_cast(
; PTX: prefetch.const.tensormap [%rd{{[0-9]+}}];
entry:
%cast = addrspacecast ptr addrspace(4) %const_ptr to ptr
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
ret void
}

; No inference possible
define void @test_no_inference_possible(ptr %generic_ptr) {
; INFER-LABEL: @test_no_inference_possible
; INFER: call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
; PTX-LABEL: .visible .func test_no_inference_possible(
; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
entry:
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %generic_ptr)
ret void
}

; Cast from Parameter space to Generic
define void @test_param_to_generic_cast(ptr addrspace(101) %param_ptr) {
; INFER-LABEL: @test_param_to_generic_cast
; INFER: call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
; PTX-LABEL: .visible .func test_param_to_generic_cast(
; PTX: prefetch.param.tensormap [%rd{{[0-9]+}}];
entry:
%cast = addrspacecast ptr addrspace(101) %param_ptr to ptr
call void @llvm.nvvm.prefetch.tensormap.p0(ptr %cast)
ret void
}

; Multiple casts in sequence
define void @test_infer_through_multiple_casts() {
; INFER-LABEL: @test_infer_through_multiple_casts
; INFER: call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) @constant_tensormap)
; PTX-LABEL: .visible .func test_infer_through_multiple_casts(
; PTX: mov.b64 %rd{{[0-9]+}}, constant_tensormap;
; PTX: cvta.const.u64 %rd{{[0-9]+}}, %rd{{[0-9]+}};
; PTX: prefetch.tensormap [%rd{{[0-9]+}}];
entry:
%cast1 = addrspacecast ptr addrspace(4) @constant_tensormap to ptr
%cast2 = addrspacecast ptr %cast1 to ptr addrspace(4)
%cast3 = addrspacecast ptr addrspace(4) %cast2 to ptr
call void @llvm.nvvm.prefetch.tensormap(ptr %cast3)
ret void
}

declare void @llvm.nvvm.prefetch.tensormap.p0(ptr)
declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4))
declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101))
43 changes: 43 additions & 0 deletions llvm/test/CodeGen/NVPTX/prefetch.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,10 @@ declare void @llvm.nvvm.prefetch.local.L2(ptr addrspace(5) %local_ptr)
declare void @llvm.nvvm.prefetch.L1(ptr %ptr)
declare void @llvm.nvvm.prefetch.L2(ptr %ptr)

declare void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
declare void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
declare void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)

declare void @llvm.nvvm.prefetch.global.L2.evict.normal(ptr addrspace(1) %global_ptr)
declare void @llvm.nvvm.prefetch.global.L2.evict.last(ptr addrspace(1) %global_ptr)

Expand Down Expand Up @@ -78,4 +82,43 @@ define void @prefetchu_l1(ptr %ptr) {
; CHECK-PTX64-NEXT: ret;
tail call void @llvm.nvvm.prefetchu.L1(ptr %ptr)
ret void
}

define void @prefetch_tensormap(ptr %ptr) {
; CHECK-PTX64-LABEL: prefetch_tensormap(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_tensormap_param_0];
; CHECK-PTX64-NEXT: prefetch.tensormap [%rd1];
; CHECK-PTX64-NEXT: ret;
tail call void @llvm.nvvm.prefetch.tensormap.p0(ptr %ptr)
ret void
}

define void @prefetch_const_tensormap(ptr addrspace(4) %const_ptr) {
; CHECK-PTX64-LABEL: prefetch_const_tensormap(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_const_tensormap_param_0];
; CHECK-PTX64-NEXT: prefetch.const.tensormap [%rd1];
; CHECK-PTX64-NEXT: ret;
tail call void @llvm.nvvm.prefetch.tensormap.p4(ptr addrspace(4) %const_ptr)
ret void
}

define void @prefetch_param_tensormap(ptr addrspace(101) %param_ptr) {
; CHECK-PTX64-LABEL: prefetch_param_tensormap(
; CHECK-PTX64: {
; CHECK-PTX64-NEXT: .reg .b64 %rd<2>;
; CHECK-PTX64-EMPTY:
; CHECK-PTX64-NEXT: // %bb.0:
; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [prefetch_param_tensormap_param_0];
; CHECK-PTX64-NEXT: prefetch.param.tensormap [%rd1];
; CHECK-PTX64-NEXT: ret;
tail call void @llvm.nvvm.prefetch.tensormap.p101(ptr addrspace(101) %param_ptr)
ret void
}