Skip to content

Commit 4f70917

Browse files
authored
[NVPTX] Add intrinsics for the bmsk instruction (#139299)
1 parent 9ca4664 commit 4f70917

File tree

4 files changed

+125
-0
lines changed

4 files changed

+125
-0
lines changed

llvm/docs/NVPTXUsage.rst

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -598,6 +598,32 @@ operand %b clamped to the range [0, 32]. The N lowest bits are then
598598
zero-extended the case of the '``zext``' variants, or sign-extended the case of
599599
the '``sext``' variants. If N is 0, the result is 0.
600600

601+
'``llvm.nvvm.bmsk.{wrap,clamp}``' Intrinsic
602+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
603+
604+
Syntax:
605+
"""""""
606+
607+
.. code-block:: llvm
608+
609+
declare i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
610+
declare i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
611+
612+
Overview:
613+
"""""""""
614+
615+
The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics creates a bit mask
616+
given a starting bit position and a bit width.
617+
618+
Semantics:
619+
""""""""""
620+
621+
The '``llvm.nvvm.bmsk.{wrap,clamp}``' family of intrinsics returns a value with
622+
all bits set to 0 except for %b bits starting at bit position %a. For the
623+
'``wrap``' variants, the values of %a and %b modulo 32 are used. For the
624+
'``clamp``' variants, the values of %a and %b are clamped to the range [0, 32],
625+
which in practice is equivalent to using them as is.
626+
601627
TMA family of Intrinsics
602628
------------------------
603629

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1367,6 +1367,16 @@ let TargetPrefix = "nvvm" in {
13671367
[llvm_i32_ty, llvm_i32_ty],
13681368
[IntrNoMem, IntrSpeculatable]>;
13691369

1370+
1371+
//
1372+
// BMSK - bit mask
1373+
//
1374+
foreach mode = ["wrap", "clamp"] in
1375+
def int_nvvm_bmsk_ # mode :
1376+
DefaultAttrsIntrinsic<[llvm_i32_ty],
1377+
[llvm_i32_ty, llvm_i32_ty],
1378+
[IntrNoMem, IntrSpeculatable]>;
1379+
13701380
//
13711381
// Convert
13721382
//

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1693,6 +1693,18 @@ foreach sign = ["s", "u"] in {
16931693
}
16941694
}
16951695

1696+
//
1697+
// BMSK
1698+
//
1699+
1700+
foreach mode = ["wrap", "clamp"] in {
1701+
defvar intrin = !cast<Intrinsic>("int_nvvm_bmsk_" # mode);
1702+
defm BMSK_ # mode
1703+
: I3Inst<"bmsk." # mode # ".b32",
1704+
intrin, I32RT, commutative = false,
1705+
requires = [hasSM<70>, hasPTX<76>]>;
1706+
}
1707+
16961708
//
16971709
// Convert
16981710
//

llvm/test/CodeGen/NVPTX/bmsk.ll

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -o - < %s -mcpu=sm_70 -mattr=+ptx76 | FileCheck %s
3+
; RUN: %if ptxas %{ llc < %s -mcpu=sm_70 -mattr=+ptx76 | %ptxas-verify -arch=sm_70 %}
4+
5+
target triple = "nvptx64-unknown-cuda"
6+
7+
define i32 @bmsk_wrap(i32 %a, i32 %b) {
8+
; CHECK-LABEL: bmsk_wrap(
9+
; CHECK: {
10+
; CHECK-NEXT: .reg .b32 %r<4>;
11+
; CHECK-EMPTY:
12+
; CHECK-NEXT: // %bb.0:
13+
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_param_0];
14+
; CHECK-NEXT: ld.param.u32 %r2, [bmsk_wrap_param_1];
15+
; CHECK-NEXT: bmsk.wrap.b32 %r3, %r1, %r2;
16+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
17+
; CHECK-NEXT: ret;
18+
%c = call i32 @llvm.nvvm.bmsk.wrap(i32 %a, i32 %b)
19+
ret i32 %c
20+
}
21+
22+
define i32 @bmsk_clamp(i32 %a, i32 %b) {
23+
; CHECK-LABEL: bmsk_clamp(
24+
; CHECK: {
25+
; CHECK-NEXT: .reg .b32 %r<4>;
26+
; CHECK-EMPTY:
27+
; CHECK-NEXT: // %bb.0:
28+
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_param_0];
29+
; CHECK-NEXT: ld.param.u32 %r2, [bmsk_clamp_param_1];
30+
; CHECK-NEXT: bmsk.clamp.b32 %r3, %r1, %r2;
31+
; CHECK-NEXT: st.param.b32 [func_retval0], %r3;
32+
; CHECK-NEXT: ret;
33+
%c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 %b)
34+
ret i32 %c
35+
}
36+
37+
define i32 @bmsk_wrap_ii() {
38+
; CHECK-LABEL: bmsk_wrap_ii(
39+
; CHECK: {
40+
; CHECK-NEXT: .reg .b32 %r<3>;
41+
; CHECK-EMPTY:
42+
; CHECK-NEXT: // %bb.0:
43+
; CHECK-NEXT: mov.b32 %r1, 5;
44+
; CHECK-NEXT: bmsk.wrap.b32 %r2, %r1, 6;
45+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
46+
; CHECK-NEXT: ret;
47+
%c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 6)
48+
ret i32 %c
49+
}
50+
51+
define i32 @bmsk_clamp_ir(i32 %a) {
52+
; CHECK-LABEL: bmsk_clamp_ir(
53+
; CHECK: {
54+
; CHECK-NEXT: .reg .b32 %r<3>;
55+
; CHECK-EMPTY:
56+
; CHECK-NEXT: // %bb.0:
57+
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_clamp_ir_param_0];
58+
; CHECK-NEXT: bmsk.clamp.b32 %r2, %r1, 7;
59+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
60+
; CHECK-NEXT: ret;
61+
%c = call i32 @llvm.nvvm.bmsk.clamp(i32 %a, i32 7)
62+
ret i32 %c
63+
}
64+
65+
define i32 @bmsk_wrap_ri(i32 %a) {
66+
; CHECK-LABEL: bmsk_wrap_ri(
67+
; CHECK: {
68+
; CHECK-NEXT: .reg .b32 %r<3>;
69+
; CHECK-EMPTY:
70+
; CHECK-NEXT: // %bb.0:
71+
; CHECK-NEXT: ld.param.u32 %r1, [bmsk_wrap_ri_param_0];
72+
; CHECK-NEXT: bmsk.wrap.b32 %r2, 5, %r1;
73+
; CHECK-NEXT: st.param.b32 [func_retval0], %r2;
74+
; CHECK-NEXT: ret;
75+
%c = call i32 @llvm.nvvm.bmsk.wrap(i32 5, i32 %a)
76+
ret i32 %c
77+
}

0 commit comments

Comments
 (0)