Skip to content

Commit 34eb634

Browse files
vsemenov368pszymich
authored andcommitted
Make vector decomposer operate on 32-byte chunks in VC
Making vector decomposer more efficient. (cherry picked from commit 354298c)
1 parent 710bc6a commit 34eb634

File tree

5 files changed

+104
-53
lines changed

5 files changed

+104
-53
lines changed

IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*========================== begin_copyright_notice ============================
22
3-
Copyright (C) 2017-2022 Intel Corporation
3+
Copyright (C) 2017-2024 Intel Corporation
44
55
SPDX-License-Identifier: MIT
66
@@ -100,7 +100,7 @@ bool GenXPostLegalization::runOnFunction(Function &F)
100100
.getTM<GenXTargetMachine>()
101101
.getGenXSubtarget();
102102

103-
VectorDecomposer VD(ST);
103+
VectorDecomposer VD;
104104

105105
bool Modified = false;
106106
Modified |= vc::breakConstantExprs(&F, vc::LegalizationStage::Legalized);

IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp

Lines changed: 19 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*========================== begin_copyright_notice ============================
22
3-
Copyright (C) 2017-2022 Intel Corporation
3+
Copyright (C) 2017-2024 Intel Corporation
44
55
SPDX-License-Identifier: MIT
66
@@ -167,15 +167,15 @@ bool VectorDecomposer::determineDecomposition(Instruction *Inst) {
167167
NotDecomposingReportInst = Inst;
168168
Web.clear();
169169
Decomposition.clear();
170-
unsigned GRFWidth = genx::ByteBits * GRFByteSize;
171-
unsigned NumGrfs =
172-
alignTo(DL->getTypeSizeInBits(Inst->getType()), GRFWidth) / GRFWidth;
173-
if (NumGrfs == 1)
174-
return false; // Ignore single GRF vector.
170+
unsigned ChunkWidth = genx::ByteBits * ChunkByteSize;
171+
unsigned NumChunks =
172+
alignTo(DL->getTypeSizeInBits(Inst->getType()), ChunkWidth) / ChunkWidth;
173+
if (NumChunks == 1)
174+
return false; // Ignore single chunk vector.
175175
LLVM_DEBUG(dbgs() << "VectorDecomposer::determineDecomposition(" << *Inst
176-
<< ")\n");
176+
<< " NumChunks: " << NumChunks << ")\n");
177177
NotDecomposing = false;
178-
for (unsigned i = 0; i != NumGrfs; ++i)
178+
for (unsigned i = 0; i != NumChunks; ++i)
179179
Decomposition.push_back(i);
180180
addToWeb(Inst);
181181
for (unsigned Idx = 0; Idx != Web.size(); ++Idx) {
@@ -263,7 +263,7 @@ bool VectorDecomposer::determineDecomposition(Instruction *Inst) {
263263
//
264264
// Change Decomposition[] so the indices used are contiguous, changing the
265265
// example above to { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3 }, and create the Offsets[]
266-
// array to translate a value from Decomposition[] into the GRF offset, so
266+
// array to translate a value from Decomposition[] into the chunk offset, so
267267
// for this example { 0, 2, 4, 8 }.
268268
Offsets.clear();
269269
for (unsigned Last = UINT_MAX, i = 0, e = Decomposition.size(); i != e; ++i) {
@@ -349,15 +349,15 @@ void VectorDecomposer::adjustDecomposition(Instruction *Inst) {
349349
Last = (R.NumElements / R.Width - 1) * R.VStride;
350350
Last += (R.Width - 1) * R.Stride;
351351
Last = R.Offset + Last * R.ElementBytes;
352-
// Compute the GRF number of the first and last byte of the region.
353-
unsigned First = R.Offset / GRFByteSize;
354-
Last /= GRFByteSize;
352+
// Compute the chunk number of the first and last byte of the region.
353+
unsigned First = R.Offset / ChunkByteSize;
354+
Last /= ChunkByteSize;
355355
if ((First >= Decomposition.size()) || (Last >= Decomposition.size())) {
356356
setNotDecomposing(Inst, "out-of-bounds");
357357
return; // don't attempt to decompose out-of-bounds accesses
358358
}
359359
if (First != Last) {
360-
// This region spans more than one GRF. Ensure they are all in the same
360+
// This region spans more than one chunk. Ensure they are all in the same
361361
// decomposed vector.
362362
for (unsigned i = Last + 1;
363363
i != Decomposition.size() && Decomposition[i] == Decomposition[Last];
@@ -705,28 +705,28 @@ void VectorDecomposer::decomposeBitCast(Instruction *Inst,
705705
* VectorDecomposer::getPartIndex : get the part index for the region
706706
*/
707707
unsigned VectorDecomposer::getPartIndex(vc::Region *R) {
708-
return Decomposition[R->Offset / GRFByteSize];
708+
return Decomposition[R->Offset / ChunkByteSize];
709709
}
710710

711711
/***********************************************************************
712712
* VectorDecomposer::getPartOffset : get the byte offset of a part
713713
*/
714714
unsigned VectorDecomposer::getPartOffset(unsigned PartIndex) {
715-
// Offsets[] has the index in GRFs.
716-
return Offsets[PartIndex] * GRFByteSize;
715+
// Offsets[] has the index in chunks.
716+
return Offsets[PartIndex] * ChunkByteSize;
717717
}
718718

719719
/***********************************************************************
720720
* VectorDecomposer::getPartNumBytes : get the size of a part in bytes
721721
*/
722722
unsigned VectorDecomposer::getPartNumBytes(Type *WholeTy, unsigned PartIndex) {
723723
if (PartIndex + 1 != Offsets.size()) {
724-
// Not the last part. We can use the offset (in GRFs) difference.
725-
return GRFByteSize * (Offsets[PartIndex + 1] - Offsets[PartIndex]);
724+
// Not the last part. We can use the offset (in chunks) difference.
725+
return ChunkByteSize * (Offsets[PartIndex + 1] - Offsets[PartIndex]);
726726
}
727727
// For the last part, we need to get the total size from WholeTy.
728728
return DL->getTypeSizeInBits(WholeTy) / genx::ByteBits -
729-
GRFByteSize * Offsets[PartIndex];
729+
ChunkByteSize * Offsets[PartIndex];
730730
}
731731

732732
/***********************************************************************

IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h

Lines changed: 2 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
/*========================== begin_copyright_notice ============================
22
3-
Copyright (C) 2017-2022 Intel Corporation
3+
Copyright (C) 2017-2024 Intel Corporation
44
55
SPDX-License-Identifier: MIT
66
@@ -68,10 +68,7 @@ class VectorDecomposer {
6868
using Value = llvm::Value;
6969
using VectorType = llvm::VectorType;
7070

71-
using GenXSubtarget = llvm::GenXSubtarget;
72-
7371
const DataLayout *DL = nullptr;
74-
const GenXSubtarget *ST;
7572

7673
llvm::SmallVector<Instruction *, 16> StartWrRegions;
7774
std::set<Instruction *> Seen;
@@ -84,11 +81,9 @@ class VectorDecomposer {
8481
std::map<PHINode *, llvm::SmallVector<Value *, 8>> PhiParts;
8582
llvm::SmallVector<Instruction *, 8> NewInsts;
8683
unsigned DecomposedCount = 0;
87-
const unsigned GRFByteSize;
84+
static constexpr unsigned ChunkByteSize = 32;
8885

8986
public:
90-
explicit VectorDecomposer(const GenXSubtarget *ST)
91-
: ST(ST), GRFByteSize(ST ? ST->getGRFByteSize() : llvm::genx::defaultGRFByteSize) {}
9287

9388
// clear : clear anything stored
9489
void clear() {
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
;=========================== begin_copyright_notice ============================
2+
;
3+
; Copyright (C) 2024 Intel Corporation
4+
;
5+
; SPDX-License-Identifier: MIT
6+
;
7+
;============================ end_copyright_notice =============================
8+
9+
; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPC -mtriple=spir64 -S < %s | FileCheck %s
10+
11+
target datalayout = "e-p:64:64-p3:32:32-p6:32:32-i64:64-n8:16:32:64"
12+
target triple = "genx64-unknown-unknown"
13+
14+
declare <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double>, i32, i32, i32, i16, i32)
15+
declare void @llvm.vc.internal.lsc.store.ugm.v1i1.v2i8.i64.v2i64(<1 x i1>, i8, i8, i8, <2 x i8>, i64, i64, i16, i32, <2 x i64>)
16+
declare <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double>, <4 x double>, i32, i32, i32, i16, i32, i1)
17+
declare <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double>, i32, i32, i32, i16, i32)
18+
declare <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double>, <2 x double>, i32, i32, i32, i16, i32, i1)
19+
declare <4 x double> @llvm.genx.rdregionf.v4f64.v24f64.i16(<24 x double>, i32, i32, i32, i16, i32)
20+
declare <2 x double> @llvm.genx.rdregionf.v2f64.v24f64.i16(<24 x double>, i32, i32, i32, i16, i32)
21+
22+
define void @foo(<64 x i64> %src, i64 %ptrtoint) {
23+
; CHECK: %[[BITCAST1:[^ ]+]] = bitcast <64 x i64> %src to <64 x double>
24+
; CHECK: %[[SPLIT1:[^ ]+]] = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 4, i32 4, i32 1, i16 128, i32 undef)
25+
; CHECK: %[[DECOMP1:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v4f64.i16.i1(<8 x double> undef, <4 x double> %[[SPLIT1]], i32 0, i32 4, i32 1, i16 16, i32 undef, i1 true)
26+
; CHECK: %[[SPLIT2:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 2, i32 2, i32 1, i16 160, i32 undef)
27+
; CHECK: %[[DECOMP2:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v2f64.i16.i1(<8 x double> %[[DECOMP1]], <2 x double> %[[SPLIT2]], i32 0, i32 2, i32 1, i16 48, i32 undef, i1 true)
28+
; CHECK: %[[SPLIT3:[^ ]+]] = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 4, i32 4, i32 1, i16 0, i32 undef)
29+
; CHECK: %[[FDIV1:[^ ]+]] = fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %[[SPLIT3]]
30+
; CHECK: %[[SPLIT4:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 2, i32 2, i32 1, i16 32, i32 undef)
31+
; CHECK: %[[FDIV2:[^ ]+]] = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[SPLIT4]]
32+
; CHECK: %[[DECOMP3:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v2f64.i16.i1(<8 x double> %[[DECOMP2]], <2 x double> %[[FDIV2]], i32 0, i32 2, i32 1, i16 0, i32 undef, i1 true)
33+
; CHECK: %[[SPLIT5:[^ ]+]] = call <4 x double> @llvm.genx.rdregionf.v4f64.v8f64.i16(<8 x double> %[[DECOMP3]], i32 0, i32 4, i32 1, i16 16, i32 undef)
34+
; CHECK: %[[SPLIT6:[^ ]+]] = fmul <4 x double> %[[FDIV1]], %[[SPLIT5]]
35+
; CHECK: %[[DECOMP4:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v4f64.i16.i1(<8 x double> %[[DECOMP3]], <4 x double> %[[SPLIT6]], i32 0, i32 4, i32 1, i16 16, i32 undef, i1 true)
36+
; CHECK: %[[SPLIT7:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v8f64.i16(<8 x double> %[[DECOMP4]], i32 0, i32 2, i32 1, i16 0, i32 undef)
37+
; CHECK: %[[SPLIT8:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v8f64.i16(<8 x double> %[[DECOMP4]], i32 0, i32 2, i32 1, i16 48, i32 undef)
38+
; CHECK: fmul <2 x double> %[[SPLIT7]], %[[SPLIT8]]
39+
40+
%.bitcast_before_collapse = bitcast <64 x i64> %src to <64 x double>
41+
%.esimd3.split6 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 128, i32 undef)
42+
%.esimd3.split6.join6 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> undef, <4 x double> %.esimd3.split6, i32 0, i32 4, i32 1, i16 48, i32 undef, i1 true)
43+
%.esimd3.split10 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 160, i32 undef)
44+
%.esimd3.split10.join10 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd3.split6.join6, <2 x double> %.esimd3.split10, i32 0, i32 2, i32 1, i16 80, i32 undef, i1 true)
45+
%.esimd3.split12 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 256, i32 undef)
46+
%.esimd3.split12.join12 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd3.split10.join10, <4 x double> %.esimd3.split12, i32 0, i32 4, i32 1, i16 96, i32 undef, i1 true)
47+
%.esimd3.split16 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 288, i32 undef)
48+
%.esimd3.split16.join16 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd3.split12.join12, <2 x double> %.esimd3.split16, i32 0, i32 2, i32 1, i16 128, i32 undef, i1 true)
49+
%.esimd3.split18 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 384, i32 undef)
50+
%.esimd3.split18.join18 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd3.split16.join16, <4 x double> %.esimd3.split18, i32 0, i32 4, i32 1, i16 144, i32 undef, i1 true)
51+
%.esimd3.split22 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 416, i32 undef)
52+
%.esimd3.split22.join22 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd3.split18.join18, <2 x double> %.esimd3.split22, i32 0, i32 2, i32 1, i16 176, i32 undef, i1 true)
53+
%.esimd9.regioncollapsed.split0 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 0, i32 undef)
54+
%.split0258 = fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %.esimd9.regioncollapsed.split0
55+
%.esimd10.join0 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd3.split22.join22, <4 x double> %.split0258, i32 4, i32 4, i32 1, i16 0, i32 undef, i1 true)
56+
%.esimd9.regioncollapsed.split4 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 32, i32 undef)
57+
%.split4259 = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %.esimd9.regioncollapsed.split4
58+
%.esimd10.join4 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd10.join0, <2 x double> %.split4259, i32 2, i32 2, i32 1, i16 32, i32 undef, i1 true)
59+
%.esimd11.split0 = call <4 x double> @llvm.genx.rdregionf.v4f64.v24f64.i16(<24 x double> %.esimd10.join4, i32 4, i32 4, i32 1, i16 0, i32 undef)
60+
%.esimd12.split0 = call <4 x double> @llvm.genx.rdregionf.v4f64.v24f64.i16(<24 x double> %.esimd10.join4, i32 4, i32 4, i32 1, i16 48, i32 undef)
61+
%.split0256 = fmul <4 x double> %.esimd11.split0, %.esimd12.split0
62+
%.esimd13.join0 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd10.join4, <4 x double> %.split0256, i32 4, i32 4, i32 1, i16 48, i32 undef, i1 true)
63+
%.esimd11.split4 = call <2 x double> @llvm.genx.rdregionf.v2f64.v24f64.i16(<24 x double> %.esimd13.join0, i32 2, i32 2, i32 1, i16 32, i32 undef)
64+
%.esimd12.split4 = call <2 x double> @llvm.genx.rdregionf.v2f64.v24f64.i16(<24 x double> %.esimd13.join0, i32 2, i32 2, i32 1, i16 80, i32 undef)
65+
%.split4257 = fmul <2 x double> %.esimd11.split4, %.esimd12.split4
66+
%bitcast3 = bitcast <2 x double> %.split4257 to <2 x i64>
67+
call void @llvm.vc.internal.lsc.store.ugm.v1i1.v2i8.i64.v2i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 4, <2 x i8> zeroinitializer, i64 0, i64 %ptrtoint, i16 1, i32 0, <2 x i64> %bitcast3)
68+
ret void
69+
}

IGC/VectorCompiler/test/PostLegalization/wrregion.ll

Lines changed: 12 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -6,17 +6,14 @@
66
;
77
;============================ end_copyright_notice =============================
88

9-
; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPC -mtriple=spir64 -S < %s | FileCheck %s --check-prefix=XeHPC
10-
; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPG -mtriple=spir64 -S < %s | FileCheck %s --check-prefix=XeHPG
9+
; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPC -mtriple=spir64 -S < %s | FileCheck %s
1110

1211
declare <48 x i32> @llvm.genx.wrregioni.v48i32.v32i32.i16.i1(<48 x i32>, <32 x i32>, i32, i32, i32, i16, i32, i1)
1312
declare <32 x i16> @llvm.genx.rdregioni.v32i16.v96i16.i16(<96 x i16>, i32, i32, i32, i16, i32)
1413

15-
; XeHPC-LABEL: test1
16-
; XeHPG-LABEL: test1
14+
; CHECK-LABEL: test1
1715
define <32 x i16> @test1(<32 x i32> %arg) {
18-
; XeHPC-NEXT: call <48 x i32> @llvm.genx.wrregioni.v48i32.v32i32.i16.i1(<48 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 64, i32 undef, i1 true)
19-
; XeHPG-NEXT: call <40 x i32> @llvm.genx.wrregioni.v40i32.v32i32.i16.i1(<40 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 32, i32 undef, i1 true)
16+
; CHECK-NEXT: call <40 x i32> @llvm.genx.wrregioni.v40i32.v32i32.i16.i1(<40 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 32, i32 undef, i1 true)
2017
%1 = call <48 x i32> @llvm.genx.wrregioni.v48i32.v32i32.i16.i1(<48 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 64, i32 undef, i1 true)
2118
%cast = bitcast <48 x i32> %1 to <192 x i8>
2219
%postcast = bitcast <192 x i8> %cast to <96 x i16>
@@ -32,26 +29,16 @@ declare <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x flo
3229
declare <256 x float> @llvm.genx.wrregionf.v256f32.v128f32.i16.i1(<256 x float>, <128 x float>, i32, i32, i32, i16, i32, i1)
3330
declare <64 x i64> @llvm.genx.rdregioni.v64i64.v128i64.i16(<128 x i64>, i32, i32, i32, i16, i32)
3431

35-
; XeHPC-LABEL: test2
36-
; XeHPG-LABEL: test2
32+
; CHECK-LABEL: test2
3733
define <64 x i64> @test2(<256 x i32> %src1, <128 x i32> %src2) {
38-
; XeHPC-NEXT: %[[DPAS1_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
39-
; XeHPC-NEXT: %[[DPAS1_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)
40-
; XeHPC-NEXT: %[[DPAS1_D:[^ ]+]] = call <128 x float> @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(<128 x i32> %[[DPAS1_1]], <64 x i32> %[[DPAS1_2]], i32 134744329)
41-
; XeHPC-NEXT: %[[DPAS2_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 512, i32 undef)
42-
; XeHPC-NEXT: %[[DPAS2_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 256, i32 undef)
43-
; XeHPC-NEXT: tail call <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x float> zeroinitializer, <128 x i32> %[[DPAS2_1]], <64 x i32> %[[DPAS2_2]], i32 9, i32 9, i32 8, i32 8, i32 0, i32 0)
44-
; XeHPC-NEXT: %[[RET:[^ ]+]] = bitcast <128 x float> %[[DPAS1_D]] to <64 x i64>
45-
; XeHPC-NEXT: ret <64 x i64> %[[RET]]
46-
47-
; XeHPG-NEXT: %[[DPAS1_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
48-
; XeHPG-NEXT: %[[DPAS1_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)
49-
; XeHPG-NEXT: %[[DPAS1_D:[^ ]+]] = call <128 x float> @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(<128 x i32> %[[DPAS1_1]], <64 x i32> %[[DPAS1_2]], i32 134744329)
50-
; XeHPG-NEXT: %[[DPAS2_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 512, i32 undef)
51-
; XeHPG-NEXT: %[[DPAS2_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 256, i32 undef)
52-
; XeHPG-NEXT: tail call <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x float> zeroinitializer, <128 x i32> %[[DPAS2_1]], <64 x i32> %[[DPAS2_2]], i32 9, i32 9, i32 8, i32 8, i32 0, i32 0)
53-
; XeHPG-NEXT: %[[RET:[^ ]+]] = bitcast <128 x float> %[[DPAS1_D]] to <64 x i64>
54-
; XeHPG-NEXT: ret <64 x i64> %[[RET]]
34+
; CHECK-NEXT: %[[DPAS1_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
35+
; CHECK-NEXT: %[[DPAS1_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)
36+
; CHECK-NEXT: %[[DPAS1_D:[^ ]+]] = call <128 x float> @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(<128 x i32> %[[DPAS1_1]], <64 x i32> %[[DPAS1_2]], i32 134744329)
37+
; CHECK-NEXT: %[[DPAS2_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 512, i32 undef)
38+
; CHECK-NEXT: %[[DPAS2_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 256, i32 undef)
39+
; CHECK-NEXT: tail call <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x float> zeroinitializer, <128 x i32> %[[DPAS2_1]], <64 x i32> %[[DPAS2_2]], i32 9, i32 9, i32 8, i32 8, i32 0, i32 0)
40+
; CHECK-NEXT: %[[RET:[^ ]+]] = bitcast <128 x float> %[[DPAS1_D]] to <64 x i64>
41+
; CHECK-NEXT: ret <64 x i64> %[[RET]]
5542

5643
%1 = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
5744
%2 = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)

0 commit comments

Comments
 (0)