Make vector decomposer operate on 32-byte chunks in VC

vsemenov368 · pszymich · commit 34eb634364ba · 2024-10-01T10:46:04.000+02:00
Making vector decomposer more efficient. (cherry picked from commit 354298c)
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXPostLegalization.cpp
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2017-2022 Intel Corporation
+Copyright (C) 2017-2024 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -100,7 +100,7 @@ bool GenXPostLegalization::runOnFunction(Function &F)
             .getTM<GenXTargetMachine>()
             .getGenXSubtarget();
 
-  VectorDecomposer VD(ST);
+  VectorDecomposer VD;
 
   bool Modified = false;
   Modified |= vc::breakConstantExprs(&F, vc::LegalizationStage::Legalized);
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.cpp
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2017-2022 Intel Corporation
+Copyright (C) 2017-2024 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -167,15 +167,15 @@ bool VectorDecomposer::determineDecomposition(Instruction *Inst) {
   NotDecomposingReportInst = Inst;
   Web.clear();
   Decomposition.clear();
-  unsigned GRFWidth = genx::ByteBits * GRFByteSize;
-  unsigned NumGrfs =
-      alignTo(DL->getTypeSizeInBits(Inst->getType()), GRFWidth) / GRFWidth;
-  if (NumGrfs == 1)
-    return false; // Ignore single GRF vector.
+  unsigned ChunkWidth = genx::ByteBits * ChunkByteSize;
+  unsigned NumChunks =
+      alignTo(DL->getTypeSizeInBits(Inst->getType()), ChunkWidth) / ChunkWidth;
+  if (NumChunks == 1)
+    return false; // Ignore single chunk vector.
   LLVM_DEBUG(dbgs() << "VectorDecomposer::determineDecomposition(" << *Inst
-                    << ")\n");
+                    << " NumChunks: " << NumChunks << ")\n");
   NotDecomposing = false;
-  for (unsigned i = 0; i != NumGrfs; ++i)
+  for (unsigned i = 0; i != NumChunks; ++i)
     Decomposition.push_back(i);
   addToWeb(Inst);
   for (unsigned Idx = 0; Idx != Web.size(); ++Idx) {
@@ -263,7 +263,7 @@ bool VectorDecomposer::determineDecomposition(Instruction *Inst) {
   //
   // Change Decomposition[] so the indices used are contiguous, changing the
   // example above to { 0, 0, 1, 1, 2, 2, 2, 2, 3, 3 }, and create the Offsets[]
-  // array to translate a value from Decomposition[] into the GRF offset, so
+  // array to translate a value from Decomposition[] into the chunk offset, so
   // for this example { 0, 2, 4, 8 }.
   Offsets.clear();
   for (unsigned Last = UINT_MAX, i = 0, e = Decomposition.size(); i != e; ++i) {
@@ -349,15 +349,15 @@ void VectorDecomposer::adjustDecomposition(Instruction *Inst) {
     Last = (R.NumElements / R.Width - 1) * R.VStride;
   Last += (R.Width - 1) * R.Stride;
   Last = R.Offset + Last * R.ElementBytes;
-  // Compute the GRF number of the first and last byte of the region.
-  unsigned First = R.Offset / GRFByteSize;
-  Last /= GRFByteSize;
+  // Compute the chunk number of the first and last byte of the region.
+  unsigned First = R.Offset / ChunkByteSize;
+  Last /= ChunkByteSize;
   if ((First >= Decomposition.size()) || (Last >= Decomposition.size())) {
     setNotDecomposing(Inst, "out-of-bounds");
     return; // don't attempt to decompose out-of-bounds accesses
   }
   if (First != Last) {
-    // This region spans more than one GRF. Ensure they are all in the same
+    // This region spans more than one chunk. Ensure they are all in the same
     // decomposed vector.
     for (unsigned i = Last + 1;
          i != Decomposition.size() && Decomposition[i] == Decomposition[Last];
@@ -705,28 +705,28 @@ void VectorDecomposer::decomposeBitCast(Instruction *Inst,
  * VectorDecomposer::getPartIndex : get the part index for the region
  */
 unsigned VectorDecomposer::getPartIndex(vc::Region *R) {
-  return Decomposition[R->Offset / GRFByteSize];
+  return Decomposition[R->Offset / ChunkByteSize];
 }
 
 /***********************************************************************
  * VectorDecomposer::getPartOffset : get the byte offset of a part
  */
 unsigned VectorDecomposer::getPartOffset(unsigned PartIndex) {
-  // Offsets[] has the index in GRFs.
-  return Offsets[PartIndex] * GRFByteSize;
+  // Offsets[] has the index in chunks.
+  return Offsets[PartIndex] * ChunkByteSize;
 }
 
 /***********************************************************************
  * VectorDecomposer::getPartNumBytes : get the size of a part in bytes
  */
 unsigned VectorDecomposer::getPartNumBytes(Type *WholeTy, unsigned PartIndex) {
   if (PartIndex + 1 != Offsets.size()) {
-    // Not the last part. We can use the offset (in GRFs) difference.
-    return GRFByteSize * (Offsets[PartIndex + 1] - Offsets[PartIndex]);
+    // Not the last part. We can use the offset (in chunks) difference.
+    return ChunkByteSize * (Offsets[PartIndex + 1] - Offsets[PartIndex]);
   }
   // For the last part, we need to get the total size from WholeTy.
   return DL->getTypeSizeInBits(WholeTy) / genx::ByteBits -
-         GRFByteSize * Offsets[PartIndex];
+         ChunkByteSize * Offsets[PartIndex];
 }
 
 /***********************************************************************
diff --git a/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h b/IGC/VectorCompiler/lib/GenXCodeGen/GenXVectorDecomposer.h
@@ -1,6 +1,6 @@
 /*========================== begin_copyright_notice ============================
 
-Copyright (C) 2017-2022 Intel Corporation
+Copyright (C) 2017-2024 Intel Corporation
 
 SPDX-License-Identifier: MIT
 
@@ -68,10 +68,7 @@ class VectorDecomposer {
   using Value = llvm::Value;
   using VectorType = llvm::VectorType;
 
-  using GenXSubtarget = llvm::GenXSubtarget;
-
   const DataLayout *DL = nullptr;
-  const GenXSubtarget *ST;
 
   llvm::SmallVector<Instruction *, 16> StartWrRegions;
   std::set<Instruction *> Seen;
@@ -84,11 +81,9 @@ class VectorDecomposer {
   std::map<PHINode *, llvm::SmallVector<Value *, 8>> PhiParts;
   llvm::SmallVector<Instruction *, 8> NewInsts;
   unsigned DecomposedCount = 0;
-  const unsigned GRFByteSize;
+  static constexpr unsigned ChunkByteSize = 32;
 
 public:
-  explicit VectorDecomposer(const GenXSubtarget *ST)
-      : ST(ST), GRFByteSize(ST ? ST->getGRFByteSize() : llvm::genx::defaultGRFByteSize) {}
 
   // clear : clear anything stored
   void clear() {
diff --git a/IGC/VectorCompiler/test/PostLegalization/decomp.ll b/IGC/VectorCompiler/test/PostLegalization/decomp.ll
@@ -0,0 +1,69 @@
+;=========================== begin_copyright_notice ============================
+;
+; Copyright (C) 2024 Intel Corporation
+;
+; SPDX-License-Identifier: MIT
+;
+;============================ end_copyright_notice =============================
+
+; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPC -mtriple=spir64 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p3:32:32-p6:32:32-i64:64-n8:16:32:64"
+target triple = "genx64-unknown-unknown"
+
+declare <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double>, i32, i32, i32, i16, i32)
+declare void @llvm.vc.internal.lsc.store.ugm.v1i1.v2i8.i64.v2i64(<1 x i1>, i8, i8, i8, <2 x i8>, i64, i64, i16, i32, <2 x i64>)
+declare <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double>, <4 x double>, i32, i32, i32, i16, i32, i1)
+declare <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double>, i32, i32, i32, i16, i32)
+declare <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double>, <2 x double>, i32, i32, i32, i16, i32, i1)
+declare <4 x double> @llvm.genx.rdregionf.v4f64.v24f64.i16(<24 x double>, i32, i32, i32, i16, i32)
+declare <2 x double> @llvm.genx.rdregionf.v2f64.v24f64.i16(<24 x double>, i32, i32, i32, i16, i32)
+
+define void @foo(<64 x i64> %src, i64 %ptrtoint) {
+; CHECK: %[[BITCAST1:[^ ]+]] = bitcast <64 x i64> %src to <64 x double>
+; CHECK: %[[SPLIT1:[^ ]+]] = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 4, i32 4, i32 1, i16 128, i32 undef)
+; CHECK: %[[DECOMP1:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v4f64.i16.i1(<8 x double> undef, <4 x double> %[[SPLIT1]], i32 0, i32 4, i32 1, i16 16, i32 undef, i1 true)
+; CHECK: %[[SPLIT2:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 2, i32 2, i32 1, i16 160, i32 undef)
+; CHECK: %[[DECOMP2:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v2f64.i16.i1(<8 x double> %[[DECOMP1]], <2 x double> %[[SPLIT2]], i32 0, i32 2, i32 1, i16 48, i32 undef, i1 true)
+; CHECK: %[[SPLIT3:[^ ]+]] = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 4, i32 4, i32 1, i16 0, i32 undef)
+; CHECK: %[[FDIV1:[^ ]+]] = fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %[[SPLIT3]]
+; CHECK: %[[SPLIT4:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %[[BITCAST1]], i32 2, i32 2, i32 1, i16 32, i32 undef)
+; CHECK: %[[FDIV2:[^ ]+]] = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %[[SPLIT4]]
+; CHECK: %[[DECOMP3:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v2f64.i16.i1(<8 x double> %[[DECOMP2]], <2 x double> %[[FDIV2]], i32 0, i32 2, i32 1, i16 0, i32 undef, i1 true)
+; CHECK: %[[SPLIT5:[^ ]+]] = call <4 x double> @llvm.genx.rdregionf.v4f64.v8f64.i16(<8 x double> %[[DECOMP3]], i32 0, i32 4, i32 1, i16 16, i32 undef)
+; CHECK: %[[SPLIT6:[^ ]+]] = fmul <4 x double> %[[FDIV1]], %[[SPLIT5]]
+; CHECK: %[[DECOMP4:[^ ]+]] = call <8 x double> @llvm.genx.wrregionf.v8f64.v4f64.i16.i1(<8 x double> %[[DECOMP3]], <4 x double> %[[SPLIT6]], i32 0, i32 4, i32 1, i16 16, i32 undef, i1 true)
+; CHECK: %[[SPLIT7:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v8f64.i16(<8 x double> %[[DECOMP4]], i32 0, i32 2, i32 1, i16 0, i32 undef)
+; CHECK: %[[SPLIT8:[^ ]+]] = call <2 x double> @llvm.genx.rdregionf.v2f64.v8f64.i16(<8 x double> %[[DECOMP4]], i32 0, i32 2, i32 1, i16 48, i32 undef)
+; CHECK: fmul <2 x double> %[[SPLIT7]], %[[SPLIT8]]
+
+  %.bitcast_before_collapse = bitcast <64 x i64> %src to <64 x double>
+  %.esimd3.split6 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 128, i32 undef)
+  %.esimd3.split6.join6 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> undef, <4 x double> %.esimd3.split6, i32 0, i32 4, i32 1, i16 48, i32 undef, i1 true)
+  %.esimd3.split10 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 160, i32 undef)
+  %.esimd3.split10.join10 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd3.split6.join6, <2 x double> %.esimd3.split10, i32 0, i32 2, i32 1, i16 80, i32 undef, i1 true)
+  %.esimd3.split12 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 256, i32 undef)
+  %.esimd3.split12.join12 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd3.split10.join10, <4 x double> %.esimd3.split12, i32 0, i32 4, i32 1, i16 96, i32 undef, i1 true)
+  %.esimd3.split16 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 288, i32 undef)
+  %.esimd3.split16.join16 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd3.split12.join12, <2 x double> %.esimd3.split16, i32 0, i32 2, i32 1, i16 128, i32 undef, i1 true)
+  %.esimd3.split18 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 384, i32 undef)
+  %.esimd3.split18.join18 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd3.split16.join16, <4 x double> %.esimd3.split18, i32 0, i32 4, i32 1, i16 144, i32 undef, i1 true)
+  %.esimd3.split22 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 416, i32 undef)
+  %.esimd3.split22.join22 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd3.split18.join18, <2 x double> %.esimd3.split22, i32 0, i32 2, i32 1, i16 176, i32 undef, i1 true)
+  %.esimd9.regioncollapsed.split0 = call <4 x double> @llvm.genx.rdregionf.v4f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 4, i32 4, i32 1, i16 0, i32 undef)
+  %.split0258 = fdiv <4 x double> <double 1.000000e+00, double 1.000000e+00, double 1.000000e+00, double 1.000000e+00>, %.esimd9.regioncollapsed.split0
+  %.esimd10.join0 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd3.split22.join22, <4 x double> %.split0258, i32 4, i32 4, i32 1, i16 0, i32 undef, i1 true)
+  %.esimd9.regioncollapsed.split4 = call <2 x double> @llvm.genx.rdregionf.v2f64.v64f64.i16(<64 x double> %.bitcast_before_collapse, i32 2, i32 2, i32 1, i16 32, i32 undef)
+  %.split4259 = fdiv <2 x double> <double 1.000000e+00, double 1.000000e+00>, %.esimd9.regioncollapsed.split4
+  %.esimd10.join4 = call <24 x double> @llvm.genx.wrregionf.v24f64.v2f64.i16.i1(<24 x double> %.esimd10.join0, <2 x double> %.split4259, i32 2, i32 2, i32 1, i16 32, i32 undef, i1 true)
+  %.esimd11.split0 = call <4 x double> @llvm.genx.rdregionf.v4f64.v24f64.i16(<24 x double> %.esimd10.join4, i32 4, i32 4, i32 1, i16 0, i32 undef)
+  %.esimd12.split0 = call <4 x double> @llvm.genx.rdregionf.v4f64.v24f64.i16(<24 x double> %.esimd10.join4, i32 4, i32 4, i32 1, i16 48, i32 undef)
+  %.split0256 = fmul <4 x double> %.esimd11.split0, %.esimd12.split0
+  %.esimd13.join0 = call <24 x double> @llvm.genx.wrregionf.v24f64.v4f64.i16.i1(<24 x double> %.esimd10.join4, <4 x double> %.split0256, i32 4, i32 4, i32 1, i16 48, i32 undef, i1 true)
+  %.esimd11.split4 = call <2 x double> @llvm.genx.rdregionf.v2f64.v24f64.i16(<24 x double> %.esimd13.join0, i32 2, i32 2, i32 1, i16 32, i32 undef)
+  %.esimd12.split4 = call <2 x double> @llvm.genx.rdregionf.v2f64.v24f64.i16(<24 x double> %.esimd13.join0, i32 2, i32 2, i32 1, i16 80, i32 undef)
+  %.split4257 = fmul <2 x double> %.esimd11.split4, %.esimd12.split4
+  %bitcast3 = bitcast <2 x double> %.split4257 to <2 x i64>
+  call void @llvm.vc.internal.lsc.store.ugm.v1i1.v2i8.i64.v2i64(<1 x i1> <i1 true>, i8 3, i8 4, i8 4, <2 x i8> zeroinitializer, i64 0, i64 %ptrtoint, i16 1, i32 0, <2 x i64> %bitcast3)
+  ret void
+}
diff --git a/IGC/VectorCompiler/test/PostLegalization/wrregion.ll b/IGC/VectorCompiler/test/PostLegalization/wrregion.ll
@@ -6,17 +6,14 @@
 ;
 ;============================ end_copyright_notice =============================
 
-; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPC -mtriple=spir64 -S < %s | FileCheck %s --check-prefix=XeHPC
-; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPG -mtriple=spir64 -S < %s | FileCheck %s --check-prefix=XeHPG
+; RUN: %opt %use_old_pass_manager% -GenXPostLegalization -march=genx64 -mcpu=XeHPC -mtriple=spir64 -S < %s | FileCheck %s
 
 declare <48 x i32> @llvm.genx.wrregioni.v48i32.v32i32.i16.i1(<48 x i32>, <32 x i32>, i32, i32, i32, i16, i32, i1)
 declare <32 x i16> @llvm.genx.rdregioni.v32i16.v96i16.i16(<96 x i16>, i32, i32, i32, i16, i32)
 
-; XeHPC-LABEL: test1
-; XeHPG-LABEL: test1
+; CHECK-LABEL: test1
 define <32 x i16> @test1(<32 x i32> %arg) {
-; XeHPC-NEXT: call <48 x i32> @llvm.genx.wrregioni.v48i32.v32i32.i16.i1(<48 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 64, i32 undef, i1 true)
-; XeHPG-NEXT: call <40 x i32> @llvm.genx.wrregioni.v40i32.v32i32.i16.i1(<40 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 32, i32 undef, i1 true)
+; CHECK-NEXT: call <40 x i32> @llvm.genx.wrregioni.v40i32.v32i32.i16.i1(<40 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 32, i32 undef, i1 true)
   %1 = call <48 x i32> @llvm.genx.wrregioni.v48i32.v32i32.i16.i1(<48 x i32> zeroinitializer, <32 x i32> %arg, i32 0, i32 32, i32 1, i16 64, i32 undef, i1 true)
   %cast = bitcast <48 x i32> %1 to <192 x i8>
   %postcast = bitcast <192 x i8> %cast to <96 x i16>
@@ -32,26 +29,16 @@ declare <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x flo
 declare <256 x float> @llvm.genx.wrregionf.v256f32.v128f32.i16.i1(<256 x float>, <128 x float>, i32, i32, i32, i16, i32, i1)
 declare <64 x i64> @llvm.genx.rdregioni.v64i64.v128i64.i16(<128 x i64>, i32, i32, i32, i16, i32)
 
-; XeHPC-LABEL: test2
-; XeHPG-LABEL: test2
+; CHECK-LABEL: test2
 define <64 x i64> @test2(<256 x i32> %src1, <128 x i32> %src2) {
-  ; XeHPC-NEXT: %[[DPAS1_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
-  ; XeHPC-NEXT: %[[DPAS1_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)
-  ; XeHPC-NEXT: %[[DPAS1_D:[^ ]+]] = call <128 x float> @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(<128 x i32> %[[DPAS1_1]], <64 x i32> %[[DPAS1_2]], i32 134744329)
-  ; XeHPC-NEXT: %[[DPAS2_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 512, i32 undef)
-  ; XeHPC-NEXT: %[[DPAS2_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 256, i32 undef)
-  ; XeHPC-NEXT: tail call <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x float> zeroinitializer, <128 x i32> %[[DPAS2_1]], <64 x i32> %[[DPAS2_2]], i32 9, i32 9, i32 8, i32 8, i32 0, i32 0)
-  ; XeHPC-NEXT: %[[RET:[^ ]+]] = bitcast <128 x float> %[[DPAS1_D]] to <64 x i64>
-  ; XeHPC-NEXT: ret <64 x i64> %[[RET]]
-
-  ; XeHPG-NEXT: %[[DPAS1_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
-  ; XeHPG-NEXT: %[[DPAS1_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)
-  ; XeHPG-NEXT: %[[DPAS1_D:[^ ]+]] = call <128 x float> @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(<128 x i32> %[[DPAS1_1]], <64 x i32> %[[DPAS1_2]], i32 134744329)
-  ; XeHPG-NEXT: %[[DPAS2_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 512, i32 undef)
-  ; XeHPG-NEXT: %[[DPAS2_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 256, i32 undef)
-  ; XeHPG-NEXT: tail call <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x float> zeroinitializer, <128 x i32> %[[DPAS2_1]], <64 x i32> %[[DPAS2_2]], i32 9, i32 9, i32 8, i32 8, i32 0, i32 0)
-  ; XeHPG-NEXT: %[[RET:[^ ]+]] = bitcast <128 x float> %[[DPAS1_D]] to <64 x i64>
-  ; XeHPG-NEXT: ret <64 x i64> %[[RET]]
+  ; CHECK-NEXT: %[[DPAS1_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
+  ; CHECK-NEXT: %[[DPAS1_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)
+  ; CHECK-NEXT: %[[DPAS1_D:[^ ]+]] = call <128 x float> @llvm.genx.dpas.nosrc0.v128f32.v128i32.v64i32(<128 x i32> %[[DPAS1_1]], <64 x i32> %[[DPAS1_2]], i32 134744329)
+  ; CHECK-NEXT: %[[DPAS2_1:[^ ]+]] = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 512, i32 undef)
+  ; CHECK-NEXT: %[[DPAS2_2:[^ ]+]] = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 256, i32 undef)
+  ; CHECK-NEXT: tail call <128 x float> @llvm.genx.dpas2.v128f32.v128f32.v128i32.v64i32(<128 x float> zeroinitializer, <128 x i32> %[[DPAS2_1]], <64 x i32> %[[DPAS2_2]], i32 9, i32 9, i32 8, i32 8, i32 0, i32 0)
+  ; CHECK-NEXT: %[[RET:[^ ]+]] = bitcast <128 x float> %[[DPAS1_D]] to <64 x i64>
+  ; CHECK-NEXT: ret <64 x i64> %[[RET]]
 
   %1 = tail call <128 x i32> @llvm.genx.rdregioni.v128i32.v256i32.i16(<256 x i32> %src1, i32 0, i32 128, i32 1, i16 0, i32 undef)
   %2 = tail call <64 x i32> @llvm.genx.rdregioni.v64i32.v128i32.i16(<128 x i32> %src2, i32 0, i32 64, i32 1, i16 0, i32 undef)