Skip to content

Commit 69abf4b

Browse files
fda0pszymich
authored andcommitted
Init address register to avoid unaligned cross grf access
Zero initialize address register before each usage to avoid unaligned access that crosses grf boundary. Some kernels use a mix of indirect accesses with 16 bit, 32 bit and 64 bit alignments. This can create a situation where inactive channels [of address register] that should be aligned to 32 bits contain data aligned to 16 bits that points to the last element of a grf register. That situation results in cross grf boundary access which results in a GPU hang. (cherry picked from commit ddab4d1)
1 parent 34eb634 commit 69abf4b

File tree

4 files changed

+26
-10
lines changed

4 files changed

+26
-10
lines changed

IGC/AdaptorOCL/UnifyIROCL.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,7 @@ static void CommonOCLBasedPasses(OpenCLProgramContext* pContext)
211211

212212
unify_opt_PreProcess(pContext);
213213
pContext->m_checkFastFlagPerInstructionInCustomUnsafeOptPass = true;
214+
pContext->m_mayHaveUnalignedAddressRegister = true;
214215

215216
DumpLLVMIR(pContext, "beforeUnification");
216217

IGC/Compiler/CISACodeGen/CISABuilder.cpp

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2775,20 +2775,27 @@ namespace IGC
27752775
void CEncoder::AddrAdd(CVariable* dst, CVariable* src0, CVariable* src1)
27762776
{
27772777
// On ICL+ platforms address register must be initialized if it is used
2778-
// in VxH indirect addressing to avoid out-of-bounds access on inactive
2779-
// lanes. VISA initializes address register at the beginning of the
2780-
// shader which is sufficient for shaders that use address register only
2781-
// for indirect addressing but is not sufficient if shader also uses
2782-
// address register in send descriptors. The latter case is handled by
2783-
// the initialization below.
2784-
// see VISA Optimizer::resetA0()
2778+
// in VxH indirect addressing to avoid out-of-bounds on inactive lanes.
2779+
// VISA initializes address register [see VISA Optimizer::resetA0()]
2780+
// at the beginning of the shader which is sufficient for some shaders.
2781+
// It is insufficient if the shader uses:
2782+
// 1. address register in send descriptors (may cause out of bounds access).
2783+
// 2. indirect addressing on types with different alignment (may cause cross grf boundary access).
2784+
// To cover these cases we introduce zero initialization of address register below.
2785+
27852786
const bool mayUseA0InSendDesc =
27862787
m_program->GetContext()->m_instrTypes.mayHaveIndirectResources;
2787-
const bool needsA0Reset =
2788+
const bool mayHaveUnalignedA0 =
2789+
m_program->GetContext()->m_mayHaveUnalignedAddressRegister;
2790+
2791+
const bool softwareNeedsA0Reset =
2792+
mayUseA0InSendDesc || mayHaveUnalignedA0;
2793+
const bool platformNeedsA0Reset =
27882794
m_program->m_Platform->NeedResetA0forVxHA0();
2795+
const bool initializeA0 = (softwareNeedsA0Reset && platformNeedsA0Reset) ||
2796+
IGC_IS_FLAG_ENABLED(InitializeAddressRegistersBeforeUse);
27892797

2790-
if (((mayUseA0InSendDesc && needsA0Reset) ||
2791-
IGC_IS_FLAG_ENABLED(InitializeAddressRegistersBeforeUse)) &&
2798+
if (initializeA0 &&
27922799
!dst->IsUniform() &&
27932800
!m_encoderState.m_noMask)
27942801
{

IGC/Compiler/CodeGenPublic.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1020,6 +1020,10 @@ namespace IGC
10201020
// Ignore per module fast math flag and use only per instruction fast math flags
10211021
// Add few changes to CustomUnsafeOptPass related to fast flag propagation
10221022
bool m_checkFastFlagPerInstructionInCustomUnsafeOptPass = false;
1023+
// Specifies if this compilation uses indirect addressing with
1024+
// differently aligned types. This can result in cross grf boundary
1025+
// access in inactive channels of address register.
1026+
bool m_mayHaveUnalignedAddressRegister = false;
10231027
// Map to store global offsets in original global buffer
10241028
std::map<std::string, uint64_t> inlineProgramScopeGlobalOffsets;
10251029
std::vector<std::string> entry_names;

IGC/ocloc_tests/Builtins/intel_sub_group_shuffle.cl

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,10 @@ kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_simd32(
4747
uint which_sub_group_local_id = ids[gid];
4848

4949
// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
50+
// CHECK-NEXT: mov (M1_NM, 16) A0(0)<1> 0x0:uw
5051
// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
5152
// CHECK-NEXT: mov (M1, 16) simdShuffle(0,0)<1> r[A0(0),0]<1,0>:d
53+
// CHECK-NEXT: mov (M5_NM, 16) A0(0)<1> 0x0:uw
5254
// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
5355
// CHECK-NEXT: mov (M5, 16) simdShuffle(1,0)<1> r[A0(0),0]<1,0>:d
5456

@@ -66,8 +68,10 @@ kernel void test_intel_sub_group_shuffle_non_uniform_non_immediate_index_src_the
6668
for (uint i = 0; i < num_iterations; ++i)
6769
{
6870
// CHECK: shl (M1, 32) ShuffleTmp(0,0)<1> {{V[0-9]+}}(0,0)<16;8,2> 0x2:uw
71+
// CHECK-NEXT: mov (M1_NM, 16) A0(0)<1> 0x0:uw
6972
// CHECK-NEXT: addr_add (M1, 16) A0(0)<1> &[[X:V[0-9]+]] ShuffleTmp(0,0)<1;1,0>
7073
// CHECK-NEXT: mov (M1, 16) first16LanesResult(0,0)<1> r[A0(0),0]<1,0>:d
74+
// CHECK-NEXT: mov (M5_NM, 16) A0(0)<1> 0x0:uw
7175
// CHECK-NEXT: addr_add (M5, 16) A0(0)<1> &[[X]] ShuffleTmp(0,16)<1;1,0>
7276
// CHECK-NEXT: mov (M5, 16) [[X]](1,0)<1> r[A0(0),0]<1,0>:d
7377
// CHECK-NEXT: mov (M1, 16) [[X]](0,0)<1> first16LanesResult(0,0)<1;1,0>

0 commit comments

Comments
 (0)