Skip to content

Commit b2124f4

Browse files
Add implicit scaling barrier implementation
Related-To: NEO-6262 Signed-off-by: Zbigniew Zdanowicz <[email protected]>
1 parent 9561018 commit b2124f4

File tree

6 files changed

+555
-4
lines changed

6 files changed

+555
-4
lines changed

opencl/test/unit_test/command_queue/walker_partition_tests_xehp_and_later_2.cpp

Lines changed: 297 additions & 0 deletions
Large diffs are not rendered by default.

shared/source/command_container/implicit_scaling.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,13 @@ struct ImplicitScalingDispatch {
5353

5454
static bool &getPipeControlStallRequired();
5555

56+
static size_t getBarrierSize(bool apiSelfCleanup);
57+
static void dispatchBarrierCommands(LinearStream &commandStream,
58+
const DeviceBitfield &devices,
59+
bool apiSelfCleanup,
60+
bool dcFlush,
61+
bool useSecondaryBatchBuffer);
62+
5663
private:
5764
static bool pipeControlStallRequired;
5865
};

shared/source/command_container/implicit_scaling_xehp_and_later.inl

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -96,10 +96,12 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
9696
staticPartitioning,
9797
useSecondaryBatchBuffer);
9898

99+
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
100+
void *commandBuffer = commandStream.getSpace(0u);
99101
if (staticPartitioning) {
100102
UNRECOVERABLE_IF(tileCount != partitionCount);
101-
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
102-
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
103+
WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
104+
cmdBufferGpuAddress,
103105
&walkerCmd,
104106
totalProgrammedSize,
105107
args);
@@ -112,8 +114,8 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
112114
args.partitionCount = partitionCount;
113115
}
114116

115-
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandStream.getSpace(0u),
116-
commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(),
117+
WalkerPartition::constructDynamicallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
118+
cmdBufferGpuAddress,
117119
&walkerCmd,
118120
totalProgrammedSize,
119121
args);
@@ -126,4 +128,38 @@ bool &ImplicitScalingDispatch<GfxFamily>::getPipeControlStallRequired() {
126128
return ImplicitScalingDispatch<GfxFamily>::pipeControlStallRequired;
127129
}
128130

131+
template <typename GfxFamily>
132+
size_t ImplicitScalingDispatch<GfxFamily>::getBarrierSize(bool apiSelfCleanup) {
133+
WalkerPartition::WalkerPartitionArgs args = {};
134+
args.emitSelfCleanup = apiSelfCleanup;
135+
args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
136+
137+
return static_cast<size_t>(WalkerPartition::estimateBarrierSpaceRequiredInCommandBuffer<GfxFamily>(args));
138+
}
139+
140+
template <typename GfxFamily>
141+
void ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(LinearStream &commandStream,
142+
const DeviceBitfield &devices,
143+
bool apiSelfCleanup,
144+
bool dcFlush,
145+
bool useSecondaryBatchBuffer) {
146+
uint32_t totalProgrammedSize = 0u;
147+
148+
WalkerPartition::WalkerPartitionArgs args = {};
149+
args.emitSelfCleanup = apiSelfCleanup;
150+
args.dcFlush = dcFlush;
151+
args.useAtomicsForSelfCleanup = ImplicitScalingHelper::isAtomicsUsedForSelfCleanup();
152+
args.tileCount = static_cast<uint32_t>(devices.count());
153+
args.secondaryBatchBuffer = useSecondaryBatchBuffer;
154+
155+
uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
156+
void *commandBuffer = commandStream.getSpace(0u);
157+
158+
WalkerPartition::constructBarrierCommandBuffer<GfxFamily>(commandBuffer,
159+
cmdBufferGpuAddress,
160+
totalProgrammedSize,
161+
args);
162+
commandStream.getSpace(totalProgrammedSize);
163+
}
164+
129165
} // namespace NEO

shared/source/command_container/walker_partition_interface.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ struct WalkerPartitionArgs {
2626
bool initializeWparidRegister = false;
2727
bool emitPipeControlStall = false;
2828
bool preferredStaticPartitioning = false;
29+
bool dcFlush = false;
2930
};
3031

3132
constexpr uint32_t wparidCCSOffset = 0x221C;
@@ -54,4 +55,10 @@ struct StaticPartitioningControlSection {
5455
uint32_t finalSyncTileCounter = 0;
5556
};
5657
constexpr size_t staticPartitioningFieldsForCleanupCount = sizeof(StaticPartitioningControlSection) / sizeof(uint32_t) - 1;
58+
59+
struct BarrierControlSection {
60+
uint32_t crossTileSyncCount = 0u;
61+
uint32_t finalSyncTileCount = 0;
62+
};
63+
constexpr size_t barrierControlSectionFieldsForCleanupCount = sizeof(BarrierControlSection) / sizeof(uint32_t) - 1;
5764
} // namespace WalkerPartition

shared/source/command_container/walker_partition_xehp_and_later.h

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -720,4 +720,64 @@ uint64_t estimateSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
720720
return size;
721721
}
722722

723+
template <typename GfxFamily>
724+
uint64_t computeBarrierControlSectionOffset(WalkerPartitionArgs &args) {
725+
uint64_t offset = 0u;
726+
if (args.emitSelfCleanup) {
727+
offset += computeSelfCleanupSectionSize<GfxFamily>(args.useAtomicsForSelfCleanup);
728+
}
729+
offset += (sizeof(PIPE_CONTROL<GfxFamily>) +
730+
computeTilesSynchronizationWithAtomicsSectionSize<GfxFamily>() +
731+
sizeof(BATCH_BUFFER_START<GfxFamily>));
732+
return offset;
733+
}
734+
735+
template <typename GfxFamily>
736+
uint64_t estimateBarrierSpaceRequiredInCommandBuffer(WalkerPartitionArgs &args) {
737+
uint64_t size = computeBarrierControlSectionOffset<GfxFamily>(args) +
738+
sizeof(BarrierControlSection);
739+
if (args.emitSelfCleanup) {
740+
size += computeSelfCleanupEndSectionSize<GfxFamily>(barrierControlSectionFieldsForCleanupCount, args.useAtomicsForSelfCleanup);
741+
}
742+
return size;
743+
}
744+
745+
template <typename GfxFamily>
746+
void constructBarrierCommandBuffer(void *cpuPointer,
747+
uint64_t gpuAddressOfAllocation,
748+
uint32_t &totalBytesProgrammed,
749+
WalkerPartitionArgs &args) {
750+
void *currentBatchBufferPointer = cpuPointer;
751+
const auto controlSectionOffset = computeBarrierControlSectionOffset<GfxFamily>(args);
752+
753+
const auto finalSyncTileCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, finalSyncTileCount);
754+
if (args.emitSelfCleanup) {
755+
programSelfCleanupSection<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, finalSyncTileCountField, args.useAtomicsForSelfCleanup);
756+
}
757+
758+
programPipeControlCommand<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, args.dcFlush);
759+
760+
const auto crossTileSyncCountField = gpuAddressOfAllocation + controlSectionOffset + offsetof(BarrierControlSection, crossTileSyncCount);
761+
programTilesSynchronizationWithAtomics<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, crossTileSyncCountField, args.tileCount);
762+
763+
const auto afterControlSectionOffset = controlSectionOffset + sizeof(BarrierControlSection);
764+
programMiBatchBufferStart<GfxFamily>(currentBatchBufferPointer, totalBytesProgrammed, gpuAddressOfAllocation + afterControlSectionOffset, false, args.secondaryBatchBuffer);
765+
766+
DEBUG_BREAK_IF(totalBytesProgrammed != controlSectionOffset);
767+
BarrierControlSection *controlSection = putCommand<BarrierControlSection>(currentBatchBufferPointer, totalBytesProgrammed);
768+
controlSection->crossTileSyncCount = 0u;
769+
controlSection->finalSyncTileCount = 0u;
770+
DEBUG_BREAK_IF(totalBytesProgrammed != afterControlSectionOffset);
771+
772+
if (args.emitSelfCleanup) {
773+
programSelfCleanupEndSection<GfxFamily>(currentBatchBufferPointer,
774+
totalBytesProgrammed,
775+
finalSyncTileCountField,
776+
gpuAddressOfAllocation + controlSectionOffset,
777+
barrierControlSectionFieldsForCleanupCount,
778+
args.tileCount,
779+
args.useAtomicsForSelfCleanup);
780+
}
781+
}
782+
723783
} // namespace WalkerPartition

shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp

Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -736,3 +736,147 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
736736
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
737737
EXPECT_EQ(3u, miSemaphoreList.size());
738738
}
739+
740+
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
741+
givenBarrierDispatchWhenApiNotRequiresSelfCleanupThenExpectMinimalCommandBuffer) {
742+
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
743+
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
744+
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
745+
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
746+
747+
size_t expectedSize = sizeof(PIPE_CONTROL) +
748+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
749+
sizeof(MI_BATCH_BUFFER_START) +
750+
sizeof(WalkerPartition::BarrierControlSection);
751+
752+
size_t estimatedSize = 0;
753+
size_t totalBytesProgrammed = 0;
754+
755+
estimatedSize = ImplicitScalingDispatch<FamilyType>::getBarrierSize(false);
756+
EXPECT_EQ(expectedSize, estimatedSize);
757+
758+
ImplicitScalingDispatch<FamilyType>::dispatchBarrierCommands(commandStream, twoTile, false, false, false);
759+
totalBytesProgrammed = commandStream.getUsed();
760+
EXPECT_EQ(expectedSize, totalBytesProgrammed);
761+
762+
HardwareParse hwParser;
763+
hwParser.parsePipeControl = true;
764+
hwParser.parseCommands<FamilyType>(commandStream, 0);
765+
hwParser.findHardwareCommands<FamilyType>();
766+
767+
EXPECT_EQ(1u, hwParser.pipeControlList.size());
768+
769+
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*hwParser.pipeControlList.begin());
770+
EXPECT_EQ(false, pipeControl->getDcFlushEnable());
771+
772+
auto miAtomicList = hwParser.getCommandsList<MI_ATOMIC>();
773+
EXPECT_EQ(1u, miAtomicList.size());
774+
775+
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
776+
EXPECT_EQ(1u, miSemaphoreList.size());
777+
778+
auto bbStartList = hwParser.getCommandsList<MI_BATCH_BUFFER_START>();
779+
EXPECT_EQ(1u, bbStartList.size());
780+
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
781+
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
782+
}
783+
784+
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
785+
givenBarrierDispatchWhenApiRequiresSelfCleanupThenExpectDefaultSelfCleanupSection) {
786+
using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM;
787+
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
788+
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
789+
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
790+
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
791+
792+
size_t expectedSize = sizeof(MI_STORE_DATA_IMM) +
793+
sizeof(PIPE_CONTROL) +
794+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
795+
sizeof(MI_BATCH_BUFFER_START) +
796+
sizeof(WalkerPartition::BarrierControlSection) +
797+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
798+
sizeof(MI_STORE_DATA_IMM) +
799+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT);
800+
801+
size_t estimatedSize = 0;
802+
size_t totalBytesProgrammed = 0;
803+
804+
estimatedSize = ImplicitScalingDispatch<FamilyType>::getBarrierSize(true);
805+
EXPECT_EQ(expectedSize, estimatedSize);
806+
807+
ImplicitScalingDispatch<FamilyType>::dispatchBarrierCommands(commandStream, twoTile, true, true, true);
808+
totalBytesProgrammed = commandStream.getUsed();
809+
EXPECT_EQ(expectedSize, totalBytesProgrammed);
810+
811+
HardwareParse hwParser;
812+
hwParser.parsePipeControl = true;
813+
hwParser.parseCommands<FamilyType>(commandStream, 0);
814+
hwParser.findHardwareCommands<FamilyType>();
815+
816+
auto storeDataImmList = hwParser.getCommandsList<MI_STORE_DATA_IMM>();
817+
EXPECT_EQ(2u, storeDataImmList.size());
818+
819+
EXPECT_EQ(1u, hwParser.pipeControlList.size());
820+
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*hwParser.pipeControlList.begin());
821+
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
822+
823+
auto miAtomicList = hwParser.getCommandsList<MI_ATOMIC>();
824+
EXPECT_EQ(3u, miAtomicList.size());
825+
826+
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
827+
EXPECT_EQ(3u, miSemaphoreList.size());
828+
829+
auto bbStartList = hwParser.getCommandsList<MI_BATCH_BUFFER_START>();
830+
EXPECT_EQ(1u, bbStartList.size());
831+
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
832+
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
833+
}
834+
835+
HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
836+
givenBarrierDispatchWhenApiRequiresSelfCleanupForcedUseAtomicThenExpectUseAtomicForSelfCleanupSection) {
837+
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
838+
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
839+
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
840+
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
841+
842+
size_t expectedSize = sizeof(MI_ATOMIC) +
843+
sizeof(PIPE_CONTROL) +
844+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
845+
sizeof(MI_BATCH_BUFFER_START) +
846+
sizeof(WalkerPartition::BarrierControlSection) +
847+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) +
848+
sizeof(MI_ATOMIC) +
849+
sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT);
850+
851+
DebugManager.flags.UseAtomicsForSelfCleanupSection.set(1);
852+
853+
size_t estimatedSize = 0;
854+
size_t totalBytesProgrammed = 0;
855+
856+
estimatedSize = ImplicitScalingDispatch<FamilyType>::getBarrierSize(true);
857+
EXPECT_EQ(expectedSize, estimatedSize);
858+
859+
ImplicitScalingDispatch<FamilyType>::dispatchBarrierCommands(commandStream, twoTile, true, true, true);
860+
totalBytesProgrammed = commandStream.getUsed();
861+
EXPECT_EQ(expectedSize, totalBytesProgrammed);
862+
863+
HardwareParse hwParser;
864+
hwParser.parsePipeControl = true;
865+
hwParser.parseCommands<FamilyType>(commandStream, 0);
866+
hwParser.findHardwareCommands<FamilyType>();
867+
868+
EXPECT_EQ(1u, hwParser.pipeControlList.size());
869+
auto pipeControl = reinterpret_cast<PIPE_CONTROL *>(*hwParser.pipeControlList.begin());
870+
EXPECT_EQ(MemorySynchronizationCommands<FamilyType>::isDcFlushAllowed(), pipeControl->getDcFlushEnable());
871+
872+
auto miAtomicList = hwParser.getCommandsList<MI_ATOMIC>();
873+
EXPECT_EQ(5u, miAtomicList.size());
874+
875+
auto miSemaphoreList = hwParser.getCommandsList<MI_SEMAPHORE_WAIT>();
876+
EXPECT_EQ(3u, miSemaphoreList.size());
877+
878+
auto bbStartList = hwParser.getCommandsList<MI_BATCH_BUFFER_START>();
879+
EXPECT_EQ(1u, bbStartList.size());
880+
auto bbStart = reinterpret_cast<MI_BATCH_BUFFER_START *>(*bbStartList.begin());
881+
EXPECT_EQ(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER::SECOND_LEVEL_BATCH_BUFFER_SECOND_LEVEL_BATCH, bbStart->getSecondLevelBatchBuffer());
882+
}

0 commit comments

Comments
 (0)