@@ -92,7 +92,7 @@ int IR_Builder::translateVISAQWScatterInst(
92
92
unsigned int instOpt = Get_Gen4_Emask (eMask, instExSize);
93
93
bool useSplitSend = useSends ();
94
94
95
- PayloadSource sources[2 ]; // Maximal 2 sources, optional header + offsets
95
+ PayloadSource sources[2 ]; // Maximal 2 sources, offsets + src
96
96
unsigned len = 0 ;
97
97
98
98
sources[len].opnd = addresses;
@@ -109,7 +109,11 @@ int IR_Builder::translateVISAQWScatterInst(
109
109
110
110
G4_SrcRegRegion *msgs[2 ] {0 , 0 };
111
111
unsigned sizes[2 ] {0 , 0 };
112
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
112
+ // For send that has smaller execsize than exSize, like
113
+ // "send (4) ..."
114
+ // Make sure to use send's execsize (4) as batchsize, not 8/16/32.
115
+ // Thus, batchsize is min(exSize, instExSize).
116
+ preparePayload (msgs, sizes, std::min (exSize, instExSize), useSplitSend, sources, len);
113
117
114
118
uint32_t desc = buildDescForScatter (DC_QWORD_SCATTERED_WRITE, numBlocks,
115
119
execSize == EXEC_SIZE_8 ? MDC_SM2_SIMD8 : MDC_SM2_SIMD16);
@@ -1484,8 +1488,9 @@ int IR_Builder::translateVISADwordAtomicInst(
1484
1488
G4_SrcRegRegion *header
1485
1489
= createSrcRegRegion (dcl, getRegionStride1 ());
1486
1490
sources[len].opnd = header;
1487
- sources[len].numElts = g4::SIMD8 ;
1491
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
1488
1492
sources[len].instOpt = InstOpt_WriteEnable;
1493
+ sources[len].copyExecSize = g4::SIMD8; // header has 8 DWs
1489
1494
++len;
1490
1495
}
1491
1496
@@ -1510,7 +1515,7 @@ int IR_Builder::translateVISADwordAtomicInst(
1510
1515
1511
1516
G4_SrcRegRegion *msgs[2 ] = {0 , 0 };
1512
1517
unsigned sizes[2 ] = {0 , 0 };
1513
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
1518
+ preparePayload (msgs, sizes, std::min ( exSize, instExSize) , useSplitSend, sources, len);
1514
1519
1515
1520
SFID sfid = SFID::DP_DC1;
1516
1521
unsigned MD = 0 ;
@@ -1669,8 +1674,9 @@ int IR_Builder::translateVISAGather4TypedInst(
1669
1674
G4_SrcRegRegion *header
1670
1675
= createSrcRegRegion (dcl, getRegionStride1 ());
1671
1676
sources[len].opnd = header;
1672
- sources[len].numElts = g4::SIMD8 ;
1677
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
1673
1678
sources[len].instOpt = InstOpt_WriteEnable;
1679
+ sources[len].copyExecSize = g4::SIMD8;
1674
1680
++len;
1675
1681
}
1676
1682
@@ -1755,8 +1761,9 @@ int IR_Builder::translateVISAScatter4TypedInst(
1755
1761
G4_SrcRegRegion *header
1756
1762
= createSrcRegRegion (dcl, getRegionStride1 ());
1757
1763
sources[len].opnd = header;
1758
- sources[len].numElts = g4::SIMD8 ;
1764
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
1759
1765
sources[len].instOpt = InstOpt_WriteEnable;
1766
+ sources[len].copyExecSize = g4::SIMD8;
1760
1767
++len;
1761
1768
}
1762
1769
@@ -1877,7 +1884,7 @@ int IR_Builder::translateVISATypedAtomicInst(
1877
1884
1878
1885
G4_SrcRegRegion *msgs[2 ] = {0 , 0 };
1879
1886
unsigned sizes[2 ] = {0 , 0 };
1880
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
1887
+ preparePayload (msgs, sizes, std::min ( exSize, instExSize) , useSplitSend, sources, len);
1881
1888
1882
1889
unsigned dstLength = dst->isNullReg () ? 0 : 1 ;
1883
1890
@@ -2081,8 +2088,9 @@ int IR_Builder::translateGather4Inst(
2081
2088
G4_SrcRegRegion *header
2082
2089
= createSrcRegRegion (dcl, getRegionStride1 ());
2083
2090
sources[len].opnd = header;
2084
- sources[len].numElts = g4::SIMD8 ;
2091
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
2085
2092
sources[len].instOpt = InstOpt_WriteEnable;
2093
+ sources[len].copyExecSize = g4::SIMD8;
2086
2094
++len;
2087
2095
}
2088
2096
@@ -2093,7 +2101,7 @@ int IR_Builder::translateGather4Inst(
2093
2101
2094
2102
G4_SrcRegRegion *msgs[2 ] = {0 , 0 };
2095
2103
unsigned sizes[2 ] = {0 , 0 };
2096
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
2104
+ preparePayload (msgs, sizes, std::min ( exSize, instExSize) , useSplitSend, sources, len);
2097
2105
2098
2106
SFID sfid = SFID::DP_DC1;
2099
2107
@@ -2183,8 +2191,9 @@ int IR_Builder::translateScatter4Inst(
2183
2191
G4_SrcRegRegion *header
2184
2192
= createSrcRegRegion (dcl, getRegionStride1 ());
2185
2193
sources[len].opnd = header;
2186
- sources[len].numElts = g4::SIMD8 ;
2194
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
2187
2195
sources[len].instOpt = InstOpt_WriteEnable;
2196
+ sources[len].copyExecSize = g4::SIMD8;
2188
2197
++len;
2189
2198
}
2190
2199
@@ -2199,7 +2208,7 @@ int IR_Builder::translateScatter4Inst(
2199
2208
2200
2209
G4_SrcRegRegion *msgs[2 ] = {0 , 0 };
2201
2210
unsigned sizes[2 ] = {0 , 0 };
2202
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
2211
+ preparePayload (msgs, sizes, std::min ( exSize, instExSize) , useSplitSend, sources, len);
2203
2212
2204
2213
SFID sfid = SFID::DP_DC1;
2205
2214
@@ -2373,8 +2382,9 @@ int IR_Builder::translateByteGatherInst(
2373
2382
G4_SrcRegRegion *header
2374
2383
= createSrcRegRegion (dcl, getRegionStride1 ());
2375
2384
sources[len].opnd = header;
2376
- sources[len].numElts = g4::SIMD8 ;
2385
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
2377
2386
sources[len].instOpt = InstOpt_WriteEnable;
2387
+ sources[len].copyExecSize = g4::SIMD8;
2378
2388
++len;
2379
2389
}
2380
2390
@@ -2385,7 +2395,7 @@ int IR_Builder::translateByteGatherInst(
2385
2395
2386
2396
G4_SrcRegRegion *msgs[2 ] = {0 , 0 };
2387
2397
unsigned sizes[2 ] = {0 , 0 };
2388
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
2398
+ preparePayload (msgs, sizes, std::min ( exSize, instExSize) , useSplitSend, sources, len);
2389
2399
2390
2400
SFID sfid = SFID::DP_DC0;
2391
2401
@@ -2483,8 +2493,9 @@ int IR_Builder::translateByteScatterInst(
2483
2493
G4_SrcRegRegion *header
2484
2494
= createSrcRegRegion (dcl, getRegionStride1 ());
2485
2495
sources[len].opnd = header;
2486
- sources[len].numElts = g4::SIMD8 ;
2496
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
2487
2497
sources[len].instOpt = InstOpt_WriteEnable;
2498
+ sources[len].copyExecSize = g4::SIMD8;
2488
2499
++len;
2489
2500
}
2490
2501
@@ -2499,7 +2510,7 @@ int IR_Builder::translateByteScatterInst(
2499
2510
2500
2511
G4_SrcRegRegion *msgs[2 ] = {0 , 0 };
2501
2512
unsigned sizes[2 ] = {0 , 0 };
2502
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
2513
+ preparePayload (msgs, sizes, std::min ( exSize, instExSize) , useSplitSend, sources, len);
2503
2514
2504
2515
SFID sfid = SFID::DP_DC0;
2505
2516
@@ -2652,8 +2663,9 @@ int IR_Builder::translateVISASVMBlockWriteInst(
2652
2663
unsigned len = 0 ;
2653
2664
2654
2665
sources[len].opnd = createSrcRegRegion (dcl, getRegionStride1 ());
2655
- sources[len].numElts = g4::SIMD8 ;
2666
+ sources[len].numElts = numEltPerGRF<Type_UD>() ;
2656
2667
sources[len].instOpt = InstOpt_WriteEnable;
2668
+ sources[len].copyExecSize = g4::SIMD8; // block msg header has 8 DWs
2657
2669
++len;
2658
2670
2659
2671
if (src->getElemSize () < TypeSize (Type_UD))
@@ -2824,7 +2836,7 @@ int IR_Builder::translateVISASVMScatterWriteInst(
2824
2836
2825
2837
bool useSplitSend = useSends ();
2826
2838
2827
- PayloadSource sources[2 ]; // Maximal 2 sources, optional header + offsets
2839
+ PayloadSource sources[2 ]; // Maximal 2 sources, offsets + src
2828
2840
unsigned len = 0 ;
2829
2841
2830
2842
sources[len].opnd = addresses;
@@ -2860,7 +2872,7 @@ int IR_Builder::translateVISASVMScatterWriteInst(
2860
2872
(TypeSize (srcType) != 4 ))
2861
2873
src->setType (*this , Type_UD);
2862
2874
2863
- preparePayload (msgs, sizes, exSize, useSplitSend, sources, len);
2875
+ preparePayload (msgs, sizes, std::min ( exSize, instExSize) , useSplitSend, sources, len);
2864
2876
2865
2877
// set the type back in case we changed it for preparePayload
2866
2878
src->setType (*this , srcType);
0 commit comments