Skip to content

Commit 684a542

Browse files
committed
[SelectionDAG][X86] Remove unused elements from atomic vector.
After splitting, all elements are created. The two components must be found by looking at the upper and lower half of EXTRACT_ELEMENT. This change extends EltsFromConsecutiveLoads to understand AtomicSDNode so that unused elements can be removed. commit-id:b83937a8
1 parent 4fccbd6 commit 684a542

File tree

4 files changed

+65
-153
lines changed

4 files changed

+65
-153
lines changed

llvm/include/llvm/CodeGen/SelectionDAG.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1873,7 +1873,7 @@ class SelectionDAG {
18731873
/// chain to the token factor. This ensures that the new memory node will have
18741874
/// the same relative memory dependency position as the old load. Returns the
18751875
/// new merged load chain.
1876-
SDValue makeEquivalentMemoryOrdering(LoadSDNode *OldLoad, SDValue NewMemOp);
1876+
SDValue makeEquivalentMemoryOrdering(MemSDNode *OldLoad, SDValue NewMemOp);
18771877

18781878
/// Topological-sort the AllNodes list and a
18791879
/// assign a unique node id for each node in the DAG based on their

llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12215,7 +12215,7 @@ SDValue SelectionDAG::makeEquivalentMemoryOrdering(SDValue OldChain,
1221512215
return TokenFactor;
1221612216
}
1221712217

12218-
SDValue SelectionDAG::makeEquivalentMemoryOrdering(LoadSDNode *OldLoad,
12218+
SDValue SelectionDAG::makeEquivalentMemoryOrdering(MemSDNode *OldLoad,
1221912219
SDValue NewMemOp) {
1222012220
assert(isa<MemSDNode>(NewMemOp.getNode()) && "Expected a memop node");
1222112221
SDValue OldChain = SDValue(OldLoad, 1);

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 51 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -7193,15 +7193,19 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
71937193
}
71947194

71957195
// Recurse to find a LoadSDNode source and the accumulated ByteOffest.
7196-
static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
7197-
if (ISD::isNON_EXTLoad(Elt.getNode())) {
7198-
auto *BaseLd = cast<LoadSDNode>(Elt);
7199-
if (!BaseLd->isSimple())
7200-
return false;
7196+
static bool findEltLoadSrc(SDValue Elt, MemSDNode *&Ld, int64_t &ByteOffset) {
7197+
if (auto *BaseLd = dyn_cast<AtomicSDNode>(Elt)) {
72017198
Ld = BaseLd;
72027199
ByteOffset = 0;
72037200
return true;
7204-
}
7201+
} else if (auto *BaseLd = dyn_cast<LoadSDNode>(Elt))
7202+
if (ISD::isNON_EXTLoad(Elt.getNode())) {
7203+
if (!BaseLd->isSimple())
7204+
return false;
7205+
Ld = BaseLd;
7206+
ByteOffset = 0;
7207+
return true;
7208+
}
72057209

72067210
switch (Elt.getOpcode()) {
72077211
case ISD::BITCAST:
@@ -7254,7 +7258,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
72547258
APInt ZeroMask = APInt::getZero(NumElems);
72557259
APInt UndefMask = APInt::getZero(NumElems);
72567260

7257-
SmallVector<LoadSDNode*, 8> Loads(NumElems, nullptr);
7261+
SmallVector<MemSDNode *, 8> Loads(NumElems, nullptr);
72587262
SmallVector<int64_t, 8> ByteOffsets(NumElems, 0);
72597263

72607264
// For each element in the initializer, see if we've found a load, zero or an
@@ -7304,7 +7308,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
73047308
EVT EltBaseVT = EltBase.getValueType();
73057309
assert(EltBaseVT.getSizeInBits() == EltBaseVT.getStoreSizeInBits() &&
73067310
"Register/Memory size mismatch");
7307-
LoadSDNode *LDBase = Loads[FirstLoadedElt];
7311+
MemSDNode *LDBase = Loads[FirstLoadedElt];
73087312
assert(LDBase && "Did not find base load for merging consecutive loads");
73097313
unsigned BaseSizeInBits = EltBaseVT.getStoreSizeInBits();
73107314
unsigned BaseSizeInBytes = BaseSizeInBits / 8;
@@ -7318,16 +7322,18 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
73187322

73197323
// Check to see if the element's load is consecutive to the base load
73207324
// or offset from a previous (already checked) load.
7321-
auto CheckConsecutiveLoad = [&](LoadSDNode *Base, int EltIdx) {
7322-
LoadSDNode *Ld = Loads[EltIdx];
7325+
auto CheckConsecutiveLoad = [&](MemSDNode *Base, int EltIdx) {
7326+
MemSDNode *Ld = Loads[EltIdx];
73237327
int64_t ByteOffset = ByteOffsets[EltIdx];
73247328
if (ByteOffset && (ByteOffset % BaseSizeInBytes) == 0) {
73257329
int64_t BaseIdx = EltIdx - (ByteOffset / BaseSizeInBytes);
73267330
return (0 <= BaseIdx && BaseIdx < (int)NumElems && LoadMask[BaseIdx] &&
73277331
Loads[BaseIdx] == Ld && ByteOffsets[BaseIdx] == 0);
73287332
}
7329-
return DAG.areNonVolatileConsecutiveLoads(Ld, Base, BaseSizeInBytes,
7330-
EltIdx - FirstLoadedElt);
7333+
auto *L = dyn_cast<LoadSDNode>(Ld);
7334+
auto *B = dyn_cast<LoadSDNode>(Base);
7335+
return L && B && DAG.areNonVolatileConsecutiveLoads(L, B, BaseSizeInBytes,
7336+
EltIdx - FirstLoadedElt);
73317337
};
73327338

73337339
// Consecutive loads can contain UNDEFS but not ZERO elements.
@@ -7347,7 +7353,7 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
73477353
}
73487354
}
73497355

7350-
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
7356+
auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, MemSDNode *LDBase) {
73517357
auto MMOFlags = LDBase->getMemOperand()->getFlags();
73527358
assert(LDBase->isSimple() &&
73537359
"Cannot merge volatile or atomic loads.");
@@ -9452,8 +9458,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
94529458
{
94539459
SmallVector<SDValue, 64> Ops(Op->ops().take_front(NumElems));
94549460
if (SDValue LD =
9455-
EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false))
9461+
EltsFromConsecutiveLoads(VT, Ops, dl, DAG, Subtarget, false)) {
94569462
return LD;
9463+
}
94579464
}
94589465

94599466
// If this is a splat of pairs of 32-bit elements, we can use a narrower
@@ -60388,6 +60395,35 @@ static SDValue combineINTRINSIC_VOID(SDNode *N, SelectionDAG &DAG,
6038860395
return SDValue();
6038960396
}
6039060397

60398+
static SDValue combineVZEXT_LOAD(SDNode *N, SelectionDAG &DAG,
60399+
TargetLowering::DAGCombinerInfo &DCI) {
60400+
// Find the TokenFactor to locate the associated AtomicLoad.
60401+
SDNode *ALD = nullptr;
60402+
for (auto &TF : N->uses())
60403+
if (TF.getUser()->getOpcode() == ISD::TokenFactor) {
60404+
SDValue L = TF.getUser()->getOperand(0);
60405+
SDValue R = TF.getUser()->getOperand(1);
60406+
if (L.getNode() == N)
60407+
ALD = R.getNode();
60408+
else if (R.getNode() == N)
60409+
ALD = L.getNode();
60410+
}
60411+
60412+
if (!ALD)
60413+
return SDValue();
60414+
if (!isa<AtomicSDNode>(ALD))
60415+
return SDValue();
60416+
60417+
// Replace the VZEXT_LOAD with the AtomicLoad.
60418+
SDLoc dl(N);
60419+
SDValue SV =
60420+
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
60421+
N->getValueType(0).changeTypeToInteger(), SDValue(ALD, 0));
60422+
SDValue BC = DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), SV);
60423+
BC = DCI.CombineTo(N, BC, SDValue(ALD, 1));
60424+
return BC;
60425+
}
60426+
6039160427
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
6039260428
DAGCombinerInfo &DCI) const {
6039360429
SelectionDAG &DAG = DCI.DAG;
@@ -60584,6 +60620,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
6058460620
case ISD::INTRINSIC_VOID: return combineINTRINSIC_VOID(N, DAG, DCI);
6058560621
case ISD::FP_TO_SINT_SAT:
6058660622
case ISD::FP_TO_UINT_SAT: return combineFP_TO_xINT_SAT(N, DAG, Subtarget);
60623+
case X86ISD::VZEXT_LOAD: return combineVZEXT_LOAD(N, DAG, DCI);
6058760624
// clang-format on
6058860625
}
6058960626

llvm/test/CodeGen/X86/atomic-load-store.ll

Lines changed: 12 additions & 137 deletions
Original file line numberDiff line numberDiff line change
@@ -208,29 +208,12 @@ define <2 x float> @atomic_vec2_float_align(ptr %x) {
208208
define <2 x half> @atomic_vec2_half(ptr %x) {
209209
; CHECK3-LABEL: atomic_vec2_half:
210210
; CHECK3: ## %bb.0:
211-
; CHECK3-NEXT: movl (%rdi), %eax
212-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
213-
; CHECK3-NEXT: shrl $16, %eax
214-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
215-
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
211+
; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
216212
; CHECK3-NEXT: retq
217213
;
218214
; CHECK0-LABEL: atomic_vec2_half:
219215
; CHECK0: ## %bb.0:
220-
; CHECK0-NEXT: movl (%rdi), %eax
221-
; CHECK0-NEXT: movl %eax, %ecx
222-
; CHECK0-NEXT: shrl $16, %ecx
223-
; CHECK0-NEXT: movw %cx, %dx
224-
; CHECK0-NEXT: ## implicit-def: $ecx
225-
; CHECK0-NEXT: movw %dx, %cx
226-
; CHECK0-NEXT: ## implicit-def: $xmm1
227-
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1
228-
; CHECK0-NEXT: movw %ax, %cx
229-
; CHECK0-NEXT: ## implicit-def: $eax
230-
; CHECK0-NEXT: movw %cx, %ax
231-
; CHECK0-NEXT: ## implicit-def: $xmm0
232-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
233-
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
216+
; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
234217
; CHECK0-NEXT: retq
235218
%ret = load atomic <2 x half>, ptr %x acquire, align 4
236219
ret <2 x half> %ret
@@ -239,29 +222,12 @@ define <2 x half> @atomic_vec2_half(ptr %x) {
239222
define <2 x bfloat> @atomic_vec2_bfloat(ptr %x) {
240223
; CHECK3-LABEL: atomic_vec2_bfloat:
241224
; CHECK3: ## %bb.0:
242-
; CHECK3-NEXT: movl (%rdi), %eax
243-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
244-
; CHECK3-NEXT: shrl $16, %eax
245-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
246-
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
225+
; CHECK3-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
247226
; CHECK3-NEXT: retq
248227
;
249228
; CHECK0-LABEL: atomic_vec2_bfloat:
250229
; CHECK0: ## %bb.0:
251-
; CHECK0-NEXT: movl (%rdi), %eax
252-
; CHECK0-NEXT: movl %eax, %ecx
253-
; CHECK0-NEXT: shrl $16, %ecx
254-
; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
255-
; CHECK0-NEXT: movw %ax, %dx
256-
; CHECK0-NEXT: ## implicit-def: $eax
257-
; CHECK0-NEXT: movw %dx, %ax
258-
; CHECK0-NEXT: ## implicit-def: $xmm0
259-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
260-
; CHECK0-NEXT: ## implicit-def: $eax
261-
; CHECK0-NEXT: movw %cx, %ax
262-
; CHECK0-NEXT: ## implicit-def: $xmm1
263-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm1
264-
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
230+
; CHECK0-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
265231
; CHECK0-NEXT: retq
266232
%ret = load atomic <2 x bfloat>, ptr %x acquire, align 4
267233
ret <2 x bfloat> %ret
@@ -440,110 +406,19 @@ define <4 x i16> @atomic_vec4_i16(ptr %x) nounwind {
440406
}
441407

442408
define <4 x half> @atomic_vec4_half(ptr %x) nounwind {
443-
; CHECK3-LABEL: atomic_vec4_half:
444-
; CHECK3: ## %bb.0:
445-
; CHECK3-NEXT: movq (%rdi), %rax
446-
; CHECK3-NEXT: movl %eax, %ecx
447-
; CHECK3-NEXT: shrl $16, %ecx
448-
; CHECK3-NEXT: pinsrw $0, %ecx, %xmm1
449-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
450-
; CHECK3-NEXT: movq %rax, %rcx
451-
; CHECK3-NEXT: shrq $32, %rcx
452-
; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2
453-
; CHECK3-NEXT: shrq $48, %rax
454-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm3
455-
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
456-
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
457-
; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
458-
; CHECK3-NEXT: retq
459-
;
460-
; CHECK0-LABEL: atomic_vec4_half:
461-
; CHECK0: ## %bb.0:
462-
; CHECK0-NEXT: movq (%rdi), %rax
463-
; CHECK0-NEXT: movl %eax, %ecx
464-
; CHECK0-NEXT: shrl $16, %ecx
465-
; CHECK0-NEXT: movw %cx, %dx
466-
; CHECK0-NEXT: ## implicit-def: $ecx
467-
; CHECK0-NEXT: movw %dx, %cx
468-
; CHECK0-NEXT: ## implicit-def: $xmm2
469-
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm2
470-
; CHECK0-NEXT: movw %ax, %dx
471-
; CHECK0-NEXT: ## implicit-def: $ecx
472-
; CHECK0-NEXT: movw %dx, %cx
473-
; CHECK0-NEXT: ## implicit-def: $xmm0
474-
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm0
475-
; CHECK0-NEXT: movq %rax, %rcx
476-
; CHECK0-NEXT: shrq $32, %rcx
477-
; CHECK0-NEXT: movw %cx, %dx
478-
; CHECK0-NEXT: ## implicit-def: $ecx
479-
; CHECK0-NEXT: movw %dx, %cx
480-
; CHECK0-NEXT: ## implicit-def: $xmm1
481-
; CHECK0-NEXT: pinsrw $0, %ecx, %xmm1
482-
; CHECK0-NEXT: shrq $48, %rax
483-
; CHECK0-NEXT: movw %ax, %cx
484-
; CHECK0-NEXT: ## implicit-def: $eax
485-
; CHECK0-NEXT: movw %cx, %ax
486-
; CHECK0-NEXT: ## implicit-def: $xmm3
487-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm3
488-
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
489-
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
490-
; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
491-
; CHECK0-NEXT: retq
409+
; CHECK-LABEL: atomic_vec4_half:
410+
; CHECK: ## %bb.0:
411+
; CHECK-NEXT: movq (%rdi), %xmm0
412+
; CHECK-NEXT: retq
492413
%ret = load atomic <4 x half>, ptr %x acquire, align 8
493414
ret <4 x half> %ret
494415
}
495416

496417
define <4 x bfloat> @atomic_vec4_bfloat(ptr %x) nounwind {
497-
; CHECK3-LABEL: atomic_vec4_bfloat:
498-
; CHECK3: ## %bb.0:
499-
; CHECK3-NEXT: movq (%rdi), %rax
500-
; CHECK3-NEXT: movq %rax, %rcx
501-
; CHECK3-NEXT: movq %rax, %rdx
502-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm0
503-
; CHECK3-NEXT: ## kill: def $eax killed $eax killed $rax
504-
; CHECK3-NEXT: shrl $16, %eax
505-
; CHECK3-NEXT: shrq $32, %rcx
506-
; CHECK3-NEXT: shrq $48, %rdx
507-
; CHECK3-NEXT: pinsrw $0, %edx, %xmm1
508-
; CHECK3-NEXT: pinsrw $0, %ecx, %xmm2
509-
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
510-
; CHECK3-NEXT: pinsrw $0, %eax, %xmm1
511-
; CHECK3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
512-
; CHECK3-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
513-
; CHECK3-NEXT: retq
514-
;
515-
; CHECK0-LABEL: atomic_vec4_bfloat:
516-
; CHECK0: ## %bb.0:
517-
; CHECK0-NEXT: movq (%rdi), %rax
518-
; CHECK0-NEXT: movl %eax, %ecx
519-
; CHECK0-NEXT: shrl $16, %ecx
520-
; CHECK0-NEXT: ## kill: def $cx killed $cx killed $ecx
521-
; CHECK0-NEXT: movw %ax, %dx
522-
; CHECK0-NEXT: movq %rax, %rsi
523-
; CHECK0-NEXT: shrq $32, %rsi
524-
; CHECK0-NEXT: ## kill: def $si killed $si killed $rsi
525-
; CHECK0-NEXT: shrq $48, %rax
526-
; CHECK0-NEXT: movw %ax, %di
527-
; CHECK0-NEXT: ## implicit-def: $eax
528-
; CHECK0-NEXT: movw %di, %ax
529-
; CHECK0-NEXT: ## implicit-def: $xmm0
530-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
531-
; CHECK0-NEXT: ## implicit-def: $eax
532-
; CHECK0-NEXT: movw %si, %ax
533-
; CHECK0-NEXT: ## implicit-def: $xmm1
534-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm1
535-
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
536-
; CHECK0-NEXT: ## implicit-def: $eax
537-
; CHECK0-NEXT: movw %dx, %ax
538-
; CHECK0-NEXT: ## implicit-def: $xmm0
539-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm0
540-
; CHECK0-NEXT: ## implicit-def: $eax
541-
; CHECK0-NEXT: movw %cx, %ax
542-
; CHECK0-NEXT: ## implicit-def: $xmm2
543-
; CHECK0-NEXT: pinsrw $0, %eax, %xmm2
544-
; CHECK0-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
545-
; CHECK0-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
546-
; CHECK0-NEXT: retq
418+
; CHECK-LABEL: atomic_vec4_bfloat:
419+
; CHECK: ## %bb.0:
420+
; CHECK-NEXT: movq (%rdi), %xmm0
421+
; CHECK-NEXT: retq
547422
%ret = load atomic <4 x bfloat>, ptr %x acquire, align 8
548423
ret <4 x bfloat> %ret
549424
}

0 commit comments

Comments
 (0)