Skip to content

Commit

Permalink
[DAGCombine] Fold (store (insert_elt (load p)) x p) -> (store x)
Browse files Browse the repository at this point in the history
If we have a store of a load with no other uses in between it, it's
considered dead and is removed. So sometimes when legalizing a fixed
length vector store of an insert, we end up producing better code
through scalarization than without.
An example is the follow below:

  %a = load <4 x i64>, ptr %x
  %b = insertelement <4 x i64> %a, i64 %y, i32 2
  store <4 x i64> %b, ptr %x

If this is scalarized, then DAGCombine successfully removes 3 of the 4
stores which are considered dead, and on RISC-V we get:

  sd a1, 16(a0)

However if we make the vector type legal (-mattr=+v), then we lose the
optimisation because we don't scalarize it.

This patch attempts to recover the optimisation for vectors by
identifying patterns where we store a load with a single insert
inbetween, replacing it with a scalar store of the inserted element.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D152276
  • Loading branch information
lukel97 committed Jun 28, 2023
1 parent 9ad29e7 commit 742fb8b
Show file tree
Hide file tree
Showing 11 changed files with 200 additions and 304 deletions.
61 changes: 61 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,7 @@ namespace {

SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
SDValue replaceStoreOfInsertLoad(StoreSDNode *ST);

bool refineExtractVectorEltIntoMultipleNarrowExtractVectorElts(SDNode *N);

Expand Down Expand Up @@ -20409,6 +20410,62 @@ SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
}
}

// (store (insert_vector_elt (load p), x, i), p) -> (store x, p+offset)
//
// If a store of a load with an element inserted into it has no other
// uses in between the chain, then we can consider the vector store
// dead and replace it with just the single scalar element store.
SDValue DAGCombiner::replaceStoreOfInsertLoad(StoreSDNode *ST) {
SDLoc DL(ST);
SDValue Value = ST->getValue();
SDValue Ptr = ST->getBasePtr();
SDValue Chain = ST->getChain();
if (Value.getOpcode() != ISD::INSERT_VECTOR_ELT || !Value.hasOneUse())
return SDValue();

SDValue Elt = Value.getOperand(1);
SDValue Idx = Value.getOperand(2);

// If the element isn't byte sized then we can't compute an offset
EVT EltVT = Elt.getValueType();
if (!EltVT.isByteSized())
return SDValue();

auto *Ld = dyn_cast<LoadSDNode>(Value.getOperand(0));
if (!Ld || Ld->getBasePtr() != Ptr ||
ST->getMemoryVT() != Ld->getMemoryVT() || !ST->isSimple() ||
!ISD::isNormalStore(ST) ||
Ld->getAddressSpace() != ST->getAddressSpace() ||
!Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1)))
return SDValue();

unsigned IsFast;
if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
Elt.getValueType(), ST->getAddressSpace(),
ST->getAlign(), ST->getMemOperand()->getFlags(),
&IsFast) ||
!IsFast)
return SDValue();
EVT PtrVT = Ptr.getValueType();

SDValue Offset =
DAG.getNode(ISD::MUL, DL, PtrVT, Idx,
DAG.getConstant(EltVT.getSizeInBits() / 8, DL, PtrVT));
SDValue NewPtr = DAG.getNode(ISD::ADD, DL, PtrVT, Ptr, Offset);
MachinePointerInfo PointerInfo(ST->getAddressSpace());

// If the offset is a known constant then try to recover the pointer
// info
if (auto *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
unsigned COffset = CIdx->getSExtValue() * EltVT.getSizeInBits() / 8;
NewPtr = DAG.getMemBasePlusOffset(Ptr, TypeSize::Fixed(COffset), DL);
PointerInfo = ST->getPointerInfo().getWithOffset(COffset);
}

return DAG.getStore(Chain, DL, Elt, NewPtr, PointerInfo, ST->getAlign(),
ST->getMemOperand()->getFlags());
}

SDValue DAGCombiner::visitSTORE(SDNode *N) {
StoreSDNode *ST = cast<StoreSDNode>(N);
SDValue Chain = ST->getChain();
Expand Down Expand Up @@ -20548,6 +20605,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
}
}

// Try scalarizing vector stores of loads where we only change one element
if (SDValue NewST = replaceStoreOfInsertLoad(ST))
return NewST;

// TODO: Can relax for unordered atomics (see D66309)
if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
if (ST->isUnindexed() && ST->isSimple() &&
Expand Down
4 changes: 1 addition & 3 deletions llvm/test/CodeGen/AArch64/vector-insert-shuffle-cycle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@ define void @test(i1 %c, ptr %ptr) {
; CHECK-NEXT: ; %bb.1: ; %bb1
; CHECK-NEXT: ldr d0, [x1]
; CHECK-NEXT: LBB0_2: ; %bb2
; CHECK-NEXT: ldr q1, [x8]
; CHECK-NEXT: mov.d v1[0], v0[0]
; CHECK-NEXT: str q1, [x8]
; CHECK-NEXT: str d0, [x8]
; CHECK-NEXT: ret
entry:
br i1 %c, label %bb1, label %bb2
Expand Down
12 changes: 9 additions & 3 deletions llvm/test/CodeGen/ARM/fp16-promote.ll
Original file line number Diff line number Diff line change
Expand Up @@ -865,8 +865,14 @@ define void @test_fmuladd(ptr %p, ptr %q, ptr %r) #0 {
; CHECK-VFP: ldrh
; CHECK-VFP: stm
; CHECK-VFP: strh
; CHECK-VFP: ldm
; CHECK-VFP: stm
; CHECK-VFP: ldrh
; CHECK-VFP: ldrh
; CHECK-VFP: ldrh
; CHECK-VFP: ldrh
; CHECK-VFP: strh
; CHECK-VFP: strh
; CHECK-VFP: strh
; CHECK-VFP: strh

; CHECK-NOVFP: ldrh
; CHECK-NOVFP: ldrh
Expand All @@ -893,7 +899,7 @@ define void @test_insertelement(ptr %p, ptr %q, i32 %i) #0 {
%a = load half, ptr %p, align 2
%b = load <4 x half>, ptr %q, align 8
%c = insertelement <4 x half> %b, half %a, i32 %i
store <4 x half> %c, ptr %q
store volatile <4 x half> %c, ptr %q
ret void
}

Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/ARM/vector-DAGCombine.ll
Original file line number Diff line number Diff line change
Expand Up @@ -134,9 +134,8 @@ define void @i64_buildvector(ptr %ptr, ptr %vp) nounwind {
define void @i64_insertelement(ptr %ptr, ptr %vp) nounwind {
; CHECK-LABEL: i64_insertelement:
; CHECK: @ %bb.0:
; CHECK-NEXT: vld1.64 {d16, d17}, [r1]
; CHECK-NEXT: vldr d16, [r0]
; CHECK-NEXT: vst1.64 {d16, d17}, [r1]
; CHECK-NEXT: ldm r0, {r2, r3}
; CHECK-NEXT: strd r2, r3, [r1]
; CHECK-NEXT: bx lr
%t0 = load i64, ptr %ptr, align 4
%vec = load <2 x i64>, ptr %vp
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/CodeGen/Hexagon/autohvx/hfinsert.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ define ptr @fred(ptr %v0) local_unnamed_addr #0 {
b0:
%v1 = load <64 x half>, ptr %v0, align 2
%v2 = insertelement <64 x half> %v1, half 0xH4170, i32 17
store <64 x half> %v2, ptr %v0, align 2
store volatile <64 x half> %v2, ptr %v0, align 2
ret ptr %v0
}

Expand Down
Loading

0 comments on commit 742fb8b

Please sign in to comment.