From a018c8cdbb3a06a523d10268905e82fc39bfe93e Mon Sep 17 00:00:00 2001 From: Mariusz Sikora Date: Tue, 19 Dec 2023 08:32:16 +0100 Subject: [PATCH] GFX12: Add LoopDataPrefetchPass (#75625) It is currently disabled by default. It will need experiments on a real HW to tune and decide on the profitability. --------- Co-authored-by: Stanislav Mekhanoshin --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 + .../AMDGPU/AMDGPUTargetTransformInfo.cpp | 8 + .../Target/AMDGPU/AMDGPUTargetTransformInfo.h | 10 + llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 4 + llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 8 +- .../test/CodeGen/AMDGPU/loop-prefetch-data.ll | 185 ++++++++++++++++++ 6 files changed, 221 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e8c04ecf39ba02..fdc2077868cf99 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -345,6 +345,11 @@ static cl::opt EnableImageIntrinsicOptimizer( cl::desc("Enable image intrinsic optimizer pass"), cl::init(true), cl::Hidden); +static cl::opt + EnableLoopPrefetch("amdgpu-loop-prefetch", + cl::desc("Enable loop data prefetch on AMDGPU"), + cl::Hidden, cl::init(false)); + static cl::opt EnableMaxIlpSchedStrategy( "amdgpu-enable-max-ilp-scheduling-strategy", cl::desc("Enable scheduling strategy to maximize ILP for a single wave."), @@ -982,6 +987,8 @@ void AMDGPUPassConfig::addEarlyCSEOrGVNPass() { } void AMDGPUPassConfig::addStraightLineScalarOptimizationPasses() { + if (isPassEnabled(EnableLoopPrefetch, CodeGenOptLevel::Aggressive)) + addPass(createLoopDataPrefetchPass()); addPass(createSeparateConstOffsetFromGEPPass()); // ReassociateGEPs exposes more opportunities for SLSR. See // the example in reassociate-geps-and-slsr.ll. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index f1da1a61bf4dd5..ebe0b8551b236a 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -1345,3 +1345,11 @@ GCNTTIImpl::getTypeLegalizationCost(Type *Ty) const { Cost.first += (Size + 255) / 256; return Cost; } + +unsigned GCNTTIImpl::getPrefetchDistance() const { + return ST->hasPrefetch() ? 128 : 0; +} + +bool GCNTTIImpl::shouldPrefetchAddressSpace(unsigned AS) const { + return AMDGPU::isFlatGlobalAddrSpace(AS); +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 1e6c5bbfc0d75b..cd8e9fd10bbf21 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -254,6 +254,16 @@ class GCNTTIImpl final : public BasicTTIImplBase { InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, FastMathFlags FMF, TTI::TargetCostKind CostKind); + + /// Data cache line size for LoopDataPrefetch pass. Has no use before GFX12. + unsigned getCacheLineSize() const override { return 128; } + + /// How much before a load we should place the prefetch instruction. + /// This is currently measured in number of IR instructions. + unsigned getPrefetchDistance() const override; + + /// \return if target want to issue a prefetch in address space \p AS. + bool shouldPrefetchAddressSpace(unsigned AS) const override; }; } // end namespace llvm diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 5d6462f355fab9..e599f23101c81e 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -245,6 +245,10 @@ bool SIInstrInfo::areLoadsFromSameBasePtr(SDNode *Load0, SDNode *Load1, if (!get(Opc0).mayLoad() || !get(Opc1).mayLoad()) return false; + // A mayLoad instruction without a def is not a load. Likely a prefetch. + if (!get(Opc0).getNumDefs() || !get(Opc1).getNumDefs()) + return false; + if (isDS(Opc0) && isDS(Opc1)) { // FIXME: Handle this case: diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 488dbe2e3189bf..8b0b6263832243 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -4,7 +4,7 @@ ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \ ; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O1 %s ; RUN: llc -O1 -mtriple=amdgcn--amdhsa -disable-verify -amdgpu-scalar-ir-passes -amdgpu-sdwa-peephole \ -; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -debug-pass=Structure < %s 2>&1 \ +; RUN: -amdgpu-load-store-vectorizer -amdgpu-enable-pre-ra-optimizations -amdgpu-loop-prefetch -debug-pass=Structure < %s 2>&1 \ ; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O1-OPTS %s ; RUN: llc -O2 -mtriple=amdgcn--amdhsa -disable-verify -debug-pass=Structure < %s 2>&1 \ ; RUN: | FileCheck -match-full-lines -strict-whitespace -check-prefix=GCN-O2 %s @@ -461,6 +461,12 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca ; GCN-O1-OPTS-NEXT: Dominator Tree Construction ; GCN-O1-OPTS-NEXT: Natural Loop Information +; GCN-O1-OPTS-NEXT: Canonicalize natural loops +; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis +; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis +; GCN-O1-OPTS-NEXT: Optimization Remark Emitter +; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis +; GCN-O1-OPTS-NEXT: Loop Data Prefetch ; GCN-O1-OPTS-NEXT: Split GEPs to a variadic base and a constant offset for better CSE ; GCN-O1-OPTS-NEXT: Scalar Evolution Analysis ; GCN-O1-OPTS-NEXT: Straight line strength reduction diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll new file mode 100644 index 00000000000000..fb3c04235b8e4d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll @@ -0,0 +1,185 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -march=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s + +define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { +; GCN-LABEL: copy_flat: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB0_3 +; GCN-NEXT: ; %bb.1: ; %for.body.preheader +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GCN-NEXT: .p2align 6 +; GCN-NEXT: .LBB0_2: ; %for.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 +; GCN-NEXT: s_add_co_i32 s4, s4, -1 +; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 +; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: flat_store_b128 v[4:5], v[0:3] +; GCN-NEXT: s_cbranch_scc1 .LBB0_2 +; GCN-NEXT: .LBB0_3: ; %for.end +; GCN-NEXT: s_endpgm +entry: + %cmp6.not = icmp eq i32 %n, 0 + br i1 %cmp6.not, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %idxprom = zext i32 %i.07 to i64 + %arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom + %ld = load <4 x i32>, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom + store <4 x i32> %ld, ptr %arrayidx2, align 4 + %inc = add nuw i32 %i.07, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { +; GCN-LABEL: copy_global: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB1_3 +; GCN-NEXT: ; %bb.1: ; %for.body.preheader +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 +; GCN-NEXT: .LBB1_2: ; %for.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 +; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 +; GCN-NEXT: s_add_co_i32 s4, s4, -1 +; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] +; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GCN-NEXT: s_cbranch_scc1 .LBB1_2 +; GCN-NEXT: .LBB1_3: ; %for.end +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +entry: + %cmp6.not = icmp eq i32 %n, 0 + br i1 %cmp6.not, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %idxprom = zext i32 %i.07 to i64 + %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom + %ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom + store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 + %inc = add nuw i32 %i.07, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { +; GCN-LABEL: copy_constant: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB2_3 +; GCN-NEXT: ; %bb.1: ; %for.body.preheader +; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 +; GCN-NEXT: v_mov_b32_e32 v0, 0 +; GCN-NEXT: .LBB2_2: ; %for.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 +; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 +; GCN-NEXT: s_add_co_i32 s4, s4, -1 +; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 +; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 +; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] +; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 +; GCN-NEXT: s_cbranch_scc1 .LBB2_2 +; GCN-NEXT: .LBB2_3: ; %for.end +; GCN-NEXT: s_nop 0 +; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) +; GCN-NEXT: s_endpgm +entry: + %cmp6.not = icmp eq i32 %n, 0 + br i1 %cmp6.not, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %idxprom = zext i32 %i.07 to i64 + %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom + %ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom + store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 + %inc = add nuw i32 %i.07, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { +; GCN-LABEL: copy_local: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s2, 0 +; GCN-NEXT: s_cbranch_scc1 .LBB3_2 +; GCN-NEXT: .LBB3_1: ; %for.body +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: v_mov_b32_e32 v2, s1 +; GCN-NEXT: v_mov_b32_e32 v4, s0 +; GCN-NEXT: s_add_co_i32 s2, s2, -1 +; GCN-NEXT: s_add_co_i32 s0, s0, 16 +; GCN-NEXT: s_add_co_i32 s1, s1, 16 +; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3 +; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1 +; GCN-NEXT: s_cmp_lg_u32 s2, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3 +; GCN-NEXT: s_waitcnt lgkmcnt(1) +; GCN-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1 +; GCN-NEXT: s_cbranch_scc1 .LBB3_1 +; GCN-NEXT: .LBB3_2: ; %for.end +; GCN-NEXT: s_endpgm +entry: + %cmp6.not = icmp eq i32 %n, 0 + br i1 %cmp6.not, label %for.end, label %for.body + +for.body: ; preds = %entry, %for.body + %i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %idxprom = zext i32 %i.07 to i64 + %arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom + %ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom + store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4 + %inc = add nuw i32 %i.07, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +}