forked from llvm/llvm-project
-
Notifications
You must be signed in to change notification settings - Fork 56
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
GFX12: Add LoopDataPrefetchPass (llvm#75625)
It is currently disabled by default. It will need experiments on a real HW to tune and decide on the profitability. --------- Co-authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin@amd.com>
- Loading branch information
1 parent
e8d98fa
commit a018c8c
Showing
6 changed files
with
221 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,185 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py | ||
; RUN: llc -march=amdgcn -mcpu=gfx1200 -amdgpu-loop-prefetch < %s | FileCheck --check-prefix=GCN %s | ||
|
||
define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s, i32 %n) { | ||
; GCN-LABEL: copy_flat: | ||
; GCN: ; %bb.0: ; %entry | ||
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_cmp_eq_u32 s4, 0 | ||
; GCN-NEXT: s_cbranch_scc1 .LBB0_3 | ||
; GCN-NEXT: ; %bb.1: ; %for.body.preheader | ||
; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 | ||
; GCN-NEXT: .p2align 6 | ||
; GCN-NEXT: .LBB0_2: ; %for.body | ||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | ||
; GCN-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 | ||
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 | ||
; GCN-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0 | ||
; GCN-NEXT: s_add_co_i32 s4, s4, -1 | ||
; GCN-NEXT: flat_load_b128 v[0:3], v[0:1] offset:-176 | ||
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 | ||
; GCN-NEXT: s_cmp_lg_u32 s4, 0 | ||
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 | ||
; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) | ||
; GCN-NEXT: flat_store_b128 v[4:5], v[0:3] | ||
; GCN-NEXT: s_cbranch_scc1 .LBB0_2 | ||
; GCN-NEXT: .LBB0_3: ; %for.end | ||
; GCN-NEXT: s_endpgm | ||
entry: | ||
%cmp6.not = icmp eq i32 %n, 0 | ||
br i1 %cmp6.not, label %for.end, label %for.body | ||
|
||
for.body: ; preds = %entry, %for.body | ||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] | ||
%idxprom = zext i32 %i.07 to i64 | ||
%arrayidx = getelementptr inbounds <4 x i32>, ptr %s, i64 %idxprom | ||
%ld = load <4 x i32>, ptr %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr %d, i64 %idxprom | ||
store <4 x i32> %ld, ptr %arrayidx2, align 4 | ||
%inc = add nuw i32 %i.07, 1 | ||
%exitcond.not = icmp eq i32 %inc, %n | ||
br i1 %exitcond.not, label %for.end, label %for.body | ||
|
||
for.end: ; preds = %for.body, %entry | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrspace(1) nocapture readonly %s, i32 %n) { | ||
; GCN-LABEL: copy_global: | ||
; GCN: ; %bb.0: ; %entry | ||
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_cmp_eq_u32 s4, 0 | ||
; GCN-NEXT: s_cbranch_scc1 .LBB1_3 | ||
; GCN-NEXT: ; %bb.1: ; %for.body.preheader | ||
; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; GCN-NEXT: v_mov_b32_e32 v0, 0 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 0xb0 | ||
; GCN-NEXT: .LBB1_2: ; %for.body | ||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; GCN-NEXT: global_load_b128 v[1:4], v0, s[2:3] offset:-176 | ||
; GCN-NEXT: s_prefetch_data s[2:3], 0x0, null, 0 | ||
; GCN-NEXT: s_add_co_i32 s4, s4, -1 | ||
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 | ||
; GCN-NEXT: s_cmp_lg_u32 s4, 0 | ||
; GCN-NEXT: s_waitcnt vmcnt(0) | ||
; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] | ||
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 | ||
; GCN-NEXT: s_cbranch_scc1 .LBB1_2 | ||
; GCN-NEXT: .LBB1_3: ; %for.end | ||
; GCN-NEXT: s_nop 0 | ||
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GCN-NEXT: s_endpgm | ||
entry: | ||
%cmp6.not = icmp eq i32 %n, 0 | ||
br i1 %cmp6.not, label %for.end, label %for.body | ||
|
||
for.body: ; preds = %entry, %for.body | ||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] | ||
%idxprom = zext i32 %i.07 to i64 | ||
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(1) %s, i64 %idxprom | ||
%ld = load <4 x i32>, ptr addrspace(1) %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom | ||
store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 | ||
%inc = add nuw i32 %i.07, 1 | ||
%exitcond.not = icmp eq i32 %inc, %n | ||
br i1 %exitcond.not, label %for.end, label %for.body | ||
|
||
for.end: ; preds = %for.body, %entry | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addrspace(4) nocapture readonly %s, i32 %n) { | ||
; GCN-LABEL: copy_constant: | ||
; GCN: ; %bb.0: ; %entry | ||
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x34 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_cmp_eq_u32 s4, 0 | ||
; GCN-NEXT: s_cbranch_scc1 .LBB2_3 | ||
; GCN-NEXT: ; %bb.1: ; %for.body.preheader | ||
; GCN-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 | ||
; GCN-NEXT: v_mov_b32_e32 v0, 0 | ||
; GCN-NEXT: .LBB2_2: ; %for.body | ||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_load_b128 s[8:11], s[2:3], 0x0 | ||
; GCN-NEXT: s_prefetch_data s[2:3], 0xb0, null, 0 | ||
; GCN-NEXT: s_add_co_i32 s4, s4, -1 | ||
; GCN-NEXT: s_add_nc_u64 s[2:3], s[2:3], 16 | ||
; GCN-NEXT: s_cmp_lg_u32 s4, 0 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: v_dual_mov_b32 v1, s8 :: v_dual_mov_b32 v2, s9 | ||
; GCN-NEXT: v_dual_mov_b32 v3, s10 :: v_dual_mov_b32 v4, s11 | ||
; GCN-NEXT: global_store_b128 v0, v[1:4], s[0:1] | ||
; GCN-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16 | ||
; GCN-NEXT: s_cbranch_scc1 .LBB2_2 | ||
; GCN-NEXT: .LBB2_3: ; %for.end | ||
; GCN-NEXT: s_nop 0 | ||
; GCN-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) | ||
; GCN-NEXT: s_endpgm | ||
entry: | ||
%cmp6.not = icmp eq i32 %n, 0 | ||
br i1 %cmp6.not, label %for.end, label %for.body | ||
|
||
for.body: ; preds = %entry, %for.body | ||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] | ||
%idxprom = zext i32 %i.07 to i64 | ||
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(4) %s, i64 %idxprom | ||
%ld = load <4 x i32>, ptr addrspace(4) %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(1) %d, i64 %idxprom | ||
store <4 x i32> %ld, ptr addrspace(1) %arrayidx2, align 4 | ||
%inc = add nuw i32 %i.07, 1 | ||
%exitcond.not = icmp eq i32 %inc, %n | ||
br i1 %exitcond.not, label %for.end, label %for.body | ||
|
||
for.end: ; preds = %for.body, %entry | ||
ret void | ||
} | ||
|
||
define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspace(3) nocapture readonly %s, i32 %n) { | ||
; GCN-LABEL: copy_local: | ||
; GCN: ; %bb.0: ; %entry | ||
; GCN-NEXT: s_load_b96 s[0:2], s[0:1], 0x24 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(0) | ||
; GCN-NEXT: s_cmp_eq_u32 s2, 0 | ||
; GCN-NEXT: s_cbranch_scc1 .LBB3_2 | ||
; GCN-NEXT: .LBB3_1: ; %for.body | ||
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 | ||
; GCN-NEXT: v_mov_b32_e32 v2, s1 | ||
; GCN-NEXT: v_mov_b32_e32 v4, s0 | ||
; GCN-NEXT: s_add_co_i32 s2, s2, -1 | ||
; GCN-NEXT: s_add_co_i32 s0, s0, 16 | ||
; GCN-NEXT: s_add_co_i32 s1, s1, 16 | ||
; GCN-NEXT: ds_load_2addr_b32 v[0:1], v2 offset0:2 offset1:3 | ||
; GCN-NEXT: ds_load_2addr_b32 v[2:3], v2 offset1:1 | ||
; GCN-NEXT: s_cmp_lg_u32 s2, 0 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(1) | ||
; GCN-NEXT: ds_store_2addr_b32 v4, v0, v1 offset0:2 offset1:3 | ||
; GCN-NEXT: s_waitcnt lgkmcnt(1) | ||
; GCN-NEXT: ds_store_2addr_b32 v4, v2, v3 offset1:1 | ||
; GCN-NEXT: s_cbranch_scc1 .LBB3_1 | ||
; GCN-NEXT: .LBB3_2: ; %for.end | ||
; GCN-NEXT: s_endpgm | ||
entry: | ||
%cmp6.not = icmp eq i32 %n, 0 | ||
br i1 %cmp6.not, label %for.end, label %for.body | ||
|
||
for.body: ; preds = %entry, %for.body | ||
%i.07 = phi i32 [ %inc, %for.body ], [ 0, %entry ] | ||
%idxprom = zext i32 %i.07 to i64 | ||
%arrayidx = getelementptr inbounds <4 x i32>, ptr addrspace(3) %s, i64 %idxprom | ||
%ld = load <4 x i32>, ptr addrspace(3) %arrayidx, align 4 | ||
%arrayidx2 = getelementptr inbounds <4 x i32>, ptr addrspace(3) %d, i64 %idxprom | ||
store <4 x i32> %ld, ptr addrspace(3) %arrayidx2, align 4 | ||
%inc = add nuw i32 %i.07, 1 | ||
%exitcond.not = icmp eq i32 %inc, %n | ||
br i1 %exitcond.not, label %for.end, label %for.body | ||
|
||
for.end: ; preds = %for.body, %entry | ||
ret void | ||
} |