-
Notifications
You must be signed in to change notification settings - Fork 12.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Use LSH for lowering ctlz_zero_undef.i8/i16 #88512
Conversation
✅ With the latest revision this PR passed the C/C++ code formatter. |
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu Author: Leon Clark (PeddleSpam) ChangesUse LSH to lower ctlz_zero_undef instead of subtracting leading zeros for i8 and i16. Related to 77615. Patch is 21.00 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/88512.diff 5 Files Affected:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index db69d50799e70b..015474b58c0932 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -3081,20 +3081,30 @@ static bool isCttzOpc(unsigned Opc) {
SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
SelectionDAG &DAG) const {
auto SL = SDLoc(Op);
+ auto Opc = Op.getOpcode();
auto Arg = Op.getOperand(0u);
auto ResultVT = Op.getValueType();
if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
return {};
- assert(isCtlzOpc(Op.getOpcode()));
+ assert(isCtlzOpc(Opc));
assert(ResultVT == Arg.getValueType());
- auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
- auto SubVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
- auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
- NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
- NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, SubVal);
+ auto const NumBits = ResultVT.getFixedSizeInBits();
+ auto NumExtBits = DAG.getConstant(32u - NumBits, SL, MVT::i32);
+ auto NewOp = SDValue();
+
+ if (Opc == ISD::CTLZ_ZERO_UNDEF) {
+ NewOp = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Arg);
+ NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, NumExtBits);
+ NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+ } else {
+ NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+ NewOp = DAG.getNode(Opc, SL, MVT::i32, NewOp);
+ NewOp = DAG.getNode(ISD::SUB, SL, MVT::i32, NewOp, NumExtBits);
+ }
+
return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 780dfaae11ef3e..5eb474a7a045de 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1270,13 +1270,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
.custom();
// The 64-bit versions produce 32-bit results, but only on the SALU.
- getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
- .legalFor({{S32, S32}, {S32, S64}})
- .clampScalar(0, S32, S32)
- .clampScalar(1, S32, S64)
- .scalarize(0)
- .widenScalarToNextPow2(0, 32)
- .widenScalarToNextPow2(1, 32);
+ getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+ .legalFor({{S32, S32}, {S32, S64}})
+ .customFor({{S32, S8}, {S32, S16}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32);
+
+ getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
+ .legalFor({{S32, S32}, {S32, S64}})
+ .clampScalar(0, S32, S32)
+ .clampScalar(1, S32, S64)
+ .scalarize(0)
+ .widenScalarToNextPow2(0, 32)
+ .widenScalarToNextPow2(1, 32);
// S64 is only legal on SALU, and needs to be broken into 32-bit elements in
// RegBankSelect.
@@ -2128,6 +2137,8 @@ bool AMDGPULegalizerInfo::legalizeCustom(
case TargetOpcode::G_CTLZ:
case TargetOpcode::G_CTTZ:
return legalizeCTLZ_CTTZ(MI, MRI, B);
+ case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+ return legalizeCTLZ_ZERO_UNDEF(MI, MRI, B);
case TargetOpcode::G_INTRINSIC_FPTRUNC_ROUND:
return legalizeFPTruncRound(MI, B);
case TargetOpcode::G_STACKSAVE:
@@ -4145,6 +4156,25 @@ bool AMDGPULegalizerInfo::legalizeCTLZ_CTTZ(MachineInstr &MI,
return true;
}
+bool AMDGPULegalizerInfo::legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI,
+ MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const {
+ Register Dst = MI.getOperand(0).getReg();
+ Register Src = MI.getOperand(1).getReg();
+ LLT DstTy = MRI.getType(Dst);
+ LLT SrcTy = MRI.getType(Src);
+ TypeSize NumBits = SrcTy.getSizeInBits();
+
+ assert(NumBits < 32u);
+
+ auto ShiftAmt = B.buildConstant(S32, 32u - NumBits);
+ auto Tmp = B.buildAnyExt(S32, {Src}).getReg(0u);
+ Tmp = B.buildLShr(S32, {Tmp}, ShiftAmt).getReg(0u);
+ B.buildInstr(AMDGPU::G_AMDGPU_FFBH_U32, {Dst}, {Tmp});
+ MI.eraseFromParent();
+ return true;
+}
+
// Check that this is a G_XOR x, -1
static bool isNot(const MachineRegisterInfo &MRI, const MachineInstr &MI) {
if (MI.getOpcode() != TargetOpcode::G_XOR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index e5ba84a74a0f8a..4b1d821dadc215 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -108,6 +108,8 @@ class AMDGPULegalizerInfo final : public LegalizerInfo {
bool legalizeMul(LegalizerHelper &Helper, MachineInstr &MI) const;
bool legalizeCTLZ_CTTZ(MachineInstr &MI, MachineRegisterInfo &MRI,
MachineIRBuilder &B) const;
+ bool legalizeCTLZ_ZERO_UNDEF(MachineInstr &MI, MachineRegisterInfo &MRI,
+ MachineIRBuilder &B) const;
bool loadInputValue(Register DstReg, MachineIRBuilder &B,
const ArgDescriptor *Arg,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index fed277d7d10d08..32af5e1975a6c3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -81,14 +81,12 @@ body: |
; CHECK: liveins: $vgpr0
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C]]
- ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
- ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C1]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AND1]](s32)
+ ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+ ; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[COPY]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR]](s32)
+ ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+ ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s16) = G_TRUNC %0
%2:_(s16) = G_CTLZ_ZERO_UNDEF %1
@@ -149,18 +147,15 @@ body: |
; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
; CHECK-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; CHECK-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U32:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR1]](s32)
+ ; CHECK-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[LSHR]], [[C]](s32)
+ ; CHECK-NEXT: [[AMDGPU_FFBH_U321:%[0-9]+]]:_(s32) = G_AMDGPU_FFBH_U32 [[LSHR2]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[BITCAST]], [[C1]]
- ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[AND]](s32)
- ; CHECK-NEXT: [[SUB:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF]], [[C]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
- ; CHECK-NEXT: [[CTLZ_ZERO_UNDEF1:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[LSHR]](s32)
- ; CHECK-NEXT: [[SUB1:%[0-9]+]]:_(s32) = G_SUB [[CTLZ_ZERO_UNDEF1]], [[C]]
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB1]](s32)
- ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[COPY1]], [[C1]]
- ; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[COPY2]], [[C1]]
- ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND2]], [[C]](s32)
- ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND1]], [[SHL]]
+ ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U32]], [[C1]]
+ ; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[AMDGPU_FFBH_U321]], [[C1]]
+ ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C]](s32)
+ ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
; CHECK-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
; CHECK-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
%0:_(<2 x s16>) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 21aff62b9226d0..617a60a47b1777 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -314,9 +314,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s2, s2, 0xff
-; SI-NEXT: s_flbit_i32_b32 s2, s2
-; SI-NEXT: s_sub_i32 s4, s2, 24
+; SI-NEXT: s_lshl_b32 s2, s2, 24
+; SI-NEXT: s_flbit_i32_b32 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0
@@ -327,9 +326,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: s_load_dword s2, s[0:1], 0x2c
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: s_and_b32 s2, s2, 0xff
+; VI-NEXT: s_lshl_b32 s2, s2, 24
; VI-NEXT: s_flbit_i32_b32 s2, s2
-; VI-NEXT: s_sub_i32 s2, s2, 24
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
@@ -349,13 +347,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT T0.W, T0.X,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -371,9 +369,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xff
+; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 24
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_byte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
@@ -391,9 +388,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: s_and_b32 s2, s2, 0xffff
-; SI-NEXT: s_flbit_i32_b32 s2, s2
-; SI-NEXT: s_add_i32 s4, s2, -16
+; SI-NEXT: s_lshl_b32 s2, s2, 16
+; SI-NEXT: s_flbit_i32_b32 s4, s2
; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -426,13 +422,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, 0.0,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT T0.W, T0.X,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: -16(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 65535(9.183409e-41), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -448,9 +444,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT: s_and_b32 s0, s4, 0xffff
+; GFX9-GISEL-NEXT: s_lshr_b32 s0, s4, 16
; GFX9-GISEL-NEXT: s_flbit_i32_b32 s0, s0
-; GFX9-GISEL-NEXT: s_sub_i32 s0, s0, 16
; GFX9-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX9-GISEL-NEXT: global_store_short v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_endpgm
@@ -590,8 +585,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_subrev_i32_e32 v1, vcc, 24, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
@@ -605,8 +600,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT: v_subrev_u32_e32 v1, vcc, 24, v1
+; VI-NEXT: v_lshlrev_b32_e32 v1, 24, v0
+; VI-NEXT: v_ffbh_u32_e32 v1, v1
; VI-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0
; VI-NEXT: v_cndmask_b32_e32 v2, 32, v1, vcc
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -618,7 +613,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -627,10 +622,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT * T0.W, T0.X,
-; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: -24(nan), 3(4.203895e-45)
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -651,8 +647,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
; GFX9-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-GISEL-NEXT: global_load_ubyte v1, v0, s[2:3]
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 24, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -685,8 +680,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; SI-NEXT: v_lshlrev_b32_e32 v0, 8, v0
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_ffbh_u32_e32 v1, v0
-; SI-NEXT: v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_ffbh_u32_e32 v1, v1
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, 32, v1, vcc
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -721,7 +716,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG: ; %bb.0:
; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[]
; EG-NEXT: TEX 0 @6
-; EG-NEXT: ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT: ALU 16, @9, KC0[CB0:0-32], KC1[]
; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X
; EG-NEXT: CF_END
; EG-NEXT: PAD
@@ -730,10 +725,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: MOV * T0.X, KC0[2].Z,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT * T0.W, T0.X,
-; EG-NEXT: ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT: -16(nan), 3(4.203895e-45)
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
+; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
; EG-NEXT: CNDE_INT * T0.W, T0.X, literal.x, PV.W,
; EG-NEXT: 32(4.484155e-44), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
@@ -756,8 +752,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
; GFX9-GISEL-NEXT: global_load_ubyte v2, v0, s[2:3] offset:1
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
; GFX9-GISEL-NEXT: v_lshl_or_b32 v1, v2, 8, v1
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v2, v1
-; GFX9-GISEL-NEXT: v_subrev_u32_e32 v2, 16, v2
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v2, v1
; GFX9-GISEL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX9-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; GFX9-GISEL-NEXT: v_cndmask_b32_e32 v1, 32, v2, vcc
@@ -1102,8 +1097,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
; SI-NEXT: v_ffbh_u32_e32 v0, v0
-; SI-NEXT: v_subrev_i32_e32 v0, vcc, 24, v0
; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -1116,8 +1111,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_ubyte v0, v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
-; VI-NEXT: v_ffbh_u32_e32 v0, v0
-; VI-NEXT: v_subrev_u32_e32 v2, vcc, 24, v0
+; VI-NEXT: v_lshlrev_b32_e32 v0, 24, v0
+; VI-NEXT: v_ffbh_u32_e32 v2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -1136,13 +1131,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; EG-NEXT: ALU clause starting at 8:
; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, T0.X,
; EG-NEXT: ALU clause starting at 9:
-; EG-NEXT: FFBH_UINT T0.W, T0.X,
+; EG-NEXT: LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT: 24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT: FFBH_UINT T0.W, PV.W,
; EG-NEXT: AND_INT * T1.W, KC0[2].Y, literal.x,
; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT: ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT: -24(nan), 0(0.000000e+00)
; EG-NEXT: AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT: LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT: LSHL * T1.W, PS, literal.y,
; EG-NEXT: 255(3.573311e-43), 3(4.203895e-45)
; EG-NEXT: LSHL T0.X, PV.W, PS,
; EG-NEXT: LSHL * T0.W, literal.x, PS,
@@ -1164,8 +1159,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
; GFX9-GISEL-NEXT: global_load_ubyte v0, v[0:1], off
; GFX9-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT: v_ffbh_u32_e32 v0, v0
-; GFX9-GISEL-NEXT: v_subrev_u32_e32 v0, 24, v0
+; GFX9-GISEL-NEXT: v_ffbh_u32_sdwa v0, v0
; GFX9-GISEL-NEXT: glo...
[truncated]
|
Does this need an update after ac09292? |
Updated and ran tests again. |
This reverts commit fb2c659.
Use LSH to lower ctlz_zero_undef instead of subtracting leading zeros for i8 and i16.
Related to 77615.