From 7fad04e94b7b594389111ae7eca0883ef18dc90b Mon Sep 17 00:00:00 2001 From: Benjamin Maxwell <benjamin.maxwell@arm.com> Date: Wed, 24 Jul 2024 10:06:34 +0100 Subject: [PATCH] [LSR] Fix matching vscale immediates (#100080) Somewhat confusingly a `SCEVMulExpr` is a `SCEVNAryExpr`, so can have > 2 operands. Previously, the vscale immediate matching did not check the number of operands of the `SCEVMulExpr`, so would ignore any operands after the first two. This led to incorrect codegen (and results) for ArmSME in IREE (https://github.com/iree-org/iree), which sometimes addresses things that are a `vscale * vscale` multiple away. The test added with this change shows an example reduced from IREE. The second write should be offset from the first `16 * vscale * vscale` (* 4 bytes), however, previously LSR dropped the second vscale and instead offset the write by `#4, mul vl`, which is an offset of `16 * vscale` (* 4 bytes). --- .../Transforms/Scalar/LoopStrengthReduce.cpp | 6 ++++-- .../AArch64/vscale-fixups.ll | 20 +++++++++++-------- 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 11f9f7822a15c8..91461d1ed27592 100644 --- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -946,13 +946,15 @@ static Immediate ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) { // FIXME: AR->getNoWrapFlags(SCEV::FlagNW) SCEV::FlagAnyWrap); return Result; - } else if (EnableVScaleImmediates) - if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) + } else if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) { + if (EnableVScaleImmediates && M->getNumOperands() == 2) { if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0))) if (isa<SCEVVScale>(M->getOperand(1))) { S = SE.getConstant(M->getType(), 0); return Immediate::getScalable(C->getValue()->getSExtValue()); } + } + } return Immediate::getZero(); } diff --git a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll index 56b59012eef40c..588696d20227fd 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/AArch64/vscale-fixups.ll @@ -384,27 +384,31 @@ for.exit: ret void } -;; This test demonstrates an incorrect MUL VL address calculation. Here there -;; are two writes that should be `16 * vscale * vscale` apart, however, -;; loop-strength-reduce has ignored the second `vscale` and offset the second -;; write by `#4, mul vl` which is an offset of `16 * vscale` dropping a vscale. +;; Here are two writes that should be `16 * vscale * vscale` apart, so MUL VL +;; addressing cannot be used to offset the second write, as for example, +;; `#4, mul vl` would only be an offset of `16 * vscale` (dropping a vscale). define void @vscale_squared_offset(ptr %alloc) #0 { ; COMMON-LABEL: vscale_squared_offset: ; COMMON: // %bb.0: // %entry +; COMMON-NEXT: rdvl x9, #1 ; COMMON-NEXT: fmov z0.s, #4.00000000 ; COMMON-NEXT: mov x8, xzr -; COMMON-NEXT: cntw x9 +; COMMON-NEXT: lsr x9, x9, #4 ; COMMON-NEXT: fmov z1.s, #8.00000000 +; COMMON-NEXT: cntw x10 ; COMMON-NEXT: ptrue p0.s, vl1 -; COMMON-NEXT: cmp x8, x9 +; COMMON-NEXT: umull x9, w9, w9 +; COMMON-NEXT: lsl x9, x9, #6 +; COMMON-NEXT: cmp x8, x10 ; COMMON-NEXT: b.ge .LBB6_2 ; COMMON-NEXT: .LBB6_1: // %for.body ; COMMON-NEXT: // =>This Inner Loop Header: Depth=1 +; COMMON-NEXT: add x11, x0, x9 ; COMMON-NEXT: st1w { z0.s }, p0, [x0] ; COMMON-NEXT: add x8, x8, #1 -; COMMON-NEXT: st1w { z1.s }, p0, [x0, #4, mul vl] +; COMMON-NEXT: st1w { z1.s }, p0, [x11] ; COMMON-NEXT: addvl x0, x0, #1 -; COMMON-NEXT: cmp x8, x9 +; COMMON-NEXT: cmp x8, x10 ; COMMON-NEXT: b.lt .LBB6_1 ; COMMON-NEXT: .LBB6_2: // %for.exit ; COMMON-NEXT: ret