From 5808c4dfdd73b1d22fb2898131a0898e5ce7f127 Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 21 Aug 2023 13:18:51 -0700 Subject: [PATCH 1/8] [ARM] distribute shifts as muls --- Makefile | 2 + src/CMakeLists.txt | 2 + src/CodeGen_ARM.cpp | 4 + src/DistributeShifts.cpp | 197 +++++++++++++++++++++++++ src/DistributeShifts.h | 21 +++ src/HexagonOptimize.cpp | 132 +---------------- test/correctness/simd_op_check_arm.cpp | 12 ++ 7 files changed, 240 insertions(+), 130 deletions(-) create mode 100644 src/DistributeShifts.cpp create mode 100644 src/DistributeShifts.h diff --git a/Makefile b/Makefile index 0679590bd8a5..f54fbd24cd94 100644 --- a/Makefile +++ b/Makefile @@ -502,6 +502,7 @@ SOURCE_FILES = \ DeviceArgument.cpp \ DeviceInterface.cpp \ Dimension.cpp \ + DistributeShifts.cpp \ EarlyFree.cpp \ Elf.cpp \ EliminateBoolVectors.cpp \ @@ -691,6 +692,7 @@ HEADER_FILES = \ DeviceArgument.h \ DeviceInterface.h \ Dimension.h \ + DistributeShifts.h \ EarlyFree.h \ Elf.h \ EliminateBoolVectors.h \ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 47e90864de40..1edc296aa775 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -57,6 +57,7 @@ set(HEADER_FILES DeviceArgument.h DeviceInterface.h Dimension.h + DistributeShifts.h EarlyFree.h Elf.h EliminateBoolVectors.h @@ -227,6 +228,7 @@ set(SOURCE_FILES DeviceArgument.cpp DeviceInterface.cpp Dimension.cpp + DistributeShifts.cpp EarlyFree.cpp Elf.cpp EliminateBoolVectors.cpp diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index 782a49f910fd..ed443b53a631 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -6,6 +6,7 @@ #include "CodeGen_Posix.h" #include "ConciseCasts.h" #include "Debug.h" +#include "DistributeShifts.h" #include "IREquality.h" #include "IRMatch.h" #include "IRMutator.h" @@ -852,6 +853,9 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f, // actually faster. func.body = SubstituteInStridedLoads().mutate(func.body); } + // Look for opportunities to turn a + (b << c) into umlal/smlal + // and a - (b << c) into umlsl/smlsl. + func.body = distribute_shifts(func.body, /* polynomials_only */ true); CodeGen_Posix::compile_func(func, simple_name, extern_name); } diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp new file mode 100644 index 000000000000..2fa9ed9ba522 --- /dev/null +++ b/src/DistributeShifts.cpp @@ -0,0 +1,197 @@ +#include "FindIntrinsics.h" +#include "CSE.h" +#include "CodeGen_Internal.h" +#include "ConciseCasts.h" +#include "IRMatch.h" +#include "IRMutator.h" +#include "Simplify.h" + +namespace Halide { +namespace Internal { + +// Distribute constant RHS widening shift lefts as multiplies. +// TODO: This is an extremely unfortunate mess. I think the better +// solution is for the simplifier to distribute constant multiplications +// instead of factoring them, and then this logic is unnecessary (find_mpy_ops +// would need to handle shifts, but that's easy). +// Another possibility would be adding a widening_mul_add intrinsic that takes +// a list of pairs of operands, and computes a widening sum of widening multiplies +// of these pairs. FindIntrinsics could aggressively rewrite shifts as +// widening_mul_add operands. +class DistributeShiftsAsMuls : public IRMutator { +public: + DistributeShiftsAsMuls(const bool polys_only) : polynomials_only(polys_only) {} +private: + const bool polynomials_only; + + static bool is_cast(const Expr &e, Type value_t) { + if (const Cast *cast = e.as()) { + return cast->value.type() == value_t; + } + return false; + } + + static Expr distribute(const Expr &a, const Expr &b) { + if (const Add *add = a.as()) { + return Add::make(distribute(add->a, b), distribute(add->b, b)); + } else if (const Sub *sub = a.as()) { + Expr sub_a = distribute(sub->a, b); + Expr sub_b = distribute(sub->b, b); + Expr negative_sub_b = lossless_negate(sub_b); + if (negative_sub_b.defined()) { + return Add::make(sub_a, negative_sub_b); + } else { + return Sub::make(sub_a, sub_b); + } + } else if (const Cast *cast = a.as()) { + Expr cast_b = lossless_cast(b.type().with_bits(cast->value.type().bits()), b); + if (cast_b.defined()) { + Expr mul = widening_mul(cast->value, cast_b); + if (mul.type().bits() <= cast->type.bits()) { + if (mul.type() != cast->type) { + mul = Cast::make(cast->type, mul); + } + return mul; + } + } + } else if (const Call *add = Call::as_intrinsic(a, {Call::widening_add})) { + Expr add_a = Cast::make(add->type, add->args[0]); + Expr add_b = Cast::make(add->type, add->args[1]); + add_a = distribute(add_a, b); + add_b = distribute(add_b, b); + // If add_a and add_b are the same kind of cast, we should remake a widening add. + const Cast *add_a_cast = add_a.as(); + const Cast *add_b_cast = add_b.as(); + if (add_a_cast && add_b_cast && + add_a_cast->value.type() == add->args[0].type() && + add_b_cast->value.type() == add->args[1].type()) { + return widening_add(add_a_cast->value, add_b_cast->value); + } else { + return Add::make(add_a, add_b); + } + } else if (const Call *sub = Call::as_intrinsic(a, {Call::widening_sub})) { + Expr sub_a = Cast::make(sub->type, sub->args[0]); + Expr sub_b = Cast::make(sub->type, sub->args[1]); + sub_a = distribute(sub_a, b); + sub_b = distribute(sub_b, b); + Expr negative_sub_b = lossless_negate(sub_b); + if (negative_sub_b.defined()) { + sub_b = negative_sub_b; + } + // If sub_a and sub_b are the same kind of cast, we should remake a widening sub. + const Cast *sub_a_cast = sub_a.as(); + const Cast *sub_b_cast = sub_b.as(); + if (sub_a_cast && sub_b_cast && + sub_a_cast->value.type() == sub->args[0].type() && + sub_b_cast->value.type() == sub->args[1].type()) { + if (negative_sub_b.defined()) { + return widening_add(sub_a_cast->value, sub_b_cast->value); + } else { + return widening_sub(sub_a_cast->value, sub_b_cast->value); + } + } else { + if (negative_sub_b.defined()) { + return Add::make(sub_a, sub_b); + } else { + return Sub::make(sub_a, sub_b); + } + } + } else if (const Call *mul = Call::as_intrinsic(a, {Call::widening_mul})) { + Expr mul_a = Cast::make(mul->type, mul->args[0]); + Expr mul_b = Cast::make(mul->type, mul->args[1]); + mul_a = distribute(mul_a, b); + if (const Cast *mul_a_cast = mul_a.as()) { + if (mul_a_cast->value.type() == mul->args[0].type()) { + return widening_mul(mul_a_cast->value, mul->args[1]); + } + } + mul_b = distribute(mul_b, b); + if (const Cast *mul_b_cast = mul_b.as()) { + if (mul_b_cast->value.type() == mul->args[1].type()) { + return widening_mul(mul->args[0], mul_b_cast->value); + } + } + } + return simplify(Mul::make(a, b)); + } + + Expr distribute_shift(const Call *op) { + if (op->is_intrinsic(Call::shift_left)) { + if (const uint64_t *const_b = as_const_uint(op->args[1])) { + Expr a = op->args[0]; + // Only rewrite widening shifts. + const Cast *cast_a = a.as(); + bool is_widening_cast = cast_a && cast_a->type.bits() >= cast_a->value.type().bits() * 2; + if (is_widening_cast || Call::as_intrinsic(a, {Call::widening_add, Call::widening_mul, Call::widening_sub})) { + const uint64_t const_m = 1ull << *const_b; + Expr b = make_const(a.type(), const_m); + return mutate(distribute(a, b)); + } + } + } else if (op->is_intrinsic(Call::widening_shift_left)) { + if (const uint64_t *const_b = as_const_uint(op->args[1])) { + const uint64_t const_m = 1ull << *const_b; + Expr b = make_const(op->type, const_m); + Expr a = Cast::make(op->type, op->args[0]); + return mutate(distribute(a, b)); + } + } + return IRMutator::visit(op); + } + + template + Expr visit_add_sub(const T *op) { + if (polynomials_only) { + Expr a, b; + if (const Call *a_call = op->a.template as()) { + if (a_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { + a = distribute_shift(a_call); + } + } + if (const Call *b_call = op->b.template as()) { + if (b_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { + b = distribute_shift(b_call); + } + } + + if (a.defined() && b.defined()) { + return T::make(a, b); + } else if (a.defined()) { + b = mutate(op->b); + return T::make(a, b); + } else if (b.defined()) { + a = mutate(op->a); + return T::make(a, b); + } else { + return IRMutator::visit(op); + } + } else { + return IRMutator::visit(op); + } + } + + using IRMutator::visit; + + Expr visit(const Call *op) override { + if (polynomials_only) { + return IRMutator::visit(op); + } else { + return distribute_shift(op); + } + } + + Expr visit(const Add *op) override { + return visit_add_sub(op); + } + + Expr visit(const Sub *op) override { + return visit_add_sub(op); + } +}; + +Stmt distribute_shifts(const Stmt &s, const bool polynomials_only) { + return DistributeShiftsAsMuls(polynomials_only).mutate(s); +} + +} // namespace Internal +} // namespace Halide diff --git a/src/DistributeShifts.h b/src/DistributeShifts.h new file mode 100644 index 000000000000..69b9d77b94dc --- /dev/null +++ b/src/DistributeShifts.h @@ -0,0 +1,21 @@ +#ifndef HALIDE_DISTRIBUTE_SHIFTS_H +#define HALIDE_DISTRIBUTE_SHIFTS_H + +/** \file + * A tool to distribute shifts as multiplies, useful for some backends. (e.g. ARM, HVX). + */ + +#include "IR.h" + +namespace Halide { +namespace Internal { + +// Distributes shifts as multiplies. If `polynomials_only` is set, +// then only distributes the patterns `a + widening_shl(b, c)` / +// `a - widening_shl(b, c)` and `a + b << c` / `a - b << c`. +Stmt distribute_shifts(const Stmt &stmt, const bool polynomials_only = false); + +} // namespace Internal +} // namespace Halide + +#endif diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index 5814bd7a0df4..317e5c1a1324 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -3,6 +3,7 @@ #include "CSE.h" #include "CodeGen_Internal.h" #include "ConciseCasts.h" +#include "DistributeShifts.h" #include "ExprUsesVar.h" #include "FindIntrinsics.h" #include "HexagonAlignment.h" @@ -1929,135 +1930,6 @@ class FuseInterleaves : public IRMutator { using IRMutator::visit; }; -// Distribute constant RHS widening shift lefts as multiplies. -// TODO: This is an extremely unfortunate mess. I think the better -// solution is for the simplifier to distribute constant multiplications -// instead of factoring them, and then this logic is unnecessary (find_mpy_ops -// would need to handle shifts, but that's easy). -// Another possibility would be adding a widening_mul_add intrinsic that takes -// a list of pairs of operands, and computes a widening sum of widening multiplies -// of these pairs. FindIntrinsics could aggressively rewrite shifts as -// widening_mul_add operands. -class DistributeShiftsAsMuls : public IRMutator { -private: - static bool is_cast(const Expr &e, Type value_t) { - if (const Cast *cast = e.as()) { - return cast->value.type() == value_t; - } - return false; - } - - static Expr distribute(const Expr &a, const Expr &b) { - if (const Add *add = a.as()) { - return Add::make(distribute(add->a, b), distribute(add->b, b)); - } else if (const Sub *sub = a.as()) { - Expr sub_a = distribute(sub->a, b); - Expr sub_b = distribute(sub->b, b); - Expr negative_sub_b = lossless_negate(sub_b); - if (negative_sub_b.defined()) { - return Add::make(sub_a, negative_sub_b); - } else { - return Sub::make(sub_a, sub_b); - } - } else if (const Cast *cast = a.as()) { - Expr cast_b = lossless_cast(b.type().with_bits(cast->value.type().bits()), b); - if (cast_b.defined()) { - Expr mul = widening_mul(cast->value, cast_b); - if (mul.type().bits() <= cast->type.bits()) { - if (mul.type() != cast->type) { - mul = Cast::make(cast->type, mul); - } - return mul; - } - } - } else if (const Call *add = Call::as_intrinsic(a, {Call::widening_add})) { - Expr add_a = Cast::make(add->type, add->args[0]); - Expr add_b = Cast::make(add->type, add->args[1]); - add_a = distribute(add_a, b); - add_b = distribute(add_b, b); - // If add_a and add_b are the same kind of cast, we should remake a widening add. - const Cast *add_a_cast = add_a.as(); - const Cast *add_b_cast = add_b.as(); - if (add_a_cast && add_b_cast && - add_a_cast->value.type() == add->args[0].type() && - add_b_cast->value.type() == add->args[1].type()) { - return widening_add(add_a_cast->value, add_b_cast->value); - } else { - return Add::make(add_a, add_b); - } - } else if (const Call *sub = Call::as_intrinsic(a, {Call::widening_sub})) { - Expr sub_a = Cast::make(sub->type, sub->args[0]); - Expr sub_b = Cast::make(sub->type, sub->args[1]); - sub_a = distribute(sub_a, b); - sub_b = distribute(sub_b, b); - Expr negative_sub_b = lossless_negate(sub_b); - if (negative_sub_b.defined()) { - sub_b = negative_sub_b; - } - // If sub_a and sub_b are the same kind of cast, we should remake a widening sub. - const Cast *sub_a_cast = sub_a.as(); - const Cast *sub_b_cast = sub_b.as(); - if (sub_a_cast && sub_b_cast && - sub_a_cast->value.type() == sub->args[0].type() && - sub_b_cast->value.type() == sub->args[1].type()) { - if (negative_sub_b.defined()) { - return widening_add(sub_a_cast->value, sub_b_cast->value); - } else { - return widening_sub(sub_a_cast->value, sub_b_cast->value); - } - } else { - if (negative_sub_b.defined()) { - return Add::make(sub_a, sub_b); - } else { - return Sub::make(sub_a, sub_b); - } - } - } else if (const Call *mul = Call::as_intrinsic(a, {Call::widening_mul})) { - Expr mul_a = Cast::make(mul->type, mul->args[0]); - Expr mul_b = Cast::make(mul->type, mul->args[1]); - mul_a = distribute(mul_a, b); - if (const Cast *mul_a_cast = mul_a.as()) { - if (mul_a_cast->value.type() == mul->args[0].type()) { - return widening_mul(mul_a_cast->value, mul->args[1]); - } - } - mul_b = distribute(mul_b, b); - if (const Cast *mul_b_cast = mul_b.as()) { - if (mul_b_cast->value.type() == mul->args[1].type()) { - return widening_mul(mul->args[0], mul_b_cast->value); - } - } - } - return simplify(Mul::make(a, b)); - } - - using IRMutator::visit; - - Expr visit(const Call *op) override { - if (op->is_intrinsic(Call::shift_left)) { - if (const uint64_t *const_b = as_const_uint(op->args[1])) { - Expr a = op->args[0]; - // Only rewrite widening shifts. - const Cast *cast_a = a.as(); - bool is_widening_cast = cast_a && cast_a->type.bits() >= cast_a->value.type().bits() * 2; - if (is_widening_cast || Call::as_intrinsic(a, {Call::widening_add, Call::widening_mul, Call::widening_sub})) { - const uint64_t const_m = 1ull << *const_b; - Expr b = make_const(a.type(), const_m); - return mutate(distribute(a, b)); - } - } - } else if (op->is_intrinsic(Call::widening_shift_left)) { - if (const uint64_t *const_b = as_const_uint(op->args[1])) { - const uint64_t const_m = 1ull << *const_b; - Expr b = make_const(op->type, const_m); - Expr a = Cast::make(op->type, op->args[0]); - return mutate(distribute(a, b)); - } - } - return IRMutator::visit(op); - } -}; - // Try generating vgathers instead of shuffles. // At present, we request VTCM memory with single page allocation flag for all // store_in allocations. So it's always safe to generate a vgather. @@ -2333,7 +2205,7 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) { // Hexagon prefers widening shifts to be expressed as multiplies to // hopefully hit compound widening multiplies. - s = DistributeShiftsAsMuls().mutate(s); + s = distribute_shifts(s, /* polynomials_only */ false); // Pattern match VectorReduce IR node. Handle vector reduce instructions // before OptimizePatterns to prevent being mutated by patterns like diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp index 75fff68206b4..ba588b090bae 100644 --- a/test/correctness/simd_op_check_arm.cpp +++ b/test/correctness/simd_op_check_arm.cpp @@ -401,19 +401,31 @@ class SimdOpCheckARM : public SimdOpCheckTest { // VMLAL I - Multiply Accumulate Long check(arm32 ? "vmlal.s8" : "smlal", 8 * w, i16_1 + i16(i8_2) * i8_3); + check(arm32 ? "vmlal.s8" : "smlal", 8 * w, i16_1 + i16(i8_2) * 2); check(arm32 ? "vmlal.u8" : "umlal", 8 * w, u16_1 + u16(u8_2) * u8_3); + check(arm32 ? "vmlal.u8" : "umlal", 8 * w, u16_1 + u16(u8_2) * 2); check(arm32 ? "vmlal.s16" : "smlal", 4 * w, i32_1 + i32(i16_2) * i16_3); + check(arm32 ? "vmlal.s16" : "smlal", 4 * w, i32_1 + i32(i16_2) * 2); check(arm32 ? "vmlal.u16" : "umlal", 4 * w, u32_1 + u32(u16_2) * u16_3); + check(arm32 ? "vmlal.u16" : "umlal", 4 * w, u32_1 + u32(u16_2) * 2); check(arm32 ? "vmlal.s32" : "smlal", 2 * w, i64_1 + i64(i32_2) * i32_3); + check(arm32 ? "vmlal.s32" : "smlal", 2 * w, i64_1 + i64(i32_2) * 2); check(arm32 ? "vmlal.u32" : "umlal", 2 * w, u64_1 + u64(u32_2) * u32_3); + check(arm32 ? "vmlal.u32" : "umlal", 2 * w, u64_1 + u64(u32_2) * 2); // VMLSL I - Multiply Subtract Long check(arm32 ? "vmlsl.s8" : "smlsl", 8 * w, i16_1 - i16(i8_2) * i8_3); + check(arm32 ? "vmlsl.s8" : "smlsl", 8 * w, i16_1 - i16(i8_2) * 2); check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, u16_1 - u16(u8_2) * u8_3); + check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, u16_1 - u16(u8_2) * 2); check(arm32 ? "vmlsl.s16" : "smlsl", 4 * w, i32_1 - i32(i16_2) * i16_3); + check(arm32 ? "vmlsl.s16" : "smlsl", 4 * w, i32_1 - i32(i16_2) * 2); check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, u32_1 - u32(u16_2) * u16_3); + check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, u32_1 - u32(u16_2) * 2); check(arm32 ? "vmlsl.s32" : "smlsl", 2 * w, i64_1 - i64(i32_2) * i32_3); + check(arm32 ? "vmlsl.s32" : "smlsl", 2 * w, i64_1 - i64(i32_2) * 2); check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, u64_1 - u64(u32_2) * u32_3); + check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, u64_1 - u64(u32_2) * 2); // VMOV X F, D Move Register or Immediate // This is for loading immediates, which we won't do in the inner loop anyway From 77eb9b516a2f99b0123b6f1b6fc4aa638b2edd17 Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 21 Aug 2023 13:35:01 -0700 Subject: [PATCH 2/8] clang-format --- src/DistributeShifts.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp index 2fa9ed9ba522..bc2cc7c74f9a 100644 --- a/src/DistributeShifts.cpp +++ b/src/DistributeShifts.cpp @@ -1,7 +1,7 @@ -#include "FindIntrinsics.h" #include "CSE.h" #include "CodeGen_Internal.h" #include "ConciseCasts.h" +#include "FindIntrinsics.h" #include "IRMatch.h" #include "IRMutator.h" #include "Simplify.h" @@ -20,7 +20,10 @@ namespace Internal { // widening_mul_add operands. class DistributeShiftsAsMuls : public IRMutator { public: - DistributeShiftsAsMuls(const bool polys_only) : polynomials_only(polys_only) {} + DistributeShiftsAsMuls(const bool polys_only) + : polynomials_only(polys_only) { + } + private: const bool polynomials_only; From a5df3a2ed7b40a899481db71b3a09ebdb57cf8ab Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 21 Aug 2023 16:13:47 -0700 Subject: [PATCH 3/8] update comment for clarity of 'distribute' --- src/DistributeShifts.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/DistributeShifts.h b/src/DistributeShifts.h index 69b9d77b94dc..e7436714c22b 100644 --- a/src/DistributeShifts.h +++ b/src/DistributeShifts.h @@ -12,7 +12,9 @@ namespace Internal { // Distributes shifts as multiplies. If `polynomials_only` is set, // then only distributes the patterns `a + widening_shl(b, c)` / -// `a - widening_shl(b, c)` and `a + b << c` / `a - b << c`. +// `a - widening_shl(b, c)` and `a + b << c` / `a - b << c`, to +// produce `a (+/-) widening_mul(b, 1 << c)` and `a (+/-) b * (1 << c)`, +// respectively Stmt distribute_shifts(const Stmt &stmt, const bool polynomials_only = false); } // namespace Internal From 9372b5f59ac9f9b0020f12c336d7cc9dd8d11863 Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 22 Aug 2023 10:39:00 -0700 Subject: [PATCH 4/8] polynomials_only -> multiply_adds --- src/CodeGen_ARM.cpp | 2 +- src/DistributeShifts.cpp | 14 +++++++------- src/DistributeShifts.h | 4 ++-- src/HexagonOptimize.cpp | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp index ed443b53a631..8e4304f89941 100644 --- a/src/CodeGen_ARM.cpp +++ b/src/CodeGen_ARM.cpp @@ -855,7 +855,7 @@ void CodeGen_ARM::compile_func(const LoweredFunc &f, } // Look for opportunities to turn a + (b << c) into umlal/smlal // and a - (b << c) into umlsl/smlsl. - func.body = distribute_shifts(func.body, /* polynomials_only */ true); + func.body = distribute_shifts(func.body, /* multiply_adds */ true); CodeGen_Posix::compile_func(func, simple_name, extern_name); } diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp index bc2cc7c74f9a..652f97fbe8d3 100644 --- a/src/DistributeShifts.cpp +++ b/src/DistributeShifts.cpp @@ -20,12 +20,12 @@ namespace Internal { // widening_mul_add operands. class DistributeShiftsAsMuls : public IRMutator { public: - DistributeShiftsAsMuls(const bool polys_only) - : polynomials_only(polys_only) { + DistributeShiftsAsMuls(const bool _multiply_adds) + : multiply_adds(_multiply_adds) { } private: - const bool polynomials_only; + const bool multiply_adds; static bool is_cast(const Expr &e, Type value_t) { if (const Cast *cast = e.as()) { @@ -144,7 +144,7 @@ class DistributeShiftsAsMuls : public IRMutator { template Expr visit_add_sub(const T *op) { - if (polynomials_only) { + if (multiply_adds) { Expr a, b; if (const Call *a_call = op->a.template as()) { if (a_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { @@ -176,7 +176,7 @@ class DistributeShiftsAsMuls : public IRMutator { using IRMutator::visit; Expr visit(const Call *op) override { - if (polynomials_only) { + if (multiply_adds) { return IRMutator::visit(op); } else { return distribute_shift(op); @@ -192,8 +192,8 @@ class DistributeShiftsAsMuls : public IRMutator { } }; -Stmt distribute_shifts(const Stmt &s, const bool polynomials_only) { - return DistributeShiftsAsMuls(polynomials_only).mutate(s); +Stmt distribute_shifts(const Stmt &s, const bool multiply_adds) { + return DistributeShiftsAsMuls(multiply_adds).mutate(s); } } // namespace Internal diff --git a/src/DistributeShifts.h b/src/DistributeShifts.h index e7436714c22b..fb7b796f8959 100644 --- a/src/DistributeShifts.h +++ b/src/DistributeShifts.h @@ -10,12 +10,12 @@ namespace Halide { namespace Internal { -// Distributes shifts as multiplies. If `polynomials_only` is set, +// Distributes shifts as multiplies. If `multiply_adds` is set, // then only distributes the patterns `a + widening_shl(b, c)` / // `a - widening_shl(b, c)` and `a + b << c` / `a - b << c`, to // produce `a (+/-) widening_mul(b, 1 << c)` and `a (+/-) b * (1 << c)`, // respectively -Stmt distribute_shifts(const Stmt &stmt, const bool polynomials_only = false); +Stmt distribute_shifts(const Stmt &stmt, const bool multiply_adds = false); } // namespace Internal } // namespace Halide diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp index 317e5c1a1324..7b8fe6650934 100644 --- a/src/HexagonOptimize.cpp +++ b/src/HexagonOptimize.cpp @@ -2205,7 +2205,7 @@ Stmt optimize_hexagon_instructions(Stmt s, const Target &t) { // Hexagon prefers widening shifts to be expressed as multiplies to // hopefully hit compound widening multiplies. - s = distribute_shifts(s, /* polynomials_only */ false); + s = distribute_shifts(s, /* multiply_adds */ false); // Pattern match VectorReduce IR node. Handle vector reduce instructions // before OptimizePatterns to prevent being mutated by patterns like From 762d722604923e8f334603ec1ce0d5abb70c6df0 Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 22 Aug 2023 10:45:41 -0700 Subject: [PATCH 5/8] move DistributeShiftsAsMuls to namespace and rename arg --- src/DistributeShifts.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp index 652f97fbe8d3..a90b343b781a 100644 --- a/src/DistributeShifts.cpp +++ b/src/DistributeShifts.cpp @@ -9,6 +9,8 @@ namespace Halide { namespace Internal { +namespace { + // Distribute constant RHS widening shift lefts as multiplies. // TODO: This is an extremely unfortunate mess. I think the better // solution is for the simplifier to distribute constant multiplications @@ -20,8 +22,8 @@ namespace Internal { // widening_mul_add operands. class DistributeShiftsAsMuls : public IRMutator { public: - DistributeShiftsAsMuls(const bool _multiply_adds) - : multiply_adds(_multiply_adds) { + DistributeShiftsAsMuls(const bool multiply_adds) + : multiply_adds(multiply_adds) { } private: @@ -192,6 +194,8 @@ class DistributeShiftsAsMuls : public IRMutator { } }; +} // namespace + Stmt distribute_shifts(const Stmt &s, const bool multiply_adds) { return DistributeShiftsAsMuls(multiply_adds).mutate(s); } From 19d4cdc3724e4c3389d6a4c25ebf222982def2c8 Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 22 Aug 2023 13:17:51 -0700 Subject: [PATCH 6/8] doxygen comment and fix clang tidy --- src/DistributeShifts.cpp | 17 ++++++++--------- src/DistributeShifts.h | 2 +- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp index a90b343b781a..7bb7e655ee38 100644 --- a/src/DistributeShifts.cpp +++ b/src/DistributeShifts.cpp @@ -11,15 +11,14 @@ namespace Internal { namespace { -// Distribute constant RHS widening shift lefts as multiplies. -// TODO: This is an extremely unfortunate mess. I think the better -// solution is for the simplifier to distribute constant multiplications -// instead of factoring them, and then this logic is unnecessary (find_mpy_ops -// would need to handle shifts, but that's easy). -// Another possibility would be adding a widening_mul_add intrinsic that takes -// a list of pairs of operands, and computes a widening sum of widening multiplies -// of these pairs. FindIntrinsics could aggressively rewrite shifts as -// widening_mul_add operands. +/** + * Distribute constant RHS widening shift lefts as multiplies. + * This is an extremely unfortunate mess. Unfortunately, the + * simplifier needs to lift constant multiplications due to its + * cost model. This transformation is very architecture and data- + * type specific (e.g. useful on ARM and HVX due to a plethora of + * dot product / widening multiply instructions). + */ class DistributeShiftsAsMuls : public IRMutator { public: DistributeShiftsAsMuls(const bool multiply_adds) diff --git a/src/DistributeShifts.h b/src/DistributeShifts.h index fb7b796f8959..f4c9d99ba7a8 100644 --- a/src/DistributeShifts.h +++ b/src/DistributeShifts.h @@ -15,7 +15,7 @@ namespace Internal { // `a - widening_shl(b, c)` and `a + b << c` / `a - b << c`, to // produce `a (+/-) widening_mul(b, 1 << c)` and `a (+/-) b * (1 << c)`, // respectively -Stmt distribute_shifts(const Stmt &stmt, const bool multiply_adds = false); +Stmt distribute_shifts(const Stmt &stmt, bool multiply_adds); } // namespace Internal } // namespace Halide From eba8f325edfaaa7b11c52a19435200f6b28e539a Mon Sep 17 00:00:00 2001 From: Alexander Date: Tue, 22 Aug 2023 17:04:25 -0700 Subject: [PATCH 7/8] handle mixed-sign widening_shl --- src/DistributeShifts.cpp | 30 ++++++++++++++++---------- src/FindIntrinsics.cpp | 13 +++++++++++ test/correctness/simd_op_check_arm.cpp | 6 ++++++ 3 files changed, 38 insertions(+), 11 deletions(-) diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp index 7bb7e655ee38..22c6dbc35d8e 100644 --- a/src/DistributeShifts.cpp +++ b/src/DistributeShifts.cpp @@ -143,20 +143,28 @@ class DistributeShiftsAsMuls : public IRMutator { return IRMutator::visit(op); } - template - Expr visit_add_sub(const T *op) { - if (multiply_adds) { - Expr a, b; - if (const Call *a_call = op->a.template as()) { - if (a_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { - a = distribute_shift(a_call); - } + Expr handle_shift(const Expr &expr) { + Expr ret; + if (const Call *as_call = expr.template as()) { + if (as_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { + ret = distribute_shift(as_call); } - if (const Call *b_call = op->b.template as()) { - if (b_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { - b = distribute_shift(b_call); + } else if (const Cast *as_cast = expr.template as()) { + if (as_cast->is_reinterpret()) { + ret = handle_shift(as_cast->value); + if (ret.defined()) { + ret = cast(as_cast->type, ret); } } + } + return ret; + } + + template + Expr visit_add_sub(const T *op) { + if (multiply_adds) { + Expr a = handle_shift(op->a); + Expr b = handle_shift(op->b); if (a.defined() && b.defined()) { return T::make(a, b); diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp index 29a8913e1068..14ef9fc4caa4 100644 --- a/src/FindIntrinsics.cpp +++ b/src/FindIntrinsics.cpp @@ -879,6 +879,19 @@ class FindIntrinsics : public IRMutator { return mutate(result); } + // Try to lossless cast to uint. + if (op->type.is_int() && bits >= 16) { + Type uint_type = op->type.narrow().with_code(halide_type_uint); + Expr a_narrow = lossless_cast(uint_type, op->args[0]); + Expr b_narrow = lossless_cast(uint_type, op->args[1]); + if (a_narrow.defined() && b_narrow.defined()) { + Expr result = op->is_intrinsic(Call::shift_left) ? widening_shift_left(a_narrow, b_narrow) : widening_shift_right(a_narrow, b_narrow); + internal_assert(result.type() != op->type); + result = Cast::make(op->type, result); + return mutate(result); + } + } + // Try to turn this into a rounding shift. Expr rounding_shift = to_rounding_shift(op); if (rounding_shift.defined()) { diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp index ba588b090bae..6809454f4a38 100644 --- a/test/correctness/simd_op_check_arm.cpp +++ b/test/correctness/simd_op_check_arm.cpp @@ -404,28 +404,34 @@ class SimdOpCheckARM : public SimdOpCheckTest { check(arm32 ? "vmlal.s8" : "smlal", 8 * w, i16_1 + i16(i8_2) * 2); check(arm32 ? "vmlal.u8" : "umlal", 8 * w, u16_1 + u16(u8_2) * u8_3); check(arm32 ? "vmlal.u8" : "umlal", 8 * w, u16_1 + u16(u8_2) * 2); + check(arm32 ? "vmlal.u8" : "umlal", 8 * w, i16_1 + i16(u8_2) * 2); check(arm32 ? "vmlal.s16" : "smlal", 4 * w, i32_1 + i32(i16_2) * i16_3); check(arm32 ? "vmlal.s16" : "smlal", 4 * w, i32_1 + i32(i16_2) * 2); check(arm32 ? "vmlal.u16" : "umlal", 4 * w, u32_1 + u32(u16_2) * u16_3); check(arm32 ? "vmlal.u16" : "umlal", 4 * w, u32_1 + u32(u16_2) * 2); + check(arm32 ? "vmlal.u16" : "umlal", 4 * w, i32_1 + i32(u16_2) * 2); check(arm32 ? "vmlal.s32" : "smlal", 2 * w, i64_1 + i64(i32_2) * i32_3); check(arm32 ? "vmlal.s32" : "smlal", 2 * w, i64_1 + i64(i32_2) * 2); check(arm32 ? "vmlal.u32" : "umlal", 2 * w, u64_1 + u64(u32_2) * u32_3); check(arm32 ? "vmlal.u32" : "umlal", 2 * w, u64_1 + u64(u32_2) * 2); + check(arm32 ? "vmlal.u32" : "umlal", 2 * w, i64_1 + i64(u32_2) * 2); // VMLSL I - Multiply Subtract Long check(arm32 ? "vmlsl.s8" : "smlsl", 8 * w, i16_1 - i16(i8_2) * i8_3); check(arm32 ? "vmlsl.s8" : "smlsl", 8 * w, i16_1 - i16(i8_2) * 2); check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, u16_1 - u16(u8_2) * u8_3); check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, u16_1 - u16(u8_2) * 2); + check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, i16_1 - i16(u8_2) * 2); check(arm32 ? "vmlsl.s16" : "smlsl", 4 * w, i32_1 - i32(i16_2) * i16_3); check(arm32 ? "vmlsl.s16" : "smlsl", 4 * w, i32_1 - i32(i16_2) * 2); check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, u32_1 - u32(u16_2) * u16_3); check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, u32_1 - u32(u16_2) * 2); + check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, i32_1 - i32(u16_2) * 2); check(arm32 ? "vmlsl.s32" : "smlsl", 2 * w, i64_1 - i64(i32_2) * i32_3); check(arm32 ? "vmlsl.s32" : "smlsl", 2 * w, i64_1 - i64(i32_2) * 2); check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, u64_1 - u64(u32_2) * u32_3); check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, u64_1 - u64(u32_2) * 2); + check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, i64_1 - i64(u32_2) * 2); // VMOV X F, D Move Register or Immediate // This is for loading immediates, which we won't do in the inner loop anyway From be7857bdcfee86e8f0007a7abc8397691a53abef Mon Sep 17 00:00:00 2001 From: Alexander Date: Wed, 23 Aug 2023 10:28:08 -0700 Subject: [PATCH 8/8] Revert "handle mixed-sign widening_shl" This reverts commit eba8f325edfaaa7b11c52a19435200f6b28e539a. --- src/DistributeShifts.cpp | 30 ++++++++++---------------- src/FindIntrinsics.cpp | 13 ----------- test/correctness/simd_op_check_arm.cpp | 6 ------ 3 files changed, 11 insertions(+), 38 deletions(-) diff --git a/src/DistributeShifts.cpp b/src/DistributeShifts.cpp index 22c6dbc35d8e..7bb7e655ee38 100644 --- a/src/DistributeShifts.cpp +++ b/src/DistributeShifts.cpp @@ -143,28 +143,20 @@ class DistributeShiftsAsMuls : public IRMutator { return IRMutator::visit(op); } - Expr handle_shift(const Expr &expr) { - Expr ret; - if (const Call *as_call = expr.template as()) { - if (as_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { - ret = distribute_shift(as_call); - } - } else if (const Cast *as_cast = expr.template as()) { - if (as_cast->is_reinterpret()) { - ret = handle_shift(as_cast->value); - if (ret.defined()) { - ret = cast(as_cast->type, ret); - } - } - } - return ret; - } - template Expr visit_add_sub(const T *op) { if (multiply_adds) { - Expr a = handle_shift(op->a); - Expr b = handle_shift(op->b); + Expr a, b; + if (const Call *a_call = op->a.template as()) { + if (a_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { + a = distribute_shift(a_call); + } + } + if (const Call *b_call = op->b.template as()) { + if (b_call->is_intrinsic({Call::shift_left, Call::widening_shift_left})) { + b = distribute_shift(b_call); + } + } if (a.defined() && b.defined()) { return T::make(a, b); diff --git a/src/FindIntrinsics.cpp b/src/FindIntrinsics.cpp index 14ef9fc4caa4..29a8913e1068 100644 --- a/src/FindIntrinsics.cpp +++ b/src/FindIntrinsics.cpp @@ -879,19 +879,6 @@ class FindIntrinsics : public IRMutator { return mutate(result); } - // Try to lossless cast to uint. - if (op->type.is_int() && bits >= 16) { - Type uint_type = op->type.narrow().with_code(halide_type_uint); - Expr a_narrow = lossless_cast(uint_type, op->args[0]); - Expr b_narrow = lossless_cast(uint_type, op->args[1]); - if (a_narrow.defined() && b_narrow.defined()) { - Expr result = op->is_intrinsic(Call::shift_left) ? widening_shift_left(a_narrow, b_narrow) : widening_shift_right(a_narrow, b_narrow); - internal_assert(result.type() != op->type); - result = Cast::make(op->type, result); - return mutate(result); - } - } - // Try to turn this into a rounding shift. Expr rounding_shift = to_rounding_shift(op); if (rounding_shift.defined()) { diff --git a/test/correctness/simd_op_check_arm.cpp b/test/correctness/simd_op_check_arm.cpp index 6809454f4a38..ba588b090bae 100644 --- a/test/correctness/simd_op_check_arm.cpp +++ b/test/correctness/simd_op_check_arm.cpp @@ -404,34 +404,28 @@ class SimdOpCheckARM : public SimdOpCheckTest { check(arm32 ? "vmlal.s8" : "smlal", 8 * w, i16_1 + i16(i8_2) * 2); check(arm32 ? "vmlal.u8" : "umlal", 8 * w, u16_1 + u16(u8_2) * u8_3); check(arm32 ? "vmlal.u8" : "umlal", 8 * w, u16_1 + u16(u8_2) * 2); - check(arm32 ? "vmlal.u8" : "umlal", 8 * w, i16_1 + i16(u8_2) * 2); check(arm32 ? "vmlal.s16" : "smlal", 4 * w, i32_1 + i32(i16_2) * i16_3); check(arm32 ? "vmlal.s16" : "smlal", 4 * w, i32_1 + i32(i16_2) * 2); check(arm32 ? "vmlal.u16" : "umlal", 4 * w, u32_1 + u32(u16_2) * u16_3); check(arm32 ? "vmlal.u16" : "umlal", 4 * w, u32_1 + u32(u16_2) * 2); - check(arm32 ? "vmlal.u16" : "umlal", 4 * w, i32_1 + i32(u16_2) * 2); check(arm32 ? "vmlal.s32" : "smlal", 2 * w, i64_1 + i64(i32_2) * i32_3); check(arm32 ? "vmlal.s32" : "smlal", 2 * w, i64_1 + i64(i32_2) * 2); check(arm32 ? "vmlal.u32" : "umlal", 2 * w, u64_1 + u64(u32_2) * u32_3); check(arm32 ? "vmlal.u32" : "umlal", 2 * w, u64_1 + u64(u32_2) * 2); - check(arm32 ? "vmlal.u32" : "umlal", 2 * w, i64_1 + i64(u32_2) * 2); // VMLSL I - Multiply Subtract Long check(arm32 ? "vmlsl.s8" : "smlsl", 8 * w, i16_1 - i16(i8_2) * i8_3); check(arm32 ? "vmlsl.s8" : "smlsl", 8 * w, i16_1 - i16(i8_2) * 2); check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, u16_1 - u16(u8_2) * u8_3); check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, u16_1 - u16(u8_2) * 2); - check(arm32 ? "vmlsl.u8" : "umlsl", 8 * w, i16_1 - i16(u8_2) * 2); check(arm32 ? "vmlsl.s16" : "smlsl", 4 * w, i32_1 - i32(i16_2) * i16_3); check(arm32 ? "vmlsl.s16" : "smlsl", 4 * w, i32_1 - i32(i16_2) * 2); check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, u32_1 - u32(u16_2) * u16_3); check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, u32_1 - u32(u16_2) * 2); - check(arm32 ? "vmlsl.u16" : "umlsl", 4 * w, i32_1 - i32(u16_2) * 2); check(arm32 ? "vmlsl.s32" : "smlsl", 2 * w, i64_1 - i64(i32_2) * i32_3); check(arm32 ? "vmlsl.s32" : "smlsl", 2 * w, i64_1 - i64(i32_2) * 2); check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, u64_1 - u64(u32_2) * u32_3); check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, u64_1 - u64(u32_2) * 2); - check(arm32 ? "vmlsl.u32" : "umlsl", 2 * w, i64_1 - i64(u32_2) * 2); // VMOV X F, D Move Register or Immediate // This is for loading immediates, which we won't do in the inner loop anyway