halide · rootjalex · Apr 30, 2024 · Aug 23, 2023 · Aug 24, 2023 · Aug 24, 2023
diff --git a/src/Bounds.cpp b/src/Bounds.cpp
@@ -1517,15 +1517,28 @@ class Bounds : public IRVisitor {
                 result.include(arg_bounds.get(i));
             }
             interval = result;
-        } else if (op->is_intrinsic(Call::widen_right_add)) {
-            Expr add = Add::make(op->args[0], cast(op->args[0].type(), op->args[1]));
-            add.accept(this);
-        } else if (op->is_intrinsic(Call::widen_right_sub)) {
-            Expr sub = Sub::make(op->args[0], cast(op->args[0].type(), op->args[1]));
-            sub.accept(this);
-        } else if (op->is_intrinsic(Call::widen_right_mul)) {
-            Expr mul = Mul::make(op->args[0], cast(op->args[0].type(), op->args[1]));
-            mul.accept(this);
+        } else if (op->is_intrinsic({Call::widening_add,
+                                     Call::widening_mul,
+                                     Call::widening_shift_left,
+                                     Call::widening_shift_right,
+                                     Call::widening_sub,
+                                     Call::widen_right_add,
+                                     Call::widen_right_sub,
+                                     Call::widen_right_mul,
+                                     // TODO: the below intrinsics should not use the optimal lowering,
+                                     // because that's harder for bounds inference to reason about.
+                                     Call::rounding_halving_add,
+                                     Call::halving_add,
+                                     Call::saturating_add,
+                                     Call::saturating_sub,
+                                     Call::halving_sub,
+                                     Call::rounding_shift_left,
+                                     Call::rounding_shift_right,
+                                     Call::mul_shift_right,
+                                     Call::rounding_mul_shift_right})) {
+            Expr a = lower_intrinsic(op);
+            internal_assert(a.defined());
+            a.accept(this);
         } else if (op->call_type == Call::Halide) {
             bounds_of_func(op->name, op->value_index, op->type);
         } else {

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
@@ -1,3 +1,4 @@
+#include "Bounds.h"
 #include "CodeGen_Internal.h"
 #include "CodeGen_Posix.h"
 #include "ConciseCasts.h"
@@ -488,7 +489,6 @@ void CodeGen_X86::visit(const Select *op) {
 }
 
 void CodeGen_X86::visit(const Cast *op) {
-
     if (!op->type.is_vector()) {
         // We only have peephole optimizations for vectors in here.
         CodeGen_Posix::visit(op);
@@ -501,7 +501,7 @@ void CodeGen_X86::visit(const Cast *op) {
     };
 
     // clang-format off
-    static Pattern patterns[] = {
+    static const Pattern patterns[] = {
         // This isn't rounding_multiply_quantzied(i16, i16, 15) because it doesn't
         // saturate the result.
         {"pmulhrs", i16(rounding_shift_right(widening_mul(wild_i16x_, wild_i16x_), 15))},
@@ -611,7 +611,7 @@ void CodeGen_X86::visit(const Call *op) {
     };
 
     // clang-format off
-    static Pattern patterns[] = {
+    static const Pattern patterns[] = {
         {"pmulh", mul_shift_right(wild_i16x_, wild_i16x_, 16)},
         {"pmulh", mul_shift_right(wild_u16x_, wild_u16x_, 16)},
         {"saturating_narrow", i16_sat(wild_i32x_)},
@@ -631,6 +631,37 @@ void CodeGen_X86::visit(const Call *op) {
         }
     }
 
+    // clang-format off
+    static const Pattern reinterpret_patterns[] = {
+        {"saturating_narrow", i16_sat(wild_u32x_)},
+        {"saturating_narrow", u16_sat(wild_u32x_)},
+        {"saturating_narrow", i8_sat(wild_u16x_)},
+        {"saturating_narrow", u8_sat(wild_u16x_)},
+    };
+    // clang-format on
+
+    // Search for saturating casts where the inner value can be
+    // reinterpreted to signed, so that we can use existing
+    // saturating_narrow patterns.
+    for (const auto &pattern : reinterpret_patterns) {
+        if (expr_match(pattern.pattern, op, matches)) {
+            const Expr &expr = matches[0];
+            Expr upper_bound = find_constant_bound(expr, Direction::Upper);
+            if (const uint64_t *bound = as_const_uint(upper_bound)) {
+                Type t = matches[0].type();
+                if (*bound <= (uint64_t)max_int(t.bits())) {
+                    // Can safely reinterpret to signed integer.
+                    matches[0] = cast(t.with_code(halide_type_int), matches[0]);
+                    value = call_overloaded_intrin(op->type, pattern.intrin, matches);
+                    if (value) {
+                        return;
+                    }
+                }
+            }
+            break;
+        }
+    }
+
     static const vector<pair<Expr, Expr>> cast_rewrites = {
         // Some double-narrowing saturating casts can be better expressed as
         // combinations of single-narrowing saturating casts.

diff --git a/src/HexagonOptimize.cpp b/src/HexagonOptimize.cpp
@@ -185,8 +185,10 @@ struct Pattern {
         // re-interleave the result.
         ReinterleaveOp0 = InterleaveResult | DeinterleaveOp0,
 
-        v65orLater = 1 << 10,  // Pattern should be matched only for v65 target or later
-        v66orLater = 1 << 11,  // Pattern should be matched only for v66 target or later
+        SafeReinterpretOp0 = 1 << 10,  // Pattern should be matched only if the first arg can be safely reinterpreted.
+
+        v65orLater = 1 << 11,  // Pattern should be matched only for v65 target or later
+        v66orLater = 1 << 12,  // Pattern should be matched only for v66 target or later
     };
 
     string intrin;  // Name of the intrinsic
@@ -256,6 +258,30 @@ bool process_match_flags(vector<Expr> &matches, int flags) {
         internal_assert(matches.size() >= 3);
         std::swap(matches[1], matches[2]);
     }
+    if (flags & Pattern::SafeReinterpretOp0) {
+        // Use bounds inference to check if the first operand can
+        // be safely reinterpreted.
+        const Type &t = matches[0].type();
+        std::cout << "matched with safe reinterpret\n";
+        if (t.is_int()) {
+            // A signed integer can be reinterpreted as unsigned if strictly positive.
+            Expr bound = find_constant_bound(matches[0], Direction::Lower);
+            if (const int64_t *lower = as_const_int(bound)) {
+                return *lower >= 0;
+            } else {
+                return false;
+            }
+        } else {
+            internal_assert(t.is_uint());
+            // An unsigned integer can be reinterpreted as signed if bounded by int max.
+            Expr bound = find_constant_bound(matches[0], Direction::Upper);
+            if (const uint64_t *upper = as_const_uint(bound)) {
+                return *upper <= (uint64_t)max_int(t.bits());
+            } else {
+                return false;
+            }
+        }
+    }
     return true;
 }
 
@@ -914,6 +940,12 @@ class OptimizePatterns : public IRMutator {
             {"halide.hexagon.pack_satuh.vw", u16_sat(wild_i32x)},
             {"halide.hexagon.pack_satb.vh", i8_sat(wild_i16x)},
             {"halide.hexagon.pack_sath.vw", i16_sat(wild_i32x)},
+            // The same patterns as above, but with safely reinterpreting the
+            // argument to be signed.
+            {"halide.hexagon.pack_satub.vh", u8_sat(wild_u16x), Pattern::SafeReinterpretOp0},
+            {"halide.hexagon.pack_satuh.vw", u16_sat(wild_u32x), Pattern::SafeReinterpretOp0},
+            {"halide.hexagon.pack_satb.vh", i8_sat(wild_u16x), Pattern::SafeReinterpretOp0},
+            {"halide.hexagon.pack_sath.vw", i16_sat(wild_u32x), Pattern::SafeReinterpretOp0},
 
             // We don't have a vpack equivalent to this one, so we match it directly.
             {"halide.hexagon.trunc_satuh.vuw", u16_sat(wild_u32x), Pattern::DeinterleaveOp0},

diff --git a/src/Type.cpp b/src/Type.cpp
@@ -6,23 +6,6 @@ namespace Halide {
 
 using std::ostringstream;
 
-namespace {
-uint64_t max_uint(int bits) {
-    uint64_t max_val = 0xffffffffffffffffULL;
-    return max_val >> (64 - bits);
-}
-
-int64_t max_int(int bits) {
-    int64_t max_val = 0x7fffffffffffffffLL;
-    return max_val >> (64 - bits);
-}
-
-int64_t min_int(int bits) {
-    return -max_int(bits) - 1;
-}
-
-}  // namespace
-
 /** Return an expression which is the maximum value of this type */
 Halide::Expr Type::max() const {
     if (is_vector()) {

diff --git a/src/Type.h b/src/Type.h
@@ -561,6 +561,23 @@ inline Type type_of() {
 /** Halide type to a C++ type */
 std::string type_to_c_type(Type type, bool include_space, bool c_plus_plus = true);
 
+/** Returns maximum representable unsigned integer. */
+constexpr uint64_t max_uint(int bits) {
+    uint64_t max_val = 0xffffffffffffffffULL;
+    return max_val >> (64 - bits);
+}
+
+/** Returns maximum representable signed integer. */
+constexpr int64_t max_int(int bits) {
+    int64_t max_val = 0x7fffffffffffffffLL;
+    return max_val >> (64 - bits);
+}
+
+/** Returns minimum representable signed integer. */
+constexpr int64_t min_int(int bits) {
+    return -max_int(bits) - 1;
+}
+
 }  // namespace Halide
 
 #endif
diff --git a/test/correctness/simd_op_check_hvx.cpp b/test/correctness/simd_op_check_hvx.cpp
@@ -304,6 +304,13 @@ class SimdOpCheckHVX : public SimdOpCheckTest {
         check("v*.b = vpack(v*.h,v*.h):sat", hvx_width / 1, i8_sat(i16_1));
         check("v*.uh = vpack(v*.w,v*.w):sat", hvx_width / 2, u16_sat(i32_1));
         check("v*.h = vpack(v*.w,v*.w):sat", hvx_width / 2, i16_sat(i32_1));
+        // Test that bounds-inference instruction selection is working properly.
+        check("v*.ub = vpack(v*.h,v*.h):sat", hvx_width / 1, u8_sat(u16_1 >> 1));
+        check("v*.b = vpack(v*.h,v*.h):sat", hvx_width / 1, i8_sat(u16_1 >> 1));
+        // These tests don't work yet because bounds inference currently gives up on
+        // u32. See https://github.com/halide/Halide/issues/7807
+        // check("v*.uh = vpack(v*.w,v*.w):sat", hvx_width / 2, u16_sat(u32_1 >> 1));
+        // check("v*.h = vpack(v*.w,v*.w):sat", hvx_width / 2, i16_sat(u32_1 >> 1));
 
         // vpack doesn't interleave its inputs, which means it doesn't
         // simplify with widening. This is preferable for when the