halide · abadams · Mar 15, 2024 · Dec 14, 2023 · Dec 14, 2023 · Dec 14, 2023
diff --git a/src/CodeGen_ARM.cpp b/src/CodeGen_ARM.cpp
diff --git a/src/CodeGen_LLVM.cpp b/src/CodeGen_LLVM.cpp
diff --git a/src/CodeGen_LLVM.h b/src/CodeGen_LLVM.h
@@ -579,6 +579,13 @@ class CodeGen_LLVM : public IRVisitor {
     llvm::Constant *get_splat(int lanes, llvm::Constant *value,
                               VectorTypeConstraint type_constraint = VectorTypeConstraint::None) const;
 
+    /** Make sure a value type has the same scalable/fixed vector type as a guide. */
+    // @{
+    llvm::Value *match_vector_type_scalable(llvm::Value *value, VectorTypeConstraint constraint);
+    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Type *guide);
+    llvm::Value *match_vector_type_scalable(llvm::Value *value, llvm::Value *guide);
+    // @}
+
     /** Support for generating LLVM vector predication intrinsics
      * ("@llvm.vp.*" and "@llvm.experimental.vp.*")
      */

diff --git a/src/IR.cpp b/src/IR.cpp
@@ -689,6 +689,7 @@ const char *const intrinsic_op_names[] = {
     "widening_shift_left",
     "widening_shift_right",
     "widening_sub",
+    "get_runtime_vscale",
 };
 
 static_assert(sizeof(intrinsic_op_names) / sizeof(intrinsic_op_names[0]) == Call::IntrinsicOpCount,

diff --git a/src/IR.h b/src/IR.h
@@ -625,6 +625,8 @@ struct Call : public ExprNode<Call> {
         widening_shift_right,
         widening_sub,
 
+        get_runtime_vscale,
+
         IntrinsicOpCount  // Sentinel: keep last.
     };
 

diff --git a/src/LLVM_Output.cpp b/src/LLVM_Output.cpp
@@ -331,6 +331,12 @@ std::unique_ptr<llvm::Module> clone_module(const llvm::Module &module_in) {
     // Read it back in.
     llvm::MemoryBufferRef buffer_ref(llvm::StringRef(clone_buffer.data(), clone_buffer.size()), "clone_buffer");
     auto cloned_module = llvm::parseBitcodeFile(buffer_ref, module_in.getContext());
+
+    // TODO(<add issue>): Add support for returning the error.
+    if (!cloned_module) {
+        llvm::dbgs() << cloned_module.takeError();
+        module_in.print(llvm::dbgs(), nullptr, false, true);
+    }
     internal_assert(cloned_module);
 
     return std::move(cloned_module.get());

diff --git a/src/StorageFolding.cpp b/src/StorageFolding.cpp
@@ -10,17 +10,14 @@
 #include "Monotonic.h"
 #include "Simplify.h"
 #include "Substitute.h"
+#include "Util.h"
 #include <utility>
 
 namespace Halide {
 namespace Internal {
 
 namespace {
 
-int64_t next_power_of_two(int64_t x) {
-    return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
-}
-
 using std::map;
 using std::string;
 using std::vector;

diff --git a/src/Util.h b/src/Util.h
@@ -13,6 +13,7 @@
 /** \file
  * Various utility functions used internally Halide. */
 
+#include <cmath>
 #include <cstdint>
 #include <cstring>
 #include <functional>
@@ -532,6 +533,11 @@ int clz64(uint64_t x);
 int ctz64(uint64_t x);
 // @}
 
+/** Return an integer 2^n, for some n,  which is >= x. Argument x must be > 0. */
+inline int64_t next_power_of_two(int64_t x) {
+    return static_cast<int64_t>(1) << static_cast<int64_t>(std::ceil(std::log2(x)));
+}
+
 }  // namespace Internal
 }  // namespace Halide
 

diff --git a/src/runtime/HalideRuntime.h b/src/runtime/HalideRuntime.h
@@ -1242,6 +1242,10 @@ enum halide_error_code_t {
     /** An explicit storage bound provided is too small to store
      * all the values produced by the function. */
     halide_error_code_storage_bound_too_small = -45,
+
+    /** "vscale" value of Scalable Vector detected in runtime does not match
+     * the vscale value used in compilation. */
+    halide_error_code_vscale_invalid = -46,
 };
 
 /** Halide calls the functions below on various error conditions. The
@@ -1316,6 +1320,7 @@ extern int halide_error_device_dirty_with_no_device_support(void *user_context,
 extern int halide_error_storage_bound_too_small(void *user_context, const char *func_name, const char *var_name,
                                                 int provided_size, int required_size);
 extern int halide_error_device_crop_failed(void *user_context);
+extern int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale);
 // @}
 
 /** Optional features a compilation Target can have.

diff --git a/src/runtime/aarch64.ll b/src/runtime/aarch64.ll
@@ -48,25 +48,34 @@ define weak_odr <2 x i64> @vabdl_u32x2(<2 x i32> %a, <2 x i32> %b) nounwind alwa
 
 declare <4 x float> @llvm.aarch64.neon.frecpe.v4f32(<4 x float> %x) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frecpe.v2f32(<2 x float> %x) nounwind readnone;
+declare float @llvm.aarch64.neon.frecpe.f32(float)
 declare <4 x float> @llvm.aarch64.neon.frsqrte.v4f32(<4 x float> %x) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frsqrte.v2f32(<2 x float> %x) nounwind readnone;
+declare float @llvm.aarch64.neon.frsqrte.f32(float)
 declare <4 x float> @llvm.aarch64.neon.frecps.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frecps.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone;
+declare float @llvm.aarch64.neon.frecps.f32(float, float)
 declare <4 x float> @llvm.aarch64.neon.frsqrts.v4f32(<4 x float> %x, <4 x float> %y) nounwind readnone;
 declare <2 x float> @llvm.aarch64.neon.frsqrts.v2f32(<2 x float> %x, <2 x float> %y) nounwind readnone;
+declare float @llvm.aarch64.neon.frsqrts.f32(float, float)
+
 declare <8 x half> @llvm.aarch64.neon.frecpe.v8f16(<8 x half> %x) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frecpe.v4f16(<4 x half> %x) nounwind readnone;
+declare half @llvm.aarch64.neon.frecpe.f16(half)
 declare <8 x half> @llvm.aarch64.neon.frsqrte.v8f16(<8 x half> %x) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frsqrte.v4f16(<4 x half> %x) nounwind readnone;
+declare half @llvm.aarch64.neon.frsqrte.f16(half)
 declare <8 x half> @llvm.aarch64.neon.frecps.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frecps.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone;
+declare half @llvm.aarch64.neon.frecps.f16(half, half)
 declare <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %x, <8 x half> %y) nounwind readnone;
 declare <4 x half> @llvm.aarch64.neon.frsqrts.v4f16(<4 x half> %x, <4 x half> %y) nounwind readnone;
+declare half @llvm.aarch64.neon.frsqrts.f16(half, half)
 
 define weak_odr float @fast_inverse_f32(float %x) nounwind alwaysinline {
-       %vec = insertelement <2 x float> poison, float %x, i32 0
-       %approx = tail call <2 x float> @fast_inverse_f32x2(<2 x float> %vec)
-       %result = extractelement <2 x float> %approx, i32 0
+       %approx = tail call float @llvm.aarch64.neon.frecpe.f32(float %x)
+       %correction = tail call float @llvm.aarch64.neon.frecps.f32(float %approx, float %x)
+       %result = fmul float %approx, %correction
        ret float %result
 }
 
@@ -85,9 +94,9 @@ define weak_odr <4 x float> @fast_inverse_f32x4(<4 x float> %x) nounwind alwaysi
 }
 
 define weak_odr half @fast_inverse_f16(half %x) nounwind alwaysinline {
-       %vec = insertelement <4 x half> poison, half %x, i32 0
-       %approx = tail call <4 x half> @fast_inverse_f16x4(<4 x half> %vec)
-       %result = extractelement <4 x half> %approx, i32 0
+       %approx = tail call half @llvm.aarch64.neon.frecpe.f16(half %x)
+       %correction = tail call half @llvm.aarch64.neon.frecps.f16(half %approx, half %x)
+       %result = fmul half %approx, %correction
        ret half %result
 }
 
@@ -106,9 +115,10 @@ define weak_odr <8 x half> @fast_inverse_f16x8(<8 x half> %x) nounwind alwaysinl
 }
 
 define weak_odr float @fast_inverse_sqrt_f32(float %x) nounwind alwaysinline {
-       %vec = insertelement <2 x float> poison, float %x, i32 0
-       %approx = tail call <2 x float> @fast_inverse_sqrt_f32x2(<2 x float> %vec)
-       %result = extractelement <2 x float> %approx, i32 0
+       %approx = tail call float @llvm.aarch64.neon.frsqrte.f32(float %x)
+       %approx2 = fmul float %approx, %approx
+       %correction = tail call float @llvm.aarch64.neon.frsqrts.f32(float %approx2, float %x)
+       %result = fmul float %approx, %correction
        ret float %result
 }
 
@@ -129,9 +139,10 @@ define weak_odr <4 x float> @fast_inverse_sqrt_f32x4(<4 x float> %x) nounwind al
 }
 
 define weak_odr half @fast_inverse_sqrt_f16(half %x) nounwind alwaysinline {
-       %vec = insertelement <4 x half> poison, half %x, i32 0
-       %approx = tail call <4 x half> @fast_inverse_sqrt_f16x4(<4 x half> %vec)
-       %result = extractelement <4 x half> %approx, i32 0
+       %approx = tail call half @llvm.aarch64.neon.frsqrte.f16(half %x)
+       %approx2 = fmul half %approx, %approx
+       %correction = tail call half @llvm.aarch64.neon.frsqrts.f16(half %approx2, half %x)
+       %result = fmul half %approx, %correction
        ret half %result
 }
 
@@ -149,4 +160,43 @@ define weak_odr <8 x half> @fast_inverse_sqrt_f16x8(<8 x half> %x) nounwind alwa
        %correction = tail call <8 x half> @llvm.aarch64.neon.frsqrts.v8f16(<8 x half> %approx2, <8 x half> %x)
        %result = fmul <8 x half> %approx, %correction
        ret <8 x half> %result
-}
+}
+
+declare <vscale x 4 x float> @llvm.aarch64.sve.frecpe.x.nxv4f32(<vscale x 4 x float> %x) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrte.x.nxv4f32(<vscale x 4 x float> %x) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) nounwind readnone;
+declare <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float> %x, <vscale x 4 x float> %y) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frecpe.x.nxv8f16(<vscale x 8 x half> %x) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrte.x.nxv8f16(<vscale x 8 x half> %x) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %y) nounwind readnone;
+declare <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half> %x, <vscale x 8 x half> %y) nounwind readnone;
+
+define weak_odr <vscale x 4 x float> @fast_inverse_f32nx4(<vscale x 4 x float> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecpe.x.nxv4f32(<vscale x 4 x float> %x)
+       %correction = tail call <vscale x 4 x float> @llvm.aarch64.sve.frecps.x.nxv4f32(<vscale x 4 x float> %approx, <vscale x 4 x float> %x)
+       %result = fmul <vscale x 4 x float> %approx, %correction
+       ret <vscale x 4 x float> %result
+}
+
+define weak_odr <vscale x 8 x half> @fast_inverse_f16nx8(<vscale x 8 x half> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecpe.x.nxv8f16(<vscale x 8 x half> %x)
+       %correction = tail call <vscale x 8 x half> @llvm.aarch64.sve.frecps.x.nxv8f16(<vscale x 8 x half> %approx, <vscale x 8 x half> %x)
+       %result = fmul <vscale x 8 x half> %approx, %correction
+       ret <vscale x 8 x half> %result
+}
+
+define weak_odr <vscale x 4 x float> @fast_inverse_sqrt_f32nx4(<vscale x 4 x float> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 4 x float> @llvm.aarch64.sve.frsqrte.x.nxv4f32(<vscale x 4 x float> %x)
+       %approx2 = fmul <vscale x 4 x float> %approx, %approx
+       %correction = tail call <vscale x 4 x float> @llvm.aarch64.sve.frsqrts.x.nxv4f32(<vscale x 4 x float> %approx2, <vscale x 4 x float> %x)
+       %result = fmul <vscale x 4 x float> %approx, %correction
+       ret <vscale x 4 x float> %result
+}
+
+define weak_odr <vscale x 8 x half> @fast_inverse_sqrt_f16nx8(<vscale x 8 x half> %x) nounwind alwaysinline {
+       %approx = tail call <vscale x 8 x half> @llvm.aarch64.sve.frsqrte.x.nxv8f16(<vscale x 8 x half> %x)
+       %approx2 = fmul <vscale x 8 x half> %approx, %approx
+       %correction = tail call <vscale x 8 x half> @llvm.aarch64.sve.frsqrts.x.nxv8f16(<vscale x 8 x half> %approx2, <vscale x 8 x half> %x)
+       %result = fmul <vscale x 8 x half> %approx, %correction
+       ret <vscale x 8 x half> %result
+}
diff --git a/src/runtime/errors.cpp b/src/runtime/errors.cpp
@@ -291,4 +291,12 @@ WEAK int halide_error_device_crop_failed(void *user_context) {
     return halide_error_code_device_crop_failed;
 }
 
+WEAK int halide_error_vscale_invalid(void *user_context, const char *func_name, int runtime_vscale, int compiletime_vscale) {
+    error(user_context)
+        << "The function " << func_name
+        << " is compiled with the assumption that vscale of Scalable Vector is " << compiletime_vscale
+        << ". However, the detected runtime vscale is " << runtime_vscale << ".";
+    return halide_error_code_vscale_invalid;
+}
+
 }  // extern "C"
diff --git a/src/runtime/posix_math.ll b/src/runtime/posix_math.ll
@@ -322,4 +322,30 @@ define weak_odr double @neg_inf_f64() nounwind uwtable readnone alwaysinline {
 
 define weak_odr double @nan_f64() nounwind uwtable readnone alwaysinline {
        ret double 0x7FF8000000000000
-}
+}
+
+; In case scalable vector with un-natural vector size, LLVM doesn't auto-vectorize the above scalar version
+define weak_odr <vscale x 4 x float> @inf_f32nx4() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0x7FF0000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 4 x float> @neg_inf_f32nx4() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0xFFF0000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 4 x float> @nan_f32nx4() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 4 x float> shufflevector (<vscale x 4 x float> insertelement (<vscale x 4 x float> undef, float 0x7FF8000000000000, i32 0), <vscale x 4 x float> undef, <vscale x 4 x i32> zeroinitializer)
+}
+
+
+define weak_odr <vscale x 2 x double> @inf_f64nx2() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0x7FF0000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 2 x double> @neg_inf_f64nx2() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0xFFF0000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
+
+define weak_odr <vscale x 2 x double> @nan_f64nx2() nounwind uwtable readnone alwaysinline {
+       ret <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> undef, double 0x7FF8000000000000, i32 0), <vscale x 2 x double> undef, <vscale x 2 x i32> zeroinitializer)
+}
diff --git a/src/runtime/runtime_api.cpp b/src/runtime/runtime_api.cpp
@@ -89,6 +89,7 @@ extern "C" __attribute__((used)) void *halide_runtime_api_functions[] = {
     (void *)&halide_error_unaligned_host_ptr,
     (void *)&halide_error_storage_bound_too_small,
     (void *)&halide_error_device_crop_failed,
+    (void *)&halide_error_vscale_invalid,
     (void *)&halide_float16_bits_to_double,
     (void *)&halide_float16_bits_to_float,
     (void *)&halide_free,

diff --git a/test/correctness/CMakeLists.txt b/test/correctness/CMakeLists.txt
@@ -276,6 +276,7 @@ tests(GROUPS correctness
       simd_op_check_hvx.cpp
       simd_op_check_powerpc.cpp
       simd_op_check_riscv.cpp
+      simd_op_check_sve2.cpp
       simd_op_check_wasm.cpp
       simd_op_check_x86.cpp
       simplified_away_embedded_image.cpp