diff --git a/src/llvm-demote-float16.cpp b/src/llvm-demote-float16.cpp index 51535e7cb1f9f..57ec30ca57947 100644 --- a/src/llvm-demote-float16.cpp +++ b/src/llvm-demote-float16.cpp @@ -25,6 +25,8 @@ #include #include #include +#include "julia.h" +#include "jitlayers.h" #define DEBUG_TYPE "demote_float16" @@ -43,13 +45,47 @@ INST_STATISTIC(FRem); INST_STATISTIC(FCmp); #undef INST_STATISTIC +extern JuliaOJIT *jl_ExecutionEngine; + +Optional always_have_fp16() { +#if defined(_CPU_X86_) || defined(_CPU_X86_64_) + // x86 doesn't support fp16 + // TODO: update for sapphire rapids when it comes out + return false; +#else + return {}; +#endif +} + namespace { +bool have_fp16(Function &caller) { + auto unconditional = always_have_fp16(); + if (unconditional.hasValue()) + return unconditional.getValue(); + + Attribute FSAttr = caller.getFnAttribute("target-features"); + StringRef FS = + FSAttr.isValid() ? FSAttr.getValueAsString() : jl_ExecutionEngine->getTargetFeatureString(); +#if defined(_CPU_AARCH64_) + if (FS.find("+fp16fml") != llvm::StringRef::npos || FS.find("+fullfp16") != llvm::StringRef::npos){ + return true; + } +#else + if (FS.find("+avx512fp16") != llvm::StringRef::npos){ + return true; + } +#endif + return false; +} + static bool demoteFloat16(Function &F) { + if (have_fp16(F)) + return false; + auto &ctx = F.getContext(); auto T_float32 = Type::getFloatTy(ctx); - SmallVector erase; for (auto &BB : F) { for (auto &I : BB) { diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp index 7e0bed6276a2d..f2fdb6f4fd1c8 100644 --- a/src/llvm-multiversioning.cpp +++ b/src/llvm-multiversioning.cpp @@ -45,6 +45,8 @@ using namespace llvm; extern Optional always_have_fma(Function&); +extern Optional always_have_fp16(); + namespace { constexpr uint32_t clone_mask = JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU; @@ -480,6 +482,14 @@ uint32_t CloneCtx::collect_func_info(Function &F) flag |= JL_TARGET_CLONE_MATH; } } + if(!always_have_fp16().hasValue()){ + for (size_t i = 0; i < I.getNumOperands(); i++) { + if(I.getOperand(i)->getType()->isHalfTy()){ + flag |= JL_TARGET_CLONE_FLOAT16; + } + // Check for BFloat16 when they are added to julia can be done here + } + } if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH)) { return flag; } diff --git a/src/processor.h b/src/processor.h index f3b571cf9b937..4b9071fb4f663 100644 --- a/src/processor.h +++ b/src/processor.h @@ -112,6 +112,8 @@ enum { JL_TARGET_MINSIZE = 1 << 7, // Clone when the function queries CPU features JL_TARGET_CLONE_CPU = 1 << 8, + // Clone when the function uses fp16 + JL_TARGET_CLONE_FLOAT16 = 1 << 9, }; #define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver) diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp index ea8dddf629d62..eaa950662d0de 100644 --- a/src/processor_arm.cpp +++ b/src/processor_arm.cpp @@ -1602,12 +1602,19 @@ static void ensure_jit_target(bool imaging) auto &t = jit_targets[i]; if (t.en.flags & JL_TARGET_CLONE_ALL) continue; + auto &features0 = jit_targets[t.base].en.features; // Always clone when code checks CPU features t.en.flags |= JL_TARGET_CLONE_CPU; + static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16}; + for (auto fe: clone_fp16) { + if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) { + t.en.flags |= JL_TARGET_CLONE_FLOAT16; + break; + } + } // The most useful one in general... t.en.flags |= JL_TARGET_CLONE_LOOP; #ifdef _CPU_ARM_ - auto &features0 = jit_targets[t.base].en.features; static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon}; for (auto fe: clone_math) { if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {