diff --git a/src/llvm-demote-float16.cpp b/src/llvm-demote-float16.cpp
index 51535e7cb1f9f..57ec30ca57947 100644
--- a/src/llvm-demote-float16.cpp
+++ b/src/llvm-demote-float16.cpp
@@ -25,6 +25,8 @@
 #include <llvm/IR/Module.h>
 #include <llvm/IR/Verifier.h>
 #include <llvm/Support/Debug.h>
+#include "julia.h"
+#include "jitlayers.h"
 
 #define DEBUG_TYPE "demote_float16"
 
@@ -43,13 +45,47 @@ INST_STATISTIC(FRem);
 INST_STATISTIC(FCmp);
 #undef INST_STATISTIC
 
+extern JuliaOJIT *jl_ExecutionEngine;
+
+Optional<bool> always_have_fp16() {
+#if defined(_CPU_X86_) || defined(_CPU_X86_64_)
+    // x86 doesn't support fp16
+    // TODO: update for sapphire rapids when it comes out
+    return false;
+#else
+    return {};
+#endif
+}
+
 namespace {
 
+bool have_fp16(Function &caller) {
+    auto unconditional = always_have_fp16();
+    if (unconditional.hasValue())
+        return unconditional.getValue();
+
+    Attribute FSAttr = caller.getFnAttribute("target-features");
+    StringRef FS =
+        FSAttr.isValid() ? FSAttr.getValueAsString() : jl_ExecutionEngine->getTargetFeatureString();
+#if defined(_CPU_AARCH64_)
+    if (FS.find("+fp16fml") != llvm::StringRef::npos || FS.find("+fullfp16") != llvm::StringRef::npos){
+        return true;
+    }
+#else
+    if (FS.find("+avx512fp16") != llvm::StringRef::npos){
+        return true;
+    }
+#endif
+    return false;
+}
+
 static bool demoteFloat16(Function &F)
 {
+    if (have_fp16(F))
+        return false;
+
     auto &ctx = F.getContext();
     auto T_float32 = Type::getFloatTy(ctx);
-
     SmallVector<Instruction *, 0> erase;
     for (auto &BB : F) {
         for (auto &I : BB) {
diff --git a/src/llvm-multiversioning.cpp b/src/llvm-multiversioning.cpp
index 7e0bed6276a2d..f2fdb6f4fd1c8 100644
--- a/src/llvm-multiversioning.cpp
+++ b/src/llvm-multiversioning.cpp
@@ -45,6 +45,8 @@ using namespace llvm;
 
 extern Optional<bool> always_have_fma(Function&);
 
+extern Optional<bool> always_have_fp16();
+
 namespace {
 constexpr uint32_t clone_mask =
     JL_TARGET_CLONE_LOOP | JL_TARGET_CLONE_SIMD | JL_TARGET_CLONE_MATH | JL_TARGET_CLONE_CPU;
@@ -480,6 +482,14 @@ uint32_t CloneCtx::collect_func_info(Function &F)
                     flag |= JL_TARGET_CLONE_MATH;
                 }
             }
+            if(!always_have_fp16().hasValue()){
+                for (size_t i = 0; i < I.getNumOperands(); i++) {
+                    if(I.getOperand(i)->getType()->isHalfTy()){
+                        flag |= JL_TARGET_CLONE_FLOAT16;
+                    }
+                    // Check for BFloat16 when they are added to julia can be done here
+                }
+            }
             if (has_veccall && (flag & JL_TARGET_CLONE_SIMD) && (flag & JL_TARGET_CLONE_MATH)) {
                 return flag;
             }
diff --git a/src/processor.h b/src/processor.h
index f3b571cf9b937..4b9071fb4f663 100644
--- a/src/processor.h
+++ b/src/processor.h
@@ -112,6 +112,8 @@ enum {
     JL_TARGET_MINSIZE = 1 << 7,
     // Clone when the function queries CPU features
     JL_TARGET_CLONE_CPU = 1 << 8,
+    // Clone when the function uses fp16
+    JL_TARGET_CLONE_FLOAT16 = 1 << 9,
 };
 
 #define JL_FEATURE_DEF_NAME(name, bit, llvmver, str) JL_FEATURE_DEF(name, bit, llvmver)
diff --git a/src/processor_arm.cpp b/src/processor_arm.cpp
index ea8dddf629d62..eaa950662d0de 100644
--- a/src/processor_arm.cpp
+++ b/src/processor_arm.cpp
@@ -1602,12 +1602,19 @@ static void ensure_jit_target(bool imaging)
         auto &t = jit_targets[i];
         if (t.en.flags & JL_TARGET_CLONE_ALL)
             continue;
+        auto &features0 = jit_targets[t.base].en.features;
         // Always clone when code checks CPU features
         t.en.flags |= JL_TARGET_CLONE_CPU;
+        static constexpr uint32_t clone_fp16[] = {Feature::fp16fml,Feature::fullfp16};
+        for (auto fe: clone_fp16) {
+            if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {
+                t.en.flags |= JL_TARGET_CLONE_FLOAT16;
+                break;
+            }
+        }
         // The most useful one in general...
         t.en.flags |= JL_TARGET_CLONE_LOOP;
 #ifdef _CPU_ARM_
-        auto &features0 = jit_targets[t.base].en.features;
         static constexpr uint32_t clone_math[] = {Feature::vfp3, Feature::vfp4, Feature::neon};
         for (auto fe: clone_math) {
             if (!test_nbit(features0, fe) && test_nbit(t.en.features, fe)) {