Enabling fp16 execution

Signed-off-by: quic-shanagra <quic_shanagra@quicinc.com>
quic · Sep 19, 2024 · 4baf57c · 4baf57c
1 parent 9a87fbd
commit 4baf57c
Show file tree

Hide file tree

Showing 3 changed files with 196 additions and 0 deletions.
diff --git a/src/Utils/DataUtil.cpp b/src/Utils/DataUtil.cpp
@@ -276,6 +276,162 @@ datautil::StatusCode datautil::writeBinaryToFile(std::string fileDir,
   return StatusCode::SUCCESS;
 }
 
+
+static inline float datautil::fp16_ieee_to_fp32_value(uint16_t h) {
+    const uint32_t w = (uint32_t) h << 16;
+    const uint32_t sign = w & UINT32_C(0x80000000);
+    const uint32_t two_w = w + w;
+    const uint32_t exp_offset = UINT32_C(0xE0) << 23;
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+    const float exp_scale = 0x1.0p-112f;
+#else
+    const float exp_scale = fp32_from_bits(UINT32_C(0x7800000));
+#endif
+    const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale;
+    const uint32_t magic_mask = UINT32_C(126) << 23;
+    const float magic_bias = 0.5f;
+    const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias;
+    const uint32_t denormalized_cutoff = UINT32_C(1) << 27;
+    const uint32_t result = sign |
+        (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value));
+    return fp32_from_bits(result);
+}
+
+/*
+ * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in
+ * IEEE half-precision format, in bit representation.
+ *
+ * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals)
+ * floating-point operations and bitcasts between integer and floating-point variables.
+ */
+
+
+bool datautil::floatNToFloat32(float* out,
+                     uint8_t* in,
+                     size_t numElements,
+                     uint8_t bitWidth)
+{
+    if(numElements == 0) {
+        return false;
+    }
+
+    if(bitWidth == 16){
+#ifndef __hexagon__
+        uint16_t *temp = (uint16_t *)in;
+        for(size_t i = 0; i < numElements; i++){
+            out[i] = fp16_ieee_to_fp32_value(temp[i]);
+        }
+#else
+        return false;
+#endif //__hexagon__
+    }
+    else if(bitWidth == 32) {
+        float* inFloat = reinterpret_cast<float*>(in);
+        for (size_t i = 0; i < numElements; i++) {
+            out[i] = inFloat[i];
+        }
+    }
+    else {
+        return false;
+    }
+
+    return true;
+}
+
+static inline float datautil::fp32_from_bits(uint32_t w) {
+#if defined(__OPENCL_VERSION__)
+    return as_float(w);
+#elif defined(__CUDA_ARCH__)
+    return __uint_as_float((unsigned int) w);
+#elif defined(__INTEL_COMPILER)
+    return _castu32_f32(w);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+    return _CopyFloatFromInt32((__int32) w);
+#else
+    union {
+        uint32_t as_bits;
+        float as_value;
+    } fp32 = { w };
+    return fp32.as_value;
+#endif
+}
+
+static inline uint32_t datautil::fp32_to_bits(float f) {
+#if defined(__OPENCL_VERSION__)
+    return as_uint(f);
+#elif defined(__CUDA_ARCH__)
+    return (uint32_t) __float_as_uint(f);
+#elif defined(__INTEL_COMPILER)
+    return _castf32_u32(f);
+#elif defined(_MSC_VER) && (defined(_M_ARM) || defined(_M_ARM64))
+    return (uint32_t) _CopyInt32FromFloat(f);
+#else
+    union {
+        float as_value;
+        uint32_t as_bits;
+    } fp32 = { f };
+    return fp32.as_bits;
+#endif
+}
+
+static inline uint16_t datautil::fp16_ieee_from_fp32_value(float f) {
+ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__)
+     const float scale_to_inf = 0x1.0p+112f;
+     const float scale_to_zero = 0x1.0p-110f;
+ #else
+     const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000));
+     const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000));
+ #endif
+     float base = (fabsf(f) * scale_to_inf) * scale_to_zero;
+
+     const uint32_t w = fp32_to_bits(f);
+     const uint32_t shl1_w = w + w;
+     const uint32_t sign = w & UINT32_C(0x80000000);
+     uint32_t bias = shl1_w & UINT32_C(0xFF000000);
+     if (bias < UINT32_C(0x71000000)) {
+         bias = UINT32_C(0x71000000);
+     }
+
+     base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base;
+     const uint32_t bits = fp32_to_bits(base);
+     const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00);
+     const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF);
+     const uint32_t nonsign = exp_bits + mantissa_bits;
+     return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign);
+ }
+
+bool datautil::float32ToFloatN(uint8_t* out,
+                       float* in,
+                       size_t numElements,
+                       uint8_t bitWidth)
+  {
+      if(numElements == 0) {
+          return false;
+      }
+
+      if(bitWidth == 16){
+  #ifndef __hexagon__
+          uint16_t *temp = (uint16_t *)out;
+          for(size_t i = 0; i < numElements; i++){
+              temp[i] = fp16_ieee_from_fp32_value(in[i]);
+          }
+  #else
+          return false;
+  #endif //__hexagon__
+      }
+      else if(bitWidth == 32) {
+          float* outFloat = reinterpret_cast<float*>(out);
+          for (size_t i = 0; i < numElements; i++) {
+              outFloat[i] = in[i];
+          }
+      }
+      else {
+          return false;
+      }
+
+      return true;
+  }
+
 template <typename T_QuantType>
 datautil::StatusCode datautil::floatToTfN(
     T_QuantType* out, float* in, int32_t offset, float scale, size_t numElements) {

diff --git a/src/Utils/DataUtil.hpp b/src/Utils/DataUtil.hpp
@@ -86,6 +86,22 @@ StatusCode writeBinaryToFile(std::string fileDir,
                              uint8_t* buffer,
                              size_t bufferSize);
 
+static inline uint16_t fp16_ieee_from_fp32_value(float f);
+static inline float fp16_ieee_to_fp32_value(uint16_t h);
+
+static inline uint32_t fp32_to_bits(float f);
+static inline float fp32_from_bits(uint32_t w);
+
+bool floatNToFloat32(float* out,
+                     uint8_t* in,
+                     size_t numElements,
+                     uint8_t bitWidth);
+
+bool float32ToFloatN(uint8_t* out,
+                       float* in,
+                       size_t numElements,
+                       uint8_t bitWidth);
+
 template <typename T_QuantType>
 datautil::StatusCode floatToTfN(
     T_QuantType* out, float* in, int32_t offset, float scale, size_t numElements);

diff --git a/src/Utils/IOTensor.cpp b/src/Utils/IOTensor.cpp
@@ -71,6 +71,21 @@ iotensor::StatusCode iotensor::IOTensor::copyFromFloatToNative(float* floatBuffe
   fillDims(dims, QNN_TENSOR_GET_DIMENSIONS(tensor), QNN_TENSOR_GET_RANK(tensor));
 
   switch (QNN_TENSOR_GET_DATA_TYPE(tensor)) {
+    case QNN_DATATYPE_FLOAT_16:
+#ifdef __hexagon__
+      QNN_ERROR("failure in aiswutility::float32ToFloatN, not supported on Hexagon");
+      returnStatus = StatusCode::FAILURE;
+#else
+      if (!datautil::float32ToFloatN(static_cast<uint8_t*>(QNN_TENSOR_GET_CLIENT_BUF(tensor).data),
+                                        floatBuffer,
+                                        datautil::calculateElementCount(dims),
+                                        16)) {
+        QNN_ERROR("failure in aiswutility::float32ToFloatN");
+        returnStatus = StatusCode::FAILURE;
+      }
+#endif
+      break;
+
     case QNN_DATATYPE_UFIXED_POINT_8:
       datautil::floatToTfN<uint8_t>(static_cast<uint8_t*>(QNN_TENSOR_GET_CLIENT_BUF(tensor).data),
                                     floatBuffer,
@@ -527,6 +542,7 @@ iotensor::StatusCode iotensor::IOTensor::allocateBuffer(uint8_t** buffer,
   size_t elementCount = datautil::calculateElementCount(dims);
   auto returnStatus   = StatusCode::SUCCESS;
   switch (dataType) {
+    case QNN_DATATYPE_FLOAT_16:
     case QNN_DATATYPE_FLOAT_32:
       QNN_DEBUG("allocating float buffer");
       returnStatus = allocateBuffer<float>(reinterpret_cast<float**>(buffer), elementCount);
@@ -614,6 +630,14 @@ iotensor::StatusCode iotensor::IOTensor::convertToFloat(float** out, Qnn_Tensor_
     return returnStatus;
   }
   switch (QNN_TENSOR_GET_DATA_TYPE(tensor)) {
+    case QNN_DATATYPE_FLOAT_16:
+      if (!datautil::floatNToFloat32(
+              *out, reinterpret_cast<uint8_t*>(QNN_TENSOR_GET_CLIENT_BUF(tensor).data), elementCount, 16)) {
+        QNN_ERROR("failure in aiswutility::floatNToFloat32");
+        returnStatus = StatusCode::FAILURE;
+      }
+      break;
+
     case QNN_DATATYPE_UFIXED_POINT_8:
       if (datautil::StatusCode::SUCCESS !=
           datautil::tfNToFloat<uint8_t>(