From 2b8cd1708cc3884a11615e618a14d9011d98f421 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 25 Sep 2024 12:59:37 -0700 Subject: [PATCH 01/16] Add microbenchmark for layer normalization --- cmake/onnxruntime_unittests.cmake | 3 +- .../core/providers/cpu/nn/layer_norm_impl.cc | 113 +++++++++--------- .../core/providers/cpu/nn/layer_norm_impl.h | 39 ++++++ .../microbenchmark/layer_normalization.cc | 108 +++++++++++++++++ 4 files changed, 207 insertions(+), 56 deletions(-) create mode 100644 onnxruntime/test/onnx/microbenchmark/layer_normalization.cc diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index a4ba85e868896..f6ace371531f9 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -1128,7 +1128,8 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) ${BENCHMARK_DIR}/gelu.cc ${BENCHMARK_DIR}/activation.cc ${BENCHMARK_DIR}/quantize.cc - ${BENCHMARK_DIR}/reduceminmax.cc) + ${BENCHMARK_DIR}/reduceminmax.cc + ${BENCHMARK_DIR}/layer_normalization.cc) target_include_directories(onnxruntime_benchmark PRIVATE ${ONNXRUNTIME_ROOT} ${onnxruntime_graph_header} ${ONNXRUNTIME_ROOT}/core/mlas/inc) target_compile_definitions(onnxruntime_benchmark PRIVATE BENCHMARK_STATIC_DEFINE) if(WIN32) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 23630dcb63efa..57eb8c69a3067 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -12,6 +12,8 @@ namespace onnxruntime { +namespace { + // Utility to convert from MLFloat16 to float only when the input type is MLFloat16. template ORT_FORCEINLINE Ret ConvertMLFloat16ToDoubleOrFloatIfNeeded(T val); @@ -63,15 +65,16 @@ ORT_FORCEINLINE constexpr double ConvertToMLFloat16IfNeeded(double val) { return val; } +} // namespace + LayerNormImpl::LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified, bool contrib_op) : OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op} { ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK()); ORT_ENFORCE(op_kernel_info.GetAttr("epsilon", &epsilon_).IsOK()); } -namespace { template -Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) { +Status LayerNormImpl::ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) const { // Inputs const Tensor* X = p_ctx->Input(0); const Tensor* scale = p_ctx->Input(1); @@ -81,21 +84,12 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo const T* bias_data = (simplified || nullptr == bias) ? nullptr : bias->Data(); const TensorShape& x_shape = X->Shape(); - const int64_t axis = HandleNegativeAxis(orig_axis, x_shape.NumDimensions()); - int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); - int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); - - const auto scale_size = scale->Shape().Size(); - const auto bias_size = (bias_data) ? bias->Shape().Size() : 0; - if (scale_size != norm_size || (bias_data && bias_size != norm_size)) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Size of X.shape()[axis:] == ", norm_size, - ". Size of scale and bias (if provided) must match this. Got scale size of ", - scale_size, " and bias size of ", bias_size); - } - + const TensorShape& scale_shape = scale->Shape(); + const TensorShape& bias_shape = bias->Shape(); Tensor* Y = p_ctx->Output(0, x_shape); - auto Y_data = Y->MutableData(); + T* Y_data = Y->MutableData(); + + const int64_t axis = HandleNegativeAxis(orig_axis, x_shape.NumDimensions()); std::vector mean_inv_std_dev_dim; mean_inv_std_dev_dim.reserve(x_shape.NumDimensions()); @@ -107,17 +101,11 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo } } - AllocatorPtr alloc; - ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); - int output_index = 1; - + Tensor* mean = p_ctx->Output(output_index++, TensorShape(mean_inv_std_dev_dim)); U* mean_data = nullptr; - if (!simplified) { - Tensor* mean = p_ctx->Output(output_index++, TensorShape(mean_inv_std_dev_dim)); - if (mean != nullptr) { - mean_data = mean->MutableData(); - } + if (mean != nullptr) { + mean_data = mean->MutableData(); } U* inv_std_dev_data = nullptr; @@ -126,8 +114,51 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo inv_std_dev_data = inv_std_dev->MutableData(); } + onnxruntime::concurrency::ThreadPool* thread_pool = p_ctx->GetOperatorThreadPool(); + + return ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, + Y_data, mean_data, inv_std_dev_data, thread_pool, axis, epsilon, simplified); +} + +Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { + const auto elem_type = p_ctx->Input(0)->GetElementType(); + + using SupportedTypeList = boost::mp11::mp_list; + + utils::MLTypeCallDispatcherFromTypeList t_disp(elem_type); + return t_disp.InvokeRet(this, p_ctx, axis_, epsilon_, simplified_, contrib_op_); +} + +template +Status LayerNormImpl::ComputeWithoutContext( + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev_data, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon, + bool simplified +) const { + int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); + int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); + + const auto scale_size = scale_shape.Size(); + const auto bias_size = (bias_data) ? bias_shape.Size() : 0; + if (scale_size != norm_size || (bias_data && bias_size != norm_size)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "Size of X.shape()[axis:] == ", norm_size, + ". Size of scale and bias (if provided) must match this. Got scale size of ", + scale_size, " and bias size of ", bias_size); + } + concurrency::ThreadPool::TryBatchParallelFor( - p_ctx->GetOperatorThreadPool(), static_cast(norm_count), + thread_pool, static_cast(norm_count), [&](ptrdiff_t task_idx) { const T* p_input = X_data + task_idx * norm_size; T* p_output = Y_data + task_idx * norm_size; @@ -159,7 +190,7 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo DoubleOrFloat scale_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(scale_data[h]); if (simplified) { p_output[h] = ConvertToMLFloat16IfNeeded(input_value / mean_square * scale_value); - } else if (nullptr == bias) { + } else if (nullptr == bias_data) { p_output[h] = ConvertToMLFloat16IfNeeded((input_value - mean) / mean_square * scale_value); } else { DoubleOrFloat bias_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(bias_data[h]); @@ -181,32 +212,4 @@ Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, boo return Status::OK(); } -template -struct SrcDispatcher { - Status operator()(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified, bool contrib_op) const { - // the contrib op kernel was always registered with the same type for all constraints. - // our implementation of the onnx op only supports 'float' as the U constraint. -#if !defined(DISABLE_CONTRIB_OPS) - if (contrib_op) { - return ComputeImpl(p_ctx, orig_axis, epsilon, simplified); - } else -#else - ORT_UNUSED_PARAMETER(contrib_op); -#endif - { - return ComputeImpl(p_ctx, orig_axis, epsilon, simplified); - } - } -}; -} // namespace - -Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { - const auto elem_type = p_ctx->Input(0)->GetElementType(); - - using SupportedTypeList = boost::mp11::mp_list; - - utils::MLTypeCallDispatcherFromTypeList t_disp(elem_type); - return t_disp.InvokeRet(p_ctx, axis_, epsilon_, simplified_, contrib_op_); -} - } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 393c637dbda18..086adb8dfa94b 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -14,7 +14,46 @@ class LayerNormImpl : public OpKernel { LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified = false, bool contrib_op = false); Status Compute(OpKernelContext* p_op_kernel_context) const override; + // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`. + template + Status ComputeWithoutContext( + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon = epsilon_, + bool simplified = simplified_ + ) const; + private: + template + Status ComputeImpl(OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified) const; + + template + struct SrcDispatcher { + Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified, bool contrib_op) const { + // the contrib op kernel was always registered with the same type for all constraints. + // our implementation of the onnx op only supports 'float' as the U constraint. + #if !defined(DISABLE_CONTRIB_OPS) + if (contrib_op) { + return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); + } else + #else + ORT_UNUSED_PARAMETER(contrib_op); + #endif + { + return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); + } + } + }; + int64_t axis_; float epsilon_; const bool simplified_; diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc new file mode 100644 index 0000000000000..20089262e1d1c --- /dev/null +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -0,0 +1,108 @@ +#include "core/platform/threadpool.h" +#include "core/util/thread_utils.h" +#include + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmaybe-uninitialized" +#endif + +#include "core/framework/allocator.h" +#include "core/framework/config_options.h" +#include "core/framework/data_transfer_manager.h" +#include "core/framework/op_kernel_info.h" +#include "core/framework/ort_value_name_idx_map.h" +#include "core/platform/windows/env.h" +#include "core/providers/cpu/nn/layer_norm_impl.h" +#include "core/providers/cpu/cpu_provider_factory.h" +#include "core/providers/cpu/cpu_provider_factory_creator.h" +#include "core/util/thread_utils.h" + +#if defined(__GNUC__) && !defined(__clang__) +#pragma GCC diagnostic pop +#endif + +using namespace onnxruntime; + +template +static void BM_LayerNormalization(benchmark::State& state) { + bool simplified = false; + const float epsilon = 1e-05f; + int64_t axis = 1; + + onnxruntime::Node node; + // Required by LayerNormImpl constructor + node.AddAttribute("axis", axis); + node.AddAttribute("epsilon", epsilon); + + KernelDef kernel_def; + std::unique_ptr execution_provider = CPUProviderFactoryCreator::Create(true)->CreateProvider(); + std::unordered_map constant_initialized_tensors; + OrtValueNameIdxMap mlvalue_name_idx_map; + DataTransferManager data_transfer_mgr; + AllocatorMap allocators; + ConfigOptions config_options; + + OpKernelInfo op_kernel_info(node, kernel_def, *execution_provider, constant_initialized_tensors, mlvalue_name_idx_map, + data_transfer_mgr, allocators, config_options); + + LayerNormImpl layer_norm_impl(op_kernel_info); + + std::vector x_dims{2, 2, 2}; + TensorShape x_shape(x_dims); + std::vector x{1, 1, 1, 1, 1, 1, 1, 1}; + + std::vector scale_bias_dims{1, 2, 2}; + TensorShape scale_shape(scale_bias_dims); + TensorShape bias_shape(scale_bias_dims); + std::vector scale{1, 1, 1, 1}; + std::vector bias{1, 1, 1, 1}; + + T* X_data = static_cast(malloc(x.size() * sizeof(T))); + T* scale_data = static_cast(malloc(scale.size() * sizeof(T))); + T* bias_data = static_cast(malloc(bias.size() * sizeof(T))); + for (size_t i = 0; i < x.size(); i++) { + X_data[i] = T(x[i]); + } + for (size_t i = 0; i < scale.size(); i++) { + scale_data[i] = T(scale[i]); + } + for (size_t i = 0; i < bias.size(); i++) { + bias_data[i] = T(bias[i]); + } + + T* Y_data = static_cast(malloc(x.size() * sizeof(T))); + U* mean_data = static_cast(malloc(x.size() * sizeof(U))); + U* inv_std_dev_data = static_cast(malloc(x.size() * sizeof(U))); + + OrtThreadPoolParams tp_params; + tp_params.name = ORT_TSTR("intra-op"); + std::unique_ptr thread_pool = concurrency::CreateThreadPool( + &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); + + for (auto _ : state) { + auto status = layer_norm_impl.ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, + Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); + + if (! status.IsOK()) + { + std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; + break; + } + } +} + + +BENCHMARK(BM_LayerNormalization) + ->Arg(1) + ->Arg(256) + ->Arg(1024) + ->UseRealTime() + ->Unit(benchmark::TimeUnit::kMicrosecond); + +BENCHMARK(BM_LayerNormalization) + ->Arg(1) + ->Arg(256) + ->Arg(1024) + ->UseRealTime() + ->Unit(benchmark::TimeUnit::kMicrosecond); From 0c89631e7f05a819494372d6ee093786aa381a3a Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Wed, 25 Sep 2024 14:25:02 -0700 Subject: [PATCH 02/16] fix warnings --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 2 +- onnxruntime/core/providers/cpu/nn/layer_norm_impl.h | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 57eb8c69a3067..885e676998ed7 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -129,7 +129,7 @@ Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { return t_disp.InvokeRet(this, p_ctx, axis_, epsilon_, simplified_, contrib_op_); } -template +template Status LayerNormImpl::ComputeWithoutContext( const T* X_data, const TensorShape& x_shape, diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 086adb8dfa94b..9c2ed303eef5e 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -15,7 +15,7 @@ class LayerNormImpl : public OpKernel { Status Compute(OpKernelContext* p_op_kernel_context) const override; // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`. - template + template Status ComputeWithoutContext( const T* X_data, const TensorShape& x_shape, @@ -28,8 +28,8 @@ class LayerNormImpl : public OpKernel { U* inv_std_dev, onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, - float epsilon = epsilon_, - bool simplified = simplified_ + float epsilon, + bool simplified ) const; private: From bca13ca03647866f1d22769f080174f703727603 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 26 Sep 2024 08:46:11 -0700 Subject: [PATCH 03/16] initialize test input data at compile time --- .../microbenchmark/layer_normalization.cc | 64 ++++++++++--------- 1 file changed, 34 insertions(+), 30 deletions(-) diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index 20089262e1d1c..a2987b4d7c25a 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -24,6 +24,29 @@ using namespace onnxruntime; +namespace { + +static const std::vector dims{1, 256, 1024}; +static const size_t num_elems = dims[0] * dims[1] * dims[2]; +static const std::vector float_vals(num_elems, 1.0f); +static const std::vector MLFloat16_vals(num_elems, MLFloat16(1.0f)); + +} // namespace + +template +const T* getVector(); + +template <> +const float* getVector() { + return float_vals.data(); +} + +template <> +const MLFloat16* getVector() { + return MLFloat16_vals.data(); +} + + template static void BM_LayerNormalization(benchmark::State& state) { bool simplified = false; @@ -48,32 +71,17 @@ static void BM_LayerNormalization(benchmark::State& state) { LayerNormImpl layer_norm_impl(op_kernel_info); - std::vector x_dims{2, 2, 2}; - TensorShape x_shape(x_dims); - std::vector x{1, 1, 1, 1, 1, 1, 1, 1}; - - std::vector scale_bias_dims{1, 2, 2}; - TensorShape scale_shape(scale_bias_dims); - TensorShape bias_shape(scale_bias_dims); - std::vector scale{1, 1, 1, 1}; - std::vector bias{1, 1, 1, 1}; - - T* X_data = static_cast(malloc(x.size() * sizeof(T))); - T* scale_data = static_cast(malloc(scale.size() * sizeof(T))); - T* bias_data = static_cast(malloc(bias.size() * sizeof(T))); - for (size_t i = 0; i < x.size(); i++) { - X_data[i] = T(x[i]); - } - for (size_t i = 0; i < scale.size(); i++) { - scale_data[i] = T(scale[i]); - } - for (size_t i = 0; i < bias.size(); i++) { - bias_data[i] = T(bias[i]); - } + TensorShape x_shape(dims); + TensorShape scale_shape(dims); + TensorShape bias_shape(dims); + + const T* x_data = getVector(); + const T* scale_data = getVector(); + const T* bias_data = getVector(); - T* Y_data = static_cast(malloc(x.size() * sizeof(T))); - U* mean_data = static_cast(malloc(x.size() * sizeof(U))); - U* inv_std_dev_data = static_cast(malloc(x.size() * sizeof(U))); + T* Y_data = static_cast(malloc(num_elems * sizeof(T))); + U* mean_data = static_cast(malloc(num_elems * sizeof(U))); + U* inv_std_dev_data = static_cast(malloc(num_elems * sizeof(U))); OrtThreadPoolParams tp_params; tp_params.name = ORT_TSTR("intra-op"); @@ -81,7 +89,7 @@ static void BM_LayerNormalization(benchmark::State& state) { &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); for (auto _ : state) { - auto status = layer_norm_impl.ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, + auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); if (! status.IsOK()) @@ -95,14 +103,10 @@ static void BM_LayerNormalization(benchmark::State& state) { BENCHMARK(BM_LayerNormalization) ->Arg(1) - ->Arg(256) - ->Arg(1024) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); BENCHMARK(BM_LayerNormalization) ->Arg(1) - ->Arg(256) - ->Arg(1024) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); From 680cf4fcf2e88af9415e69786a408990ca063ffa Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Thu, 26 Sep 2024 09:12:38 -0700 Subject: [PATCH 04/16] remove unused specialization that fails on pipeline --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 5 ----- onnxruntime/core/providers/cpu/nn/layer_norm_impl.h | 6 +++--- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 885e676998ed7..546557f6f9015 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -23,11 +23,6 @@ ORT_FORCEINLINE float ConvertMLFloat16ToDoubleOrFloatIfNeeded( return val.ToFloat(); } -template <> -ORT_FORCEINLINE double ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return double(ConvertMLFloat16ToDoubleOrFloatIfNeeded(val)); -} - template <> ORT_FORCEINLINE constexpr float ConvertMLFloat16ToDoubleOrFloatIfNeeded(float val) { return val; diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index 9c2ed303eef5e..aa876357ed3c8 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -29,8 +29,7 @@ class LayerNormImpl : public OpKernel { onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, float epsilon, - bool simplified - ) const; + bool simplified) const; private: template @@ -38,7 +37,8 @@ class LayerNormImpl : public OpKernel { template struct SrcDispatcher { - Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, float epsilon, bool simplified, bool contrib_op) const { + Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, + float epsilon, bool simplified, bool contrib_op) const { // the contrib op kernel was always registered with the same type for all constraints. // our implementation of the onnx op only supports 'float' as the U constraint. #if !defined(DISABLE_CONTRIB_OPS) From f0df5263f9b2d9ebcf7cef8aa34c7118bd3746b3 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 01:15:13 -0700 Subject: [PATCH 05/16] fix build on linux --- onnxruntime/test/onnx/microbenchmark/layer_normalization.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index a2987b4d7c25a..5c7bd5716832a 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -1,3 +1,5 @@ +#ifdef _WIN32 + #include "core/platform/threadpool.h" #include "core/util/thread_utils.h" #include @@ -110,3 +112,5 @@ BENCHMARK(BM_LayerNormalization) ->Arg(1) ->UseRealTime() ->Unit(benchmark::TimeUnit::kMicrosecond); + +#endif From 87725c37e4f65f4372ffcf05228b6e15ff081077 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 08:20:28 -0700 Subject: [PATCH 06/16] convert all inputs to float efficiently if needed --- .../contrib_ops/cpu/skip_layer_norm.cc | 81 ++++++++++++------- .../core/providers/cpu/nn/layer_norm_impl.cc | 69 ++++++++++------ 2 files changed, 99 insertions(+), 51 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index faf78cae80ee1..50ce160f38153 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "core/framework/tensor.h" +#include "core/mlas/inc/mlas.h" #include "core/util/math_cpuonly.h" #include "core/providers/common.h" #include "core/platform/threadpool.h" @@ -36,30 +37,32 @@ REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) -// Utility to convert from MLFloat16 to float only when the input type is MLFloat16. -template -ORT_FORCEINLINE Ret ConvertMLFloat16ToDoubleOrFloatIfNeeded(T val); -template <> -ORT_FORCEINLINE float ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return val.ToFloat(); -} +template +std::shared_ptr> ConvertHalfToFloatIfNeeded(const T* p_input, int num_elems); -template <> -ORT_FORCEINLINE double ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return static_cast(ConvertMLFloat16ToDoubleOrFloatIfNeeded(val)); +template +std::shared_ptr> ConvertHalfToFloatIfNeeded( + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) +{ + return nullptr; } -template <> -ORT_FORCEINLINE constexpr float ConvertMLFloat16ToDoubleOrFloatIfNeeded(float val) { - return val; -} +template<> +std::shared_ptr> ConvertHalfToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +{ + if (!p_input) { + return nullptr; + } -template <> -ORT_FORCEINLINE constexpr double ConvertMLFloat16ToDoubleOrFloatIfNeeded(double val) { - return val; + // Efficiently convert all the MLFloat16 values to floats. + std::shared_ptr> vec = std::make_shared>(num_elems); + MlasConvertHalfToFloatBuffer(p_input, &(*vec)[0], num_elems); + + return vec; } + // Function template that only converts the input value to MLFloat16 if T is MLFloat16. template ORT_FORCEINLINE constexpr typename std::enable_if_t || std::is_same_v, T> @@ -145,15 +148,28 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); + std::shared_ptr> float_input = ConvertHalfToFloatIfNeeded(p_input, hidden_size); + const DoubleOrFloat* converted_input = + float_input == nullptr + ? reinterpret_cast(p_input) + : reinterpret_cast(&(*float_input)[0]); + std::shared_ptr> float_skip = ConvertHalfToFloatIfNeeded(p_skip, hidden_size); + const DoubleOrFloat* converted_skip = + float_skip == nullptr + ? reinterpret_cast(p_skip) + : reinterpret_cast(&(*float_skip)[0]); + std::shared_ptr> float_bias = ConvertHalfToFloatIfNeeded(bias_data, hidden_size); + const DoubleOrFloat* converted_bias = + float_bias == nullptr + ? reinterpret_cast(bias_data) + : reinterpret_cast(&(*float_bias)[0]); + std::unique_ptr output_buffer = std::make_unique(hidden_size); for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat input_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_input[h]); - DoubleOrFloat skip_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_skip[h]); - - DoubleOrFloat value = input_value + skip_value; + DoubleOrFloat value = converted_input[h] + converted_skip[h]; if (nullptr != bias_data) { - value += ConvertMLFloat16ToDoubleOrFloatIfNeeded(bias_data[h]); + value += converted_bias[h]; } output_buffer[h] = value; @@ -173,15 +189,26 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_); } + std::shared_ptr> float_gamma = ConvertHalfToFloatIfNeeded(gamma_data, hidden_size); + const DoubleOrFloat* converted_gamma = + float_gamma == nullptr + ? reinterpret_cast(gamma_data) + : reinterpret_cast(&(*float_gamma)[0]); + std::shared_ptr> float_beta = ConvertHalfToFloatIfNeeded(beta_data, hidden_size); + const DoubleOrFloat* converted_beta = + float_beta == nullptr + ? reinterpret_cast(beta_data) + : reinterpret_cast(&(*float_beta)[0]); for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat gamma_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(gamma_data[h]); if (simplified) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded(output_buffer[h] / mean_square * gamma_value); + p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( + output_buffer[h] / mean_square * converted_gamma[h]); } else if (nullptr == beta_data) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * gamma_value); + p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( + (output_buffer[h] - mean) / mean_square * converted_gamma[h]); } else { - DoubleOrFloat beta_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(beta_data[h]); - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * gamma_value + beta_value); + p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( + (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]); } } }, diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 546557f6f9015..9010dd7d6f1b4 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -5,6 +5,7 @@ #include "core/common/safeint.h" #include "core/framework/tensor.h" +#include "core/mlas/inc/mlas.h" #include "core/platform/threadpool.h" #include "core/providers/common.h" #include "core/util/force_inline.h" @@ -14,23 +15,28 @@ namespace onnxruntime { namespace { -// Utility to convert from MLFloat16 to float only when the input type is MLFloat16. -template -ORT_FORCEINLINE Ret ConvertMLFloat16ToDoubleOrFloatIfNeeded(T val); +template +std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const T* p_input, int num_elems); -template <> -ORT_FORCEINLINE float ConvertMLFloat16ToDoubleOrFloatIfNeeded(MLFloat16 val) { - return val.ToFloat(); +template +std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded( + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) +{ + return nullptr; } -template <> -ORT_FORCEINLINE constexpr float ConvertMLFloat16ToDoubleOrFloatIfNeeded(float val) { - return val; -} +template<> +std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +{ + if (!p_input) { + return nullptr; + } -template <> -ORT_FORCEINLINE constexpr double ConvertMLFloat16ToDoubleOrFloatIfNeeded(double val) { - return val; + // Efficiently convert all the MLFloat16 values to floats. + std::shared_ptr> vec = std::make_shared>(num_elems); + MlasConvertHalfToFloatBuffer(p_input, &(*vec)[0], num_elems); + + return vec; } ORT_FORCEINLINE constexpr float ConvertToFloatIfNeeded(float val) { @@ -138,8 +144,7 @@ Status LayerNormImpl::ComputeWithoutContext( onnxruntime::concurrency::ThreadPool* thread_pool, int64_t axis, float epsilon, - bool simplified -) const { + bool simplified) const { int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); @@ -167,10 +172,17 @@ Status LayerNormImpl::ComputeWithoutContext( DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); + std::shared_ptr> float_input = ConvertMLFloat16ToFloatIfNeeded(p_input, norm_size); + const DoubleOrFloat* converted_input = + float_input == nullptr + ? reinterpret_cast(p_input) + : reinterpret_cast(&(*float_input)[0]); + + std::unique_ptr output_buffer = std::make_unique(norm_size); for (int64_t h = 0; h < norm_size; h++) { - DoubleOrFloat input_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_input[h]); - mean += input_value; - mean_square += input_value * input_value; + output_buffer[h] = converted_input[h]; + mean += converted_input[h]; + mean_square += converted_input[h] * converted_input[h]; } mean = mean / norm_size; @@ -180,16 +192,25 @@ Status LayerNormImpl::ComputeWithoutContext( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } + std::shared_ptr> float_scale = ConvertMLFloat16ToFloatIfNeeded(scale_data, norm_size); + const DoubleOrFloat* converted_scale = + float_scale == nullptr + ? reinterpret_cast(scale_data) + : reinterpret_cast(&(*float_scale)[0]); + std::shared_ptr> float_bias = ConvertMLFloat16ToFloatIfNeeded(bias_data, norm_size); + const DoubleOrFloat* converted_bias = + float_bias == nullptr + ? reinterpret_cast(bias_data) + : reinterpret_cast(&(*float_bias)[0]); + for (int64_t h = 0; h < norm_size; h++) { - DoubleOrFloat input_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(p_input[h]); - DoubleOrFloat scale_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(scale_data[h]); if (simplified) { - p_output[h] = ConvertToMLFloat16IfNeeded(input_value / mean_square * scale_value); + p_output[h] = ConvertToMLFloat16IfNeeded(output_buffer[h] / mean_square * converted_scale[h]); } else if (nullptr == bias_data) { - p_output[h] = ConvertToMLFloat16IfNeeded((input_value - mean) / mean_square * scale_value); + p_output[h] = ConvertToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * converted_scale[h]); } else { - DoubleOrFloat bias_value = ConvertMLFloat16ToDoubleOrFloatIfNeeded(bias_data[h]); - p_output[h] = ConvertToMLFloat16IfNeeded((input_value - mean) / mean_square * scale_value + bias_value); + p_output[h] = ConvertToMLFloat16IfNeeded( + (output_buffer[h] - mean) / mean_square * converted_scale[h] + converted_bias[h]); } } From 8aa80daa2532d4edb1d69111c43436b99a69a774 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 10:13:17 -0700 Subject: [PATCH 07/16] convert output buffer efficiently in layer_norm_impl --- .../contrib_ops/cpu/skip_layer_norm.cc | 16 ++--- .../core/providers/cpu/nn/layer_norm_impl.cc | 63 +++++++++++++++---- 2 files changed, 59 insertions(+), 20 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 50ce160f38153..9178a2f17015b 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -39,17 +39,17 @@ REGISTER_KERNEL_TYPED(MLFloat16) template -std::shared_ptr> ConvertHalfToFloatIfNeeded(const T* p_input, int num_elems); +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertHalfToFloatIfNeeded( +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertHalfToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { if (!p_input) { return nullptr; @@ -148,17 +148,17 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); - std::shared_ptr> float_input = ConvertHalfToFloatIfNeeded(p_input, hidden_size); + std::shared_ptr> float_input = ConvertHalfToFloatBufferIfNeeded(p_input, hidden_size); const DoubleOrFloat* converted_input = float_input == nullptr ? reinterpret_cast(p_input) : reinterpret_cast(&(*float_input)[0]); - std::shared_ptr> float_skip = ConvertHalfToFloatIfNeeded(p_skip, hidden_size); + std::shared_ptr> float_skip = ConvertHalfToFloatBufferIfNeeded(p_skip, hidden_size); const DoubleOrFloat* converted_skip = float_skip == nullptr ? reinterpret_cast(p_skip) : reinterpret_cast(&(*float_skip)[0]); - std::shared_ptr> float_bias = ConvertHalfToFloatIfNeeded(bias_data, hidden_size); + std::shared_ptr> float_bias = ConvertHalfToFloatBufferIfNeeded(bias_data, hidden_size); const DoubleOrFloat* converted_bias = float_bias == nullptr ? reinterpret_cast(bias_data) @@ -189,12 +189,12 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_); } - std::shared_ptr> float_gamma = ConvertHalfToFloatIfNeeded(gamma_data, hidden_size); + std::shared_ptr> float_gamma = ConvertHalfToFloatBufferIfNeeded(gamma_data, hidden_size); const DoubleOrFloat* converted_gamma = float_gamma == nullptr ? reinterpret_cast(gamma_data) : reinterpret_cast(&(*float_gamma)[0]); - std::shared_ptr> float_beta = ConvertHalfToFloatIfNeeded(beta_data, hidden_size); + std::shared_ptr> float_beta = ConvertHalfToFloatBufferIfNeeded(beta_data, hidden_size); const DoubleOrFloat* converted_beta = float_beta == nullptr ? reinterpret_cast(beta_data) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 9010dd7d6f1b4..a7ab7c6b526d6 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,18 +15,38 @@ namespace onnxruntime { namespace { +double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) +{ + return p_output; +} + +float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) +{ + return p_output; +} + +float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +{ + if (!p_output) { + return nullptr; + } + + return new float[num_elems]; +} + + template -std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const T* p_input, int num_elems); +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(const MLFloat16* p_input, int num_elems) +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { if (!p_input) { return nullptr; @@ -39,6 +59,17 @@ std::shared_ptr> ConvertMLFloat16ToFloatIfNeeded(c return vec; } + +void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) +{ + if (!output_buffer || !p_output) { + return; + } + + MlasConvertFloatToHalfBuffer(output_buffer, p_output, num_elems); +} + + ORT_FORCEINLINE constexpr float ConvertToFloatIfNeeded(float val) { return val; } @@ -172,13 +203,16 @@ Status LayerNormImpl::ComputeWithoutContext( DoubleOrFloat mean(0.0f); DoubleOrFloat mean_square(0.0f); - std::shared_ptr> float_input = ConvertMLFloat16ToFloatIfNeeded(p_input, norm_size); + std::shared_ptr> float_input = ConvertMLFloat16ToFloatBufferIfNeeded(p_input, norm_size); const DoubleOrFloat* converted_input = float_input == nullptr ? reinterpret_cast(p_input) : reinterpret_cast(&(*float_input)[0]); - std::unique_ptr output_buffer = std::make_unique(norm_size); + // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. + // If T is MLFloat16, then we allocate norm_size floats in output_buffer. + DoubleOrFloat* output_buffer = static_cast(OnlyCreateBufferIfMLFloat16(p_output, norm_size)); + for (int64_t h = 0; h < norm_size; h++) { output_buffer[h] = converted_input[h]; mean += converted_input[h]; @@ -192,12 +226,12 @@ Status LayerNormImpl::ComputeWithoutContext( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - std::shared_ptr> float_scale = ConvertMLFloat16ToFloatIfNeeded(scale_data, norm_size); + std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded(scale_data, norm_size); const DoubleOrFloat* converted_scale = float_scale == nullptr ? reinterpret_cast(scale_data) : reinterpret_cast(&(*float_scale)[0]); - std::shared_ptr> float_bias = ConvertMLFloat16ToFloatIfNeeded(bias_data, norm_size); + std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded(bias_data, norm_size); const DoubleOrFloat* converted_bias = float_bias == nullptr ? reinterpret_cast(bias_data) @@ -205,15 +239,20 @@ Status LayerNormImpl::ComputeWithoutContext( for (int64_t h = 0; h < norm_size; h++) { if (simplified) { - p_output[h] = ConvertToMLFloat16IfNeeded(output_buffer[h] / mean_square * converted_scale[h]); + output_buffer[h] = output_buffer[h] / mean_square * converted_scale[h]; } else if (nullptr == bias_data) { - p_output[h] = ConvertToMLFloat16IfNeeded((output_buffer[h] - mean) / mean_square * converted_scale[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_scale[h]; } else { - p_output[h] = ConvertToMLFloat16IfNeeded( - (output_buffer[h] - mean) / mean_square * converted_scale[h] + converted_bias[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_scale[h] + converted_bias[h]; } } + if (std::is_same_v) { + ConvertFloatBufferToMLFloat16( + reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); + delete[] output_buffer; + } + if (mean_data != nullptr) { // ONNX spec doesn't support 'double' for 'U' so when 'T' == double, 'U' == float and we need to narrow mean_data[task_idx] = ConvertToMLFloat16IfNeeded(ConvertToFloatIfNeeded(mean)); From 295d6527228ca7dc4a264ace35a2ca5d172c7760 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 11:46:33 -0700 Subject: [PATCH 08/16] convert output buffer efficiently in skip_layer_norm --- .../contrib_ops/cpu/skip_layer_norm.cc | 86 ++++++++++++------- .../core/providers/cpu/nn/layer_norm_impl.cc | 1 + 2 files changed, 57 insertions(+), 30 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 9178a2f17015b..47174ec54fafd 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -38,6 +38,28 @@ REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) +namespace { + +double* CreateBufferIfMLFloat16(double* p_output, int num_elems) +{ + return p_output; +} + +float* CreateBufferIfMLFloat16(float* p_output, int num_elems) +{ + return p_output; +} + +float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +{ + if (!p_output) { + return nullptr; + } + + return new float[num_elems]; +} + + template std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); @@ -63,24 +85,17 @@ std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( } -// Function template that only converts the input value to MLFloat16 if T is MLFloat16. -template -ORT_FORCEINLINE constexpr typename std::enable_if_t || std::is_same_v, T> -ConvertDoubleOrFloatToMLFloat16IfNeeded(T val) { - return val; -} +void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) +{ + if (!output_buffer || !p_output) { + return; + } -template -ORT_FORCEINLINE constexpr typename std::enable_if_t, T> -ConvertDoubleOrFloatToMLFloat16IfNeeded(float val) { - return MLFloat16(val); + MlasConvertFloatToHalfBuffer(output_buffer, p_output, num_elems); } -template -ORT_FORCEINLINE constexpr typename std::enable_if_t, T> -ConvertDoubleOrFloatToMLFloat16IfNeeded(double val) { - return MLFloat16(static_cast(val)); -} +} // namespace + template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) @@ -164,22 +179,30 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { ? reinterpret_cast(bias_data) : reinterpret_cast(&(*float_bias)[0]); - std::unique_ptr output_buffer = std::make_unique(hidden_size); + // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. + // If T is MLFloat16, then we allocate hidden_size floats in output_buffer. + DoubleOrFloat* output_buffer = static_cast(CreateBufferIfMLFloat16(p_output, hidden_size)); + for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat value = converted_input[h] + converted_skip[h]; + DoubleOrFloat val = converted_input[h] + converted_skip[h]; if (nullptr != bias_data) { - value += converted_bias[h]; + val += converted_bias[h]; } - output_buffer[h] = value; - T converted_value = ConvertDoubleOrFloatToMLFloat16IfNeeded(value); - if (nullptr != p_skip_input_bias_add_output_data) { - p_skip_input_bias_add_output_data[h] = converted_value; + output_buffer[h] = val; + mean += val; + mean_square += val * val; + + if (nullptr != p_skip_input_bias_add_output_data && (std::is_same_v || std::is_same_v)) { + p_skip_input_bias_add_output_data[h] = *(reinterpret_cast(&val)); } + } - mean += value; - mean_square += value * value; + if (nullptr != p_skip_input_bias_add_output_data && std::is_same_v) { + ConvertFloatBufferToMLFloat16(reinterpret_cast(output_buffer), + reinterpret_cast(p_skip_input_bias_add_output_data), + hidden_size); } mean = mean / hidden_size; @@ -201,16 +224,19 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { : reinterpret_cast(&(*float_beta)[0]); for (size_t h = 0; h < static_cast(hidden_size); h++) { if (simplified) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( - output_buffer[h] / mean_square * converted_gamma[h]); + output_buffer[h] = output_buffer[h] / mean_square * converted_gamma[h]; } else if (nullptr == beta_data) { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( - (output_buffer[h] - mean) / mean_square * converted_gamma[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h]; } else { - p_output[h] = ConvertDoubleOrFloatToMLFloat16IfNeeded( - (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]); + output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]; } } + + if (std::is_same_v) { + ConvertFloatBufferToMLFloat16( + reinterpret_cast(output_buffer), reinterpret_cast(p_output), hidden_size); + delete[] output_buffer; + } }, 0); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index a7ab7c6b526d6..cc7bfb039d112 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -99,6 +99,7 @@ ORT_FORCEINLINE constexpr double ConvertToMLFloat16IfNeeded(double val) { } // namespace + LayerNormImpl::LayerNormImpl(const OpKernelInfo& op_kernel_info, bool simplified, bool contrib_op) : OpKernel(op_kernel_info), simplified_{simplified}, contrib_op_{contrib_op} { ORT_ENFORCE(op_kernel_info.GetAttr("axis", &axis_).IsOK()); From 405a0a0caf98f7688e6e7c306cbb90733935f6ba Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 11:55:17 -0700 Subject: [PATCH 09/16] add inline and fix some lint issues --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 16 ++++++---------- .../core/providers/cpu/nn/layer_norm_impl.cc | 16 ++++++---------- 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 47174ec54fafd..ff100e617d2f8 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,31 +40,27 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -double* CreateBufferIfMLFloat16(double* p_output, int num_elems) +ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -float* CreateBufferIfMLFloat16(float* p_output, int num_elems) +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { - if (!p_output) { - return nullptr; - } - - return new float[num_elems]; + return p_output == nullptr ? nullptr : new float[num_elems]; } template -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( +ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index cc7bfb039d112..35bac4b94d2c1 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,31 +15,27 @@ namespace onnxruntime { namespace { -double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { - if (!p_output) { - return nullptr; - } - - return new float[num_elems]; + return p_output == nullptr ? nullptr : new float[num_elems]; } template -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); template -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; From 245f298eba41e5d69f8e977865d5b96de16986e8 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 12:32:11 -0700 Subject: [PATCH 10/16] fix some lint errors --- .../contrib_ops/cpu/skip_layer_norm.cc | 18 ++++++----------- .../core/providers/cpu/nn/layer_norm_impl.cc | 20 +++++++------------ 2 files changed, 13 insertions(+), 25 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index ff100e617d2f8..5d4ae6f67d972 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,18 +40,15 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) -{ +ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) -{ +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) -{ +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { return p_output == nullptr ? nullptr : new float[num_elems]; } @@ -61,14 +58,12 @@ ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNe template ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) -{ + const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) -{ +std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { if (!p_input) { return nullptr; } @@ -81,8 +76,7 @@ std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( } -void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) -{ +void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) { if (!output_buffer || !p_output) { return; } diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 35bac4b94d2c1..33e631152ffcf 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,35 +15,30 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) -{ +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) -{ +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) -{ +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { return p_output == nullptr ? nullptr : new float[num_elems]; } template -ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int64_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) -{ + const std::enable_if_t || std::is_same_v, T>* p_input, int64_t num_elems) { return nullptr; } template<> -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) -{ +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int64_t num_elems) { if (!p_input) { return nullptr; } @@ -56,8 +51,7 @@ std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded Date: Mon, 30 Sep 2024 12:44:07 -0700 Subject: [PATCH 11/16] fix warning --- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 33e631152ffcf..00cc0900e9577 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,15 +15,15 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int num_elems) { +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int64_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int64_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t num_elems) { return p_output == nullptr ? nullptr : new float[num_elems]; } From a483ca480cb09275d46158061e8f6c0c57056ada Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Mon, 30 Sep 2024 22:50:00 -0700 Subject: [PATCH 12/16] maybe_unused --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 10 ++++++---- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 5d4ae6f67d972..6505ee3fc6fd4 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,11 +40,11 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, int num_elems) { +ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] int num_elems) { return p_output; } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, int num_elems) { +ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] int num_elems) { return p_output; } @@ -54,11 +54,13 @@ ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elem template -ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const T* p_input, int num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( + [[maybe_unused]] const T* p_input, [[maybe_unused]] int num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int num_elems) { + [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, + [[maybe_unused]] int num_elems) { return nullptr; } diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 00cc0900e9577..d3da791be81fe 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -15,11 +15,11 @@ namespace onnxruntime { namespace { -ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, int64_t num_elems) { +ORT_FORCEINLINE double* OnlyCreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] int64_t num_elems) { return p_output; } -ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, int64_t num_elems) { +ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] int64_t num_elems) { return p_output; } @@ -29,11 +29,13 @@ ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t template -ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const T* p_input, int64_t num_elems); +ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( + [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - const std::enable_if_t || std::is_same_v, T>* p_input, int64_t num_elems) { + [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, + [[maybe_unused]] int64_t num_elems) { return nullptr; } From 19d225a017d7ed83587f94945ccde47f340bc1f1 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 07:12:57 -0700 Subject: [PATCH 13/16] Fix bug --- onnxruntime/contrib_ops/cpu/skip_layer_norm.cc | 2 +- onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 6505ee3fc6fd4..c963668d58e6e 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -224,7 +224,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { } } - if (std::is_same_v) { + if (std::is_same_v) { ConvertFloatBufferToMLFloat16( reinterpret_cast(output_buffer), reinterpret_cast(p_output), hidden_size); delete[] output_buffer; diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index d3da791be81fe..50fa0d55af270 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -240,7 +240,7 @@ Status LayerNormImpl::ComputeWithoutContext( } } - if (std::is_same_v) { + if (std::is_same_v) { ConvertFloatBufferToMLFloat16( reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); delete[] output_buffer; From 05b5037b410d800959d4daaa45be3d0bf21521fd Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 12:14:33 -0700 Subject: [PATCH 14/16] separate MLFloat16 implementation in skip_layer_norm --- .../contrib_ops/cpu/skip_layer_norm.cc | 252 +++++++++--------- 1 file changed, 125 insertions(+), 127 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index c963668d58e6e..a4c7a19dfb5c9 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -40,50 +40,138 @@ REGISTER_KERNEL_TYPED(MLFloat16) namespace { -ORT_FORCEINLINE double* CreateBufferIfMLFloat16(double* p_output, [[maybe_unused]] int num_elems) { - return p_output; -} +template || std::is_same_v, void>> +void ComputeJob( + const T* input_data, + const T* skip_data, + const T* gamma_data, + const T* beta_data, + const T* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + T* output_data, + T* skip_input_bias_add_output_data +) { + auto offset = task_idx * hidden_size; + const T* p_input = input_data + offset; + const T* p_skip = skip_data + (offset % skip_size); + T* p_output = output_data + offset; + T* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; + + T mean(0.0f); + T mean_square(0.0f); + + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + T val = p_input[h] + p_skip[h]; + + if (nullptr != bias_data) { + val += bias_data[h]; + } + + if (nullptr != p_skip_input_bias_add_output) { + p_skip_input_bias_add_output[h] = val; + } + + p_output[h] = val; + mean += val; + mean_square += val * val; + } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(float* p_output, [[maybe_unused]] int num_elems) { - return p_output; -} + mean = mean / hidden_size; + if (simplified) { + mean_square = sqrt(mean_square / hidden_size + epsilon); + } else { + mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); + } -ORT_FORCEINLINE float* CreateBufferIfMLFloat16(MLFloat16* p_output, int num_elems) { - return p_output == nullptr ? nullptr : new float[num_elems]; + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + if (simplified) { + p_output[h] = p_output[h] / mean_square * gamma_data[h]; + } else if (nullptr == beta_data) { + p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h]; + } else { + p_output[h] = (p_output[h] - mean) / mean_square * gamma_data[h] + beta_data[h]; + } + } } +void ComputeJob( + const MLFloat16* input_data, + const MLFloat16* skip_data, + const MLFloat16* gamma_data, + const MLFloat16* beta_data, + const MLFloat16* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + MLFloat16* output_data, + MLFloat16* skip_input_bias_add_output_data +) { + auto offset = task_idx * hidden_size; + const MLFloat16* p_input = input_data + offset; + const MLFloat16* p_skip = skip_data + (offset % skip_size); + MLFloat16* p_output = output_data + offset; + MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; + + float mean(0.0f); + float mean_square(0.0f); + + std::vector float_input(hidden_size); + MlasConvertHalfToFloatBuffer(p_input, &float_input[0], hidden_size); + std::vector float_skip(hidden_size); + MlasConvertHalfToFloatBuffer(p_skip, &float_skip[0], hidden_size); + std::vector float_bias; + if (bias_data != nullptr) { + float_bias.resize(hidden_size); + MlasConvertHalfToFloatBuffer(bias_data, &float_bias[0], hidden_size); + } + + std::vector float_output(hidden_size); -template -ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - [[maybe_unused]] const T* p_input, [[maybe_unused]] int num_elems); + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + float val = float_input[h] + float_skip[h]; -template -ORT_FORCEINLINE std::shared_ptr> ConvertHalfToFloatBufferIfNeeded( - [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, - [[maybe_unused]] int num_elems) { - return nullptr; -} + if (nullptr != bias_data) { + val += float_bias[h]; + } -template<> -std::shared_ptr> ConvertHalfToFloatBufferIfNeeded(const MLFloat16* p_input, int num_elems) { - if (!p_input) { - return nullptr; + float_output[h] = val; + mean += val; + mean_square += val * val; } - // Efficiently convert all the MLFloat16 values to floats. - std::shared_ptr> vec = std::make_shared>(num_elems); - MlasConvertHalfToFloatBuffer(p_input, &(*vec)[0], num_elems); - - return vec; -} + if (nullptr != p_skip_input_bias_add_output) { + MlasConvertFloatToHalfBuffer(&float_output[0], p_skip_input_bias_add_output, hidden_size); + } + mean = mean / hidden_size; + if (simplified) { + mean_square = sqrt(mean_square / hidden_size + epsilon); + } else { + mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); + } -void ConvertFloatBufferToMLFloat16(const float* output_buffer, MLFloat16* p_output, int num_elems) { - if (!output_buffer || !p_output) { - return; + std::vector float_gamma(hidden_size); + MlasConvertHalfToFloatBuffer(gamma_data, &float_gamma[0], hidden_size); + std::vector float_beta(hidden_size); + MlasConvertHalfToFloatBuffer(beta_data, &float_beta[0], hidden_size); + + for (decltype(hidden_size) h = 0; h < hidden_size; h++) { + if (simplified) { + float_output[h] = float_output[h] / mean_square * float_gamma[h]; + } else if (nullptr == beta_data) { + float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h]; + } else { + float_output[h] = (float_output[h] - mean) / mean_square * float_gamma[h] + float_beta[h]; + } } - MlasConvertFloatToHalfBuffer(output_buffer, p_output, num_elems); + MlasConvertFloatToHalfBuffer(&float_output[0], p_output, hidden_size); } } // namespace @@ -104,8 +192,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* beta = p_ctx->Input(3); const Tensor* bias = p_ctx->Input(4); Tensor* output = p_ctx->Output(0, input->Shape()); - // For inferencing, we support one more optional output which is the sum - // of the input and skip tensors + // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); const auto& input_dims = input->Shape().GetDims(); @@ -130,105 +217,16 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { T* output_data = output->MutableData(); - // For inferencing, we support one more optional output which is the sum - // of the input and skip tensors - T* skip_input_bias_add_output_data = skip_input_bias_add_output != nullptr ? skip_input_bias_add_output->MutableData() : nullptr; + // For inferencing, we support one more optional output which is the sum of the input and skip tensors + T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); - const auto& skip_size = skip->Shape().Size(); + const int64_t& skip_size = skip->Shape().Size(); concurrency::ThreadPool::TryBatchParallelFor( p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { - auto offset = task_idx * hidden_size; - - const T* p_input = input_data + offset; - const T* p_skip = skip_data + (offset % skip_size); - T* p_output = output_data + offset; - T* p_skip_input_bias_add_output_data = skip_input_bias_add_output_data != nullptr ? skip_input_bias_add_output_data + offset : nullptr; - - using DoubleOrFloat = typename std::conditional< - std::is_same::value, // If T is double - double, // Use double - float // Otherwise, use float (covers float and MLFloat16) - >::type; - - DoubleOrFloat mean(0.0f); - DoubleOrFloat mean_square(0.0f); - - std::shared_ptr> float_input = ConvertHalfToFloatBufferIfNeeded(p_input, hidden_size); - const DoubleOrFloat* converted_input = - float_input == nullptr - ? reinterpret_cast(p_input) - : reinterpret_cast(&(*float_input)[0]); - std::shared_ptr> float_skip = ConvertHalfToFloatBufferIfNeeded(p_skip, hidden_size); - const DoubleOrFloat* converted_skip = - float_skip == nullptr - ? reinterpret_cast(p_skip) - : reinterpret_cast(&(*float_skip)[0]); - std::shared_ptr> float_bias = ConvertHalfToFloatBufferIfNeeded(bias_data, hidden_size); - const DoubleOrFloat* converted_bias = - float_bias == nullptr - ? reinterpret_cast(bias_data) - : reinterpret_cast(&(*float_bias)[0]); - - // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. - // If T is MLFloat16, then we allocate hidden_size floats in output_buffer. - DoubleOrFloat* output_buffer = static_cast(CreateBufferIfMLFloat16(p_output, hidden_size)); - - for (size_t h = 0; h < static_cast(hidden_size); h++) { - DoubleOrFloat val = converted_input[h] + converted_skip[h]; - - if (nullptr != bias_data) { - val += converted_bias[h]; - } - - output_buffer[h] = val; - mean += val; - mean_square += val * val; - - if (nullptr != p_skip_input_bias_add_output_data && (std::is_same_v || std::is_same_v)) { - p_skip_input_bias_add_output_data[h] = *(reinterpret_cast(&val)); - } - } - - if (nullptr != p_skip_input_bias_add_output_data && std::is_same_v) { - ConvertFloatBufferToMLFloat16(reinterpret_cast(output_buffer), - reinterpret_cast(p_skip_input_bias_add_output_data), - hidden_size); - } - - mean = mean / hidden_size; - if (simplified) { - mean_square = sqrt(mean_square / hidden_size + epsilon_); - } else { - mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon_); - } - - std::shared_ptr> float_gamma = ConvertHalfToFloatBufferIfNeeded(gamma_data, hidden_size); - const DoubleOrFloat* converted_gamma = - float_gamma == nullptr - ? reinterpret_cast(gamma_data) - : reinterpret_cast(&(*float_gamma)[0]); - std::shared_ptr> float_beta = ConvertHalfToFloatBufferIfNeeded(beta_data, hidden_size); - const DoubleOrFloat* converted_beta = - float_beta == nullptr - ? reinterpret_cast(beta_data) - : reinterpret_cast(&(*float_beta)[0]); - for (size_t h = 0; h < static_cast(hidden_size); h++) { - if (simplified) { - output_buffer[h] = output_buffer[h] / mean_square * converted_gamma[h]; - } else if (nullptr == beta_data) { - output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h]; - } else { - output_buffer[h] = (output_buffer[h] - mean) / mean_square * converted_gamma[h] + converted_beta[h]; - } - } - - if (std::is_same_v) { - ConvertFloatBufferToMLFloat16( - reinterpret_cast(output_buffer), reinterpret_cast(p_output), hidden_size); - delete[] output_buffer; - } + ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, + simplified, output_data, skip_input_bias_add_output_data); }, 0); From ab2e5f2e4b286f48de6278441649be408bd4bc95 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 12:39:18 -0700 Subject: [PATCH 15/16] fix linter issues --- .../contrib_ops/cpu/skip_layer_norm.cc | 56 ++++++++--------- .../core/providers/cpu/nn/layer_norm_impl.cc | 62 +++++++++---------- .../core/providers/cpu/nn/layer_norm_impl.h | 34 +++++----- .../microbenchmark/layer_normalization.cc | 23 +++---- 4 files changed, 82 insertions(+), 93 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index a4c7a19dfb5c9..66ca8c4dfd37f 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -37,24 +37,22 @@ REGISTER_KERNEL_TYPED(float) REGISTER_KERNEL_TYPED(double) REGISTER_KERNEL_TYPED(MLFloat16) - namespace { template || std::is_same_v, void>> void ComputeJob( - const T* input_data, - const T* skip_data, - const T* gamma_data, - const T* beta_data, - const T* bias_data, - ptrdiff_t task_idx, - int hidden_size, - int64_t skip_size, - float epsilon, - bool simplified, - T* output_data, - T* skip_input_bias_add_output_data -) { + const T* input_data, + const T* skip_data, + const T* gamma_data, + const T* beta_data, + const T* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + T* output_data, + T* skip_input_bias_add_output_data) { auto offset = task_idx * hidden_size; const T* p_input = input_data + offset; const T* p_skip = skip_data + (offset % skip_size); @@ -99,19 +97,18 @@ void ComputeJob( } void ComputeJob( - const MLFloat16* input_data, - const MLFloat16* skip_data, - const MLFloat16* gamma_data, - const MLFloat16* beta_data, - const MLFloat16* bias_data, - ptrdiff_t task_idx, - int hidden_size, - int64_t skip_size, - float epsilon, - bool simplified, - MLFloat16* output_data, - MLFloat16* skip_input_bias_add_output_data -) { + const MLFloat16* input_data, + const MLFloat16* skip_data, + const MLFloat16* gamma_data, + const MLFloat16* beta_data, + const MLFloat16* bias_data, + ptrdiff_t task_idx, + int hidden_size, + int64_t skip_size, + float epsilon, + bool simplified, + MLFloat16* output_data, + MLFloat16* skip_input_bias_add_output_data) { auto offset = task_idx * hidden_size; const MLFloat16* p_input = input_data + offset; const MLFloat16* p_skip = skip_data + (offset % skip_size); @@ -174,8 +171,7 @@ void ComputeJob( MlasConvertFloatToHalfBuffer(&float_output[0], p_output, hidden_size); } -} // namespace - +} // namespace template SkipLayerNorm::SkipLayerNorm(const OpKernelInfo& op_kernel_info) @@ -226,7 +222,7 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { p_ctx->GetOperatorThreadPool(), static_cast(task_count), [&](ptrdiff_t task_idx) { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, - simplified, output_data, skip_input_bias_add_output_data); + simplified, output_data, skip_input_bias_add_output_data); }, 0); diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 50fa0d55af270..28ff0420a7323 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -27,19 +27,18 @@ ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t return p_output == nullptr ? nullptr : new float[num_elems]; } - template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); + [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, - [[maybe_unused]] int64_t num_elems) { + [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, + [[maybe_unused]] int64_t num_elems) { return nullptr; } -template<> +template <> std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int64_t num_elems) { if (!p_input) { return nullptr; @@ -52,7 +51,6 @@ std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeededGetOperatorThreadPool(); return ComputeWithoutContext(X_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, - Y_data, mean_data, inv_std_dev_data, thread_pool, axis, epsilon, simplified); + Y_data, mean_data, inv_std_dev_data, thread_pool, axis, epsilon, simplified); } Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { @@ -156,19 +152,19 @@ Status LayerNormImpl::Compute(OpKernelContext* p_ctx) const { template Status LayerNormImpl::ComputeWithoutContext( - const T* X_data, - const TensorShape& x_shape, - const T* scale_data, - const TensorShape& scale_shape, - const T* bias_data, - const TensorShape& bias_shape, - T* Y_data, - U* mean_data, - U* inv_std_dev_data, - onnxruntime::concurrency::ThreadPool* thread_pool, - int64_t axis, - float epsilon, - bool simplified) const { + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev_data, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon, + bool simplified) const { int64_t norm_count = x_shape.SizeToDimension(onnxruntime::narrow(axis)); int64_t norm_size = x_shape.SizeFromDimension(onnxruntime::narrow(axis)); @@ -198,9 +194,9 @@ Status LayerNormImpl::ComputeWithoutContext( std::shared_ptr> float_input = ConvertMLFloat16ToFloatBufferIfNeeded(p_input, norm_size); const DoubleOrFloat* converted_input = - float_input == nullptr - ? reinterpret_cast(p_input) - : reinterpret_cast(&(*float_input)[0]); + float_input == nullptr + ? reinterpret_cast(p_input) + : reinterpret_cast(&(*float_input)[0]); // If T is float or double, then output_buffer will be the same as p_output, so we don't allocate new memory. // If T is MLFloat16, then we allocate norm_size floats in output_buffer. @@ -221,14 +217,14 @@ Status LayerNormImpl::ComputeWithoutContext( std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded(scale_data, norm_size); const DoubleOrFloat* converted_scale = - float_scale == nullptr - ? reinterpret_cast(scale_data) - : reinterpret_cast(&(*float_scale)[0]); + float_scale == nullptr + ? reinterpret_cast(scale_data) + : reinterpret_cast(&(*float_scale)[0]); std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded(bias_data, norm_size); const DoubleOrFloat* converted_bias = - float_bias == nullptr - ? reinterpret_cast(bias_data) - : reinterpret_cast(&(*float_bias)[0]); + float_bias == nullptr + ? reinterpret_cast(bias_data) + : reinterpret_cast(&(*float_bias)[0]); for (int64_t h = 0; h < norm_size; h++) { if (simplified) { @@ -242,7 +238,7 @@ Status LayerNormImpl::ComputeWithoutContext( if (std::is_same_v) { ConvertFloatBufferToMLFloat16( - reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); + reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); delete[] output_buffer; } diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h index aa876357ed3c8..64e1c2ba2f902 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.h @@ -17,19 +17,19 @@ class LayerNormImpl : public OpKernel { // This method was created so that it can be called directly from `test/onnx/microbenchmark/layer_normalization.cc`. template Status ComputeWithoutContext( - const T* X_data, - const TensorShape& x_shape, - const T* scale_data, - const TensorShape& scale_shape, - const T* bias_data, - const TensorShape& bias_shape, - T* Y_data, - U* mean_data, - U* inv_std_dev, - onnxruntime::concurrency::ThreadPool* thread_pool, - int64_t axis, - float epsilon, - bool simplified) const; + const T* X_data, + const TensorShape& x_shape, + const T* scale_data, + const TensorShape& scale_shape, + const T* bias_data, + const TensorShape& bias_shape, + T* Y_data, + U* mean_data, + U* inv_std_dev, + onnxruntime::concurrency::ThreadPool* thread_pool, + int64_t axis, + float epsilon, + bool simplified) const; private: template @@ -38,16 +38,16 @@ class LayerNormImpl : public OpKernel { template struct SrcDispatcher { Status operator()(const LayerNormImpl* p_instance, OpKernelContext* p_ctx, int64_t orig_axis, - float epsilon, bool simplified, bool contrib_op) const { + float epsilon, bool simplified, bool contrib_op) const { // the contrib op kernel was always registered with the same type for all constraints. // our implementation of the onnx op only supports 'float' as the U constraint. - #if !defined(DISABLE_CONTRIB_OPS) +#if !defined(DISABLE_CONTRIB_OPS) if (contrib_op) { return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); } else - #else +#else ORT_UNUSED_PARAMETER(contrib_op); - #endif +#endif { return p_instance->ComputeImpl(p_ctx, orig_axis, epsilon, simplified); } diff --git a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc index 5c7bd5716832a..4660cb85a43f1 100644 --- a/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc +++ b/onnxruntime/test/onnx/microbenchmark/layer_normalization.cc @@ -33,23 +33,22 @@ static const size_t num_elems = dims[0] * dims[1] * dims[2]; static const std::vector float_vals(num_elems, 1.0f); static const std::vector MLFloat16_vals(num_elems, MLFloat16(1.0f)); -} // namespace +} // namespace template const T* getVector(); template <> const float* getVector() { - return float_vals.data(); + return float_vals.data(); } template <> const MLFloat16* getVector() { - return MLFloat16_vals.data(); + return MLFloat16_vals.data(); } - -template +template static void BM_LayerNormalization(benchmark::State& state) { bool simplified = false; const float epsilon = 1e-05f; @@ -69,7 +68,7 @@ static void BM_LayerNormalization(benchmark::State& state) { ConfigOptions config_options; OpKernelInfo op_kernel_info(node, kernel_def, *execution_provider, constant_initialized_tensors, mlvalue_name_idx_map, - data_transfer_mgr, allocators, config_options); + data_transfer_mgr, allocators, config_options); LayerNormImpl layer_norm_impl(op_kernel_info); @@ -88,21 +87,19 @@ static void BM_LayerNormalization(benchmark::State& state) { OrtThreadPoolParams tp_params; tp_params.name = ORT_TSTR("intra-op"); std::unique_ptr thread_pool = concurrency::CreateThreadPool( - &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); + &Env::Default(), tp_params, concurrency::ThreadPoolType::INTRA_OP); for (auto _ : state) { auto status = layer_norm_impl.ComputeWithoutContext(x_data, x_shape, scale_data, scale_shape, bias_data, bias_shape, - Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); + Y_data, mean_data, inv_std_dev_data, thread_pool.get(), axis, epsilon, simplified); - if (! status.IsOK()) - { - std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; - break; + if (!status.IsOK()) { + std::cout << "ComputeWithoutContext status not OK: " << status.ErrorMessage() << std::endl; + break; } } } - BENCHMARK(BM_LayerNormalization) ->Arg(1) ->UseRealTime() From 63e9644ce4016dbb77608dcc550397b157f55ac1 Mon Sep 17 00:00:00 2001 From: Alex Marin Date: Tue, 1 Oct 2024 14:03:54 -0700 Subject: [PATCH 16/16] fix precision warning --- .../core/providers/cpu/nn/layer_norm_impl.cc | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc index 28ff0420a7323..3259d0b67ef92 100644 --- a/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc +++ b/onnxruntime/core/providers/cpu/nn/layer_norm_impl.cc @@ -29,17 +29,17 @@ ORT_FORCEINLINE float* OnlyCreateBufferIfMLFloat16(MLFloat16* p_output, int64_t template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( - [[maybe_unused]] const T* p_input, [[maybe_unused]] int64_t num_elems); + [[maybe_unused]] const T* p_input, [[maybe_unused]] size_t num_elems); template ORT_FORCEINLINE std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded( [[maybe_unused]] const std::enable_if_t || std::is_same_v, T>* p_input, - [[maybe_unused]] int64_t num_elems) { + [[maybe_unused]] size_t num_elems) { return nullptr; } template <> -std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, int64_t num_elems) { +std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded(const MLFloat16* p_input, size_t num_elems) { if (!p_input) { return nullptr; } @@ -51,7 +51,7 @@ std::shared_ptr> ConvertMLFloat16ToFloatBufferIfNeeded> float_input = ConvertMLFloat16ToFloatBufferIfNeeded(p_input, norm_size); + std::shared_ptr> float_input = ConvertMLFloat16ToFloatBufferIfNeeded( + p_input, static_cast(norm_size)); const DoubleOrFloat* converted_input = float_input == nullptr ? reinterpret_cast(p_input) @@ -215,12 +216,14 @@ Status LayerNormImpl::ComputeWithoutContext( mean_square = sqrt(mean_square / norm_size - mean * mean + epsilon); } - std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded(scale_data, norm_size); + std::shared_ptr> float_scale = ConvertMLFloat16ToFloatBufferIfNeeded( + scale_data, static_cast(norm_size)); const DoubleOrFloat* converted_scale = float_scale == nullptr ? reinterpret_cast(scale_data) : reinterpret_cast(&(*float_scale)[0]); - std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded(bias_data, norm_size); + std::shared_ptr> float_bias = ConvertMLFloat16ToFloatBufferIfNeeded( + bias_data, static_cast(norm_size)); const DoubleOrFloat* converted_bias = float_bias == nullptr ? reinterpret_cast(bias_data) @@ -238,7 +241,9 @@ Status LayerNormImpl::ComputeWithoutContext( if (std::is_same_v) { ConvertFloatBufferToMLFloat16( - reinterpret_cast(output_buffer), reinterpret_cast(p_output), norm_size); + reinterpret_cast(output_buffer), + reinterpret_cast(p_output), + static_cast(norm_size)); delete[] output_buffer; }