From ebd0f51287ad3ea0c8d91ee899b9edfcbc351c8e Mon Sep 17 00:00:00 2001 From: hong <43953930+phlrain@users.noreply.github.com> Date: Thu, 3 Mar 2022 09:32:42 +0800 Subject: [PATCH] Move bn to pten (#39347) * add bn cpu version; test=develop * move batch norm to pten * move batch norm to pten; test=develop * fix bug; test=develop * fix func::tranpose depend bug; test=develop * fix compile bugs; test=develop * fix use_op batch_norm bug; test=develop * fix cudnn bn add relu test; test=develop * fix pten context build and double grad bug; test= develop * remve useless code; test=develop * add batch norm gpu fp16 support; test=develop * fix test bn op bug; test=develop * remove output dtype set; test=develop * fix bug; test=develop * fix bug; test=develop * fix applay pass to program bug; test=develop * revert to develop; test=develop * fix rocm bug; test=develop * revert operator to develop; test=develop * fix pre_commit; test=develop * fix statci check error; test=develop * resolve conflict; test=develop * ana batch norm bug; * revert batch norm op * resolve conlict * fix nan inf and speed bug; test=develop * fix bug; test=develop * fix error; test=develop * test expand op; test=develop * fix bug; test=develop * resolve confilct * resolve confilct; test=develop * polish code; test=develop * polish code; test=develop * change mutable data to ctx alloc; test=develop * make format same with ci; test=develop * fix format error with ci; test=develop --- .../mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc | 9 +- paddle/fluid/framework/operator.cc | 2 - paddle/fluid/operators/batch_norm_op.cc | 12 - paddle/fluid/operators/batch_norm_op.cu | 1322 ----------------- .../operators/fused/cudnn_bn_add_relu_test.cc | 2 +- paddle/fluid/operators/inplace_abn_op.cc | 83 +- paddle/fluid/operators/inplace_abn_op.cu | 81 +- paddle/fluid/operators/norm_utils.cu.h | 47 +- paddle/phi/kernels/batch_norm_grad_kernel.h | 90 ++ paddle/phi/kernels/batch_norm_kernel.h | 43 + .../phi/kernels/cpu/batch_norm_grad_kernel.cc | 674 +++++++++ paddle/phi/kernels/cpu/batch_norm_kernel.cc | 204 +++ .../phi/kernels/gpu/batch_norm_grad_kernel.cu | 1038 +++++++++++++ paddle/phi/kernels/gpu/batch_norm_kernel.cu | 680 +++++++++ paddle/phi/kernels/gpu/batch_norm_utils.h | 142 ++ paddle/phi/ops/compat/batch_norm_sig.cc | 89 ++ .../dygraph_to_static/test_mobile_net.py | 1 + .../unittests/test_apply_pass_to_program.py | 1 + .../tests/unittests/test_batch_norm_op.py | 16 +- .../tests/unittests/test_batch_norm_op_v2.py | 14 +- .../fluid/tests/unittests/test_conv2d_op.py | 2 + .../tests/unittests/test_expand_v2_op.py | 1 + .../tests/unittests/test_inplace_abn_op.py | 9 +- .../tests/unittests/test_norm_nn_grad.py | 2 + .../unittests/test_program_prune_backward.py | 2 + .../fluid/tests/unittests/test_reshape_op.py | 1 + 26 files changed, 3175 insertions(+), 1392 deletions(-) create mode 100644 paddle/phi/kernels/batch_norm_grad_kernel.h create mode 100644 paddle/phi/kernels/batch_norm_kernel.h create mode 100644 paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc create mode 100644 paddle/phi/kernels/cpu/batch_norm_kernel.cc create mode 100644 paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu create mode 100644 paddle/phi/kernels/gpu/batch_norm_kernel.cu create mode 100644 paddle/phi/kernels/gpu/batch_norm_utils.h create mode 100644 paddle/phi/ops/compat/batch_norm_sig.cc diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc index 96aa95bde3374..11190309814e7 100644 --- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc @@ -12,12 +12,13 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include #include +#include -#include #include -#include -#include + +#include "gtest/gtest.h" #include "paddle/fluid/framework/ir/graph_traits.h" #include "paddle/fluid/framework/ir/mkldnn/conv_elementwise_add_mkldnn_fuse_pass.h" #include "paddle/fluid/framework/ir/pass_tester_helper.h" @@ -25,7 +26,7 @@ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/platform/place.h" -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN); USE_OP(conv2d_transpose); USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 6414dd455db4f..8ebc64e5f2cb2 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2215,8 +2215,6 @@ void OperatorWithKernel::BuildPhiKernelContext( vector_int_attr.end()); pt_kernel_context->EmplaceBackAttr(vector_int64_attr); } - // TODO(YuanRisheng) Need support vector attr - } else if (attr_defs[i].type_index == std::type_index(typeid(std::vector))) { const auto& vector_int_attr = BOOST_GET_CONST(std::vector, attr); diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index 949cf021cf0fa..174207deb08b8 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -1289,15 +1289,3 @@ REGISTER_OPERATOR(batch_norm_grad, ops::BatchNormGradOp, ops::BatchNormDoubleGradMaker); REGISTER_OPERATOR(batch_norm_grad_grad, ops::BatchNormDoubleGradOp, ops::BatchNormDoubleGradOpInplaceInferer); - -REGISTER_OP_CPU_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CPU_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu index d59396db1517f..a19b087245a89 100644 --- a/paddle/fluid/operators/batch_norm_op.cu +++ b/paddle/fluid/operators/batch_norm_op.cu @@ -41,1327 +41,5 @@ using CudnnDataType = platform::CudnnDataType; template using BatchNormParamType = typename CudnnDataType::BatchNormParamType; -template -static __global__ void BNForwardInference( - const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - int num = N * C * HxW; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType x_sub_mean = - static_cast>(x[i]) - mean[c]; - BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); - y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( - const T *x, const BatchNormParamType *scale, - const BatchNormParamType *bias, const int C, const int N, const int HxW, - const double epsilon, double exponentialAverageFactor, T *y, - BatchNormParamType *mean, BatchNormParamType *variance, - BatchNormParamType *save_mean, - BatchNormParamType *save_inv_variance) { - int outer_size = C; - int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType variance_val; - __shared__ BatchNormParamType inv_var_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - variance_val = x_square_sum / inner_size - mean_val * mean_val; - inv_var_val = 1 / sqrt(variance_val + epsilon); - - if (save_mean && save_inv_variance) { - save_mean[i] = mean_val; - save_inv_variance[i] = inv_var_val; - } - mean[i] = (1 - exponentialAverageFactor) * mean_val + - exponentialAverageFactor * mean[i]; - variance[i] = (1 - exponentialAverageFactor) * variance_val + - exponentialAverageFactor * variance[i]; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_sub_mean = - static_cast>(x[index]) - mean_val; - y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; - } - } -} - -template -class BatchNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - float momentum = ctx.Attr("momentum"); - const bool is_test = ctx.Attr("is_test"); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool trainable_stats = ctx.Attr("trainable_statistics"); - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - bool test_mode = is_test && (!trainable_stats); - - // Get the size for each dimension. - // NCHW [batch_size, in_channels, in_height, in_width] - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5" - "But received: the size of input's dimensions is [%d]", - x_dims.size())); - - auto *y = ctx.Output("Y"); - y->mutable_data(ctx.GetPlace()); - - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - auto dtype = platform::CudnnDataType::type; - -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - test_mode || - (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); - - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_y(y->type()); - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, y, - &transformed_y); - } else { - transformed_x.ShareDataWith(*x); - transformed_y.ShareDataWith(*y); - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - - VLOG(3) << "Setting descriptors."; - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * D * C, 1, W * D * C, D * C, C}; - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// Note: PERSISTENT not implemented for inference -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor( -// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - // Note: PERSISTENT not implemented for inference - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnDeriveBNTensorDescriptor( - bn_param_desc_, data_desc_, - test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); -#endif - - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto &dev_ctx = ctx.template device_context(); - - auto handle = dev_ctx.cudnn_handle(); - - // Now, depending on whether we are running test or not, we have two paths. - // It is training mode when it's not reference AND not using pre-trained - // model. - bool training = !test_mode && !use_global_stats; - if (!training) { - // only when test we use input to do computation. - const auto *est_mean = ctx.Input("Mean"); - const auto *est_var = ctx.Input("Variance"); - // Run inference mode. - PADDLE_ENFORCE_EQ( - est_mean->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of mean's dimensions must equal to 1." - "But received: the size of mean's dimensions mean is [%d]," - "the dimensions of mean is [%s].", - est_mean->dims().size(), est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of variance's dimensions must equal to 1." - "But received: the size of variance's dimensions is [%d]," - "the dimensions of variance is [%s].", - est_var->dims().size(), est_var->dims())); - PADDLE_ENFORCE_EQ( - est_mean->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of mean must equal to the number of " - "Channels, which is [%d]. But received: the first dimension" - "of mean is [%d], the dimensions of mean is [%s].", - C, est_mean->dims()[0], est_mean->dims())); - PADDLE_ENFORCE_EQ( - est_var->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of variance must equal to the number" - "of Channels, which is [%d]. But received: the first dimension of" - "variance is [%d], the dimensions of variance is [%s].", - C, est_var->dims()[0], est_var->dims())); - -#ifdef PADDLE_WITH_HIP - const int block_size = 256; - const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; - if (compute_format == DataLayout::kNCHW) { - BNForwardInference< - T, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } else { - BNForwardInference< - T, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - est_mean->template data>(), - est_var->template data>(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, transformed_y.template data()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardInference( -// handle, miopenBNSpatial, -// const_cast( -// static_cast(CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// const_cast(static_cast( -// est_mean->template data>())), -// const_cast(static_cast( -// est_var->template data>())), -// epsilon)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardInference( - handle, - // Note: PERSISTENT not implemented for inference - CUDNN_BATCHNORM_SPATIAL, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), - est_mean->template data>(), - est_var->template data>(), epsilon)); -#endif - } else { - // if MomentumTensor is set, use MomentumTensor value, momentum - // is only used in this training branch - if (ctx.HasInput("MomentumTensor")) { - const auto *mom_tensor = ctx.Input("MomentumTensor"); - Tensor mom_cpu; - paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), - &mom_cpu); - momentum = mom_cpu.data()[0]; - } - - // Run training mode. - // obtain running mean and running inv var, and there is no need - // to initialize them. - - auto *mean_out = ctx.Output("MeanOut"); - auto *variance_out = ctx.Output("VarianceOut"); - mean_out->mutable_data>(ctx.GetPlace()); - variance_out->mutable_data>(ctx.GetPlace()); - - auto *saved_mean = ctx.Output("SavedMean"); - auto *saved_variance = ctx.Output("SavedVariance"); - saved_mean->mutable_data>(ctx.GetPlace()); - saved_variance->mutable_data>(ctx.GetPlace()); - - if ((N * H * W * D) == 1) { - // Only 1 element in normalization dimension, - // skip the batch norm calculation, let y = x. - framework::TensorCopy(*x, ctx.GetPlace(), y); - } else { - double this_factor = 1. - momentum; - - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - size_t reserve_space_size = 0; - void *reserve_space_ptr = nullptr; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - // Create reserve space and workspace for batch norm. - // Create tensor for each batchnorm op, it will be used in the - // backward. Thus this tensor shouldn't be temp. - auto *reserve_space = ctx.Output("ReserveSpace"); - PADDLE_ENFORCE_NOT_NULL( - reserve_space, - platform::errors::NotFound( - "The argument ReserveSpace of batch_norm op is not found.")); - - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*zDesc=*/nullptr, - /*yDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - // -------------- cudnn batchnorm reserve space -------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - /*handle=*/handle, - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*activationDesc=*/nullptr, - /*xDesc=*/data_desc_, - /*sizeInBytes=*/&reserve_space_size)); - - reserve_space_ptr = reserve_space->mutable_data( - ctx.GetPlace(), transformed_x.type(), reserve_space_size); - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTrainingEx( - handle, mode_, CUDNN_BATCHNORM_OPS_BN, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), nullptr, nullptr, data_desc_, - transformed_y.template data(), bn_param_desc_, - scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()), - nullptr, workspace_ptr, workspace_size, reserve_space_ptr, - reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - const int num = transformed_x.numel(); - const int block = 256; - const int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - const int grid = std::min(C, max_blocks); - if (compute_format == DataLayout::kNCHW) { - BNForwardTraining< - T, block, - DataLayout::kNCHW><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } else { - BNForwardTraining< - T, block, - DataLayout::kNHWC><<>>( - transformed_x.template data(), - scale->template data>(), - bias->template data>(), C, N, H * W * D, - epsilon, this_factor, transformed_y.template data(), - mean_out->template data>(), - variance_out->template data>(), - saved_mean->template data>(), - saved_variance->template data>()); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationForwardTraining( -// handle, mode_, const_cast(static_cast( -// CudnnDataType::kOne())), -// const_cast( -// static_cast(CudnnDataType::kZero())), -// data_desc_, -// static_cast(transformed_x.template data()), -// data_desc_, -// static_cast( -// transformed_y.template mutable_data(ctx.GetPlace())), -// bn_param_desc_, -// const_cast(static_cast( -// scale->template data>())), -// const_cast(static_cast( -// bias->template data>())), -// this_factor, -// static_cast( -// mean_out->template mutable_data>( -// ctx.GetPlace())), -// static_cast(variance_out->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())), -// epsilon, -// static_cast( -// saved_mean->template mutable_data>( -// ctx.GetPlace())), -// static_cast(saved_variance->template mutable_data< -// BatchNormParamType>(ctx.GetPlace())))); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationForwardTraining( - handle, mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_y.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - bias->template data>(), this_factor, - mean_out->template mutable_data>( - ctx.GetPlace()), - variance_out->template mutable_data>( - ctx.GetPlace()), - epsilon, - saved_mean->template mutable_data>( - ctx.GetPlace()), - saved_variance->template mutable_data>( - ctx.GetPlace()))); -#endif - } - } - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_y, y); - } -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( - const T *dy, const T *x, const BatchNormParamType *mean, - const BatchNormParamType *variance, const double epsilon, const int N, - const int C, const int HxW, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); - BatchNormParamType mean_i = mean[i]; - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - ds_sum += static_cast>(dy[index]) * - (static_cast>(x[index]) - mean_i); - db_sum += static_cast>(dy[index]); - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale[i] = ds_sum * inv_var_i; - dbias[i] = db_sum; - } - __syncthreads(); - } -} - -template -static __global__ void KeBNBackwardData(const T *dy, - const BatchNormParamType *scale, - const BatchNormParamType *variance, - const double epsilon, const int C, - const int HxW, const int num, T *dx) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? i / HxW % C : i % C; - BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); - dx[i] = static_cast(static_cast>(dy[i]) * - scale[c] * inv_var); - } -} - -template -static __global__ void KeBNRestoreData(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, - double epsilon, int C, int M, - const int num, const T *y) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - int stride = blockDim.x * gridDim.x; - for (int i = gid; i < num; i += stride) { - const int c = layout == framework::DataLayout::kNCHW ? (i / M) % C : i % C; - auto y_i = static_cast>(y[i]); - auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; - x[i] = static_cast(x_i); - } -} - -template -class InplaceHelper { - public: - void operator()(const framework::DataLayout layout, T *x, - const BatchNormParamType *scale, - const BatchNormParamType *bias, - const BatchNormParamType *mean, - const BatchNormParamType *variance, double epsilon, int C, - int M, const int num, const T *y, int grid2, const int block, - const gpuStream_t &stream) { - PADDLE_ENFORCE_EQ(x, y, platform::errors::InvalidArgument( - "X and Y should be inplaced in inplace mode")); - KeBNRestoreData<<>>( - layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); - } -}; - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( - const T *dy, const T *x, const BatchNormParamType *scale, - const BatchNormParamType *saved_mean, - const BatchNormParamType *saved_inv_variance, const int C, const int N, - const int HxW, const double epsilon, T *dx, BatchNormParamType *dscale, - BatchNormParamType *dbias) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage ds_storage; - __shared__ typename BlockReduce::TempStorage db_storage; - __shared__ typename BlockReduce::TempStorage mean_storage; - __shared__ typename BlockReduce::TempStorage variance_storeage; - __shared__ BatchNormParamType inv_var_val; - __shared__ BatchNormParamType mean_val; - __shared__ BatchNormParamType dscale_val; - __shared__ BatchNormParamType dbias_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType ds_sum = static_cast>(0); - BatchNormParamType db_sum = static_cast>(0); - - if (saved_mean && saved_inv_variance) { - if (threadIdx.x == 0) { - inv_var_val = saved_inv_variance[i]; - mean_val = saved_mean[i]; - } - } else { - BatchNormParamType x_sum = static_cast>(0); - BatchNormParamType x_square_sum = - static_cast>(0); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType x_i = - static_cast>(x[index]); - x_sum += x_i; - x_square_sum += x_i * x_i; - } - x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); - x_square_sum = - BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); - if (threadIdx.x == 0) { - mean_val = x_sum / inner_size; - inv_var_val = - 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); - } - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - ds_sum += - dy_i * (static_cast>(x[index]) - mean_val); - db_sum += dy_i; - } - ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); - db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); - if (threadIdx.x == 0) { - dscale_val = ds_sum * inv_var_val; - dbias_val = db_sum; - dscale[i] = dscale_val; - dbias[i] = dbias_val; - } - __syncthreads(); - - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = scale[i] * inv_var_val * - (static_cast>(dy[index]) - - dbias_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_val) * - inv_var_val * dscale_val / inner_size); - } - } -} - -template -static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( - const T *dy, const BatchNormParamType *scale, - const BatchNormParamType *mean, const T *x, - const BatchNormParamType *variance, const int C, const int N, - const int HxW, T *dx) { - const int outer_size = C; - const int inner_size = N * HxW; - typedef cub::BlockReduce, BlockDim> BlockReduce; - __shared__ typename BlockReduce::TempStorage dy_storage; - __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; - __shared__ BatchNormParamType dy_sum_val; - __shared__ BatchNormParamType dy_x_sub_mean_sum_val; - - for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { - BatchNormParamType inv_var_i = variance[i]; - BatchNormParamType mean_i = mean[i]; - BatchNormParamType dy_sum = static_cast>(0); - BatchNormParamType dy_x_sub_mean_sum = - static_cast>(0); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - BatchNormParamType dy_i = - static_cast>(dy[index]); - dy_sum += dy_i; - dy_x_sub_mean_sum += - dy_i * (static_cast>(x[index]) - mean_i); - } - - dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); - dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) - .Reduce(dy_x_sub_mean_sum, cub::Sum()); - - if (threadIdx.x == 0) { - dy_sum_val = dy_sum; - dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; - } - __syncthreads(); - for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { - const int index = layout == framework::DataLayout::kNCHW - ? (j / HxW * C + i) * HxW + j % HxW - : j * outer_size + i; - dx[index] = - (static_cast>(dy[index]) - - dy_sum_val / static_cast>(inner_size) - - (static_cast>(x[index]) - mean_i) * - dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * - scale[i] * inv_var_i; - } - } -} - -template -class BatchNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - PADDLE_ENFORCE_EQ( - platform::is_gpu_place(ctx.GetPlace()), true, - platform::errors::InvalidArgument("It must use CUDAPlace.")); - double epsilon = static_cast(ctx.Attr("epsilon")); - const std::string data_layout_str = ctx.Attr("data_layout"); - bool use_global_stats = ctx.Attr("use_global_stats"); - - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - // batch_norm with inplace as false will take X as grad input, which - // is same as cuDNN batch_norm backward calculation, batch_norm - // with inplace as true only take Y as input and X should be calculate - // by inverse operation of batch_norm on Y - const Tensor *x; - bool is_inplace; - if (ctx.HasInput("Y")) { - x = ctx.Input("Y"); - is_inplace = true; - if (d_x) { - PADDLE_ENFORCE_EQ(d_x, d_y, - platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD not inplace in inplace mode")); - } - } else { - x = ctx.Input("X"); - is_inplace = false; - if (d_x) { - PADDLE_ENFORCE_NE( - d_x, d_y, platform::errors::InvalidArgument( - "X@GRAD and Y@GRAD inplaced in non-inplace mode")); - } - } - - const bool is_test = ctx.Attr("is_test"); - use_global_stats = is_test || use_global_stats; - - const auto &x_dims = x->dims(); - - PADDLE_ENFORCE_EQ( - x_dims.size() >= 2 && x_dims.size() <= 5, true, - platform::errors::InvalidArgument( - "The size of input's dimensions should be between 2 and 5." - "But received: the size of input's dimensions is [%d]," - "the dimensions of input is [%s]", - x_dims.size(), x_dims)); - int N, C, H, W, D; - ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); - - // init output - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - } - - if (d_scale && d_bias) { - d_scale->mutable_data>(ctx.GetPlace()); - d_bias->mutable_data>(ctx.GetPlace()); - } - PADDLE_ENFORCE_EQ( - scale->dims().size(), 1UL, - platform::errors::InvalidArgument( - "The size of scale's dimensions must equal to 1. But received: " - "the size of scale's dimensions is [%d], the dimensions of scale " - "is [%s].", - scale->dims().size(), scale->dims())); - PADDLE_ENFORCE_EQ( - scale->dims()[0], C, - platform::errors::InvalidArgument( - "The first dimension of scale must equal to Channels[%d]. But " - "received: the first dimension of scale is [%d]", - C, scale->dims()[0])); - - auto dtype = platform::CudnnDataType::type; - const auto *reserve_space = ctx.Input("ReserveSpace"); -#ifdef PADDLE_WITH_HIP - auto compute_format = data_layout == DataLayout::kNHWC ? DataLayout::kNHWC - : DataLayout::kNCHW; - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// HIP do not support compute format of NHWC -// auto compute_format = DataLayout::kNCHW; -#else - const bool fast_nhwc_batch_norm = - dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent && - reserve_space != nullptr; - auto compute_format = - fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC - ? DataLayout::kNHWC - : DataLayout::kNCHW; -#endif - - Tensor transformed_x(x->type()); - Tensor transformed_d_y(d_y->type()); - Tensor transformed_d_x; - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW && x_dims.size() > 2) { - VLOG(3) << "Transform input tensor from NHWC to NCHW."; - ResizeToChannelFirst(ctx, x, - &transformed_x); - TransToChannelFirst(ctx, x, - &transformed_x); - ResizeToChannelFirst(ctx, d_y, - &transformed_d_y); - TransToChannelFirst(ctx, d_y, - &transformed_d_y); - if (d_x) { - ResizeToChannelFirst(ctx, d_x, - &transformed_d_x); - } - } else { - transformed_x.ShareDataWith(*x); - transformed_d_y.ShareDataWith(*d_y); - if (d_x) { - transformed_d_x.ShareDataWith(*d_x); - } - } - - std::vector dims; - std::vector strides; - if (compute_format == DataLayout::kNCHW) { - dims = {N, C, H, W, D}; - strides = {C * H * W * D, H * W * D, W * D, D, 1}; - } else { - dims = {N, C, H, W, D}; - strides = {H * W * C * D, 1, W * D * C, D * C, C}; - } - - auto &dev_ctx = ctx.template device_context(); - const int num = transformed_x.numel(); -#ifdef HIPCC - const int block = 256; -#else - const int block = 512; -#endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); - const int max_blocks = std::max(max_threads / block, 1); - int grid1 = (num + block - 1) / block; - int grid2 = std::min(C, max_blocks); - auto stream = dev_ctx.stream(); - InplaceHelper inplace_functor; - - if (!use_global_stats) { - if ((N * H * W * D) == 1) { - if (d_x) { - framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); - } - phi::funcs::SetConstant> - functor; - functor(dev_ctx, d_scale, static_cast>(0)); - functor(dev_ctx, d_bias, static_cast>(0)); - return; - } - -// ------------------- cudnn descriptors --------------------- -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// miopenTensorDescriptor_t data_desc_; -// miopenTensorDescriptor_t bn_param_desc_; -// miopenBatchNormMode_t mode_; - -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); -#else - cudnnTensorDescriptor_t data_desc_; - cudnnTensorDescriptor_t bn_param_desc_; - cudnnBatchNormMode_t mode_; - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); -#endif - if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { - LOG(ERROR) << "Provided epsilon is smaller than " - << "CUDNN_BN_MIN_EPSILON. Setting it to " - << "CUDNN_BN_MIN_EPSILON instead."; - } - epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// mode_ = miopenBNSpatial; -#elif CUDNN_VERSION_MIN(7, 0, 1) - if (FLAGS_cudnn_batchnorm_spatial_persistent) { - mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#else - if (H == 1 && W == 1) { - mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; - } else { - mode_ = CUDNN_BATCHNORM_SPATIAL; - } -#endif // CUDNN_VERSION_MIN(7, 0, 1) - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( -// data_desc_, CudnnDataType::type, -// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), -// const_cast(strides.data()))); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, -// data_desc_, mode_)); -#else - PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetTensorNdDescriptor( - data_desc_, CudnnDataType::type, - x_dims.size() > 3 ? x_dims.size() : 4, dims.data(), strides.data())); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDeriveBNTensorDescriptor(bn_param_desc_, - data_desc_, mode_)); -#endif - - const auto *saved_mean = ctx.Input("SavedMean"); - const auto *saved_var = ctx.Input("SavedVariance"); - const auto *saved_mean_data = - saved_mean->template data>(); - const auto *saved_var_data = - saved_var->template data>(); - - if (is_inplace) { - inplace_functor(compute_format, transformed_x.data(), - scale->template data>(), - bias->template data>(), - saved_mean_data, saved_var_data, epsilon, C, H * W * D, - num, transformed_x.data(), grid2, block, stream); - } - - // This branch calls CUDNN APIs - if (d_x && d_scale && d_bias) { - bool called = false; -#if CUDNN_VERSION_MIN(7, 4, 1) - called = true; - size_t workspace_size = 0; - void *workspace_ptr = nullptr; - Tensor workspace_tensor; - auto reserve_space_size = reserve_space->memory_size(); - // --------------- cudnn batchnorm workspace --------------- - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload:: - cudnnGetBatchNormalizationBackwardExWorkspaceSize( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, - /*xDesc=*/data_desc_, - /*yDesc=*/data_desc_, - /*dyDesc=*/data_desc_, - /*dzDesc=*/nullptr, - /*dxDesc=*/data_desc_, - /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, - /*activationDesc=*/nullptr, - /*sizeInBytes=*/&workspace_size)); - - workspace_ptr = workspace_tensor.mutable_data( - ctx.GetPlace(), transformed_x.type(), workspace_size); - - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackwardEx( - /*handle=*/dev_ctx.cudnn_handle(), - /*mode=*/mode_, - /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, - /*alphaDataDiff=*/CudnnDataType::kOne(), - /*betaDataDiff=*/CudnnDataType::kZero(), - /*alphaParamDiff=*/CudnnDataType::kOne(), - /*betaParamDiff=*/CudnnDataType::kZero(), - /*xDesc=*/data_desc_, - /*xData=*/transformed_x.template data(), - /*yDesc=*/nullptr, - /*yData=*/nullptr, - /*dyDesc=*/data_desc_, - /*dyData=*/transformed_d_y.template data(), - /*dzDesc=*/nullptr, - /*dzData=*/nullptr, - /*dxDesc=*/data_desc_, - /*dxData=*/transformed_d_x.template mutable_data( - ctx.GetPlace()), - /*dBnScaleBiasDesc=*/bn_param_desc_, - /*bnScaleData=*/scale->template data>(), - /*bnBiasData=*/nullptr, - /*dBnScaleData=*/d_scale - ->template mutable_data>( - ctx.GetPlace()), - /*dBnBiasData=*/d_bias - ->template mutable_data>( - ctx.GetPlace()), - /*epsilon=*/epsilon, - /*savedMean=*/saved_mean_data, - /*savedInvVariance=*/saved_var_data, - /*activationDesc=*/nullptr, - /*workspace=*/workspace_ptr, - /*workSpaceSizeInBytes=*/workspace_size, - /*reserveSpace=*/const_cast( - reserve_space->template data()), - /*reserveSpaceSizeInBytes=*/reserve_space_size)); -#endif // CUDNN_VERSION_MIN(7, 4, 1) - if (!called) { -#ifdef PADDLE_WITH_HIP - if (compute_format == DataLayout::kNCHW) { - BNBackward< - T, block, - DataLayout::kNCHW><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } else { - BNBackward< - T, block, - DataLayout::kNHWC><<>>( - transformed_d_y.template data(), - transformed_x.template data(), - scale->template data>(), saved_mean_data, - saved_var_data, C, N, H * W * D, epsilon, - transformed_d_x.template data(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace())); - } - -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenBatchNormalizationBackward( -// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), -// CudnnDataType::kZero(), CudnnDataType::kOne(), -// CudnnDataType::kZero(), data_desc_, -// transformed_x.template data(), data_desc_, -// transformed_d_y.template data(), data_desc_, -// transformed_d_x.template mutable_data(ctx.GetPlace()), -// bn_param_desc_, scale->template data>(), -// d_scale->template mutable_data>( -// ctx.GetPlace()), -// d_bias->template mutable_data>( -// ctx.GetPlace()), -// epsilon, saved_mean_data, saved_var_data)); -#else - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnBatchNormalizationBackward( - dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), - CudnnDataType::kZero(), CudnnDataType::kOne(), - CudnnDataType::kZero(), data_desc_, - transformed_x.template data(), data_desc_, - transformed_d_y.template data(), data_desc_, - transformed_d_x.template mutable_data(ctx.GetPlace()), - bn_param_desc_, scale->template data>(), - d_scale->template mutable_data>( - ctx.GetPlace()), - d_bias->template mutable_data>( - ctx.GetPlace()), - epsilon, saved_mean_data, saved_var_data)); -#endif - } - - if (data_layout == DataLayout::kNHWC && - compute_format == DataLayout::kNCHW) { - VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; - TransToChannelLast( - ctx, &transformed_d_x, d_x); - } - } else { - // This branch call CUDA kernels - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - BNBackwardData<<< - grid2, block, 0, dev_ctx.stream()>>>( - d_y->data(), scale->data>(), - saved_mean_data, x->data(), saved_var_data, C, N, H * W * D, - d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), saved_mean_data, saved_var_data, - epsilon, N, C, H * W * D, - d_scale->data>(), - d_bias->data>()); - } - } - } - -#ifdef PADDLE_WITH_HIP -// TODO(wangran16): wait for MIOpen to improve the performance of BN -// clean when exit. -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); -// PADDLE_ENFORCE_GPU_SUCCESS( -// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); -#else - // clean when exit. - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); - PADDLE_ENFORCE_GPU_SUCCESS( - platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); -#endif - } else { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); - - const auto *running_mean_data = - running_mean->template data>(); - const auto *running_var_data = - running_var->template data>(); - - if (is_inplace) { - auto px = *x; - inplace_functor(data_layout, px.mutable_data(ctx.GetPlace()), - scale->template data>(), - bias->template data>(), - running_mean_data, running_var_data, epsilon, C, - H * W * D, num, x->data(), grid2, block, stream); - } - - if (compute_format == DataLayout::kNCHW) { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNCHW><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNCHW><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } else { - if (d_x) { - KeBNBackwardData< - T, framework::DataLayout::kNHWC><<>>( - d_y->data(), scale->data>(), - running_var_data, epsilon, C, H * W, num, d_x->data()); - } - if (d_scale && d_bias) { - KeBNBackwardScaleBias< - T, block, - framework::DataLayout::kNHWC><<>>( - d_y->data(), x->data(), running_mean_data, running_var_data, - epsilon, N, C, H * W * D, d_scale->data>(), - d_bias->data>()); - } - } - } - } -}; - -template -class BatchNormDoubleGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *X = ctx.Input("X"); - const auto *Scale = ctx.Input("Scale"); - const auto *dY = ctx.Input("DY"); - const auto *Saved_mean = ctx.Input("SavedMean"); - const auto *Saved_variance = ctx.Input("SavedVariance"); - const double epsilon = static_cast(ctx.Attr("epsilon")); - const bool use_global_stats = ctx.Attr("use_global_stats"); - const bool is_test = ctx.Attr("is_test"); - - PADDLE_ENFORCE_EQ( - is_test, false, - platform::errors::InvalidArgument( - "`is_test = True` CANNOT be used in train program. If " - "you want to use global status in pre_train model, " - "please set `use_global_stats = True`")); - - const std::string data_layout_str = ctx.Attr("data_layout"); - const DataLayout data_layout = - framework::StringToDataLayout(data_layout_str); - - const auto *ddX = ctx.Input("DDX"); - const auto *ddScale = ctx.Input("DDScale"); - const auto *ddBias = ctx.Input("DDBias"); - - auto *dX = ctx.Output("DX"); - auto *dScale = ctx.Output("DScale"); - auto *ddY = ctx.Output("DDY"); - - NormDoubleGradFunctor( - ctx, data_layout, X, Scale, dY, Saved_mean, Saved_variance, epsilon, - use_global_stats, ddX, ddScale, ddBias, dX, dScale, ddY); - } -}; - } // namespace operators } // namespace paddle - -namespace ops = paddle::operators; -namespace plat = paddle::platform; -#ifdef PADDLE_WITH_HIP -// MIOPEN do not support double -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel); -#else -REGISTER_OP_CUDA_KERNEL( - batch_norm, ops::BatchNormKernel, - ops::BatchNormKernel, - ops::BatchNormKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad, ops::BatchNormGradKernel, - ops::BatchNormGradKernel, - ops::BatchNormGradKernel); -REGISTER_OP_CUDA_KERNEL( - batch_norm_grad_grad, - ops::BatchNormDoubleGradKernel, - ops::BatchNormDoubleGradKernel); -#endif diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc index 6119af18ce153..b3ac3606eaf8e 100644 --- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc +++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc @@ -32,7 +32,7 @@ namespace platform = paddle::platform; namespace op = paddle::operators; using Tensor = paddle::framework::Tensor; -USE_OP(batch_norm); +USE_OP_ITSELF(batch_norm); USE_CUDA_ONLY_OP(fused_bn_add_activation); USE_CUDA_ONLY_OP(fused_bn_add_activation_grad); diff --git a/paddle/fluid/operators/inplace_abn_op.cc b/paddle/fluid/operators/inplace_abn_op.cc index e0779249c41ad..7f5136969980b 100644 --- a/paddle/fluid/operators/inplace_abn_op.cc +++ b/paddle/fluid/operators/inplace_abn_op.cc @@ -17,6 +17,8 @@ #include #include #include "paddle/fluid/operators/batch_norm_op.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { @@ -202,8 +204,7 @@ class InplaceABNOpGradMaker : public framework::SingleGradOpMaker { }; template -class InplaceABNKernel - : public paddle::operators::BatchNormKernel { +class InplaceABNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* x = ctx.Input("X"); @@ -213,7 +214,33 @@ class InplaceABNKernel auto activation = GetInplaceABNActivationType(ctx.Attr("activation")); auto& place = *ctx.template device_context().eigen_device(); - BatchNormKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); auto cur_y = EigenVector::Flatten(*y); InplaceABNActivation functor; @@ -222,8 +249,7 @@ class InplaceABNKernel }; template -class InplaceABNGradKernel - : public paddle::operators::BatchNormGradKernel { +class InplaceABNGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Input("Y"); @@ -244,7 +270,52 @@ class InplaceABNGradKernel InplaceABNActivation functor; functor.GradCompute(ctx, activation, place, cur_y, cur_y, cur_dy, cur_dy); - BatchNormGradKernel::Compute(ctx); + // BatchNormGradKernel::Compute(ctx); + + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } }; diff --git a/paddle/fluid/operators/inplace_abn_op.cu b/paddle/fluid/operators/inplace_abn_op.cu index be7a7bd71711e..db8f8c72d13f8 100644 --- a/paddle/fluid/operators/inplace_abn_op.cu +++ b/paddle/fluid/operators/inplace_abn_op.cu @@ -15,14 +15,15 @@ limitations under the License. */ #include "paddle/fluid/operators/batch_norm_op.h" #include "paddle/fluid/operators/inplace_abn_op.h" #include "paddle/fluid/operators/sync_batch_norm_op.cu.h" +#include "paddle/phi/kernels/batch_norm_grad_kernel.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" namespace paddle { namespace operators { template class InplaceABNKernel - : public paddle::operators::SyncBatchNormKernel, - public paddle::operators::BatchNormKernel { + : public paddle::operators::SyncBatchNormKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* y = ctx.Output("Y"); @@ -36,7 +37,33 @@ class InplaceABNKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormKernel::Compute(ctx); } else { - BatchNormKernel::Compute(ctx); + // BatchNormKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* mean = ctx.Input("Mean"); + auto* variance = ctx.Input("Variance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* mean_out = ctx.Output("MeanOut"); + auto* variance_out = ctx.Output("VarianceOut"); + auto* saved_mean = ctx.Output("SavedMean"); + auto* saved_variance = ctx.Output("SavedVariance"); + auto* reserve_space = ctx.Output("ReserveSpace"); + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormKernel( + static_cast::TYPE&>(dev_ctx), + *x, *scale, *bias, *mean, *variance, momentum, epsilon, data_layout, + is_test, use_global_stats, trainable_statistics, fuse_with_relu, y, + mean_out, variance_out, saved_mean, saved_variance, reserve_space); } auto cur_y = EigenVector::Flatten(*y); @@ -49,8 +76,7 @@ class InplaceABNKernel // https://kevinzakka.github.io/2016/09/14/batch_normalization/ template class InplaceABNGradKernel - : public paddle::operators::SyncBatchNormGradKernel, - public paddle::operators::BatchNormGradKernel { + : public paddle::operators::SyncBatchNormGradKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const auto* y = ctx.Input("Y"); @@ -74,7 +100,50 @@ class InplaceABNGradKernel if (ctx.Attr("use_sync_bn")) { SyncBatchNormGradKernel::Compute(ctx); } else { - BatchNormGradKernel::Compute(ctx); + auto* scale = ctx.Input("Scale"); + auto* bias = ctx.Input("Bias"); + auto* saved_mean = ctx.Input("SavedMean"); + auto* saved_variance = ctx.Input("SavedVariance"); + + auto momentum = ctx.Attr("momentum"); + auto epsilon = ctx.Attr("epsilon"); + auto data_layout = ctx.Attr("data_layout"); + auto is_test = ctx.Attr("is_test"); + auto use_global_stats = ctx.Attr("use_global_stats"); + auto trainable_statistics = ctx.Attr("trainable_statistics"); + auto fuse_with_relu = ctx.Attr("fuse_with_relu"); + + auto* scale_grad = ctx.Output(framework::GradVarName("Scale")); + auto* bias_grad = ctx.Output(framework::GradVarName("Bias")); + + auto* reserve_space = ctx.Input("ReserveSpace"); + auto* mean = ctx.Input("ReserveSpace"); + auto* variance = ctx.Input("ReserveSpace"); + + paddle::optional space_opt = paddle::none; + paddle::optional mean_opt = paddle::none; + paddle::optional variance_opt = paddle::none; + + if (reserve_space != nullptr) { + space_opt = *reserve_space; + } + + if (mean != nullptr) { + mean_opt = *mean; + } + + if (variance != nullptr) { + variance_opt = *variance; + } + + auto& dev_ctx = ctx.device_context(); + phi::BatchNormGradRawKernel( + static_cast::TYPE&>(dev_ctx), + *d_y, *y, *scale, *bias, *saved_mean, *saved_variance, space_opt, + mean_opt, variance_opt, momentum, epsilon, data_layout, is_test, + use_global_stats, trainable_statistics, fuse_with_relu, true, d_x, + scale_grad, bias_grad); } } }; diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h index c400a8f4239a6..0ed1f2719de25 100644 --- a/paddle/fluid/operators/norm_utils.cu.h +++ b/paddle/fluid/operators/norm_utils.cu.h @@ -389,11 +389,12 @@ __global__ void DoubleGradComputeDDYWithGlobal( } template -void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, +void NormDoubleGradFunctor(const DeviceContext &ctx, const DataLayout data_layout, const Tensor *X, const Tensor *Scale, const Tensor *dY, const Tensor *Saved_mean, - const Tensor *Saved_variance, const double epsilon, + const Tensor *Saved_variance, const Tensor *Mean, + const Tensor *Variance, const double epsilon, const bool use_global_stats, const Tensor *ddX, const Tensor *ddScale, const Tensor *ddBias, Tensor *dX, Tensor *dScale, Tensor *ddY) { @@ -404,8 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, const T *ddscale_data = (ddScale == nullptr ? nullptr : ddScale->data()); const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data()); - auto &dev_ctx = ctx.template device_context(); - phi::funcs::SetConstant set_constant; + phi::funcs::SetConstant set_constant; auto &x_dims = X->dims(); const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] @@ -416,7 +416,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, Tensor scale_tmp; if (!Scale) { scale_tmp.mutable_data({C}, ctx.GetPlace()); - set_constant(dev_ctx, &scale_tmp, static_cast(1)); + set_constant(ctx, &scale_tmp, static_cast(1)); } const T *scale_data = Scale ? Scale->data() : scale_tmp.data(); #ifdef __HIPCC__ @@ -424,15 +424,15 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, #else const int block = 512; #endif - int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + int max_threads = ctx.GetMaxPhysicalThreadCount(); const int max_blocks = std::max(max_threads / block, 1); int grid = std::min(C, max_blocks); int grid1 = (num + block - 1) / block; const T *mean_data, *variance_data; if (use_global_stats) { - const auto *running_mean = ctx.Input("Mean"); - const auto *running_var = ctx.Input("Variance"); + const auto *running_mean = Mean; + const auto *running_var = Variance; const auto *running_mean_data = running_mean->template data(); const auto *running_var_data = running_var->template data(); mean_data = running_mean_data; @@ -440,34 +440,35 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } else { const T *smean_data = Saved_mean->data(); const T *svariance_data = Saved_variance->data(); + mean_data = smean_data; variance_data = svariance_data; } if (dX) { T *dx_data = dX->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dX, static_cast(0)); + set_constant(ctx, dX, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } else { DoubleGradComputeDXWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( dy_data, ddscale_data, variance_data, epsilon, C, sample_size, num, dx_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDX< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } else { DoubleGradComputeDX< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, scale_data, ddscale_data, N, C, sample_size, epsilon, dx_data); } @@ -475,28 +476,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (dScale) { T *dscale_data = dScale->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, dScale, static_cast(0)); + set_constant(ctx, dScale, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } else { DoubleGradComputeDScaleWithGlobal< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( ddx_data, variance_data, dy_data, epsilon, N, C, sample_size, dscale_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDScale< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } else { DoubleGradComputeDScale< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddx_data, dy_data, N, C, sample_size, epsilon, dscale_data); } @@ -504,28 +505,28 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx, } if (ddY) { T *ddy_data = ddY->mutable_data(ctx.GetPlace()); - set_constant(dev_ctx, ddY, static_cast(0)); + set_constant(ctx, ddY, static_cast(0)); if (use_global_stats) { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNHWC><<>>( + T, DataLayout::kNHWC><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } else { DoubleGradComputeDDYWithGlobal< - T, DataLayout::kNCHW><<>>( + T, DataLayout::kNCHW><<>>( ddx_data, scale_data, mean_data, variance_data, x_data, ddbias_data, ddscale_data, epsilon, C, sample_size, num, ddy_data); } } else { if (data_layout == DataLayout::kNHWC) { DoubleGradComputeDDY< - T, block, DataLayout::kNHWC><<>>( + T, block, DataLayout::kNHWC><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } else { DoubleGradComputeDDY< - T, block, DataLayout::kNCHW><<>>( + T, block, DataLayout::kNCHW><<>>( x_data, mean_data, variance_data, ddscale_data, ddbias_data, ddx_data, scale_data, N, C, sample_size, epsilon, ddy_data); } diff --git a/paddle/phi/kernels/batch_norm_grad_kernel.h b/paddle/phi/kernels/batch_norm_grad_kernel.h new file mode 100644 index 0000000000000..c15dbd2f63f58 --- /dev/null +++ b/paddle/phi/kernels/batch_norm_grad_kernel.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BatchNormGradRawKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +template +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad); + +template +void BatchNormDoubleGradKernel(const Context& dev_ctx, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* y_grad_grad); + +} // namespace phi diff --git a/paddle/phi/kernels/batch_norm_kernel.h b/paddle/phi/kernels/batch_norm_kernel.h new file mode 100644 index 0000000000000..7ddf32e27c7d7 --- /dev/null +++ b/paddle/phi/kernels/batch_norm_kernel.h @@ -0,0 +1,43 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/phi/core/dense_tensor.h" + +namespace phi { + +template +void BatchNormKernel(const Context& dev_ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space); + +} // namespace phi diff --git a/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc new file mode 100644 index 0000000000000..de2343a384a5b --- /dev/null +++ b/paddle/phi/kernels/cpu/batch_norm_grad_kernel.cc @@ -0,0 +1,674 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +namespace phi { + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +void BatchNormGradRawKernel(const Context& ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + const auto* d_y = &y_grad; + + DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + auto* d_x = x_grad; + auto* d_scale = scale_grad; + auto* d_bias = bias_grad; + + use_global_stats = is_test || use_global_stats; + + // batch_norm with inplace as false will take X as grad input, which + // is same as cuDNN batch_norm backward calculation, batch_norm + // with inplace as true only take Y as input and X should be calculate + // by inverse operation of batch_norm on Y + + if (is_inplace) { + if (d_x) { + PADDLE_ENFORCE_EQ(d_x, + d_y, + phi::errors::InvalidArgument( + "X@GRAD and Y@GRAD inplaced in non-inplace mode")); + } + } else { + if (d_x) { + PADDLE_ENFORCE_NE(d_x, + d_y, + phi::errors::InvalidArgument( + "X@GRAD and Y@GRAD inplaced in non-inplace mode")); + } + } + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto& x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be larger than 1." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be less than 6." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + const int N = x_dims[0]; + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x.numel() / N / C; + + // input dimension is 2 and the format is NCHW. The input can be regarded as + // NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + + // init output + if (d_x) { + ctx.template Alloc(d_x); + } + + const T* mean_data = saved_mean.data(); + const T* inv_var_data = saved_variance.data(); + DenseTensor inv_var_tensor; + if (use_global_stats) { + const auto* running_mean = mean.get_ptr(); + const auto* running_variance = variance.get_ptr(); + mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); + T* running_inv_var_data = ctx.template Alloc(&inv_var_tensor); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); + inv_var_data = running_inv_var_data; + } + + ConstEigenVectorArrayMap scale_arr(scale.data(), C); + ConstEigenVectorArrayMap bias_arr(bias.data(), C); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + T* d_bias_data = nullptr; + T* d_scale_data = nullptr; + if (d_scale && d_bias) { + d_bias_data = ctx.template Alloc(d_bias); + d_scale_data = ctx.template Alloc(d_scale); + } + + // d_bias = np.sum(d_y, axis=0) + // d_scale = np.sum((X - mean) / inv_std * dy, axis=0) + // d_x = (1. / N) * scale * inv_var * (N * d_y - np.sum(d_y, axis=0) + // - (X - mean) * inv_var * inv_var * np.sum(d_y * (X - mean), axis=0)) + EigenVectorArrayMap d_bias_arr(d_bias_data, C); + EigenVectorArrayMap d_scale_arr(d_scale_data, C); + + if (d_scale && d_bias) { + d_bias_arr.setZero(); + d_scale_arr.setZero(); + } + + if (d_x && (N * sample_size) == 1 && !use_global_stats) { + paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + return; + } + + int scale_coefff = use_global_stats ? 1 : N * sample_size; + const auto scale_inv_var_nhw = scale_arr * inv_var_arr / scale_coefff; + + DenseTensor dy_sum; + dy_sum.Resize({C}); + auto dy_sum_data = ctx.template Alloc(&dy_sum); + EigenVectorArrayMap dy_sum_arr(dy_sum_data, C); + + DenseTensor dy_mul_x_sub_mean_mul_invstd_sum; + dy_mul_x_sub_mean_mul_invstd_sum.Resize({C}); + auto dy_mul_x_sub_mean_mul_invstd_sum_data = + ctx.template Alloc(&dy_mul_x_sub_mean_mul_invstd_sum); + EigenVectorArrayMap dy_mul_x_sub_mean_mul_invstd_sum_arr( + dy_mul_x_sub_mean_mul_invstd_sum_data, C); + + dy_sum_arr.setZero(); + dy_mul_x_sub_mean_mul_invstd_sum_arr.setZero(); + + // inplace calculation + // Y: ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + // X: (y - bias) / scale / (inv_var) + est_mean + // formula transform ====> + // (y - bias) / (scale * inv_var) + est_mean + switch (data_layout) { + case DataLayout::kNCHW: { + if (is_inplace) { + auto px = x; + EigenArrayMap x_data(ctx.template Alloc(&px), sample_size, N * C); + ConstEigenArrayMap y_data(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + x_data.col(nc) = (y_data.col(nc) - bias_arr(nc % C)) / + scale_inv_var_nhw(nc % C) / scale_coefff + + mean_arr(nc % C); + } + } + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + ConstEigenArrayMap d_y_arr(d_y->data(), sample_size, N * C); + + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + dy_sum_arr(c) += d_y_arr.col(nc).sum(); + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) += + ((x_arr.col(nc) - mean_arr(c)) * inv_var_arr(c) * d_y_arr.col(nc)) + .sum(); + } + + if (d_scale && d_bias) { + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; + } + + if (d_x) { + EigenArrayMap d_x_arr( + ctx.template Alloc(d_x), sample_size, N * C); + if (!use_global_stats) { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) = + scale_inv_var_nhw(c) * + (d_y_arr.col(nc) * N * sample_size - dy_sum_arr(c) - + (x_arr.col(nc) - mean_arr[c]) * + dy_mul_x_sub_mean_mul_invstd_sum_arr(c) * inv_var_arr(c)); + } + } else { + for (int nc = 0; nc < N * C; ++nc) { + int c = nc % C; + d_x_arr.col(nc) = scale_inv_var_nhw(c) * d_y_arr.col(nc); + } + } + } + break; + } + case DataLayout::kNHWC: { + if (is_inplace) { + auto px = x; + EigenArrayMap x_data(ctx.template Alloc(&px), C, N * sample_size); + ConstEigenArrayMap y_data(x.data(), C, N * sample_size); + for (int nhw = 0; nhw < N * sample_size; nhw++) { + x_data.col(nhw) = + (y_data.col(nhw) - bias_arr) / scale_inv_var_nhw / scale_coefff + + mean_arr; + } + } + ConstEigenArrayMap x_arr(x.data(), C, N * sample_size); + ConstEigenArrayMap d_y_arr(d_y->data(), C, N * sample_size); + + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + dy_sum_arr += d_y_arr.col(nhw); + dy_mul_x_sub_mean_mul_invstd_sum_arr += + (x_arr.col(nhw) - mean_arr) * inv_var_arr * d_y_arr.col(nhw); + } + + if (d_scale && d_bias) { + d_bias_arr = dy_sum_arr; + d_scale_arr = dy_mul_x_sub_mean_mul_invstd_sum_arr; + } + + if (d_x) { + EigenArrayMap d_x_arr( + ctx.template Alloc(d_x), C, N * sample_size); + if (!use_global_stats) { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) = + scale_inv_var_nhw * + (d_y_arr.col(nhw) * N * sample_size - dy_sum_arr - + (x_arr.col(nhw) - mean_arr) * + dy_mul_x_sub_mean_mul_invstd_sum_arr * inv_var_arr); + } + } else { + for (int nhw = 0; nhw < N * sample_size; ++nhw) { + d_x_arr.col(nhw) = scale_inv_var_nhw * d_y_arr.col(nhw); + } + } + } + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s", + data_layout_str)); + } +} + +template +void BatchNormGradKernel(const Context& dev_ctx, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* bias_grad) { + BatchNormGradRawKernel(dev_ctx, + y_grad, + x, + scale, + bias, + saved_mean, + saved_variance, + reserve_space, + mean, + variance, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + fuse_with_relu, + false, + x_grad, + scale_grad, + bias_grad); +} + +template +void BatchNormDoubleGradKernel(const Context& ctx, + const DenseTensor& x_grad_grad, + const DenseTensor& scale_grad_grad, + const DenseTensor& bias_grad_grad, + const DenseTensor& y_grad, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& saved_mean, + const DenseTensor& saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* x_grad, + DenseTensor* scale_grad, + DenseTensor* y_grad_grad) { + const auto* X = &x; + const auto* Scale = &scale; + const auto* dY = &y_grad; + const auto* Saved_mean = &saved_mean; + const auto* Saved_variance = &saved_variance; + + PADDLE_ENFORCE_EQ(is_test, + false, + phi::errors::InvalidArgument( + "`is_test = True` CANNOT be used in train program. If " + "you want to use global status in pre_train model, " + "please set `use_global_stats = True`")); + + const auto data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const auto* ddX = &x_grad_grad; + const auto* ddScale = &scale_grad_grad; + const auto* ddBias = &bias_grad_grad; + + auto* dX = x_grad; + auto* dScale = scale_grad; + auto* ddY = y_grad_grad; + ctx.template Alloc(dX); + ctx.template Alloc(ddY); + + const auto& x_dims = X->dims(); + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = X->numel() / C; + phi::funcs::SetConstant set_constant; + + const T* mean_data = Saved_mean->data(); + const T* inv_var_data = Saved_variance->data(); + + DenseTensor inv_var_tensor; + if (use_global_stats) { + const auto* running_mean = mean.get_ptr(); + const auto* running_variance = variance.get_ptr(); + mean_data = running_mean->data(); + inv_var_tensor.Resize({C}); + + T* running_inv_var_data = ctx.template Alloc(&inv_var_tensor); + EigenVectorArrayMap inv_var_tmp(running_inv_var_data, C); + ConstEigenVectorArrayMap var_arr(running_variance->data(), C); + + inv_var_tmp = (var_arr + epsilon).sqrt().inverse(); + inv_var_data = running_inv_var_data; + } + + // transpose NCHW -> NHWC for easy calculate + DenseTensor transformed_x(X->type()); + DenseTensor transformed_dy(dY->type()); + DenseTensor transformed_ddx(ddX->type()); + + DenseTensor transformed_dx(dX->type()); + DenseTensor transformed_ddy(ddY->type()); + if (data_layout == DataLayout::kNCHW && x_dims.size() > 2) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + // Input Tensor + ResizeToChannelLast(ctx, X, &transformed_x); + TransToChannelLast(ctx, X, &transformed_x); + ResizeToChannelLast(ctx, dY, &transformed_dy); + TransToChannelLast(ctx, dY, &transformed_dy); + ResizeToChannelLast(ctx, ddX, &transformed_ddx); + TransToChannelLast(ctx, ddX, &transformed_ddx); + // Output Tensor + ResizeToChannelLast(ctx, dX, &transformed_dx); + ResizeToChannelLast(ctx, ddY, &transformed_ddy); + } else { + transformed_x.ShareDataWith(*X); + transformed_dy.ShareDataWith(*dY); + transformed_ddx.ShareDataWith(*ddX); + + transformed_dx.ShareDataWith(*dX); + transformed_ddy.ShareDataWith(*ddY); + } + + ConstEigenArrayMap x_arr(transformed_x.data(), C, sample_size); + ConstEigenVectorArrayMap mean_arr(mean_data, C); + ConstEigenVectorArrayMap inv_var_arr(inv_var_data, C); + + Tensor mean_tile; + mean_tile.Resize({C, sample_size}); + EigenArrayMap mean_tile_data( + ctx.template Alloc(&mean_tile), C, sample_size); + + DenseTensor inv_var_tile; + inv_var_tile.Resize({C, sample_size}); + EigenArrayMap inv_var_tile_data( + ctx.template Alloc(&inv_var_tile), C, sample_size); + + mean_tile_data = mean_arr.replicate(1, sample_size); + inv_var_tile_data = inv_var_arr.replicate(1, sample_size); + + DenseTensor Scale_data; + if (!Scale) { + Scale_data.Resize({C}); + ctx.template Alloc(&Scale_data); + set_constant(ctx, &Scale_data, static_cast(1)); + } + ConstEigenVectorArrayMap scale_arr( + Scale ? Scale->data() : Scale_data.data(), C); + + Tensor scale_tile; + scale_tile.Resize({C, sample_size}); + EigenArrayMap scale_tile_data( + ctx.template Alloc(&scale_tile), C, sample_size); + scale_tile_data = scale_arr.replicate(1, sample_size); + + ConstEigenArrayMap dy_arr(transformed_dy.data(), C, sample_size); + ConstEigenArrayMap ddx_arr(transformed_ddx.data(), C, sample_size); + + DenseTensor x_sub_mean_mul_invstd; + x_sub_mean_mul_invstd.Resize({C, sample_size}); + + EigenArrayMap x_sub_mean_mul_invstd_arr( + ctx.template Alloc(&x_sub_mean_mul_invstd), C, sample_size); + x_sub_mean_mul_invstd_arr = (x_arr - mean_tile_data) * inv_var_tile_data; + + if (dX) { + ctx.template Alloc(dX); + EigenArrayMap dx_arr( + ctx.template Alloc(&transformed_dx), C, sample_size); + dx_arr.setZero(); + if (use_global_stats) { + // math: dx = (ddscale * dy) * inv_var + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + dx_arr = dy_arr * ddscale_tile_data * inv_var_tile_data; + } + } else { + // math: dx = scale * ((x - mean) * inv_var / NxHxW * (np.mean(ddx, + // axis=(n,h,w)) * + // np.sum(dy, axis=(n,h,w)) - + // np.sum(dy * ddx, axis=(n,h,w)) + 3 * np.mean(dy * (x - + // mean), + // axis=(n,h,w)) * inv_var.pow(2) * + // np.sum(ddx * (x - mean), axis=(n,h,w))) + inv_var.pow(3) / + // NxHxW * + // np.sum(ddx * (x - mean)) * + // (np.mean(dy, axis=(n,h,w)) - dy) + inv_var.pow(3) / NxHxW * + // np.sum(dy, + // axis=(n,h,w)) * (x - mean) * + // (np.mean(ddx, axis=(n,h,w)) - ddx)) + ddr * (dy * inv_var - + // inv_var + // * + // np.mean(dy, axis=(n,h,w)) - + // inv_var.pow(3) * (x - mean) * np.mean(dy * (x - mean), + // axis=(n,h,w))) + + if (ddX) { + dx_arr += + (x_sub_mean_mul_invstd_arr * inv_var_tile_data * inv_var_tile_data / + sample_size) + .colwise() * + (ddx_arr.rowwise().sum() * dy_arr.rowwise().sum() / sample_size - + (dy_arr * ddx_arr).rowwise().sum() + + 3. * (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() * + (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size); + + dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * + (ddx_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size * (dy_arr.rowwise().sum() / sample_size - dy_arr); + + dx_arr += (inv_var_tile_data * inv_var_tile_data).colwise() * + (dy_arr * x_sub_mean_mul_invstd_arr).rowwise().sum() / + sample_size * + (ddx_arr.rowwise().sum() / sample_size - ddx_arr); + + dx_arr = scale_tile_data * dx_arr; + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + dx_arr += + (dy_arr * inv_var_tile_data - + (dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size) * + inv_var_tile_data - + x_sub_mean_mul_invstd_arr * inv_var_tile_data * + (dy_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size) * + ddscale_tile_data; + } + } + if (data_layout == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; + TransToChannelFirst(ctx, &transformed_dx, dX); + } + } + if (dScale) { + EigenVectorArrayMap dscale_arr(ctx.template Alloc(dScale), C); + dscale_arr.setZero(); + if (use_global_stats) { + // math: dscale = np.sum(ddx * dy, axis=(n,h,w)) * inv_var + if (ddX) { + dscale_arr = (ddx_arr * dy_arr * inv_var_tile_data).rowwise().sum(); + } + } else { + // math: dscale = inv_var * (dy - np.mean(dy, axis=(n,h,w) - (x-mean) * + // inv_var.pow(2) * np.mean(dy * (x-mean), axis=(n,h,w)))) * + // ddx + if (ddX) { + Tensor first_grad; + first_grad.Resize({C, sample_size}); + EigenArrayMap first_grad_arr( + ctx.template Alloc(&first_grad), C, sample_size); + first_grad_arr.setZero(); + + first_grad_arr += + inv_var_tile_data * + (dy_arr - + dy_arr.rowwise().sum().replicate(1, sample_size) / sample_size - + x_sub_mean_mul_invstd_arr * + (dy_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size); + dscale_arr = (first_grad_arr * ddx_arr).rowwise().sum(); + } + } + } + + if (ddY) { + ctx.template Alloc(ddY); + EigenArrayMap ddy_arr( + ctx.template Alloc(&transformed_ddy), C, sample_size); + ddy_arr.setZero(); + if (use_global_stats) { + // math: ddy = r * ddx * inv_var + ddbias + + // ddscale * (x - mean) * inv_var + if (ddX) { + ddy_arr = scale_tile_data * ddx_arr * inv_var_tile_data; + } + } else { + // math: ddy = (x - mean) * inv_var * ddscale + ddbias + + // scale * inv_var * (ddx - (x - mean) * inv_var.pow(2) * + // np.mean(ddx * (x - mean), axis=(n,h,w))) + if (ddX) { + ddy_arr += + scale_tile_data * inv_var_tile_data * + (ddx_arr - + ddx_arr.rowwise().sum().replicate(1, sample_size) / sample_size - + x_sub_mean_mul_invstd_arr * + (ddx_arr * x_sub_mean_mul_invstd_arr) + .rowwise() + .sum() + .replicate(1, sample_size) / + sample_size); + } + } + if (ddScale) { + ConstEigenVectorArrayMap ddscale_arr(ddScale->data(), C); + Tensor ddscale_tile; + ddscale_tile.Resize({C, sample_size}); + EigenArrayMap ddscale_tile_data( + ctx.template Alloc(&ddscale_tile), C, sample_size); + ddscale_tile_data = ddscale_arr.replicate(1, sample_size); + + ddy_arr += x_sub_mean_mul_invstd_arr * ddscale_tile_data; + } + + if (ddBias) { + ConstEigenVectorArrayMap ddbias_arr(ddBias->data(), C); + Tensor ddbias_tile; + ddbias_tile.Resize({C, sample_size}); + EigenArrayMap ddbias_tile_data( + ctx.template Alloc(&ddbias_tile), C, sample_size); + ddbias_tile_data = ddbias_arr.replicate(1, sample_size); + + ddy_arr += ddbias_tile_data; + } + + if (data_layout == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NHWC to NCHW"; + TransToChannelFirst(ctx, &transformed_ddy, ddY); + } + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + batch_norm_grad, CPU, ALL_LAYOUT, phi::BatchNormGradKernel, float, double) { +} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + CPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + double) {} + +PD_REGISTER_KERNEL(batch_norm_grad_grad, + CPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} diff --git a/paddle/phi/kernels/cpu/batch_norm_kernel.cc b/paddle/phi/kernels/cpu/batch_norm_kernel.cc new file mode 100644 index 0000000000000..743128e8dea99 --- /dev/null +++ b/paddle/phi/kernels/cpu/batch_norm_kernel.cc @@ -0,0 +1,204 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/framework/tensor_util.h" + +namespace phi { + +template +using EigenArrayMap = + Eigen::Map>; +template +using ConstEigenArrayMap = + Eigen::Map>; +template +using EigenVectorArrayMap = Eigen::Map>; +template +using ConstEigenVectorArrayMap = + Eigen::Map>; + +template +void BatchNormKernel(const Context& ctx, + const DenseTensor& x, + const DenseTensor& scale, + const DenseTensor& bias, + const DenseTensor& mean, + const DenseTensor& variance, + float momentum, + float epsilon, + const std::string& data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor* y, + DenseTensor* mean_out, + DenseTensor* variance_out, + DenseTensor* saved_mean, + DenseTensor* saved_variance, + DenseTensor* reserve_space) { + bool test_mode = is_test && (!trainable_statistics); + + bool global_stats = test_mode || use_global_stats; + + auto data_layout = paddle::framework::StringToDataLayout(data_layout_str); + + const auto& x_dims = x.dims(); + PADDLE_ENFORCE_GE( + x_dims.size(), + 2, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be larger than 1." + "But received: the size of input X's dimensions is [%d]", + x_dims.size())); + PADDLE_ENFORCE_LE( + x_dims.size(), + 5, + phi::errors::InvalidArgument( + "The size of input X's dimensions should be less than 6." + "But received: the size of input X's dimensionss is [%d]", + x_dims.size())); + const int N = x_dims[0]; + const int C = (data_layout == DataLayout::kNCHW ? x_dims[1] + : x_dims[x_dims.size() - 1]); + const int sample_size = x.numel() / N / C; + + // alloc memory + ctx.template Alloc(y); + ctx.template Alloc(mean_out); + ctx.template Alloc(variance_out); + ctx.template Alloc(saved_mean); + ctx.template Alloc(saved_variance); + + // input dimension is 2 and the format is NCHW. The input can be regarded + // as NHWC format + if (x_dims.size() == 2 && data_layout == DataLayout::kNCHW) { + data_layout = DataLayout::kNHWC; + } + + if (!global_stats) { + // saved_xx is use just in this batch of data + EigenVectorArrayMap saved_mean_e(ctx.template Alloc(saved_mean), C); + EigenVectorArrayMap saved_variance_e( + ctx.template Alloc(saved_variance), C); + saved_mean_e.setZero(); + saved_variance_e.setZero(); + + EigenVectorArrayMap running_mean_arr(ctx.template Alloc(mean_out), C); + EigenVectorArrayMap running_var_arr(ctx.template Alloc(variance_out), + C); + + if ((N * sample_size) == 1) { + // Only 1 element in normalization dimension, + // we skip the batch norm calculation, let y = x. + paddle::framework::TensorCopy(x, ctx.GetPlace(), y); + return; + } + + switch (data_layout) { + case DataLayout::kNCHW: { + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + saved_mean_e(nc % C) += x_arr.col(nc).sum(); + } + saved_mean_e /= N * sample_size; + for (int nc = 0; nc < N * C; ++nc) { + saved_variance_e(nc % C) += + (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm(); + } + saved_variance_e /= N * sample_size; + break; + } + case DataLayout::kNHWC: { + ConstEigenArrayMap x_arr(x.data(), C, N * sample_size); + for (int i = 0; i < N * sample_size; ++i) { + saved_mean_e += x_arr.col(i); + } + saved_mean_e /= N * sample_size; + for (int i = 0; i < N * sample_size; ++i) { + saved_variance_e += + (x_arr.col(i) - saved_mean_e) * (x_arr.col(i) - saved_mean_e); + } + saved_variance_e /= N * sample_size; + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %s", + data_layout_str)); + } + + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + + running_mean_arr = + running_mean_arr * momentum + saved_mean_e * (1. - momentum); + running_var_arr = + running_var_arr * momentum + saved_variance_e * (1. - momentum); + } + + // use SavedMean and SavedVariance to do normalize + Eigen::Array inv_std(C); + if (global_stats) { + ConstEigenVectorArrayMap var_arr(variance.data(), C); + inv_std = (var_arr + epsilon).sqrt().inverse(); + } else { + EigenVectorArrayMap saved_inv_std(saved_variance->data(), C); + // inverse SavedVariance first, gradient will use it too. + saved_inv_std = (saved_inv_std + epsilon).inverse().sqrt(); + inv_std = saved_inv_std; + } + ConstEigenVectorArrayMap mean_arr( + global_stats ? mean.data() : saved_mean->data(), C); + + // ((x - est_mean) * (inv_var) * scale + bias + // formula transform ====> + // (x * inv_var * scale) + (bias - est_mean * inv_var * scale) + ConstEigenVectorArrayMap scale_arr(scale.data(), C); + ConstEigenVectorArrayMap bias_arr(bias.data(), C); + Eigen::Array new_scale = inv_std * scale_arr; + Eigen::Array new_bias = + bias_arr - mean_arr * inv_std * scale_arr; + + switch (data_layout) { + case DataLayout::kNCHW: { + EigenArrayMap y_arr(ctx.template Alloc(y), sample_size, N * C); + ConstEigenArrayMap x_arr(x.data(), sample_size, N * C); + for (int nc = 0; nc < N * C; ++nc) { + y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C); + } + break; + } + case DataLayout::kNHWC: { + EigenArrayMap(ctx.template Alloc(y), C, N * sample_size) = + (ConstEigenArrayMap(x.data(), C, N * sample_size).colwise() * + new_scale) + .colwise() + + new_bias; + break; + } + default: + PADDLE_THROW(phi::errors::InvalidArgument("Unknown storage order: %d", + data_layout)); + } +} + +} // namespace phi + +PD_REGISTER_KERNEL( + batch_norm, CPU, ALL_LAYOUT, phi::BatchNormKernel, float, double) {} diff --git a/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu new file mode 100644 index 0000000000000..2c9ee5ede0103 --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_grad_kernel.cu @@ -0,0 +1,1038 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" + +#include "paddle/fluid/operators/norm_utils.cu.h" +#include "paddle/fluid/operators/norm_utils.h" + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/layout_utils.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void KeBNBackwardScaleBias( + const T *dy, + const T *x, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + const double epsilon, + const int N, + const int C, + const int HxW, + BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + BatchNormParamType inv_var_i = 1.0 / sqrt(variance[i] + epsilon); + BatchNormParamType mean_i = mean[i]; + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + ds_sum += static_cast>(dy[index]) * + (static_cast>(x[index]) - mean_i); + db_sum += static_cast>(dy[index]); + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale[i] = ds_sum * inv_var_i; + dbias[i] = db_sum; + } + __syncthreads(); + } +} + +template +static __global__ void KeBNBackwardData(const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *variance, + const double epsilon, + const int C, + const int HxW, + const int num, + T *dx) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType inv_var = 1.0 / sqrt(variance[c] + epsilon); + dx[i] = static_cast(static_cast>(dy[i]) * + scale[c] * inv_var); + } +} + +template +static __global__ void KeBNRestoreData(const phi::DataLayout layout, + T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + double epsilon, + int C, + int M, + const int num, + const T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? (i / M) % C : i % C; + auto y_i = static_cast>(y[i]); + auto x_i = (y_i - bias[c]) / scale[c] / variance[c] + mean[c]; + x[i] = static_cast(x_i); + } +} + +template +class InplaceHelper { + public: + void operator()(const phi::DataLayout layout, + T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + double epsilon, + int C, + int M, + const int num, + const T *y, + int grid2, + const int block, + const gpuStream_t &stream) { + PADDLE_ENFORCE_EQ(x, + y, + phi::errors::InvalidArgument( + "X and Y should be inplaced in inplace mode")); + KeBNRestoreData<<>>( + layout, x, scale, bias, mean, variance, epsilon, C, M, num, y); + } +}; + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackward( + const T *dy, + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *saved_mean, + const BatchNormParamType *saved_inv_variance, + const int C, + const int N, + const int HxW, + const double epsilon, + T *dx, + BatchNormParamType *dscale, + BatchNormParamType *dbias) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage ds_storage; + __shared__ typename BlockReduce::TempStorage db_storage; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType inv_var_val; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType dscale_val; + __shared__ BatchNormParamType dbias_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType ds_sum = static_cast>(0); + BatchNormParamType db_sum = static_cast>(0); + + if (saved_mean && saved_inv_variance) { + if (threadIdx.x == 0) { + inv_var_val = saved_inv_variance[i]; + mean_val = saved_mean[i]; + } + } else { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = + static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = + static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + inv_var_val = + 1 / sqrt(x_square_sum / inner_size - mean_val * mean_val + epsilon); + } + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + ds_sum += + dy_i * (static_cast>(x[index]) - mean_val); + db_sum += dy_i; + } + ds_sum = BlockReduce(ds_storage).Reduce(ds_sum, cub::Sum()); + db_sum = BlockReduce(db_storage).Reduce(db_sum, cub::Sum()); + if (threadIdx.x == 0) { + dscale_val = ds_sum * inv_var_val; + dbias_val = db_sum; + dscale[i] = dscale_val; + dbias[i] = dbias_val; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = scale[i] * inv_var_val * + (static_cast>(dy[index]) - + dbias_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_val) * + inv_var_val * dscale_val / inner_size); + } + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNBackwardData( + const T *dy, + const BatchNormParamType *scale, + const BatchNormParamType *mean, + const T *x, + const BatchNormParamType *variance, + const int C, + const int N, + const int HxW, + T *dx) { + const int outer_size = C; + const int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage dy_storage; + __shared__ typename BlockReduce::TempStorage dy_x_sub_mean_storage; + __shared__ BatchNormParamType dy_sum_val; + __shared__ BatchNormParamType dy_x_sub_mean_sum_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType inv_var_i = variance[i]; + BatchNormParamType mean_i = mean[i]; + BatchNormParamType dy_sum = static_cast>(0); + BatchNormParamType dy_x_sub_mean_sum = + static_cast>(0); + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType dy_i = + static_cast>(dy[index]); + dy_sum += dy_i; + dy_x_sub_mean_sum += + dy_i * (static_cast>(x[index]) - mean_i); + } + + dy_sum = BlockReduce(dy_storage).Reduce(dy_sum, cub::Sum()); + dy_x_sub_mean_sum = BlockReduce(dy_x_sub_mean_storage) + .Reduce(dy_x_sub_mean_sum, cub::Sum()); + + if (threadIdx.x == 0) { + dy_sum_val = dy_sum; + dy_x_sub_mean_sum_val = dy_x_sub_mean_sum; + } + __syncthreads(); + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + dx[index] = + (static_cast>(dy[index]) - + dy_sum_val / static_cast>(inner_size) - + (static_cast>(x[index]) - mean_i) * + dy_x_sub_mean_sum_val * inv_var_i * inv_var_i / inner_size) * + scale[i] * inv_var_i; + } + } +} + +template +void BatchNormGradRawKernel(const Context &ctx, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon_f, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + bool is_inplace, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + double epsilon = static_cast(epsilon_f); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const auto *d_y = &y_grad; + + auto *d_x = x_grad; + auto *d_scale = scale_grad; + auto *d_bias = bias_grad; + + use_global_stats = is_test || use_global_stats; + + const auto &x_dims = x.dims(); + + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, + true, + phi::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5." + "But received: the size of input's dimensions is [%d]," + "the dimensions of input is [%s]", + x_dims.size(), + x_dims)); + int N, C, H, W, D; + paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + // init output + if (d_x) { + ctx.template Alloc(d_x); + } + + if (d_scale && d_bias) { + d_scale->mutable_data>(ctx.GetPlace()); + d_bias->mutable_data>(ctx.GetPlace()); + } + + PADDLE_ENFORCE_EQ( + scale.dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of scale's dimensions must equal to 1. But received: " + "the size of scale's dimensions is [%d], the dimensions of scale " + "is [%s].", + scale.dims().size(), + scale.dims())); + PADDLE_ENFORCE_EQ( + scale.dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of scale must equal to Channels[%d]. But " + "received: the first dimension of scale is [%d]", + C, + scale.dims()[0])); + + auto dtype = paddle::platform::CudnnDataType::type; +#ifdef PADDLE_WITH_HIP + auto compute_format = + data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; +#else + const bool fast_nhwc_batch_norm = dtype == CUDNN_DATA_HALF && + FLAGS_cudnn_batchnorm_spatial_persistent && + (reserve_space.get_ptr() != nullptr); + auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC + ? DataLayout::kNHWC + : DataLayout::kNCHW; +#endif + + DenseTensor transformed_x(x.type()); + DenseTensor transformed_d_y(d_y->type()); + DenseTensor transformed_d_x; + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &x, &transformed_x); + TransToChannelFirst(ctx, &x, &transformed_x); + ResizeToChannelFirst(ctx, d_y, &transformed_d_y); + TransToChannelFirst(ctx, d_y, &transformed_d_y); + if (d_x) { + ResizeToChannelFirst(ctx, d_x, &transformed_d_x); + } + } else { + transformed_x.ShareDataWith(x); + transformed_d_y.ShareDataWith(*d_y); + if (d_x) { + transformed_d_x.ShareDataWith(*d_x); + } + } + + std::vector dims; + std::vector strides; + if (compute_format == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * C * D, 1, W * D * C, D * C, C}; + } + + const int num = transformed_x.numel(); +#ifdef HIPCC + const int block = 256; +#else + const int block = 512; +#endif + int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + int grid1 = (num + block - 1) / block; + int grid2 = std::min(C, max_blocks); + auto stream = ctx.stream(); + InplaceHelper inplace_functor; + + if (!use_global_stats) { + if ((N * H * W * D) == 1) { + if (d_x) { + paddle::framework::TensorCopy(*d_y, ctx.GetPlace(), d_x); + } + phi::funcs::SetConstant> functor; + functor(ctx, d_scale, static_cast>(0)); + functor(ctx, d_bias, static_cast>(0)); + return; + } + +// ------------------- cudnn descriptors --------------------- +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor( + &bn_param_desc_)); +#endif + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; +#elif CUDNN_VERSION_MIN(7, 0, 1) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#endif // CUDNN_VERSION_MIN(7, 0, 1) + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor(bn_param_desc_, +// data_desc_, mode_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, data_desc_, mode_)); +#endif + + const auto *saved_mean_data = + saved_mean.template data>(); + const auto *saved_var_data = + saved_variance.template data>(); + + if (is_inplace) { + inplace_functor(compute_format, + transformed_x.data(), + scale.template data>(), + bias.template data>(), + saved_mean_data, + saved_var_data, + epsilon, + C, + H * W * D, + num, + transformed_x.data(), + grid2, + block, + stream); + } + + // This branch calls CUDNN APIs + if (d_x && d_scale && d_bias) { + bool called = false; +#if CUDNN_VERSION_MIN(7, 4, 1) + called = true; + size_t workspace_size = 0; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + auto reserve_space_size = reserve_space->memory_size(); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationBackwardExWorkspaceSize( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*yDesc=*/data_desc_, + /*dyDesc=*/data_desc_, + /*dzDesc=*/nullptr, + /*dxDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); + + workspace_ptr = workspace_tensor.mutable_data( + ctx.GetPlace(), transformed_x.type(), workspace_size); + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackwardEx( + /*handle=*/ctx.cudnn_handle(), + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*alphaDataDiff=*/CudnnDataType::kOne(), + /*betaDataDiff=*/CudnnDataType::kZero(), + /*alphaParamDiff=*/CudnnDataType::kOne(), + /*betaParamDiff=*/CudnnDataType::kZero(), + /*xDesc=*/data_desc_, + /*xData=*/transformed_x.template data(), + /*yDesc=*/nullptr, + /*yData=*/nullptr, + /*dyDesc=*/data_desc_, + /*dyData=*/transformed_d_y.template data(), + /*dzDesc=*/nullptr, + /*dzData=*/nullptr, + /*dxDesc=*/data_desc_, + /*dxData=*/ctx.template Alloc(&transformed_d_x), + /*dBnScaleBiasDesc=*/bn_param_desc_, + /*bnScaleData=*/scale.template data>(), + /*bnBiasData=*/nullptr, + /*dBnScaleData=*/d_scale + ->template mutable_data>( + ctx.GetPlace()), + /*dBnBiasData=*/d_bias + ->template mutable_data>( + ctx.GetPlace()), + /*epsilon=*/epsilon, + /*savedMean=*/saved_mean_data, + /*savedInvVariance=*/saved_var_data, + /*activationDesc=*/nullptr, + /*workspace=*/workspace_ptr, + /*workSpaceSizeInBytes=*/workspace_size, + /*reserveSpace=*/const_cast( + reserve_space->template data()), + /*reserveSpaceSizeInBytes=*/reserve_space_size)); +#endif // CUDNN_VERSION_MIN(7, 4, 1) + if (!called) { +#ifdef PADDLE_WITH_HIP + if (compute_format == DataLayout::kNCHW) { + BNBackward<<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } else { + BNBackward<<>>( + transformed_d_y.template data(), + transformed_x.template data(), + scale.template data>(), + saved_mean_data, + saved_var_data, + C, + N, + H * W * D, + epsilon, + transformed_d_x.template data(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace())); + } + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationBackward( +// dev_ctx.cudnn_handle(), mode_, CudnnDataType::kOne(), +// CudnnDataType::kZero(), CudnnDataType::kOne(), +// CudnnDataType::kZero(), data_desc_, +// transformed_x.template data(), data_desc_, +// transformed_d_y.template data(), data_desc_, +// transformed_d_x.template mutable_data(ctx.GetPlace()), +// bn_param_desc_, scale->template data>(), +// d_scale->template mutable_data>( +// ctx.GetPlace()), +// d_bias->template mutable_data>( +// ctx.GetPlace()), +// epsilon, saved_mean_data, saved_var_data)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationBackward( + ctx.cudnn_handle(), + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + transformed_d_y.template data(), + data_desc_, + ctx.template Alloc(&transformed_d_x), + bn_param_desc_, + scale.template data>(), + d_scale->template mutable_data>( + ctx.GetPlace()), + d_bias->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean_data, + saved_var_data)); +#endif + } + + if (data_layout == DataLayout::kNHWC && + compute_format == DataLayout::kNCHW) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + TransToChannelLast(ctx, &transformed_d_x, d_x); + } + } else { + // This branch call CUDA kernels + if (compute_format == DataLayout::kNCHW) { + if (d_x) { + BNBackwardData< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + scale.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + N, + H * W * D, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + x.data(), + saved_mean_data, + saved_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + BNBackwardData< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + scale.data>(), + saved_mean_data, + x.data(), + saved_var_data, + C, + N, + H * W * D, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + x.data(), + saved_mean_data, + saved_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } + } + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#else + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor( + bn_param_desc_)); +#endif + } else { + const auto *running_mean = mean.get_ptr(); + const auto *running_var = variance.get_ptr(); + + const auto *running_mean_data = + running_mean->template data>(); + const auto *running_var_data = + running_var->template data>(); + + if (is_inplace) { + auto px = x; + inplace_functor(data_layout, + ctx.template Alloc(&px), + scale.template data>(), + bias.template data>(), + running_mean_data, + running_var_data, + epsilon, + C, + H * W * D, + num, + x.data(), + grid2, + block, + stream); + } + + if (compute_format == DataLayout::kNCHW) { + if (d_x) { + KeBNBackwardData<<>>( + d_y->data(), + scale.data>(), + running_var_data, + epsilon, + C, + H * W, + num, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNCHW><<>>( + d_y->data(), + x.data(), + running_mean_data, + running_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } else { + if (d_x) { + KeBNBackwardData<<>>( + d_y->data(), + scale.data>(), + running_var_data, + epsilon, + C, + H * W, + num, + d_x->data()); + } + if (d_scale && d_bias) { + KeBNBackwardScaleBias< + T, + block, + phi::DataLayout::kNHWC><<>>( + d_y->data(), + x.data(), + running_mean_data, + running_var_data, + epsilon, + N, + C, + H * W * D, + d_scale->data>(), + d_bias->data>()); + } + } + } +} + +template +void BatchNormGradKernel(const Context &dev_ctx, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional reserve_space, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string &data_layout, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *bias_grad) { + BatchNormGradRawKernel(dev_ctx, + y_grad, + x, + scale, + bias, + saved_mean, + saved_variance, + reserve_space, + mean, + variance, + momentum, + epsilon, + data_layout, + is_test, + use_global_stats, + trainable_statistics, + fuse_with_relu, + false, + x_grad, + scale_grad, + bias_grad); +} + +template +void BatchNormDoubleGradKernel(const Context &ctx, + const DenseTensor &x_grad_grad, + const DenseTensor &scale_grad_grad, + const DenseTensor &bias_grad_grad, + const DenseTensor &y_grad, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &saved_mean, + const DenseTensor &saved_variance, + paddle::optional mean, + paddle::optional variance, + float momentum, + float epsilon, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *x_grad, + DenseTensor *scale_grad, + DenseTensor *y_grad_grad) { + PADDLE_ENFORCE_EQ(is_test, + false, + phi::errors::InvalidArgument( + "`is_test = True` CANNOT be used in train program. If " + "you want to use global status in pre_train model, " + "please set `use_global_stats = True`")); + + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + + const DenseTensor *running_mean = nullptr; + const DenseTensor *running_variance = nullptr; + if (use_global_stats) { + running_mean = mean.get_ptr(); + running_variance = variance.get_ptr(); + } + paddle::operators::NormDoubleGradFunctor(ctx, + data_layout, + &x, + &scale, + &y_grad, + &saved_mean, + &saved_variance, + running_mean, + running_variance, + epsilon, + use_global_stats, + &x_grad_grad, + &scale_grad_grad, + &bias_grad_grad, + x_grad, + scale_grad, + y_grad_grad); +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormGradKernel, + float, + phi::dtype::float16) {} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + GPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(batch_norm_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormGradKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +PD_REGISTER_KERNEL(batch_norm_grad_raw, + GPU, + ALL_LAYOUT, + phi::BatchNormGradRawKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +#endif + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm_grad_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} + +#else +PD_REGISTER_KERNEL(batch_norm_grad_grad, + GPU, + ALL_LAYOUT, + phi::BatchNormDoubleGradKernel, + float, + double) {} +#endif diff --git a/paddle/phi/kernels/gpu/batch_norm_kernel.cu b/paddle/phi/kernels/gpu/batch_norm_kernel.cu new file mode 100644 index 0000000000000..6ad12245d2a45 --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_kernel.cu @@ -0,0 +1,680 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef __NVCC__ +#include "cub/cub.cuh" +#endif +#ifdef __HIPCC__ +#include +namespace cub = hipcub; +#endif + +#include "paddle/fluid/platform/device/gpu/gpu_dnn.h" +#include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/core/kernel_registry.h" +#include "paddle/phi/kernels/batch_norm_kernel.h" +#include "paddle/phi/kernels/funcs/eigen/common.h" + +#include "paddle/fluid/operators/norm_utils.cu.h" +#include "paddle/fluid/operators/norm_utils.h" + +#include "paddle/fluid/framework/data_layout.h" +#include "paddle/fluid/operators/layout_utils.h" +#include "paddle/fluid/platform/enforce.h" + +#include "paddle/fluid/platform/flags.h" +#include "paddle/phi/kernels/gpu/batch_norm_utils.h" + +#ifdef __HIPCC__ +#define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim) +#else +#define LAUNCH_BOUNDS(BlockDim) +#endif + +DECLARE_bool(cudnn_batchnorm_spatial_persistent); + +namespace phi { + +template +using CudnnDataType = paddle::platform::CudnnDataType; +template +using BatchNormParamType = typename CudnnDataType::BatchNormParamType; + +template +static __global__ void BNForwardInference(const T *x, + const BatchNormParamType *mean, + const BatchNormParamType *variance, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + T *y) { + int gid = blockIdx.x * blockDim.x + threadIdx.x; + int stride = blockDim.x * gridDim.x; + int num = N * C * HxW; + for (int i = gid; i < num; i += stride) { + const int c = layout == phi::DataLayout::kNCHW ? i / HxW % C : i % C; + BatchNormParamType x_sub_mean = + static_cast>(x[i]) - mean[c]; + BatchNormParamType inv_var = 1 / sqrt(variance[c] + epsilon); + y[i] = static_cast(scale[c] * x_sub_mean * inv_var + bias[c]); + } +} + +template +static __global__ LAUNCH_BOUNDS(BlockDim) void BNForwardTraining( + const T *x, + const BatchNormParamType *scale, + const BatchNormParamType *bias, + const int C, + const int N, + const int HxW, + const double epsilon, + double exponentialAverageFactor, + T *y, + BatchNormParamType *mean, + BatchNormParamType *variance, + BatchNormParamType *save_mean, + BatchNormParamType *save_inv_variance) { + int outer_size = C; + int inner_size = N * HxW; + typedef cub::BlockReduce, BlockDim> BlockReduce; + __shared__ typename BlockReduce::TempStorage mean_storage; + __shared__ typename BlockReduce::TempStorage variance_storeage; + __shared__ BatchNormParamType mean_val; + __shared__ BatchNormParamType variance_val; + __shared__ BatchNormParamType inv_var_val; + + for (int i = blockIdx.x; i < outer_size; i += gridDim.x) { + BatchNormParamType x_sum = static_cast>(0); + BatchNormParamType x_square_sum = static_cast>(0); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_i = static_cast>(x[index]); + x_sum += x_i; + x_square_sum += x_i * x_i; + } + x_sum = BlockReduce(mean_storage).Reduce(x_sum, cub::Sum()); + x_square_sum = + BlockReduce(variance_storeage).Reduce(x_square_sum, cub::Sum()); + if (threadIdx.x == 0) { + mean_val = x_sum / inner_size; + variance_val = x_square_sum / inner_size - mean_val * mean_val; + inv_var_val = 1 / sqrt(variance_val + epsilon); + + if (save_mean && save_inv_variance) { + save_mean[i] = mean_val; + save_inv_variance[i] = inv_var_val; + } + mean[i] = (1 - exponentialAverageFactor) * mean_val + + exponentialAverageFactor * mean[i]; + variance[i] = (1 - exponentialAverageFactor) * variance_val + + exponentialAverageFactor * variance[i]; + } + __syncthreads(); + + for (int j = threadIdx.x; j < inner_size; j += blockDim.x) { + const int index = layout == phi::DataLayout::kNCHW + ? (j / HxW * C + i) * HxW + j % HxW + : j * outer_size + i; + BatchNormParamType x_sub_mean = + static_cast>(x[index]) - mean_val; + y[index] = scale[i] * x_sub_mean * inv_var_val + bias[i]; + } + } +} + +template +void BatchNormKernel(const Context &ctx, + const DenseTensor &x, + const DenseTensor &scale, + const DenseTensor &bias, + const DenseTensor &mean, + const DenseTensor &variance, + float momentum, + float epsilon_f, + const std::string &data_layout_str, + bool is_test, + bool use_global_stats, + bool trainable_statistics, + bool fuse_with_relu, + DenseTensor *y, + DenseTensor *mean_out, + DenseTensor *variance_out, + DenseTensor *saved_mean, + DenseTensor *saved_variance, + DenseTensor *reserve_space) { + double epsilon = epsilon_f; + const bool trainable_stats = trainable_statistics; + const DataLayout data_layout = + paddle::framework::StringToDataLayout(data_layout_str); + bool test_mode = is_test && (!trainable_stats); + + // Get the size for each dimension. + // NCHW [batch_size, in_channels, in_height, in_width] + const auto &x_dims = x.dims(); + PADDLE_ENFORCE_EQ( + x_dims.size() >= 2 && x_dims.size() <= 5, + true, + phi::errors::InvalidArgument( + "The size of input's dimensions should be between 2 and 5" + "But received: the size of input's dimensions is [%d]", + x_dims.size())); + + ctx.template Alloc(y); + int N, C, H, W, D; + paddle::operators::ExtractNCWHD(x_dims, data_layout, &N, &C, &H, &W, &D); + + auto dtype = paddle::platform::CudnnDataType::type; + +#ifdef PADDLE_WITH_HIP + auto compute_format = + data_layout == DataLayout::kNHWC ? DataLayout::kNHWC : DataLayout::kNCHW; + +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// HIP do not support compute format of NHWC +// auto compute_format = DataLayout::kNCHW; +#else + const bool fast_nhwc_batch_norm = + test_mode || + (dtype == CUDNN_DATA_HALF && FLAGS_cudnn_batchnorm_spatial_persistent); + + auto compute_format = fast_nhwc_batch_norm && data_layout == DataLayout::kNHWC + ? DataLayout::kNHWC + : DataLayout::kNCHW; +#endif + + DenseTensor transformed_x(x.type()); + DenseTensor transformed_y(y->type()); + + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform input tensor from NHWC to NCHW."; + ResizeToChannelFirst(ctx, &x, &transformed_x); + TransToChannelFirst(ctx, &x, &transformed_x); + ResizeToChannelFirst(ctx, y, &transformed_y); + } else { + transformed_x.ShareDataWith(x); + transformed_y.ShareDataWith(*y); + } + +// ------------------- cudnn descriptors --------------------- +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// miopenTensorDescriptor_t data_desc_; +// miopenTensorDescriptor_t bn_param_desc_; +// miopenBatchNormMode_t mode_; + +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenCreateTensorDescriptor(&bn_param_desc_)); +#else + cudnnTensorDescriptor_t data_desc_; + cudnnTensorDescriptor_t bn_param_desc_; + cudnnBatchNormMode_t mode_; + + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnCreateTensorDescriptor(&bn_param_desc_)); +#endif + + if (epsilon <= CUDNN_BN_MIN_EPSILON - FLT_EPSILON) { + LOG(ERROR) << "Provided epsilon is smaller than " + << "CUDNN_BN_MIN_EPSILON. Setting it to " + << "CUDNN_BN_MIN_EPSILON instead."; + } + epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON); + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// mode_ = miopenBNSpatial; +#elif CUDNN_VERSION_MIN(7, 0, 1) + if (FLAGS_cudnn_batchnorm_spatial_persistent) { + mode_ = CUDNN_BATCHNORM_SPATIAL_PERSISTENT; + } else if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#else + if (H == 1 && W == 1) { + mode_ = CUDNN_BATCHNORM_PER_ACTIVATION; + } else { + mode_ = CUDNN_BATCHNORM_SPATIAL; + } +#endif // CUDNN_VERSION_MIN(7, 0, 1) + + VLOG(3) << "Setting descriptors."; + std::vector dims; + std::vector strides; + if (compute_format == DataLayout::kNCHW) { + dims = {N, C, H, W, D}; + strides = {C * H * W * D, H * W * D, W * D, D, 1}; + } else { + dims = {N, C, H, W, D}; + strides = {H * W * D * C, 1, W * D * C, D * C, C}; + } + +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSetTensorDescriptor( +// data_desc_, CudnnDataType::type, +// x_dims.size() > 3 ? x_dims.size() : 4, const_cast(dims.data()), +// const_cast(strides.data()))); +// Note: PERSISTENT not implemented for inference +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDeriveBNTensorDescriptor( +// bn_param_desc_, data_desc_, test_mode ? miopenBNSpatial : mode_)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnSetTensorNdDescriptor( + data_desc_, + CudnnDataType::type, + x_dims.size() > 3 ? x_dims.size() : 4, + dims.data(), + strides.data())); + // Note: PERSISTENT not implemented for inference + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDeriveBNTensorDescriptor( + bn_param_desc_, + data_desc_, + test_mode ? CUDNN_BATCHNORM_SPATIAL : mode_)); +#endif + + auto handle = ctx.cudnn_handle(); + + // Now, depending on whether we are running test or not, we have two paths. + // It is training mode when it's not reference AND not using pre-trained + // model. + bool training = !test_mode && !use_global_stats; + if (!training) { + // only when test we use input to do computation. + const auto *est_mean = &mean; + const auto *est_var = &variance; + // Run inference mode. + PADDLE_ENFORCE_EQ( + est_mean->dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of mean's dimensions must equal to 1." + "But received: the size of mean's dimensions mean is [%d]," + "the dimensions of mean is [%s].", + est_mean->dims().size(), + est_mean->dims())); + PADDLE_ENFORCE_EQ( + est_var->dims().size(), + 1UL, + phi::errors::InvalidArgument( + "The size of variance's dimensions must equal to 1." + "But received: the size of variance's dimensions is [%d]," + "the dimensions of variance is [%s].", + est_var->dims().size(), + est_var->dims())); + PADDLE_ENFORCE_EQ( + est_mean->dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of mean must equal to the number of " + "Channels, which is [%d]. But received: the first dimension" + "of mean is [%d], the dimensions of mean is [%s].", + C, + est_mean->dims()[0], + est_mean->dims())); + PADDLE_ENFORCE_EQ( + est_var->dims()[0], + C, + phi::errors::InvalidArgument( + "The first dimension of variance must equal to the number" + "of Channels, which is [%d]. But received: the first dimension of" + "variance is [%d], the dimensions of variance is [%s].", + C, + est_var->dims()[0], + est_var->dims())); + +#ifdef PADDLE_WITH_HIP + const int block_size = 256; + const int grid_size = (N * C * H * W * D + block_size - 1) / block_size; + if (compute_format == DataLayout::kNCHW) { + BNForwardInference< + T, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } else { + BNForwardInference< + T, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + est_mean->template data>(), + est_var->template data>(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + transformed_y.template data()); + } +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardInference( +// handle, miopenBNSpatial, +// const_cast( +// static_cast(CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// const_cast(static_cast( +// est_mean->template data>())), +// const_cast(static_cast( +// est_var->template data>())), +// epsilon)); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardInference( + handle, + // Note: PERSISTENT not implemented for inference + CUDNN_BATCHNORM_SPATIAL, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + est_mean->template data>(), + est_var->template data>(), + epsilon)); +#endif + } else { + // if MomentumTensor is set, use MomentumTensor value, momentum + // is only used in this training branch + + // need to solve here + // if (ctx.HasInput("MomentumTensor")) { + // const auto *mom_tensor = MomentumTensor; + // DenseTensor mom_cpu; + // paddle::framework::TensorCopySync(*mom_tensor, platform::CPUPlace(), + // &mom_cpu); + // momentum = mom_cpu.data()[0]; + // } + + // Run training mode. + // obtain running mean and running inv var, and there is no need + // to initialize them. + mean_out->mutable_data>(ctx.GetPlace()); + variance_out->mutable_data>(ctx.GetPlace()); + + saved_mean->mutable_data>(ctx.GetPlace()); + saved_variance->mutable_data>(ctx.GetPlace()); + + if ((N * H * W * D) == 1) { + // Only 1 element in normalization dimension, + // skip the batch norm calculation, let y = x. + paddle::framework::TensorCopy(x, ctx.GetPlace(), y); + } else { + double this_factor = 1. - momentum; + + bool called = false; +#if CUDNN_VERSION_MIN(7, 4, 1) + called = true; + size_t workspace_size = 0; + size_t reserve_space_size = 0; + void *reserve_space_ptr = nullptr; + void *workspace_ptr = nullptr; + DenseTensor workspace_tensor; + // Create reserve space and workspace for batch norm. + // Create tensor for each batchnorm op, it will be used in the + // backward. Thus this tensor shouldn't be temp. + // auto *reserve_space = ctx.Output("ReserveSpace"); + PADDLE_ENFORCE_NOT_NULL( + reserve_space, + phi::errors::NotFound( + "The argument ReserveSpace of batch_norm op is not found.")); + // --------------- cudnn batchnorm workspace --------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationForwardTrainingExWorkspaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnIps=*/CUDNN_BATCHNORM_OPS_BN, + /*xDesc=*/data_desc_, + /*zDesc=*/nullptr, + /*yDesc=*/data_desc_, + /*bnScaleBiasMeanVarDesc=*/bn_param_desc_, + /*activationDesc=*/nullptr, + /*sizeInBytes=*/&workspace_size)); + + // -------------- cudnn batchnorm reserve space -------------- + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload:: + cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + /*handle=*/handle, + /*mode=*/mode_, + /*bnOps=*/CUDNN_BATCHNORM_OPS_BN, + /*activationDesc=*/nullptr, + /*xDesc=*/data_desc_, + /*sizeInBytes=*/&reserve_space_size)); + + reserve_space_ptr = reserve_space->mutable_data( + ctx.GetPlace(), transformed_x.type(), reserve_space_size); + workspace_ptr = workspace_tensor.mutable_data( + ctx.GetPlace(), transformed_x.type(), workspace_size); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTrainingEx( + handle, + mode_, + CUDNN_BATCHNORM_OPS_BN, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + nullptr, + nullptr, + data_desc_, + transformed_y.template data(), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()), + nullptr, + workspace_ptr, + workspace_size, + reserve_space_ptr, + reserve_space_size)); +#endif // CUDNN_VERSION_MIN(7, 4, 1) + if (!called) { +#ifdef PADDLE_WITH_HIP + const int num = transformed_x.numel(); + const int block = 256; + const int max_threads = ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(max_threads / block, 1); + const int grid = std::min(C, max_blocks); + if (compute_format == DataLayout::kNCHW) { + BNForwardTraining< + T, + block, + DataLayout::kNCHW><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } else { + BNForwardTraining< + T, + block, + DataLayout::kNHWC><<>>( + transformed_x.template data(), + scale.template data>(), + bias.template data>(), + C, + N, + H * W * D, + epsilon, + this_factor, + transformed_y.template data(), + mean_out->template data>(), + variance_out->template data>(), + saved_mean->template data>(), + saved_variance->template data>()); + } +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenBatchNormalizationForwardTraining( +// handle, mode_, const_cast(static_cast( +// CudnnDataType::kOne())), +// const_cast( +// static_cast(CudnnDataType::kZero())), +// data_desc_, +// static_cast(transformed_x.template data()), +// data_desc_, +// static_cast( +// transformed_y.template mutable_data(ctx.GetPlace())), +// bn_param_desc_, +// const_cast(static_cast( +// scale->template data>())), +// const_cast(static_cast( +// bias->template data>())), +// this_factor, +// static_cast( +// mean_out->template mutable_data>( +// ctx.GetPlace())), +// static_cast(variance_out->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())), +// epsilon, +// static_cast( +// saved_mean->template mutable_data>( +// ctx.GetPlace())), +// static_cast(saved_variance->template mutable_data< +// BatchNormParamType>(ctx.GetPlace())))); +#else + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnBatchNormalizationForwardTraining( + handle, + mode_, + CudnnDataType::kOne(), + CudnnDataType::kZero(), + data_desc_, + transformed_x.template data(), + data_desc_, + ctx.template Alloc(&transformed_y), + bn_param_desc_, + scale.template data>(), + bias.template data>(), + this_factor, + mean_out->template mutable_data>( + ctx.GetPlace()), + variance_out->template mutable_data>( + ctx.GetPlace()), + epsilon, + saved_mean->template mutable_data>( + ctx.GetPlace()), + saved_variance->template mutable_data>( + ctx.GetPlace()))); +#endif + } + } + } + + if (data_layout == DataLayout::kNHWC && compute_format == DataLayout::kNCHW && + x_dims.size() > 2) { + VLOG(3) << "Transform batchnorm output from NCHW to NHWC"; + TransToChannelLast(ctx, &transformed_y, y); + } +#ifdef PADDLE_WITH_HIP +// TODO(wangran16): wait for MIOpen to improve the performance of BN +// clean when exit. +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(data_desc_)); +// PADDLE_ENFORCE_GPU_SUCCESS( +// platform::dynload::miopenDestroyTensorDescriptor(bn_param_desc_)); +#else + // clean when exit. + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(data_desc_)); + PADDLE_ENFORCE_GPU_SUCCESS( + paddle::platform::dynload::cudnnDestroyTensorDescriptor(bn_param_desc_)); +#endif +} + +} // namespace phi + +#ifdef PADDLE_WITH_HIP +PD_REGISTER_KERNEL(batch_norm, + GPU, + ALL_LAYOUT, + phi::BatchNormKernel, + float, + phi::dtype::float16) {} +#else +PD_REGISTER_KERNEL(batch_norm, + GPU, + ALL_LAYOUT, + phi::BatchNormKernel, + float, + double, + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32); + } +} + +#endif diff --git a/paddle/phi/kernels/gpu/batch_norm_utils.h b/paddle/phi/kernels/gpu/batch_norm_utils.h new file mode 100644 index 0000000000000..c9c62026edfa7 --- /dev/null +++ b/paddle/phi/kernels/gpu/batch_norm_utils.h @@ -0,0 +1,142 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/phi/kernels/funcs/math_function.h" + +namespace phi { + +using Tensor = DenseTensor; + +template +inline void ResizeToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[4]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + in_dims_vec[4] = input->dims()[3]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[3]; + in_dims_vec[2] = input->dims()[1]; + in_dims_vec[3] = input->dims()[2]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } +} + +template +inline void ResizeToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[4]; + in_dims_vec[4] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + + } else if (dim == 2) { + // input + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[3]; + in_dims_vec[3] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } else if (dim == 1) { + transformed_input->Resize(input->dims()); + + auto in_dims_vec = phi::vectorize(input->dims()); + in_dims_vec[1] = input->dims()[2]; + in_dims_vec[2] = input->dims()[1]; + transformed_input->Resize(phi::make_ddim(in_dims_vec)); + context.template Alloc(transformed_input); + } +} + +template +inline void TransToChannelFirst(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + VLOG(5) << "Why am I called?"; + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 4, 1, 2, 3}; + funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 3, 1, 2}; + funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +template +inline void TransToChannelLast(const DeviceContext& context, + const Tensor* input, + Tensor* transformed_input) { + int dim = input->dims().size() - 2; + if (dim == 3) { + std::vector axis{0, 2, 3, 4, 1}; + funcs::Transpose trans5; + trans5(context, *input, transformed_input, axis); + + } else if (dim == 2) { + std::vector axis{0, 2, 3, 1}; + funcs::Transpose trans4; + trans4(context, *input, transformed_input, axis); + } else if (dim == 1) { + std::vector axis{0, 2, 1}; + funcs::Transpose trans3; + trans3(context, *input, transformed_input, axis); + } +} + +} // namespace phi diff --git a/paddle/phi/ops/compat/batch_norm_sig.cc b/paddle/phi/ops/compat/batch_norm_sig.cc new file mode 100644 index 0000000000000..011d4c12ecefc --- /dev/null +++ b/paddle/phi/ops/compat/batch_norm_sig.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/phi/core/compat/op_utils.h" + +namespace phi { + +KernelSignature BatchNormOpArgumentMapping(const ArgumentMappingContext& ctx) { + return KernelSignature("batch_norm", + {"X", "Scale", "Bias", "Mean", "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"Y", + "MeanOut", + "VarianceOut", + "SavedMean", + "SavedVariance", + "ReserveSpace"}); +} + +KernelSignature BatchNormGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature( + "batch_norm_grad", + {GradVarName("Y"), + "X", + "Scale", + "Bias", + "SavedMean", + "SavedVariance", + "ReserveSpace", + "Mean", + "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {GradVarName("X"), GradVarName("Scale"), GradVarName("Bias")}); +} + +KernelSignature BatchNormGradGradOpArgumentMapping( + const ArgumentMappingContext& ctx) { + return KernelSignature("batch_norm_grad_grad", + {"DDX", + "DDScale", + "DDBias", + "DY", + "X", + "Scale", + "SavedMean", + "SavedVariance", + "Mean", + "Variance"}, + {"momentum", + "epsilon", + "data_layout", + "is_test", + "use_global_stats", + "trainable_statistics", + "fuse_with_relu"}, + {"DX", "DScale", "DDY"}); +} + +} // namespace phi + +PD_REGISTER_ARG_MAPPING_FN(batch_norm, phi::BatchNormOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad, + phi::BatchNormGradOpArgumentMapping); +PD_REGISTER_ARG_MAPPING_FN(batch_norm_grad_grad, + phi::BatchNormGradGradOpArgumentMapping); diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py index 30c1955adcf9f..c6f491a5484d9 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py @@ -520,6 +520,7 @@ def predict_static(args, data): paddle.enable_static() exe = fluid.Executor(args.place) # load inference model + [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model( args.model_save_dir, diff --git a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py index 4552d600bafd7..2b281d7d6f7c5 100644 --- a/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py +++ b/python/paddle/fluid/tests/unittests/test_apply_pass_to_program.py @@ -162,6 +162,7 @@ def test_main(self): for k, v in self.get_strategy().items(): setattr(build_strategy, k, v) self.check_before_applied(main2, startup2) + apply_build_strategy(main2, startup2, build_strategy, {"use_cuda": self.use_cuda}) self.check_after_applied(main2, startup2) diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py index cce13a8bf3b74..b02df024518a8 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py @@ -320,7 +320,7 @@ def check_with_place(self, place, data_layout, dtype, shape): def test_check_output(self): places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: @@ -342,13 +342,13 @@ def setUp(self): def test_check_output(self): places = [] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): place = core.CUDAPlace(0) if core.is_float16_supported(place): places.append(place) - for place in places: - for data_format in ["NCHW", "NHWC"]: + #for data_format in ["NCHW", "NHWC"]: + for data_format in ["NCHW"]: self.check_with_place(place, data_format, self.dtype, [2, 3, 4, 5]) self.check_with_place(place, data_format, self.dtype, [2, 3]) @@ -517,7 +517,7 @@ def test_with_place(place, data_layout, shape): places = [core.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: @@ -657,7 +657,7 @@ def test_errors(self): class TestDygraphBatchNormTrainableStats(unittest.TestCase): def test_dygraph(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: shape = [4, 10, 4, 4] @@ -678,7 +678,7 @@ def compute(x, is_test, trainable_statistics): def test_static(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: exe = fluid.Executor(p) @@ -716,4 +716,6 @@ def test_reservespace(self): if __name__ == '__main__': + import paddle + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py index 6a6f85a483206..c9abac8fb7946 100644 --- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py +++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py @@ -28,7 +28,7 @@ class TestBatchNorm(unittest.TestCase): def test_name(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: with fluid.dygraph.guard(p): @@ -36,7 +36,7 @@ def test_name(self): def test_error(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: #paddle.disable_static() @@ -83,7 +83,7 @@ def error3d(): def test_dygraph(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: shape = [4, 10, 4, 4] @@ -135,7 +135,7 @@ def compute_v4(x): def test_static(self): places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): places.append(fluid.CUDAPlace(0)) for p in places: exe = fluid.Executor(p) @@ -177,7 +177,7 @@ def setUp(self): else: paddle.set_default_dtype("float64") self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): self.places.append(fluid.CUDAPlace(0)) def tearDown(self): @@ -247,7 +247,7 @@ def test_3d(self): class TestBatchNormUseGlobalStats(unittest.TestCase): def setUp(self): self.places = [fluid.CPUPlace()] - if core.is_compiled_with_cuda() and core.op_support_gpu("batch_norm"): + if core.is_compiled_with_cuda(): self.places.append(fluid.CUDAPlace(0)) self.init_test() @@ -300,4 +300,6 @@ def init_test(self): if __name__ == '__main__': + import paddle + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index 8ea4e369d3236..826f886dab172 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -16,6 +16,7 @@ import unittest import numpy as np +import paddle import paddle import paddle.fluid.core as core @@ -1001,4 +1002,5 @@ def init_paddings(self): TestWithDilation_AsyPadding, grad_check=False) if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py index aee6ca249f535..a204c26c1b823 100644 --- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py +++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py @@ -231,4 +231,5 @@ def test_api(self): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py index 077496200d988..67f6b91021472 100644 --- a/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py +++ b/python/paddle/fluid/tests/unittests/test_inplace_abn_op.py @@ -23,6 +23,7 @@ from paddle.fluid.layer_helper import LayerHelper from paddle.fluid import compiler import paddle.fluid.unique_name as unique_name +import paddle class TestInplaceANBOpTraining(unittest.TestCase): @@ -138,14 +139,14 @@ def compare(self, place, layout, only_forward, activation, alpha, use_cuda): outs[0].name if not only_forward else None, build_strategy=build_strategy, exec_strategy=exec_strategy) - bn_fetches = exe.run(program=comp_prog1, + bn_fetches = exe.run(program=main, feed={'input': data}, fetch_list=fetch_name) fetch_outs.append(bn_fetches) fetch_names.append(fetch_name) - for bn_val, inplace_abn_val, name1, name2 in zip(*(fetch_outs + - fetch_names)): + for bn_val, inplace_abn_val, name1, name2 in zip(*( + fetch_outs + fetch_names)): self.assertTrue( np.allclose( bn_val, inplace_abn_val, atol=1e-2), @@ -156,6 +157,7 @@ def compare(self, place, layout, only_forward, activation, alpha, use_cuda): def test_op(self): use_cudas = [False, True] if core.is_compiled_with_cuda() else [False] + #use_cudas = [False] for use_cuda in use_cudas: place = core.CUDAPlace(0) if use_cuda else core.CPUPlace() layouts = ["NCHW", "NHWC"] @@ -186,4 +188,5 @@ def test_all_branches(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py index fe8c181b79049..49fe397644dc6 100644 --- a/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py +++ b/python/paddle/fluid/tests/unittests/test_norm_nn_grad.py @@ -21,6 +21,7 @@ import paddle.fluid.layers as layers import paddle.fluid.core as core import gradient_checker +import paddle from decorator_helper import prog_scope @@ -167,4 +168,5 @@ def init_test(self): if __name__ == "__main__": + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py index b01c7cf179955..a1a3b31a9766e 100755 --- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py +++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py @@ -24,6 +24,7 @@ import seresnext_net from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType from fake_reader import fake_imdb_reader +import paddle def lstm_net(use_feed): @@ -309,4 +310,5 @@ def program_scope_guard(self): if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_reshape_op.py b/python/paddle/fluid/tests/unittests/test_reshape_op.py index c860d6972fb76..40481b097827c 100755 --- a/python/paddle/fluid/tests/unittests/test_reshape_op.py +++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py @@ -507,4 +507,5 @@ def test_reshape_zero_tensor_error(self): if __name__ == "__main__": + paddle.enable_static() unittest.main()