From df0e74dba0fcbb894eeefa727d7a8a4d50025ccb Mon Sep 17 00:00:00 2001 From: chengduoZH Date: Mon, 5 Feb 2018 11:28:22 +0800 Subject: [PATCH] unifid GPU and CPU implementation --- paddle/operators/layer_norm_op.cc | 185 ------------------ paddle/operators/layer_norm_op.h | 2 +- .../v2/fluid/tests/test_layer_norm_op.py | 4 +- 3 files changed, 4 insertions(+), 187 deletions(-) diff --git a/paddle/operators/layer_norm_op.cc b/paddle/operators/layer_norm_op.cc index 910b8ec0a4d5a..76d5d571c31c0 100644 --- a/paddle/operators/layer_norm_op.cc +++ b/paddle/operators/layer_norm_op.cc @@ -21,13 +21,6 @@ using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; using DataLayout = framework::DataLayout; -template -using EigenMatrixMapRowMajor = Eigen::Map< - Eigen::Matrix>; -template -using ConstEigenMatrixMapRowMajor = Eigen::Map< - const Eigen::Matrix>; - class LayerNormOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -115,75 +108,6 @@ Layer Norm has been implemented as discussed in the paper: } }; -template -class LayerNormKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const float epsilon = ctx.Attr("epsilon"); - const auto *scale = ctx.Input("Scale"); - const auto *bias = ctx.Input("Bias"); - const auto *x = ctx.Input("X"); - const auto &x_dims = x->dims(); - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - - auto *output = ctx.Output("Y"); - auto *mean = ctx.Output("Mean"); - auto *var = ctx.Output("Variance"); - output->mutable_data(ctx.GetPlace()); - mean->mutable_data(ctx.GetPlace()); - var->mutable_data(ctx.GetPlace()); - - auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - - auto input_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - - auto mean_map = EigenMatrixMapRowMajor(mean->data(), left, 1); - auto var_map = EigenMatrixMapRowMajor(var->data(), left, 1); - auto output_map = EigenMatrixMapRowMajor(output->data(), left, right); - - auto squre = [](T ele) { return ele * ele; }; - auto add_epslion = [epsilon](T ele) { return ele + epsilon; }; - - mean_map = input_map.rowwise().mean(); - var_map = (input_map - mean_map.replicate(1, right)) - .unaryExpr(squre) - .rowwise() - .mean() - .unaryExpr(add_epslion); - - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // TODO(zcd): Some thinking about output_map, is it appropriate that - // `output_map` and `input_map` point to the same memory. - auto inv_std = var_map.unaryExpr(inv_std_func); - if (scale && bias) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) + - bias_map.replicate(left, 1); - } else if (scale) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)); - } else if (bias) { - auto bias_map = ConstEigenMatrixMapRowMajor(bias->data(), 1, right); - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)) + - bias_map.replicate(left, 1); - } else { - output_map = (input_map - mean_map.replicate(1, right)) - .cwiseProduct(inv_std.replicate(1, right)); - } - } -}; - class LayerNormGradOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; @@ -236,115 +160,6 @@ class LayerNormGradOp : public framework::OperatorWithKernel { } }; -template -class LayerNormGradKernel - : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &ctx) const override { - const auto *x = ctx.Input("X"); - const auto *mean = ctx.Input("Mean"); - const auto *var = ctx.Input("Variance"); - const auto *scale = ctx.Input("Scale"); - const auto *d_y = ctx.Input(framework::GradVarName("Y")); - - const auto &x_dims = x->dims(); - - const auto begin_norm_axis = ctx.Attr("begin_norm_axis"); - auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); - int left = static_cast(matrix_dim[0]); - int right = static_cast(matrix_dim[1]); - - // init output - auto *d_x = ctx.Output(framework::GradVarName("X")); - auto *d_scale = ctx.Output(framework::GradVarName("Scale")); - auto *d_bias = ctx.Output(framework::GradVarName("Bias")); - - auto x_map = ConstEigenMatrixMapRowMajor(x->data(), left, right); - auto d_y_map = ConstEigenMatrixMapRowMajor(d_y->data(), left, right); - auto mean_map = ConstEigenMatrixMapRowMajor(mean->data(), left, 1); - auto var_map = ConstEigenMatrixMapRowMajor(var->data(), left, 1); - - if (d_bias) { - d_bias->mutable_data(ctx.GetPlace()); - auto d_bias_map = EigenMatrixMapRowMajor(d_bias->data(), 1, right); - d_bias_map = d_y_map.colwise().sum(); - } - if (d_scale) { - d_scale->mutable_data(ctx.GetPlace()); - auto d_scale_map = - EigenMatrixMapRowMajor(d_scale->data(), 1, right); - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - // There are two equation to compute d_scale. One uses "Y" and the other - // does not use "Y" - d_scale_map = - ((x_map - mean_map.replicate(1, right)) - .cwiseProduct( - var_map.unaryExpr(inv_std_func).replicate(1, right)) - .cwiseProduct(d_y_map)) - .colwise() - .sum(); - } - - if (d_x) { - d_x->mutable_data(ctx.GetPlace()); - auto d_x_map = EigenMatrixMapRowMajor(d_x->data(), left, right); - auto triple_product_func = [](T ele) { return ele * ele * ele; }; - auto inv_std_func = [](T ele) { return std::sqrt(1 / ele); }; - - auto inv_std_map = var_map.unaryExpr(inv_std_func).eval(); - // TODO(zcd): these code can be refined - if (d_scale) { - auto scale_map = - ConstEigenMatrixMapRowMajor(scale->data(), 1, right); - // dy_dx - auto dx_end = - inv_std_map.replicate(1, right).cwiseProduct(d_y_map).cwiseProduct( - scale_map.replicate(left, 1)); - - // dy_dmean_dx - auto dx_mean = - (T(-1.0) / right) * dx_end.rowwise().sum().replicate(1, right); - - // dy_var_dx - auto dvar_end_part = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(scale_map.replicate(left, 1)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = inv_std_map.unaryExpr(triple_product_func) - .cwiseProduct(dvar_end_part) - .replicate(1, right); - auto dx_var = - (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); - - d_x_map = dx_end + dx_mean + dx_var; - } else { - // dy_dx - auto dx_end = inv_std_map.replicate(1, right).cwiseProduct(d_y_map); - - // dy_dmean_dx - auto dx_mean = - (T(-1.0) / right) * dx_end.rowwise().sum().replicate(1, right); - - // dy_var_dx - auto dvar_end_part = (x_map - mean_map.replicate(1, right)) - .cwiseProduct(d_y_map) - .rowwise() - .sum(); - auto dvar_end = inv_std_map.unaryExpr(triple_product_func) - .cwiseProduct(dvar_end_part) - .replicate(1, right); - auto dx_var = - (T(-1.0) / right) * - (x_map - mean_map.replicate(1, right)).cwiseProduct(dvar_end); - - d_x_map = dx_end + dx_mean + dx_var; - } - } - } -}; - } // namespace operators } // namespace paddle diff --git a/paddle/operators/layer_norm_op.h b/paddle/operators/layer_norm_op.h index 2de58186fbd84..608447b1ff846 100644 --- a/paddle/operators/layer_norm_op.h +++ b/paddle/operators/layer_norm_op.h @@ -31,7 +31,7 @@ template struct DivAndSqrtFunctor { explicit DivAndSqrtFunctor(T epsilon) { epsilon_ = epsilon; } inline HOSTDEVICE T operator()(T a, T b) const { - return a / (sqrt(b) + epsilon_); + return a / (sqrt(b + epsilon_)); } private: diff --git a/python/paddle/v2/fluid/tests/test_layer_norm_op.py b/python/paddle/v2/fluid/tests/test_layer_norm_op.py index f456b1194c52e..4460ffaf9c469 100644 --- a/python/paddle/v2/fluid/tests/test_layer_norm_op.py +++ b/python/paddle/v2/fluid/tests/test_layer_norm_op.py @@ -20,6 +20,8 @@ from paddle.v2.fluid.op import Operator from paddle.v2.fluid.framework import grad_var_name +np.random.random(123) + def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1): x_shape = x.shape @@ -148,7 +150,7 @@ def test_with_place(place, shape, begin_norm_axis=1): x_shape = shape D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1) scale_shape = [D] - np.random.random(123) + x_val = np.random.random_sample(x_shape).astype(np.float32) scale_val = np.random.random_sample(scale_shape).astype(np.float32) bias_val = np.random.random_sample(scale_shape).astype(np.float32)