diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 4797b0e7154e0..afe8e6bf18014 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -184,4 +184,6 @@ endif() if(WITH_ASCEND_CL) cc_test(gelu_op_npu_test SRCS gelu_op_npu_test.cc DEPS op_registry gelu_op scope device_context enforce executor) +cc_test(mean_op_npu_test SRCS mean_op_npu_test.cc DEPS op_registry mean_op scope device_context enforce executor) endif() + diff --git a/paddle/fluid/operators/mean_op_npu.cc b/paddle/fluid/operators/mean_op_npu.cc new file mode 100644 index 0000000000000..f7dba26604964 --- /dev/null +++ b/paddle/fluid/operators/mean_op_npu.cc @@ -0,0 +1,135 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/mean_op.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/operators/npu_op_runner.h" + + +namespace paddle { +namespace operators { + +template +class MeanNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* out = ctx.Output("Out"); + + auto reduce_ndim = x->dims().size(); + std::vector axes; + for (auto i = 0; i < reduce_ndim; ++i) { + axes.push_back(i); + } + + framework::NPUAttributeMap attr_input = { + {"keep_dims", false}, + {"axes", axes}}; + + std::vector out_dims; + out_dims.push_back(1); + out->Resize(framework::make_ddim(out_dims)); + out->mutable_data(ctx.GetPlace()); + + Tensor reduced_out(x->type()); + std::vector reduced_dout_dims; + reduced_dout_dims.push_back(1); + reduced_out.Resize(framework::make_ddim(reduced_dout_dims)); + reduced_out.mutable_data(ctx.GetPlace()); + + auto runner = NpuOpRunner("ReduceMeanD", + {*x}, + {*out}, + attr_input); + + auto stream = + ctx.template device_context< + paddle::platform::NPUDeviceContext>() + .stream(); + runner.Run(stream); + } +}; + + +template +class MeanGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto stream = + context.template device_context< + paddle::platform::NPUDeviceContext>() + .stream(); + + auto grad = context.Input(framework::GradVarName("Out")); + + PADDLE_ENFORCE_EQ(grad->numel(), 1, + platform::errors::InvalidArgument( + "Mean Gradient Input Tensor len should be 1. But " + "received Out@Grad's elements num is %d.", + grad->numel())); + + auto IG = context.Output(framework::GradVarName("X")); + IG->mutable_data(context.GetPlace()); + + // ones + Tensor ones(grad->type()); + std::vector dout_dims; + for (auto i = 0; i < IG->dims().size(); ++i) { + dout_dims.push_back(IG->dims()[i]); + } + ones.Resize(framework::make_ddim(dout_dims)); + ones.mutable_data(context.GetPlace()); + auto runner_ones = NpuOpRunner("OnesLike", {*IG}, {ones}, {}); + runner_ones.Run(stream); + + // means + Tensor mean_tensor(grad->type()); + mean_tensor.Resize({1}); + mean_tensor.mutable_data(context.GetPlace()); + std::vector mean_vec; + mean_vec.push_back(1.0/static_cast(IG->numel())); + framework::TensorFromVector(mean_vec, + context.device_context(), + &mean_tensor); + + // means mul ones + Tensor mean_ma(grad->type()); + mean_ma.Resize(framework::make_ddim(dout_dims)); + mean_ma.mutable_data(context.GetPlace()); + auto runner_mul_1 = NpuOpRunner("Mul", {mean_tensor, ones}, {mean_ma}, {}); + runner_mul_1.Run(stream); + + // and mul grad + auto runner_mul_2 = NpuOpRunner("Mul", {mean_ma, *grad}, {*IG}, {}); + runner_mul_2.Run(stream); + } +}; + + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OP_NPU_KERNEL( + mean, + ops::MeanNPUKernel, + ops::MeanNPUKernel, + ops::MeanNPUKernel, + ops::MeanNPUKernel) + + +REGISTER_OP_NPU_KERNEL( + mean_grad, + ops::MeanGradNPUKernel, + ops::MeanGradNPUKernel, + ops::MeanGradNPUKernel, + ops::MeanGradNPUKernel) diff --git a/paddle/fluid/operators/mean_op_npu_test.cc b/paddle/fluid/operators/mean_op_npu_test.cc new file mode 100644 index 0000000000000..ab9517b79318d --- /dev/null +++ b/paddle/fluid/operators/mean_op_npu_test.cc @@ -0,0 +1,139 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifndef _WIN32 +#include +#endif + +#include +#include // NOLINT +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" +#include "paddle/fluid/operators/dropout_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/string/printf.h" + +namespace f = paddle::framework; +namespace p = paddle::platform; +namespace m = paddle::operators::math; + +USE_OP(mean); +USE_OP_DEVICE_KERNEL(mean, NPU); +USE_OP(mean_grad); +USE_OP_DEVICE_KERNEL(mean_grad, NPU); + +template +void Compare(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + + std::vector init; + init.push_back(static_cast(1.0)); + init.push_back(static_cast(2.0)); + init.push_back(static_cast(3.0)); + init.push_back(static_cast(4.0)); + + TensorFromVector(init, ctx, tensor_x); + tensor_x->Resize({4}); + + ctx.Wait(); + + auto place = ctx.GetPlace(); + auto out = scope->Var("Out"); + auto tensor_out = out->GetMutable(); + + auto op = f::OpRegistry::CreateOp(op_type, + {{"X", {"X"}}}, + {{"Out", {"Out"}}}, + {}); + + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_out, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)1); + EXPECT_EQ((float)out_vec[0], (float)2.5); +} + +template +void CompareGrad(f::Scope* scope, const p::DeviceContext& ctx, + std::string op_type) { + // init + auto dout = scope->Var("DOut"); + auto tensor_dout = dout->GetMutable(); + float dvalue = 2.0; + tensor_dout->Resize({1}); + std::vector init_dout; + init_dout.push_back(static_cast(dvalue)); + TensorFromVector(init_dout, ctx, tensor_dout); + ctx.Wait(); + + auto x = scope->Var("X"); + auto tensor_x = x->GetMutable(); + tensor_x->Resize({4}); + + auto dx = scope->Var("DX"); + auto tensor_dx = dx->GetMutable(); + tensor_dx->Resize({4}); + + ctx.Wait(); + + auto op = f::OpRegistry::CreateOp(op_type, + {{"Out@GRAD", {"DOut"}}, + {"X", {"X"}}}, + {{"X@GRAD", {"DX"}}}, + {}); + + auto place = ctx.GetPlace(); + op->Run(*scope, place); + + std::vector out_vec; + TensorToVector(*tensor_dx, ctx, &out_vec); + + ctx.Wait(); + + EXPECT_EQ((uint32_t)out_vec.size(), (uint32_t)4); + EXPECT_EQ((float)out_vec[0], (float)1.0/dvalue); + EXPECT_EQ((float)out_vec[1], (float)1.0/dvalue); + EXPECT_EQ((float)out_vec[2], (float)1.0/dvalue); + EXPECT_EQ((float)out_vec[3], (float)1.0/dvalue); +} + +TEST(mean, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "mean"); +} + +TEST(mean, NPU_fp16) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + Compare(&scope, ctx, "mean"); +} + + +TEST(mean_grad, NPU_fp32) { + f::Scope scope; + p::NPUDeviceContext ctx(p::NPUPlace(0)); + CompareGrad(&scope, ctx, "mean_grad"); +} diff --git a/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py new file mode 100644 index 0000000000000..abee0de2dea2c --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_mean_op_npu.py @@ -0,0 +1,149 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid +from paddle.fluid import core + +paddle.enable_static() +SEED = 2021 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMean(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "mean" + self.init_dtype() + + x = np.random.random([3, 3]).astype(self.dtype) + self.inputs = {'X': x} + + self.attrs = {} + np_out = np.mean(x) + self.outputs = {'Out': np_out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMeanFP16(OpTest): + def setUp(self): + self.set_npu() + self.place = paddle.NPUPlace(0) + self.op_type = "mean" + self.init_dtype() + + x = np.random.random([3, 3]).astype(self.dtype) + self.inputs = {'X': x} + + self.attrs = {} + np_out = np.mean(x) + self.outputs = {'Out': np_out} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False) + + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMeanNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(32, 32)).astype('float32') + b_np = np.random.random(size=(32, 32)).astype('float32') + label_np = np.random.randint(2, size=(32, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[32, 32], dtype='float32') + b = paddle.static.data(name="b", shape=[32, 32], dtype='float32') + label = paddle.static.data( + name="label", shape=[32, 1], dtype='int64') + + c = paddle.multiply(a, b) + d = paddle.sqrt(c) + + fc_1 = fluid.layers.fc(input=d, size=128) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run( + main_prog, + feed={"a": a_np, + "b": b_np, + "label": label_np}, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() +