From 6a5b1809c3449e834eaffe681350fa4ba9baa067 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 06:32:45 +0000 Subject: [PATCH 01/28] add mul --- paddle/fluid/operators/mul_op_npu.cc | 137 +++++++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 paddle/fluid/operators/mul_op_npu.cc diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc new file mode 100644 index 0000000000000..7656d6b9d8ad7 --- /dev/null +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -0,0 +1,137 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_ASCEND_CL +#include +#include + +#include "paddle/fluid/operators/mul_op.h" +#include "paddle/fluid/operators/npu_op_runner.h" + +namespace paddle { +namespace operators { + +template +class MulNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* out = ctx.Output("Out"); + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + auto stream = + ctx.template device_context() + .stream(); + if (x_num_col_dims == 1 && y_num_col_dims == 1) { + if (x->dims().size() == 2 && y->dims().size() == 2) { + auto runner = + NpuOpRunner("MatMul", {*x, *y}, {*out}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + out->mutable_data(ctx.GetPlace()); + + runner.Run(stream); + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // flatten + Tensor tmp_flatten(x->type()); + int64_t size = x->dims()[1] * x->dims()[2]; + std::vector vec_flatten; + vec_flatten.push_back(size); + tmp_flatten.Resize(framework::make_ddim(vec_flatten)); + tmp_flatten.mutable_data(ctx.GetPlace()); + auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, {}); + runner_flatten.Run(stream); + // matmul + auto runner_matmul = + NpuOpRunner("MatMul", {tmp_flatten, *y}, {*out}, {}); + runner_matmul.Run(stream); + } + } + // to do other + } +}; + +template +class MulGradNPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* y = ctx.Input("Y"); + auto* dout = ctx.Input(framework::GradVarName("Out")); + auto* dx = ctx.Output(framework::GradVarName("X")); + auto* dy = ctx.Output(framework::GradVarName("Y")); + int x_num_col_dims = ctx.Attr("x_num_col_dims"); + int y_num_col_dims = ctx.Attr("y_num_col_dims"); + auto stream = + ctx.template device_context() + .stream(); + if (x_num_col_dims == 1 && y_num_col_dims == 1) { + if (x->dims().size() == 2 && y->dims().size() == 2) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + + runner_dx.Run(stream); + + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {*x, *dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // flatten + Tensor tmp_flatten(x->type()); + int64_t size = x->dims()[1] * x->dims()[2]; + std::vector vec_flatten; + vec_flatten.push_back(size); + tmp_flatten.Resize(framework::make_ddim(vec_flatten)); + tmp_flatten.mutable_data(ctx.GetPlace()); + auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, {}); + runner_flatten.Run(stream); + // matmul + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + + runner_dx.Run(stream); + // to do shape==2 + + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {tmp_flatten, *dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_NPU_KERNEL( + mul, ops::MulNPUKernel, + ops::MulNPUKernel); +REGISTER_OP_NPU_KERNEL( + mul_grad, ops::MulGradNPUKernel, + ops::MulGradNPUKernel); +#endif From a5f17fb72722c247bd9290f4294992858e54ceea Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 06:44:00 +0000 Subject: [PATCH 02/28] add test mul --- .../tests/unittests/npu/test_mul_op_npu.py | 138 ++++++++++++++++++ 1 file changed, 138 insertions(+) create mode 100644 python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py new file mode 100644 index 0000000000000..182bd92f041a7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -0,0 +1,138 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import numpy as np +import unittest +import sys +sys.path.append("..") +from op_test import OpTest +import paddle +import paddle.fluid as fluid + +paddle.enable_static() +SEED = 2021 + + +class TestMul(OpTest): + def config(self): + self.x_shape = (32, 5) + self.y_shape = (5, 100) + + def setUp(self): + self.set_npu() + self.op_type = "mul" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.config() + np.random.seed(SEED) + self.inputs = { + 'X': np.random.random(self.x_shpe).astype(self.dtype), + 'Y': np.random.random(self.y_shape).astype(self.dtype) + } + self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} + + def set_npu(self): + self.__class__.use_npu = True + self.__class__.no_need_check_grad = True + + def init_dtype(self): + self.dtype = np.float32 + + def test_check_output(self): + self.check_output_with_place(self.place, check_dygraph=False, atol=1e-5) + + + # +class TestMulFP16(TestMatMul): + """ + case 2 + """ + + def init_dtype(self): + self.dtype = np.float16 + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMulNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3)).astype('float32') + b_np = np.random.random(size=(2, 3)).astype('float32') + c_np = np.random.random(size=(3, 2)).astype('float32') + d_np = np.random.random(size=(3, 2)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') + c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') + d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.fluid.layers.mul(sum_1, sum_2) + + fc_1 = fluid.layers.fc(input=result, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +if __name__ == '__main__': + unittest.main() From 7e738ddfe59b552e39bdaa8ebacd3d73d491103c Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 07:37:55 +0000 Subject: [PATCH 03/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 5 +- .../tests/unittests/npu/test_mul_op_npu.py | 71 +------------------ 2 files changed, 3 insertions(+), 73 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 7656d6b9d8ad7..5d86b15148b02 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef PADDLE_WITH_ASCEND_CL #include #include @@ -36,10 +35,10 @@ class MulNPUKernel : public framework::OpKernel { .stream(); if (x_num_col_dims == 1 && y_num_col_dims == 1) { if (x->dims().size() == 2 && y->dims().size() == 2) { + out->mutable_data(ctx.GetPlace()); auto runner = NpuOpRunner("MatMul", {*x, *y}, {*out}, {{"transpose_x1", false}, {"transpose_x2", false}}); - out->mutable_data(ctx.GetPlace()); runner.Run(stream); } else if (x->dims().size() == 3 && y->dims().size() == 2) { @@ -52,6 +51,7 @@ class MulNPUKernel : public framework::OpKernel { tmp_flatten.mutable_data(ctx.GetPlace()); auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, {}); runner_flatten.Run(stream); + out->mutable_data(ctx.GetPlace()); // matmul auto runner_matmul = NpuOpRunner("MatMul", {tmp_flatten, *y}, {*out}, {}); @@ -134,4 +134,3 @@ REGISTER_OP_NPU_KERNEL( mul_grad, ops::MulGradNPUKernel, ops::MulGradNPUKernel); -#endif diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index 182bd92f041a7..1057f1ffdc24b 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -39,7 +39,7 @@ def setUp(self): self.config() np.random.seed(SEED) self.inputs = { - 'X': np.random.random(self.x_shpe).astype(self.dtype), + 'X': np.random.random(self.x_shape).astype(self.dtype), 'Y': np.random.random(self.y_shape).astype(self.dtype) } self.outputs = {'Out': np.dot(self.inputs['X'], self.inputs['Y'])} @@ -65,74 +65,5 @@ def init_dtype(self): self.dtype = np.float16 -@unittest.skipIf(not paddle.is_compiled_with_npu(), - "core is not compiled with NPU") -class TestMulNet(unittest.TestCase): - def _test(self, run_npu=True): - main_prog = paddle.static.Program() - startup_prog = paddle.static.Program() - main_prog.random_seed = SEED - startup_prog.random_seed = SEED - np.random.seed(SEED) - - a_np = np.random.random(size=(2, 3)).astype('float32') - b_np = np.random.random(size=(2, 3)).astype('float32') - c_np = np.random.random(size=(3, 2)).astype('float32') - d_np = np.random.random(size=(3, 2)).astype('float32') - label_np = np.random.randint(2, size=(2, 1)).astype('int64') - - with paddle.static.program_guard(main_prog, startup_prog): - a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') - b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') - c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') - d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') - label = paddle.static.data( - name="label", shape=[2, 1], dtype='int64') - - sum_1 = paddle.add(a, b) - sum_2 = paddle.add(c, d) - result = paddle.fluid.layers.mul(sum_1, sum_2) - - fc_1 = fluid.layers.fc(input=result, size=8) - prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') - - cost = fluid.layers.cross_entropy(input=prediction, label=label) - loss = fluid.layers.reduce_mean(cost) - sgd = fluid.optimizer.SGD(learning_rate=0.01) - sgd.minimize(loss) - - if run_npu: - place = paddle.NPUPlace(0) - else: - place = paddle.CPUPlace() - exe = paddle.static.Executor(place) - exe.run(startup_prog) - - print("Start run on {}".format(place)) - for epoch in range(100): - - pred_res, loss_res = exe.run(main_prog, - feed={ - "a": a_np, - "b": b_np, - "c": c_np, - "d": d_np, - "label": label_np - }, - fetch_list=[prediction, loss]) - if epoch % 10 == 0: - print("Epoch {} | Prediction[0]: {}, Loss: {}".format( - epoch, pred_res[0], loss_res)) - - return pred_res, loss_res - - def test_npu(self): - cpu_pred, cpu_loss = self._test(False) - npu_pred, npu_loss = self._test(True) - - self.assertTrue(np.allclose(npu_pred, cpu_pred)) - self.assertTrue(np.allclose(npu_loss, cpu_loss)) - - if __name__ == '__main__': unittest.main() From 1ac720e823c46dbd9d397b22a7c67b611d1e55e3 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 07:43:41 +0000 Subject: [PATCH 04/28] fix --- .../tests/unittests/npu/test_mul_op_npu.py | 71 ++++++++++++++++++- 1 file changed, 70 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index 1057f1ffdc24b..76af2febf378e 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -56,7 +56,7 @@ def test_check_output(self): # -class TestMulFP16(TestMatMul): +class TestMulFP16(TestMul): """ case 2 """ @@ -65,5 +65,74 @@ def init_dtype(self): self.dtype = np.float16 +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMulNet(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3)).astype('float32') + b_np = np.random.random(size=(2, 3)).astype('float32') + c_np = np.random.random(size=(3, 2)).astype('float32') + d_np = np.random.random(size=(3, 2)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3], dtype='float32') + c = paddle.static.data(name="c", shape=[3, 2], dtype='float32') + d = paddle.static.data(name="d", shape=[3, 2], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.fluid.layers.mul(sum_1, sum_2) + + fc_1 = fluid.layers.fc(input=result, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + if __name__ == '__main__': unittest.main() From 13f40687e8f8359abf1e5c29c5c17f262d5cab23 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 08:02:42 +0000 Subject: [PATCH 05/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 64 +++++++++++++++++----------- 1 file changed, 40 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 5d86b15148b02..d08bddfe18dc4 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -33,6 +33,14 @@ class MulNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); + PADDLE_ENFORCE_EQ(x_num_col_dims, 1, + platform::errors::InvalidArgument( + "now only support x_num_col_dims == 1: but got %d", + x_num_col_dims)); + PADDLE_ENFORCE_EQ(y_num_col_dims, 1, + platform::errors::InvalidArgument( + "now only support y_num_col_dims == 1: but got %d", + y_num_col_dims)); if (x_num_col_dims == 1 && y_num_col_dims == 1) { if (x->dims().size() == 2 && y->dims().size() == 2) { out->mutable_data(ctx.GetPlace()); @@ -78,19 +86,23 @@ class MulGradNPUKernel : public framework::OpKernel { .stream(); if (x_num_col_dims == 1 && y_num_col_dims == 1) { if (x->dims().size() == 2 && y->dims().size() == 2) { - dx->mutable_data(ctx.GetPlace()); - auto runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - - runner_dx.Run(stream); - - dy->mutable_data(ctx.GetPlace()); - auto runner_dy = - NpuOpRunner("MatMul", {*x, *dout}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); - - runner_dy.Run(stream); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + + runner_dx.Run(stream); + } + + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {*x, *dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } } else if (x->dims().size() == 3 && y->dims().size() == 2) { // flatten Tensor tmp_flatten(x->type()); @@ -102,20 +114,24 @@ class MulGradNPUKernel : public framework::OpKernel { auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, {}); runner_flatten.Run(stream); // matmul - dx->mutable_data(ctx.GetPlace()); - auto runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, - {{"transpose_x1", false}, {"transpose_x2", true}}); - - runner_dx.Run(stream); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = + NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + + runner_dx.Run(stream); + } // to do shape==2 - dy->mutable_data(ctx.GetPlace()); - auto runner_dy = - NpuOpRunner("MatMul", {tmp_flatten, *dout}, {*dy}, - {{"transpose_x1", true}, {"transpose_x2", false}}); + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {tmp_flatten, *dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); + runner_dy.Run(stream); + } } } } From 03018e49d78fea772fb68cfaaf3442b3a8770985 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 09:04:19 +0000 Subject: [PATCH 06/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index d08bddfe18dc4..76e0116b69156 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -53,18 +53,22 @@ class MulNPUKernel : public framework::OpKernel { // flatten Tensor tmp_flatten(x->type()); int64_t size = x->dims()[1] * x->dims()[2]; - std::vector vec_flatten; - vec_flatten.push_back(size); - tmp_flatten.Resize(framework::make_ddim(vec_flatten)); - tmp_flatten.mutable_data(ctx.GetPlace()); - auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, {}); - runner_flatten.Run(stream); - out->mutable_data(ctx.GetPlace()); + x->resize(x->dims()[0], size) + // std::vector vec_flatten; + // vec_flatten.push_back(size); + // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); + // tmp_flatten.mutable_data(ctx.GetPlace()); + // auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, + // {}); + // runner_flatten.Run(stream); + out->mutable_data(ctx.GetPlace()); // matmul auto runner_matmul = - NpuOpRunner("MatMul", {tmp_flatten, *y}, {*out}, {}); - runner_matmul.Run(stream); - } + NpuOpRunner("MatMul", x, *y + }, {*out}, {}); + runner_matmul.Run(stream); + } else { + PADDLE_THROW(platform::errors::InvalidArgument("not suppert dims")); } // to do other } From 526502f3cfcda3b68d7808645bf790ae2c63827d Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 09:17:17 +0000 Subject: [PATCH 07/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 76e0116b69156..0e9372f1439c4 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -63,14 +63,13 @@ class MulNPUKernel : public framework::OpKernel { // runner_flatten.Run(stream); out->mutable_data(ctx.GetPlace()); // matmul - auto runner_matmul = - NpuOpRunner("MatMul", x, *y - }, {*out}, {}); - runner_matmul.Run(stream); - } else { - PADDLE_THROW(platform::errors::InvalidArgument("not suppert dims")); + auto runner_matmul = NpuOpRunner("MatMul", {x, *y}, {*out}, {}); + runner_matmul.Run(stream); + } else { + PADDLE_THROW(platform::errors::InvalidArgument("not suppert dims")); + } + // to do other } - // to do other } }; From dc5bb810d9ef5236690132b39968e5bb64c65176 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 09:19:15 +0000 Subject: [PATCH 08/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 0e9372f1439c4..13290c95b5770 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -53,15 +53,15 @@ class MulNPUKernel : public framework::OpKernel { // flatten Tensor tmp_flatten(x->type()); int64_t size = x->dims()[1] * x->dims()[2]; - x->resize(x->dims()[0], size) - // std::vector vec_flatten; - // vec_flatten.push_back(size); - // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); - // tmp_flatten.mutable_data(ctx.GetPlace()); - // auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, - // {}); - // runner_flatten.Run(stream); - out->mutable_data(ctx.GetPlace()); + x->resize(x->dims()[0], size); + // std::vector vec_flatten; + // vec_flatten.push_back(size); + // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); + // tmp_flatten.mutable_data(ctx.GetPlace()); + // auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, + // {}); + // runner_flatten.Run(stream); + out->mutable_data(ctx.GetPlace()); // matmul auto runner_matmul = NpuOpRunner("MatMul", {x, *y}, {*out}, {}); runner_matmul.Run(stream); From ef215a14237221df11a031bc3022a9dbe156f827 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 09:24:19 +0000 Subject: [PATCH 09/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 13290c95b5770..2ea89e6b8ac49 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -53,7 +53,7 @@ class MulNPUKernel : public framework::OpKernel { // flatten Tensor tmp_flatten(x->type()); int64_t size = x->dims()[1] * x->dims()[2]; - x->resize(x->dims()[0], size); + x->Resize(paddle::framework::make_ddim({x->dims()[0], size})); // std::vector vec_flatten; // vec_flatten.push_back(size); // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); From 3baf8be500d70f9438bec9deb7097ae4b8aae892 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 09:32:05 +0000 Subject: [PATCH 10/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 2ea89e6b8ac49..0febccd7bd706 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -51,9 +51,10 @@ class MulNPUKernel : public framework::OpKernel { runner.Run(stream); } else if (x->dims().size() == 3 && y->dims().size() == 2) { // flatten - Tensor tmp_flatten(x->type()); + // Tensor tmp_flatten(x->type()); int64_t size = x->dims()[1] * x->dims()[2]; - x->Resize(paddle::framework::make_ddim({x->dims()[0], size})); + int64_t f_dim = x->dims()[0]; + x->Resize(paddle::framework::make_ddim({f_dim, size})); // std::vector vec_flatten; // vec_flatten.push_back(size); // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); @@ -63,7 +64,7 @@ class MulNPUKernel : public framework::OpKernel { // runner_flatten.Run(stream); out->mutable_data(ctx.GetPlace()); // matmul - auto runner_matmul = NpuOpRunner("MatMul", {x, *y}, {*out}, {}); + auto runner_matmul = NpuOpRunner("MatMul", {*x, *y}, {*out}, {}); runner_matmul.Run(stream); } else { PADDLE_THROW(platform::errors::InvalidArgument("not suppert dims")); From 8ccc80997c15a8822a7471c32ce13ec02f0f00c0 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 09:40:09 +0000 Subject: [PATCH 11/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 0febccd7bd706..0ad4f9296d71f 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -52,9 +52,12 @@ class MulNPUKernel : public framework::OpKernel { } else if (x->dims().size() == 3 && y->dims().size() == 2) { // flatten // Tensor tmp_flatten(x->type()); - int64_t size = x->dims()[1] * x->dims()[2]; - int64_t f_dim = x->dims()[0]; - x->Resize(paddle::framework::make_ddim({f_dim, size})); + int64_t sec_dim = x->dims()[1] * x->dims()[2]; + int64_t first_dim = x->dims()[0]; + std::vector vec_dim; + vec_dim.push_back(first_dim); + vec_dim.push_back(sec_dim); + x->Resize(framework::make_ddim(vec_dim)); // std::vector vec_flatten; // vec_flatten.push_back(size); // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); From 7ab7407d16ce5f2e90cb02c8cfd9f935f3667643 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 09:52:24 +0000 Subject: [PATCH 12/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 0ad4f9296d71f..87efa94bac089 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -67,7 +67,9 @@ class MulNPUKernel : public framework::OpKernel { // runner_flatten.Run(stream); out->mutable_data(ctx.GetPlace()); // matmul - auto runner_matmul = NpuOpRunner("MatMul", {*x, *y}, {*out}, {}); + auto runner_matmul = + NpuOpRunner("MatMul", {*x, *y}, {*out}, + {{"transpose_x1", false}, {"transpose_x2", false}}); runner_matmul.Run(stream); } else { PADDLE_THROW(platform::errors::InvalidArgument("not suppert dims")); From 987b54e16b0ee86325bc8793646441d057c90e60 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Fri, 12 Mar 2021 10:19:41 +0000 Subject: [PATCH 13/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 87efa94bac089..bf768b39eccd6 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -50,14 +50,16 @@ class MulNPUKernel : public framework::OpKernel { runner.Run(stream); } else if (x->dims().size() == 3 && y->dims().size() == 2) { - // flatten - // Tensor tmp_flatten(x->type()); + // reshape + Tensor tmp_x(x->type()); int64_t sec_dim = x->dims()[1] * x->dims()[2]; int64_t first_dim = x->dims()[0]; - std::vector vec_dim; - vec_dim.push_back(first_dim); - vec_dim.push_back(sec_dim); - x->Resize(framework::make_ddim(vec_dim)); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); // std::vector vec_flatten; // vec_flatten.push_back(size); // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); @@ -68,7 +70,7 @@ class MulNPUKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); // matmul auto runner_matmul = - NpuOpRunner("MatMul", {*x, *y}, {*out}, + NpuOpRunner("MatMul", {tmp_x, *y}, {*out}, {{"transpose_x1", false}, {"transpose_x2", false}}); runner_matmul.Run(stream); } else { From 85effc75dac07eeccc28923a07ac7e7406da20ef Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 14 Mar 2021 08:02:15 +0000 Subject: [PATCH 14/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 65 ++++++++++++++++--- .../tests/unittests/npu/test_mul_op_npu.py | 51 +++++++++++++++ 2 files changed, 106 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index bf768b39eccd6..5363d0d54d706 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -58,25 +58,70 @@ class MulNPUKernel : public framework::OpKernel { tmp_x.mutable_data(ctx.GetPlace()); framework::TensorCopy( *x, ctx.GetPlace(), - ctx.template device_context(), tmp_x); + ctx.template device_context(), &tmp_x); tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - // std::vector vec_flatten; - // vec_flatten.push_back(size); - // tmp_flatten.Resize(framework::make_ddim(vec_flatten)); - // tmp_flatten.mutable_data(ctx.GetPlace()); - // auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, - // {}); - // runner_flatten.Run(stream); out->mutable_data(ctx.GetPlace()); // matmul - auto runner_matmul = + auto runner = NpuOpRunner("MatMul", {tmp_x, *y}, {*out}, {{"transpose_x1", false}, {"transpose_x2", false}}); - runner_matmul.Run(stream); + runner.Run(stream); } else { PADDLE_THROW(platform::errors::InvalidArgument("not suppert dims")); } // to do other + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] + PADDLE_ENFORCE_EQ(x_num_col_dims, 2, + platform::errors::InvalidArgument( + "now only support x_num_col_dims == 2: but got %d", + x_num_col_dims)); + // flatten => x.shape=[6, 4] + Tensor tmp_matmul(x->type()); + int64_t first_dim = x->dims()[0] * x->dims()[1]; + int64_t sec_dim = x->dims()[2]; + tmp_matmul.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_matmul.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_matmul.Resize(framework::make_ddim({first_dim, sec_dim})); + + // matmul [6,4] , [4, 5] => [6, 5] + Tensor tmp_x1(x->type()); + tmp_x1.Resize(framework::make_ddim({first_dim, y->dims()[1]})); + tmp_x1.mutable_data(ctx.GetPlace()); + + auto runner_matmul = + NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul}, + {{"transpose_x1", false}, {"transpose_x2", false}}); + + runner_matmul.Run(stream); + // transpose [6, 5] => [5, 6] + Tensor tmp_trans(x->type()); + tmp_trans.Resize(framework::make_ddim({y->dims()[1], first_dim})); + tmp_trans.mutable_data(ctx.GetPlace()); + auto runner_trans = NpuOpRunner("TransposeD", {tmp_matmul}, {tmp_trans}, + {{"perm", {1, 0}}}); + runner_trans.Run(stream); + // reshape [5, 6] => [5, 2, 3] + Tensor tmp_re(x->type()); + int64_t re_first_dim = y->dims()[1]; + int64_t re_sec_dim = x->dims()[0]; + int64_t re_third_dim = x->dims()[1]; + tmp_re.Resize( + framework::make_ddim({re_first_dim, re_sec_dim, re_third_dim})); + tmp_re.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + tmp_trans, ctx.GetPlace(), + ctx.template device_context(), &tmp_re); + tmp_re.Resize( + framework::make_ddim({re_first_dim, re_sec_dim, re_third_dim})); + + // transpose [5, 2, 3] => [2, 3, 5] + auto runner_trans_final = + NpuOpRunner("TransposeD", {tmp_re}, {*out}, {{"perm", {1, 2, 0}}}); + runner_trans_final.Run(stream); } } }; diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index 76af2febf378e..9a82487105a04 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -65,6 +65,57 @@ def init_dtype(self): self.dtype = np.float16 +class TestMul3(TestMul): + """ + case 3 + """ + + def config(self): + self.x_shape = (2, 2, 5) + self.y_shape = (10, 5) + + def setUp(self): + self.set_npu() + self.op_type = "mul" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.config() + np.random.seed(SEED) + self.inputs = { + 'X': np.random.random(self.x_shape).astype(self.dtype), + 'Y': np.random.random(self.y_shape).astype(self.dtype) + } + self.outputs = { + 'Out': np.dot(self.inputs['X'].reshape(2, 10), self.inputs['Y']) + } + + +class TestMul4(TestMul): + """ + case 4 + """ + + def config(self): + self.x_shape = (2, 3, 4) + self.y_shape = (4, 5) + + def setUp(self): + self.set_npu() + self.op_type = "mul" + self.place = paddle.NPUPlace(0) + self.init_dtype() + self.config() + np.random.seed(SEED) + self.inputs = { + 'X': np.random.random(self.x_shape).astype(self.dtype), + 'Y': np.random.random(self.y_shape).astype(self.dtype) + } + self.attrs = {"x_num_col_dims": 2} + self.outputs = { + 'Out': np.dot(self.inputs['X'].reshape(6, 4), self.inputs['Y']) + } + + @unittest.skipIf(not paddle.is_compiled_with_npu(), "core is not compiled with NPU") class TestMulNet(unittest.TestCase): From 9fac20ac7c38bce8a09df93f73d54efe3962ae8e Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Sun, 14 Mar 2021 08:09:21 +0000 Subject: [PATCH 15/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 5363d0d54d706..3e61bf4644b5d 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -77,20 +77,20 @@ class MulNPUKernel : public framework::OpKernel { "now only support x_num_col_dims == 2: but got %d", x_num_col_dims)); // flatten => x.shape=[6, 4] - Tensor tmp_matmul(x->type()); + Tensor tmp_x(x->type()); int64_t first_dim = x->dims()[0] * x->dims()[1]; int64_t sec_dim = x->dims()[2]; - tmp_matmul.Resize(framework::make_ddim({first_dim, sec_dim})); - tmp_matmul.mutable_data(ctx.GetPlace()); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); framework::TensorCopy( *x, ctx.GetPlace(), ctx.template device_context(), &tmp_x); - tmp_matmul.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); // matmul [6,4] , [4, 5] => [6, 5] - Tensor tmp_x1(x->type()); - tmp_x1.Resize(framework::make_ddim({first_dim, y->dims()[1]})); - tmp_x1.mutable_data(ctx.GetPlace()); + Tensor tmp_matmul(x->type()); + tmp_matmul.Resize(framework::make_ddim({first_dim, y->dims()[1]})); + tmp_matmul.mutable_data(ctx.GetPlace()); auto runner_matmul = NpuOpRunner("MatMul", {tmp_x, *y}, {tmp_matmul}, @@ -101,8 +101,9 @@ class MulNPUKernel : public framework::OpKernel { Tensor tmp_trans(x->type()); tmp_trans.Resize(framework::make_ddim({y->dims()[1], first_dim})); tmp_trans.mutable_data(ctx.GetPlace()); + std::vector vec_trans = {1, 0}; auto runner_trans = NpuOpRunner("TransposeD", {tmp_matmul}, {tmp_trans}, - {{"perm", {1, 0}}}); + {{"perm", vec_trans}}); runner_trans.Run(stream); // reshape [5, 6] => [5, 2, 3] Tensor tmp_re(x->type()); @@ -119,8 +120,9 @@ class MulNPUKernel : public framework::OpKernel { framework::make_ddim({re_first_dim, re_sec_dim, re_third_dim})); // transpose [5, 2, 3] => [2, 3, 5] - auto runner_trans_final = - NpuOpRunner("TransposeD", {tmp_re}, {*out}, {{"perm", {1, 2, 0}}}); + std::vector vec_trans_final = {1, 2, 0}; + auto runner_trans_final = NpuOpRunner("TransposeD", {tmp_re}, {*out}, + {{"perm", vec_trans_final}}); runner_trans_final.Run(stream); } } From 159e7fb6ee89d07c082e370a306942121fa66d2f Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 05:20:11 +0000 Subject: [PATCH 16/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 8 -------- 1 file changed, 8 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 3e61bf4644b5d..d434752a2f7b9 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -33,14 +33,6 @@ class MulNPUKernel : public framework::OpKernel { auto stream = ctx.template device_context() .stream(); - PADDLE_ENFORCE_EQ(x_num_col_dims, 1, - platform::errors::InvalidArgument( - "now only support x_num_col_dims == 1: but got %d", - x_num_col_dims)); - PADDLE_ENFORCE_EQ(y_num_col_dims, 1, - platform::errors::InvalidArgument( - "now only support y_num_col_dims == 1: but got %d", - y_num_col_dims)); if (x_num_col_dims == 1 && y_num_col_dims == 1) { if (x->dims().size() == 2 && y->dims().size() == 2) { out->mutable_data(ctx.GetPlace()); From 73af19d4356c20f2ad00ed3d2eaeed6e83926311 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 06:03:15 +0000 Subject: [PATCH 17/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 1 + python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index d434752a2f7b9..aab04306a93e0 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -112,6 +112,7 @@ class MulNPUKernel : public framework::OpKernel { framework::make_ddim({re_first_dim, re_sec_dim, re_third_dim})); // transpose [5, 2, 3] => [2, 3, 5] + out->mutable_data(ctx.GetPlace()); std::vector vec_trans_final = {1, 2, 0}; auto runner_trans_final = NpuOpRunner("TransposeD", {tmp_re}, {*out}, {{"perm", vec_trans_final}}); diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index 9a82487105a04..a62ac41ca51fb 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -111,9 +111,7 @@ def setUp(self): 'Y': np.random.random(self.y_shape).astype(self.dtype) } self.attrs = {"x_num_col_dims": 2} - self.outputs = { - 'Out': np.dot(self.inputs['X'].reshape(6, 4), self.inputs['Y']) - } + self.outputs = {'Out': np.matmul(self.inputs['X'], self.inputs['Y'])} @unittest.skipIf(not paddle.is_compiled_with_npu(), From a355422b95d25fa36b120391b5d72fc868cf5ecf Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 07:31:50 +0000 Subject: [PATCH 18/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 23 +++---- .../tests/unittests/npu/test_mul_op_npu.py | 63 +++++++++++++++++++ 2 files changed, 75 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index aab04306a93e0..0c67337a73381 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -155,30 +155,31 @@ class MulGradNPUKernel : public framework::OpKernel { runner_dy.Run(stream); } } else if (x->dims().size() == 3 && y->dims().size() == 2) { - // flatten - Tensor tmp_flatten(x->type()); - int64_t size = x->dims()[1] * x->dims()[2]; - std::vector vec_flatten; - vec_flatten.push_back(size); - tmp_flatten.Resize(framework::make_ddim(vec_flatten)); - tmp_flatten.mutable_data(ctx.GetPlace()); - auto runner_flatten = NpuOpRunner("Flatten", {*x}, {tmp_flatten}, {}); - runner_flatten.Run(stream); + // flatten => x.shape=[6, 4] // matmul if (dx) { + // to do : why dout.dims=2 dx->mutable_data(ctx.GetPlace()); auto runner_dx = NpuOpRunner("MatMul", {*dout, *y}, {*dx}, {{"transpose_x1", false}, {"transpose_x2", true}}); - runner_dx.Run(stream); } // to do shape==2 if (dy) { + Tensor tmp_x(x->type()); + int64_t first_dim = x->dims()[0] * x->dims()[1]; + int64_t sec_dim = x->dims()[2]; + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); dy->mutable_data(ctx.GetPlace()); auto runner_dy = - NpuOpRunner("MatMul", {tmp_flatten, *dout}, {*dy}, + NpuOpRunner("MatMul", {tmp_x, *dout}, {*dy}, {{"transpose_x1", true}, {"transpose_x2", false}}); runner_dy.Run(stream); diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index a62ac41ca51fb..aa2489837c5aa 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -183,5 +183,68 @@ def test_npu(self): self.assertTrue(np.allclose(npu_loss, cpu_loss)) +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMulNet3_2(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3, 4)).astype('float32') + b_np = np.random.random(size=(2, 3, 4)).astype('float32') + c_np = np.random.random(size=(4, 5)).astype('float32') + d_np = np.random.random(size=(4, 5)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32') + c = paddle.static.data(name="c", shape=[4, 5], dtype='float32') + d = paddle.static.data(name="d", shape=[4, 5], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.fluid.layers.mul(sum_1, sum_2) + result_re = paddle.reshape(2, 15) + + fc_1 = fluid.layers.fc(input=result_re, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + if __name__ == '__main__': unittest.main() From 506e46ab7812463ddcadeb6da787631eb50f5446 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 09:59:03 +0000 Subject: [PATCH 19/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 81 +++++++++++++++++-- .../tests/unittests/npu/test_mul_op_npu.py | 80 +++++++++++++++++- 2 files changed, 153 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 0c67337a73381..3a6182937c9ec 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -158,16 +158,24 @@ class MulGradNPUKernel : public framework::OpKernel { // flatten => x.shape=[6, 4] // matmul if (dx) { - // to do : why dout.dims=2 - dx->mutable_data(ctx.GetPlace()); - auto runner_dx = - NpuOpRunner("MatMul", {*dout, *y}, {*dx}, + // to do : why dout.dims!=2 + // matmul [2, 5] * [12, 5] => [2, 12] + Tensor tmp_matmul(y->type()); + tmp_matmul.Resize(framework::make_ddim({dout->dims[0], y->dims[1]})); + tmp_matmul.mutable_data(ctx.GetPlace()); + auto runner_matmul = + NpuOpRunner("MatMul", {*dout, *y}, {tmp_matmul}, {{"transpose_x1", false}, {"transpose_x2", true}}); - runner_dx.Run(stream); + runner_matmul.Run(stream); + // reshape [2, 12] => [2, 3, 4] + dx->mutable_data(ctx.GetPlace(), x->type()); + framework::TensorCopy( + tmp_matmul, ctx.GetPlace(), + ctx.template device_context(), dx); } - // to do shape==2 if (dy) { + // flatten Tensor tmp_x(x->type()); int64_t first_dim = x->dims()[0] * x->dims()[1]; int64_t sec_dim = x->dims()[2]; @@ -185,6 +193,67 @@ class MulGradNPUKernel : public framework::OpKernel { runner_dy.Run(stream); } } + } else if (x->dims().size() == 3 && y->dims().size() == 2) { + // for example: x.shape=[2, 3, 4] y.shape=[4, 5], expect [2, 3, 5] + PADDLE_ENFORCE_EQ(x_num_col_dims, 2, + platform::errors::InvalidArgument( + "now only support x_num_col_dims == 2: but got %d", + x_num_col_dims)); + // flatten => x.shape=[6, 4] + Tensor tmp_x(x->type()); + int64_t first_dim = x->dims()[0] * x->dims()[1]; + int64_t sec_dim = x->dims()[2]; + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + // flatten dout + Tensor tmp_dout(x->type()); + int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1]; + int64_t dout_sec_dim = dout->dims()[2]; + tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim})); + tmp_dout.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *dout, ctx.GetPlace(), + ctx.template device_context(), &tmp_dout); + tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim})); + + // unsqueeze => x.shape=[1, 4, 5] + Tensor tmp_y(y->type()); + int64_t first_dim_y = 1; + int64_t sec_dim_y = y->dims()[0]; + int64_t third_dim_y = y->dims()[1]; + tmp_y.Resize(framework::make_ddim({first_dim_y, sec_dim_y, third_dim_y})); + tmp_y.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *y, ctx.GetPlace(), + ctx.template device_context(), &tmp_y); + tmp_y.Resize(framework::make_ddim({first_dim_y, sec_dim_y, third_dim_y})); + // TileWithAxis + Tensor tmp_tile(x->type()); + tmp_tile.Resize( + framework::make_ddim({dout->dims()[0], y->dims()[0], y->dims()[1]})); + tmp_tile.mutable_data(ctx.GetPlace()); + auto runner_tile = NpuOpRunner("TileWithAxis", {tmp_y}, {tmp_tile}, + {{"axis", 0}, {"tiles", dout->dims()[0]}}); + runner_tile.Run(stream); + if (dx) { + dx->mutable_data(ctx.GetPlace()); + auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, tmp_tile}, {*dx}, + {{"adj_x1", false}, {"adj_x2", true}}); + + runner_dx.Run(stream); + } + if (dy) { + dy->mutable_data(ctx.GetPlace()); + auto runner_dy = + NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy}, + {{"transpose_x1", true}, {"transpose_x2", false}}); + + runner_dy.Run(stream); + } } } }; diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index aa2489837c5aa..b1f4760b3544c 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -186,6 +186,75 @@ def test_npu(self): @unittest.skipIf(not paddle.is_compiled_with_npu(), "core is not compiled with NPU") class TestMulNet3_2(unittest.TestCase): + def _test(self, run_npu=True): + main_prog = paddle.static.Program() + startup_prog = paddle.static.Program() + main_prog.random_seed = SEED + startup_prog.random_seed = SEED + np.random.seed(SEED) + + a_np = np.random.random(size=(2, 3, 4)).astype('float32') + b_np = np.random.random(size=(2, 3, 4)).astype('float32') + c_np = np.random.random(size=(12, 5)).astype('float32') + d_np = np.random.random(size=(12, 5)).astype('float32') + label_np = np.random.randint(2, size=(2, 1)).astype('int64') + + with paddle.static.program_guard(main_prog, startup_prog): + a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32') + b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32') + c = paddle.static.data(name="c", shape=[4, 5], dtype='float32') + d = paddle.static.data(name="d", shape=[4, 5], dtype='float32') + label = paddle.static.data( + name="label", shape=[2, 1], dtype='int64') + + sum_1 = paddle.add(a, b) + sum_2 = paddle.add(c, d) + result = paddle.fluid.layers.mul(sum_1, sum_2) + + fc_1 = fluid.layers.fc(input=result_re, size=8) + prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') + + cost = fluid.layers.cross_entropy(input=prediction, label=label) + loss = fluid.layers.reduce_mean(cost) + sgd = fluid.optimizer.SGD(learning_rate=0.01) + sgd.minimize(loss) + + if run_npu: + place = paddle.NPUPlace(0) + else: + place = paddle.CPUPlace() + exe = paddle.static.Executor(place) + exe.run(startup_prog) + + print("Start run on {}".format(place)) + for epoch in range(100): + + pred_res, loss_res = exe.run(main_prog, + feed={ + "a": a_np, + "b": b_np, + "c": c_np, + "d": d_np, + "label": label_np + }, + fetch_list=[prediction, loss]) + if epoch % 10 == 0: + print("Epoch {} | Prediction[0]: {}, Loss: {}".format( + epoch, pred_res[0], loss_res)) + + return pred_res, loss_res + + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + + +@unittest.skipIf(not paddle.is_compiled_with_npu(), + "core is not compiled with NPU") +class TestMulNet3_2_xc2(unittest.TestCase): def _test(self, run_npu=True): main_prog = paddle.static.Program() startup_prog = paddle.static.Program() @@ -209,8 +278,8 @@ def _test(self, run_npu=True): sum_1 = paddle.add(a, b) sum_2 = paddle.add(c, d) - result = paddle.fluid.layers.mul(sum_1, sum_2) - result_re = paddle.reshape(2, 15) + result = paddle.fluid.layers.mul(sum_1, sum_2, x_num_col_dims=2) + result_re = paddle.reshape(result, shape=[2, 15]) fc_1 = fluid.layers.fc(input=result_re, size=8) prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') @@ -245,6 +314,13 @@ def _test(self, run_npu=True): return pred_res, loss_res + def test_npu(self): + cpu_pred, cpu_loss = self._test(False) + npu_pred, npu_loss = self._test(True) + + self.assertTrue(np.allclose(npu_pred, cpu_pred)) + self.assertTrue(np.allclose(npu_loss, cpu_loss)) + if __name__ == '__main__': unittest.main() From b130c7b9092b237b8c05f15188e1295a123d0208 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 11:11:03 +0000 Subject: [PATCH 20/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 3a6182937c9ec..3ecc462601022 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -161,7 +161,8 @@ class MulGradNPUKernel : public framework::OpKernel { // to do : why dout.dims!=2 // matmul [2, 5] * [12, 5] => [2, 12] Tensor tmp_matmul(y->type()); - tmp_matmul.Resize(framework::make_ddim({dout->dims[0], y->dims[1]})); + tmp_matmul.Resize( + framework::make_ddim({dout->dims()[0], y->dims()[1]})); tmp_matmul.mutable_data(ctx.GetPlace()); auto runner_matmul = NpuOpRunner("MatMul", {*dout, *y}, {tmp_matmul}, From 90218869a83a39aec55c2a89084d136086198a0d Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 11:15:49 +0000 Subject: [PATCH 21/28] fix --- .../fluid/tests/unittests/npu/test_mul_op_npu.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index b1f4760b3544c..7457142b39d65 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -157,7 +157,7 @@ def _test(self, run_npu=True): exe = paddle.static.Executor(place) exe.run(startup_prog) - print("Start run on {}".format(place)) + print("TestMulNet Start run on {} . ".format(place)) for epoch in range(100): pred_res, loss_res = exe.run(main_prog, @@ -202,8 +202,8 @@ def _test(self, run_npu=True): with paddle.static.program_guard(main_prog, startup_prog): a = paddle.static.data(name="a", shape=[2, 3, 4], dtype='float32') b = paddle.static.data(name="b", shape=[2, 3, 4], dtype='float32') - c = paddle.static.data(name="c", shape=[4, 5], dtype='float32') - d = paddle.static.data(name="d", shape=[4, 5], dtype='float32') + c = paddle.static.data(name="c", shape=[12, 5], dtype='float32') + d = paddle.static.data(name="d", shape=[12, 5], dtype='float32') label = paddle.static.data( name="label", shape=[2, 1], dtype='int64') @@ -226,7 +226,7 @@ def _test(self, run_npu=True): exe = paddle.static.Executor(place) exe.run(startup_prog) - print("Start run on {}".format(place)) + print("testMulNet3_2 tart run on {}".format(place)) for epoch in range(100): pred_res, loss_res = exe.run(main_prog, @@ -296,7 +296,7 @@ def _test(self, run_npu=True): exe = paddle.static.Executor(place) exe.run(startup_prog) - print("Start run on {}".format(place)) + print("TestMulNet3_2_xc2. Start run on {}".format(place)) for epoch in range(100): pred_res, loss_res = exe.run(main_prog, From e4a6a1d7b54f8c80de13ed6068cd23e01bd81391 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 11:18:21 +0000 Subject: [PATCH 22/28] fix --- python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py index 7457142b39d65..e65a3dac73928 100644 --- a/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py +++ b/python/paddle/fluid/tests/unittests/npu/test_mul_op_npu.py @@ -211,7 +211,7 @@ def _test(self, run_npu=True): sum_2 = paddle.add(c, d) result = paddle.fluid.layers.mul(sum_1, sum_2) - fc_1 = fluid.layers.fc(input=result_re, size=8) + fc_1 = fluid.layers.fc(input=result, size=8) prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=label) From 63a28cfbe81ad9909554320d44a8dce92aae3822 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 11:28:08 +0000 Subject: [PATCH 23/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 3ecc462601022..5b740830e99d0 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -162,7 +162,7 @@ class MulGradNPUKernel : public framework::OpKernel { // matmul [2, 5] * [12, 5] => [2, 12] Tensor tmp_matmul(y->type()); tmp_matmul.Resize( - framework::make_ddim({dout->dims()[0], y->dims()[1]})); + framework::make_ddim({dout->dims()[0], y->dims()[0]})); tmp_matmul.mutable_data(ctx.GetPlace()); auto runner_matmul = NpuOpRunner("MatMul", {*dout, *y}, {tmp_matmul}, From b4d87c8976351f07a78786c51e37db2e18295b4c Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Mon, 15 Mar 2021 11:41:02 +0000 Subject: [PATCH 24/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 5b740830e99d0..836168f734ee6 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -178,8 +178,8 @@ class MulGradNPUKernel : public framework::OpKernel { if (dy) { // flatten Tensor tmp_x(x->type()); - int64_t first_dim = x->dims()[0] * x->dims()[1]; - int64_t sec_dim = x->dims()[2]; + int64_t sec_dim = x->dims()[1] * x->dims()[2]; + int64_t first_dim = x->dims()[0]; tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); tmp_x.mutable_data(ctx.GetPlace()); framework::TensorCopy( From 5908a0875f542e9dd8c4b253e0e20cff78c18c89 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Tue, 16 Mar 2021 04:27:05 +0000 Subject: [PATCH 25/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 91 +++++++++------------------- 1 file changed, 29 insertions(+), 62 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 836168f734ee6..fd99b83ae9121 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -89,34 +89,11 @@ class MulNPUKernel : public framework::OpKernel { {{"transpose_x1", false}, {"transpose_x2", false}}); runner_matmul.Run(stream); - // transpose [6, 5] => [5, 6] - Tensor tmp_trans(x->type()); - tmp_trans.Resize(framework::make_ddim({y->dims()[1], first_dim})); - tmp_trans.mutable_data(ctx.GetPlace()); - std::vector vec_trans = {1, 0}; - auto runner_trans = NpuOpRunner("TransposeD", {tmp_matmul}, {tmp_trans}, - {{"perm", vec_trans}}); - runner_trans.Run(stream); - // reshape [5, 6] => [5, 2, 3] - Tensor tmp_re(x->type()); - int64_t re_first_dim = y->dims()[1]; - int64_t re_sec_dim = x->dims()[0]; - int64_t re_third_dim = x->dims()[1]; - tmp_re.Resize( - framework::make_ddim({re_first_dim, re_sec_dim, re_third_dim})); - tmp_re.mutable_data(ctx.GetPlace()); + // reshape [6, 5] => [2, 3, 5] + out->mutable_data(ctx.GetPlace(), out->type()); framework::TensorCopy( - tmp_trans, ctx.GetPlace(), - ctx.template device_context(), &tmp_re); - tmp_re.Resize( - framework::make_ddim({re_first_dim, re_sec_dim, re_third_dim})); - - // transpose [5, 2, 3] => [2, 3, 5] - out->mutable_data(ctx.GetPlace()); - std::vector vec_trans_final = {1, 2, 0}; - auto runner_trans_final = NpuOpRunner("TransposeD", {tmp_re}, {*out}, - {{"perm", vec_trans_final}}); - runner_trans_final.Run(stream); + tmp_matmul, ctx.GetPlace(), + ctx.template device_context(), out); } } }; @@ -200,17 +177,7 @@ class MulGradNPUKernel : public framework::OpKernel { platform::errors::InvalidArgument( "now only support x_num_col_dims == 2: but got %d", x_num_col_dims)); - // flatten => x.shape=[6, 4] - Tensor tmp_x(x->type()); - int64_t first_dim = x->dims()[0] * x->dims()[1]; - int64_t sec_dim = x->dims()[2]; - tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - tmp_x.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *x, ctx.GetPlace(), - ctx.template device_context(), &tmp_x); - tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); - // flatten dout + // tmp_dout both used by dx and dy Tensor tmp_dout(x->type()); int64_t dout_first_dim = dout->dims()[0] * dout->dims()[1]; int64_t dout_sec_dim = dout->dims()[2]; @@ -221,33 +188,33 @@ class MulGradNPUKernel : public framework::OpKernel { ctx.template device_context(), &tmp_dout); tmp_dout.Resize(framework::make_ddim({dout_first_dim, dout_sec_dim})); - // unsqueeze => x.shape=[1, 4, 5] - Tensor tmp_y(y->type()); - int64_t first_dim_y = 1; - int64_t sec_dim_y = y->dims()[0]; - int64_t third_dim_y = y->dims()[1]; - tmp_y.Resize(framework::make_ddim({first_dim_y, sec_dim_y, third_dim_y})); - tmp_y.mutable_data(ctx.GetPlace()); - framework::TensorCopy( - *y, ctx.GetPlace(), - ctx.template device_context(), &tmp_y); - tmp_y.Resize(framework::make_ddim({first_dim_y, sec_dim_y, third_dim_y})); - // TileWithAxis - Tensor tmp_tile(x->type()); - tmp_tile.Resize( - framework::make_ddim({dout->dims()[0], y->dims()[0], y->dims()[1]})); - tmp_tile.mutable_data(ctx.GetPlace()); - auto runner_tile = NpuOpRunner("TileWithAxis", {tmp_y}, {tmp_tile}, - {{"axis", 0}, {"tiles", dout->dims()[0]}}); - runner_tile.Run(stream); if (dx) { - dx->mutable_data(ctx.GetPlace()); - auto runner_dx = NpuOpRunner("BatchMatMul", {*dout, tmp_tile}, {*dx}, - {{"adj_x1", false}, {"adj_x2", true}}); - - runner_dx.Run(stream); + // tmp_dout * y [6,5] * [4,5] => [6, 4] + Tensor tmp_matmul(y->type()); + tmp_matmul.Resize(framework::make_ddim({dout_first_dim, y->dims()[0]})); + tmp_matmul.mutable_data(ctx.GetPlace()); + auto runner_matmul = + NpuOpRunner("MatMul", {tmp_dout, *y}, {tmp_matmul}, + {{"transpose_x1", false}, {"transpose_x2", true}}); + runner_matmul.Run(stream); + // reshape [6,4] => [2, 3, 4] + dx->mutable_data(ctx.GetPlace(), x->type()); + framework::TensorCopy( + tmp_matmul, ctx.GetPlace(), + ctx.template device_context(), dx); } if (dy) { + // flatten => x.shape=[6, 4] + Tensor tmp_x(x->type()); + int64_t first_dim = x->dims()[0] * x->dims()[1]; + int64_t sec_dim = x->dims()[2]; + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + tmp_x.mutable_data(ctx.GetPlace()); + framework::TensorCopy( + *x, ctx.GetPlace(), + ctx.template device_context(), &tmp_x); + tmp_x.Resize(framework::make_ddim({first_dim, sec_dim})); + // mamtul [6,4] [6,5] =>[4,5] dy->mutable_data(ctx.GetPlace()); auto runner_dy = NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy}, From 3fdc1d8c98fa116ae22e32745f5da273d1148b1e Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Tue, 16 Mar 2021 05:11:08 +0000 Subject: [PATCH 26/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index fd99b83ae9121..7bd2508154713 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -135,7 +135,6 @@ class MulGradNPUKernel : public framework::OpKernel { // flatten => x.shape=[6, 4] // matmul if (dx) { - // to do : why dout.dims!=2 // matmul [2, 5] * [12, 5] => [2, 12] Tensor tmp_matmul(y->type()); tmp_matmul.Resize( @@ -219,7 +218,6 @@ class MulGradNPUKernel : public framework::OpKernel { auto runner_dy = NpuOpRunner("MatMul", {tmp_x, tmp_dout}, {*dy}, {{"transpose_x1", true}, {"transpose_x2", false}}); - runner_dy.Run(stream); } } From bccb99a017e332becce6e1b3623ada4531bc8288 Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Tue, 16 Mar 2021 05:41:06 +0000 Subject: [PATCH 27/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index 7bd2508154713..a5f5034cd7402 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -90,7 +90,9 @@ class MulNPUKernel : public framework::OpKernel { runner_matmul.Run(stream); // reshape [6, 5] => [2, 3, 5] - out->mutable_data(ctx.GetPlace(), out->type()); + out.Resize( + framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]})); + out->mutable_data(ctx.GetPlace(), x->type()); framework::TensorCopy( tmp_matmul, ctx.GetPlace(), ctx.template device_context(), out); From c95312c557b71b87c74aae23c9dc540eff63baaf Mon Sep 17 00:00:00 2001 From: frankwhzhang Date: Tue, 16 Mar 2021 05:52:22 +0000 Subject: [PATCH 28/28] fix --- paddle/fluid/operators/mul_op_npu.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/mul_op_npu.cc b/paddle/fluid/operators/mul_op_npu.cc index a5f5034cd7402..cf057cc339c62 100644 --- a/paddle/fluid/operators/mul_op_npu.cc +++ b/paddle/fluid/operators/mul_op_npu.cc @@ -90,12 +90,14 @@ class MulNPUKernel : public framework::OpKernel { runner_matmul.Run(stream); // reshape [6, 5] => [2, 3, 5] - out.Resize( + (*out).Resize( framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]})); out->mutable_data(ctx.GetPlace(), x->type()); framework::TensorCopy( tmp_matmul, ctx.GetPlace(), ctx.template device_context(), out); + (*out).Resize( + framework::make_ddim({x->dims()[0], x->dims()[1], y->dims()[1]})); } } }; @@ -205,7 +207,7 @@ class MulGradNPUKernel : public framework::OpKernel { ctx.template device_context(), dx); } if (dy) { - // flatten => x.shape=[6, 4] + // flatten x.shape [2,3,4] => [6, 4] Tensor tmp_x(x->type()); int64_t first_dim = x->dims()[0] * x->dims()[1]; int64_t sec_dim = x->dims()[2];