add unittest for new matmul_v2 kernel

PaddlePaddle · Oct 28, 2021 · 7758f14 · 7758f14 · paddle-bot-old · Oct 28, 2021
1 parent 0fb60d0
commit 7758f14
Show file tree

Hide file tree

Showing 7 changed files with 178 additions and 11 deletions.
diff --git a/paddle/fluid/operators/matmul_v2_op.h b/paddle/fluid/operators/matmul_v2_op.h
@@ -25,6 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 
+// only can include the headers in paddle/pten/api dirs
+#include "paddle/pten/api/include/core.h"
+#include "paddle/pten/api/include/linalg.h"
+#include "paddle/pten/hapi/lib/utils/tensor_utils.h"
+
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/operators/reduce_ops/cub_reduce.h"
 #endif
@@ -380,15 +385,17 @@ class MatMulV2Kernel : public framework::OpKernel<T> {
     auto* Out = ctx.Output<Tensor>("Out");
     bool trans_x = ctx.Attr<bool>("trans_x");
     bool trans_y = ctx.Attr<bool>("trans_y");
-    PADDLE_ENFORCE_NE(framework::product(X->dims()), 0,
-                      platform::errors::InvalidArgument(
-                          "The Input(X) dims size must not be equal 0,"
-                          " but reviced dims size is 0. "));
-    PADDLE_ENFORCE_NE(framework::product(Y->dims()), 0,
-                      platform::errors::InvalidArgument(
-                          "The Input(Y) dims size must not be equal 0,"
-                          " but reviced dims size is 0. "));
-    MatMulFunction<DeviceContext, T>(X, Y, Out, trans_x, trans_y, ctx);
+
+    auto& dev_ctx = ctx.device_context<DeviceContext>();
+    Out->mutable_data<T>(X->place());
+
+    auto pt_x = paddle::experimental::MakePtenDenseTensor(*X);
+    auto pt_y = paddle::experimental::MakePtenDenseTensor(*Y);
+    auto pt_out = paddle::experimental::MakePtenDenseTensor(*Out);
+
+    // call new kernel
+    pten::Matmul<T>(dev_ctx, *pt_x.get(), *pt_y.get(), trans_x, trans_y,
+                    pt_out.get());
   }
 };
 

diff --git a/paddle/pten/hapi/lib/linalg.cc b/paddle/pten/hapi/lib/linalg.cc
@@ -83,6 +83,8 @@ Tensor matmul(const Tensor& x,
   auto dense_y = std::dynamic_pointer_cast<pten::DenseTensor>(y.impl());
   kernel_context.EmplaceBackInput(dense_x);
   kernel_context.EmplaceBackInput(dense_y);
+  kernel_context.EmplaceBackAttr(transpose_x);
+  kernel_context.EmplaceBackAttr(transpose_y);
   // TODO(chenweihang): add transform impl
 
   // 4. InferShape

diff --git a/paddle/pten/kernels/cuda/linalg.cu b/paddle/pten/kernels/cuda/linalg.cu
@@ -56,6 +56,7 @@ void Matmul(const CUDAContext& dev_ctx,
 
 PT_REGISTER_MODULE(LinalgCUDA);
 
+using float16 = paddle::platform::float16;
 using complex64 = ::paddle::platform::complex<float>;
 using complex128 = ::paddle::platform::complex<double>;
 
@@ -76,5 +77,6 @@ PT_REGISTER_KERNEL("matmul_v2",
                    pten::Matmul,
                    float,
                    double,
+                   float16,
                    complex64,
                    complex128) {}
diff --git a/paddle/pten/kernels/functions/math/matmul_func.h b/paddle/pten/kernels/functions/math/matmul_func.h
@@ -328,7 +328,6 @@ void MatMulFunction(const DeviceContext& dev_ctx,
                        x_broadcast_dims.data(),
                        y_broadcast_dims.data(),
                        out_broadcast_dims.data());
-
   out_broadcast_dims[ndim - 2] = M;
   out_broadcast_dims[ndim - 1] = N;
 

diff --git a/paddle/pten/tests/CMakeLists.txt b/paddle/pten/tests/CMakeLists.txt
@@ -5,6 +5,7 @@ cc_test(dense_tensor_test SRCS dense_tensor_test.cc DEPS dense_tensor)
 cc_test(kernel_factory_test SRCS kernel_factory_test.cc DEPS kernel_factory)
 cc_test(test_mean_api SRCS test_mean_api.cc DEPS math_api pten_hapi_utils)
 cc_test(test_dot_api SRCS test_dot_api.cc DEPS linalg_api pten_hapi_utils)
+cc_test(test_matmul_api SRCS test_matmul_api.cc DEPS linalg_api pten_hapi_utils)
 cc_test(test_fill_api SRCS test_fill_api.cc DEPS creation_api pten_hapi_utils)
 cc_test(test_copy_api SRCS test_copy_api.cc DEPS utils_cpu pten_hapi_utils)
 cc_test(test_flatten_api SRCS test_flatten_api.cc DEPS utils_cpu manipulation_api pten_hapi_utils)
diff --git a/paddle/pten/tests/test_fill_api.cc b/paddle/pten/tests/test_fill_api.cc
@@ -32,7 +32,6 @@ using DDim = paddle::framework::DDim;
 
 // TODO(chenweihang): Remove this test after the API is used in the dygraph
 TEST(API, full_like) {
-  // 1. create tensor
   // 1. create tensor
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());

diff --git a/paddle/pten/tests/test_matmul_api.cc b/paddle/pten/tests/test_matmul_api.cc
@@ -0,0 +1,157 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <memory>
+
+#include "paddle/pten/hapi/include/linalg.h"
+
+#include "paddle/pten/core/dense_tensor.h"
+#include "paddle/pten/core/kernel_registry.h"
+#include "paddle/pten/hapi/lib/utils/allocator.h"
+#include "paddle/pten/kernels/cuda/utils.h"
+
+PT_DECLARE_MODULE(LinalgCPU);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PT_DECLARE_MODULE(LinalgCUDA);
+#endif
+
+namespace framework = paddle::framework;
+using DDim = paddle::framework::DDim;
+
+TEST(API, matmul_cpu) {
+  // 1. create tensor
+  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
+      paddle::platform::CPUPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto* dense_x_data = dense_x->mutable_data<float>();
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+  auto* dense_y_data = dense_y->mutable_data<float>();
+
+  for (size_t i = 0; i < 9; ++i) {
+    dense_x_data[i] = 1.0;
+    dense_y_data[i] = 2.0;
+  }
+  std::vector<float> sum(9, 6.0);
+
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
+
+  // 2. test API
+  auto out = paddle::experimental::matmul(x, y, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.shape()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+
+  for (size_t i = 0; i < 9; i++) {
+    ASSERT_NEAR(sum[i], dense_out->data<float>()[i], 1e-6f);
+  }
+}
+
+TEST(API, matmul_cuda) {
+  // Prepare CPU Dense Tensor
+  const auto alloc_cpu =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CPUPlace());
+  auto ref_x = std::make_shared<pten::DenseTensor>(
+      alloc_cpu,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto* ref_x_data = ref_x->mutable_data<float>();
+
+  auto ref_y = std::make_shared<pten::DenseTensor>(
+      alloc_cpu,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+  auto* ref_y_data = ref_y->mutable_data<float>();
+
+  for (size_t i = 0; i < 9; ++i) {
+    ref_x_data[i] = 1.0;
+    ref_y_data[i] = 2.0;
+  }
+  std::vector<float> sum(9, 6.0);
+
+  // 1. create tensor
+  const auto alloc_cuda =
+      std::make_shared<paddle::experimental::DefaultAllocator>(
+          paddle::platform::CUDAPlace());
+  auto dense_x = std::make_shared<pten::DenseTensor>(
+      alloc_cuda,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto dense_y = std::make_shared<pten::DenseTensor>(
+      alloc_cuda,
+      pten::DenseTensorMeta(pten::DataType::FLOAT32,
+                            framework::make_ddim({3, 3}),
+                            pten::DataLayout::NCHW));
+
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto place = paddle::platform::CUDAPlace();
+  auto* dev_ctx = pool.GetByPlace(place);
+
+  pten::Copy(*dev_ctx, *ref_x.get(), dense_x.get());
+  pten::Copy(*dev_ctx, *ref_y.get(), dense_y.get());
+
+  paddle::experimental::Tensor x(dense_x);
+  paddle::experimental::Tensor y(dense_y);
+
+  // 2. test API
+  auto out = paddle::experimental::matmul(x, y, false, false);
+
+  // 3. check result
+  ASSERT_EQ(out.shape().size(), 2);
+  ASSERT_EQ(out.shape()[0], 3);
+  ASSERT_EQ(out.shape()[1], 3);
+  ASSERT_EQ(out.numel(), 9);
+  ASSERT_EQ(out.type(), pten::DataType::FLOAT32);
+  ASSERT_EQ(out.layout(), pten::DataLayout::NCHW);
+  ASSERT_EQ(out.initialized(), true);
+
+  auto dense_out = std::dynamic_pointer_cast<pten::DenseTensor>(out.impl());
+
+  auto ref_out = std::make_shared<pten::DenseTensor>(
+      alloc_cpu,
+      pten::DenseTensorMeta(
+          pten::DataType::FLOAT32, out.shape(), pten::DataLayout::NCHW));
+
+  pten::Copy(*dev_ctx, *dense_out.get(), ref_out.get());
+
+  for (size_t i = 0; i < 9; i++) {
+    ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f);
+  }
+}