From 136e4de03ca9ec10f7f14f3d2dc9cf83259461d7 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 27 Jan 2021 16:56:20 +0800
Subject: [PATCH 01/83] fix compile error

---
 paddle/fluid/framework/custom_operator.cc | 1 +
 1 file changed, 1 insertion(+)
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 88dce234c828d..41d8cbf5a5377 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -31,6 +31,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/dynload/dynamic_loader.h"
 
 namespace paddle {
 namespace framework {

From 2e4fc0d1ab1a08b40d25d699f43d3f9e9b10417d Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 29 Jan 2021 15:40:17 +0800
Subject: [PATCH 02/83] wrap framework tensor with LoDTensor

---
 paddle/fluid/extension/include/device.h   |   6 +-
 paddle/fluid/extension/include/dtype.h    |  12 +-
 paddle/fluid/extension/include/tensor.h   |  73 ++++++++-
 paddle/fluid/extension/src/tensor.cc      | 175 ++++++++++++++++++++++
 paddle/fluid/framework/custom_operator.cc |  27 ++--
 paddle/fluid/inference/CMakeLists.txt     |   1 +
 6 files changed, 271 insertions(+), 23 deletions(-)
 create mode 100644 paddle/fluid/extension/src/tensor.cc

diff --git a/paddle/fluid/extension/include/device.h b/paddle/fluid/extension/include/device.h
index 53181e658d689..8b28499b69095 100644
--- a/paddle/fluid/extension/include/device.h
+++ b/paddle/fluid/extension/include/device.h
@@ -14,12 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/platform/place.h"
-
 namespace paddle {
 
-using CPUPlace = platform::CPUPlace;
-using CUDAPlace = platform::CUDAPlace;
-using XPUPlace = platform::XPUPlace;
+enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index 5cc88f854e2a4..bc649897aa702 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -18,9 +18,13 @@ limitations under the License. */
 
 namespace paddle {
 
-using bfloat16 = platform::bfloat16;
-using float16 = platform::float16;
-using complex64 = platform::complex64;
-using complex128 = platform::complex128;
+enum PaddleDType {
+        FLOAT32,
+        INT64,
+        INT32,
+        UINT8,
+        INT8,
+        // TODO(Superjomn) support more data types if needed.
+    };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 339da0edfa338..1cf0fca639095 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -14,11 +14,78 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/extension/include/device.h"
+#include "paddle/fluid/extension/include/dtype.h"
 
 namespace paddle {
 
-using Tensor = framework::Tensor;
+class CustomTensor{
+public:
+    explicit CustomTensor(void* raw_tensor) : tensor_{raw_tensor}{};
+    /// \brief Reset the shape of the tensor.
+    /// Generally it's only used for the input tensor.
+    /// Reshape must be called before calling mutable_data() or copy_from_cpu()
+    /// \param shape The shape to set.
+    void Reshape(const std::vector<int>& shape);
+
+    /// \brief Get the memory pointer in CPU or GPU with specific data type.
+    /// Please Reshape the tensor first before call this.
+    /// It's usually used to get input data pointer.
+    /// \param place The place of the tensor.
+    template <typename T>
+    T* mutable_data(PaddlePlace place);
+
+    /// \brief Get the memory pointer directly.
+    /// It's usually used to get the output data pointer.
+    /// \param[out] place To get the device type of the tensor.
+    /// \param[out] size To get the data size of the tensor.
+    /// \return The tensor data buffer pointer.
+    template <typename T>
+    T* data(PaddlePlace* place, int* size) const;
+
+    /// \brief Copy the host memory to tensor data.
+    /// It's usually used to set the input tensor data.
+    /// \param data The pointer of the data, from which the tensor will copy.
+    template <typename T>
+    void copy_from_cpu(const T* data);
+
+    /// \brief Copy the tensor data to the host memory.
+    /// It's usually used to get the output tensor data.
+    /// \param[out] data The tensor will copy the data to the address.
+    template <typename T>
+    void copy_to_cpu(T* data);
+
+    /// \brief Return the shape of the Tensor.
+    std::vector<int> shape() const;
+
+    /// \brief Set lod info of the tensor.
+    /// More about LOD can be seen here:
+    ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+    /// \param x the lod info.
+    void SetLoD(const std::vector<std::vector<size_t>>& x);
+    /// \brief Return the lod info of the tensor.
+    std::vector<std::vector<size_t>> lod() const;
+
+    void SetPlace(PaddlePlace place) {
+        place_ = place;
+    }
+
+    /// \brief Return the data type of the tensor.
+    /// It's usually used to get the output tensor data type.
+    /// \return The data type of the tensor.
+    PaddleDType type() const;
+
+
+    /// \brief Share data with another tensor.
+    /// Use this to pass tensor from op to op
+    /// \return void.
+    void ShareDataWith(void* tensor_out);
+
+private:
+    mutable std::shared_ptr<void> tensor_;
+    PaddlePlace place_;
+    PaddleDType dtype_;
+    int device_num_{};
+};
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
new file mode 100644
index 0000000000000..63de6c16b4a0f
--- /dev/null
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -0,0 +1,175 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/extension/include/all.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+
+#define GET_CASTED_TENSOR                               \
+  if (!tensor_) {                                       \
+    tensor_ = std::make_shared<framework::LoDTensor>(); \
+  }                                                     \
+  auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
+
+void CustomTensor::Reshape(const std::vector<int> &shape) {
+    GET_CASTED_TENSOR
+    tensor->Resize(framework::make_ddim(shape));
+}
+
+
+template <typename T>
+T *CustomTensor::mutable_data(PaddlePlace place) {
+    GET_CASTED_TENSOR
+    PADDLE_ENFORCE_GT(
+            tensor->numel(), 0,
+            platform::errors::PreconditionNotMet(
+                    "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
+                    "&shape)"
+                    "function before retrieving mutable_data from input tensor."));
+    switch (static_cast<int>(place)) {
+        case static_cast<int>(PaddlePlace::kCPU): {
+            return tensor->mutable_data<T>(platform::CPUPlace());
+        }
+        case static_cast<int>(PaddlePlace::kGPU): {
+            return tensor->mutable_data<T>(platform::CUDAPlace(device_num_));
+        }
+        default:
+            PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
+                                                       static_cast<int>(place)));
+    }
+}
+
+template <typename T>
+T *CustomTensor::data(PaddlePlace *place, int *size) const {
+    GET_CASTED_TENSOR;
+    auto *res = tensor->data<T>();
+
+    if (platform::is_cpu_place(tensor->place())) {
+        *place = PaddlePlace::kCPU;
+    } else if (platform::is_gpu_place(tensor->place())) {
+        *place = PaddlePlace::kGPU;
+    } else {
+        *place = PaddlePlace::kUNK;
+    }
+
+    *size = tensor->numel();
+    return res;
+}
+
+PaddleDType CustomTensor::type() const {
+    GET_CASTED_TENSOR;
+    auto type = tensor->type();
+    if (type == framework::proto::VarType::FP32) {
+        return PaddleDType::FLOAT32;
+    } else if (type == framework::proto::VarType::INT64) {
+        return PaddleDType::INT64;
+    } else if (type == framework::proto::VarType::INT32) {
+        return PaddleDType::INT32;
+    } else if (type == framework::proto::VarType::UINT8) {
+        return PaddleDType::UINT8;
+    }
+    return PaddleDType::FLOAT32;
+}
+
+template <typename T>
+void CustomTensor::copy_from_cpu(const T *data) {
+    GET_CASTED_TENSOR;
+    PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                      platform::errors::PreconditionNotMet(
+                              "You should call ZeroCopyTensor::Reshape(const "
+                              "std::vector<int> &shape)"
+                              "function before copying data from cpu."));
+    size_t ele_size = tensor->numel() * sizeof(T);
+
+    if (place_ == PaddlePlace::kCPU) {
+        auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
+        std::memcpy(static_cast<void *>(t_data), data, ele_size);
+    } else {
+#ifdef PADDLE_WITH_CUDA
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+platform::CUDAPlace gpu_place(device_);
+auto *t_data = tensor->mutable_data<T>(gpu_place);
+auto *dev_ctx =
+    static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+
+memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
+             data, ele_size, dev_ctx->stream());
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+                "Not compiled with CUDA, should not reach here."));
+#endif
+    }
+}
+
+template <typename T>
+void CustomTensor::copy_to_cpu(T *data) {
+    GET_CASTED_TENSOR;
+    auto ele_num = tensor->numel();
+    auto *t_data = tensor->data<T>();
+    auto t_place = tensor->place();
+
+    if (platform::is_cpu_place(t_place)) {
+        std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+    } else {
+#ifdef PADDLE_WITH_CUDA
+        platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
+auto *dev_ctx =
+    static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
+             t_data, ele_num * sizeof(T), dev_ctx->stream());
+
+cudaStreamSynchronize(dev_ctx->stream());
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+                "Not compile with CUDA, should not reach here."));
+#endif
+    }
+}
+
+std::vector<int> CustomTensor::shape() const {
+    GET_CASTED_TENSOR
+    return framework::vectorize<int>(tensor->dims());
+}
+
+void CustomTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+    GET_CASTED_TENSOR;
+    framework::LoD lod;
+    for (auto &level : x) {
+        lod.emplace_back(level);
+    }
+    tensor->set_lod(lod);
+}
+
+std::vector<std::vector<size_t>> CustomTensor::lod() const {
+    GET_CASTED_TENSOR;
+    std::vector<std::vector<size_t>> res;
+    for (auto &level : tensor->lod()) {
+        res.emplace_back(level);
+    }
+    return res;
+}
+
+
+void CustomTensor::ShareDataWith(void* out_data){
+    auto out_data_tmp = framework::LoDTensor();
+    out_data_tmp = *static_cast<framework::LoDTensor*>(tensor_.get());
+    static_cast<framework::LoDTensor*>(out_data)->ShareDataWith(out_data_tmp);
+}
+
+}  // namespace paddle
+
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 41d8cbf5a5377..fc646387afcf5 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -32,6 +32,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
+#include "paddle/fluid/extension/include/tensor.h"
 
 namespace paddle {
 namespace framework {
@@ -60,30 +61,31 @@ template <typename Func>
 static void CallKernelFunc(const framework::ExecutionContext& ctx, Func&& func);
 
 template <>
-void CallKernelFunc<const std::function<std::vector<Tensor>(const Tensor&)>&>(
+void CallKernelFunc<const std::function<std::vector<CustomTensor>(const CustomTensor&)>&>(
     const framework::ExecutionContext& ctx,
-    const std::function<std::vector<Tensor>(const Tensor&)>& func) {
+    const std::function<std::vector<CustomTensor>(const CustomTensor&)>& func) {
   VLOG(0) << "start run in CallKernelFunc";
-  auto* x = ctx.Input<Tensor>(detail::kCustomOpInputPrefix + std::to_string(0));
+  const Tensor* x = ctx.Input<Tensor>(detail::kCustomOpInputPrefix + std::to_string(0));
   PADDLE_ENFORCE_NOT_NULL(x, "input x is nullptr.");
   PADDLE_ENFORCE(x->IsInitialized(), "input x is not initialized.");
-
+  CustomTensor ct = CustomTensor((void *) x);
+  VLOG(0) << "Get ZeroCopyTensor in";
   VLOG(0) << "run forward func in CallKernelFunc";
-  auto outs = func(*x);
+  auto outs = func(ct);
 
   VLOG(0) << "share output in CallKernelFunc";
   auto true_outs = ctx.MultiOutput<Tensor>(detail::kCustomOpOutputPrefix);
   for (size_t i = 0; i < true_outs.size(); ++i) {
-    (true_outs)[i]->ShareDataWith(outs.at(i));
+    outs.at(i).ShareDataWith((true_outs)[i]);
   }
 }
 
 template <>
 void CallKernelFunc<const std::function<
-    std::vector<Tensor>(const Tensor&, const Tensor&, const Tensor&)>&>(
+    std::vector<CustomTensor>(const CustomTensor&, const CustomTensor&, const CustomTensor&)>&>(
     const framework::ExecutionContext& ctx,
-    const std::function<std::vector<Tensor>(const Tensor&, const Tensor&,
-                                            const Tensor&)>& func) {
+    const std::function<std::vector<CustomTensor>(const CustomTensor&, const CustomTensor&,
+                                            const CustomTensor&)>& func) {
   std::vector<const Tensor*> ins;
   for (auto name : ctx.InNameList()) {
     VLOG(0) << "input name: " << name;
@@ -94,14 +96,17 @@ void CallKernelFunc<const std::function<
   }
 
   VLOG(0) << "run forward func in CallKernelFunc";
-  auto outs = func(*ins[0], *ins[1], *ins[2]);
+  CustomTensor ct1 = CustomTensor((void*)ins[0]);
+  CustomTensor ct2 = CustomTensor((void*)ins[1]);
+  CustomTensor ct3 = CustomTensor((void*)ins[2]);
+  auto outs = func(ct1, ct2, ct3);
 
   VLOG(0) << "share output in CallKernelFunc";
   auto out_names = ctx.OutNameList();
   PADDLE_ENFORCE_EQ(out_names.size(), 1, "only can hold 1 out in custom op.");
   auto true_outs = ctx.MultiOutput<Tensor>(out_names[0]);
   for (size_t i = 0; i < true_outs.size(); ++i) {
-    (true_outs)[i]->ShareDataWith(outs.at(i));
+      outs.at(i).ShareDataWith((true_outs)[i]);
   }
 }
 
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index fb55d5463621e..c966aaf94df43 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -74,6 +74,7 @@ set(SHARED_INFERENCE_SRCS
     ${CMAKE_CURRENT_SOURCE_DIR}/api/analysis_predictor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/api/details/zero_copy_tensor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/utils/io_utils.cc
+    ${CMAKE_CURRENT_SOURCE_DIR}/../extension/src/tensor.cc
     ${mkldnn_quantizer_src_file})
 
 # shared inference library deps

From 618d9177af7361daf42a359d8d2604e95d960883 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 29 Jan 2021 17:19:21 +0800
Subject: [PATCH 03/83] fix compile error

---
 paddle/fluid/extension/include/tensor.h | 2 +-
 paddle/fluid/extension/src/tensor.cc    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 1cf0fca639095..bdb64a56c86f4 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -21,7 +21,7 @@ namespace paddle {
 
 class CustomTensor{
 public:
-    explicit CustomTensor(void* raw_tensor) : tensor_{raw_tensor}{};
+    explicit CustomTensor(void* raw_tensor);
     /// \brief Reset the shape of the tensor.
     /// Generally it's only used for the input tensor.
     /// Reshape must be called before calling mutable_data() or copy_from_cpu()
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 63de6c16b4a0f..d668e580701a7 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -30,6 +30,7 @@ void CustomTensor::Reshape(const std::vector<int> &shape) {
     tensor->Resize(framework::make_ddim(shape));
 }
 
+CustomTensor::CustomTensor(void* raw_tensor) : tensor_(static_cast<framework::LoDTensor*>(raw_tensor)){}
 
 template <typename T>
 T *CustomTensor::mutable_data(PaddlePlace place) {

From d49f476068bf9526d256fb606c60991454de7eea Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 29 Jan 2021 17:23:29 +0800
Subject: [PATCH 04/83] fix compile error

---
 paddle/fluid/framework/custom_operator.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index fc646387afcf5..96825d6d757aa 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -279,10 +279,10 @@ void LoadCustomOperator(const std::string& dso_name) {
     // 2. register op
     RegisterOperator(pair.first, forward_in_num);
     // 3. register op kernel
-    RegisterOperatorKernel<std::function<std::vector<Tensor>(const Tensor&)>>(
+    RegisterOperatorKernel<std::function<std::vector<CustomTensor>(const CustomTensor&)>>(
         pair.first, &forward_func);
-    RegisterOperatorKernel<std::function<std::vector<Tensor>(
-        const Tensor&, const Tensor&, const Tensor&)>>(pair.first + "_grad",
+    RegisterOperatorKernel<std::function<std::vector<CustomTensor>(
+        const CustomTensor&, const CustomTensor&, const CustomTensor&)>>(pair.first + "_grad",
                                                        &backward_func);
   }
 }

From d14cd5183f54cbd963d6b1aae5a2a3e366fad714 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 29 Jan 2021 17:38:39 +0800
Subject: [PATCH 05/83] fix compile error

---
 paddle/fluid/framework/custom_operator.cc | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 96825d6d757aa..1fd91a4498255 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -68,7 +68,9 @@ void CallKernelFunc<const std::function<std::vector<CustomTensor>(const CustomTe
   const Tensor* x = ctx.Input<Tensor>(detail::kCustomOpInputPrefix + std::to_string(0));
   PADDLE_ENFORCE_NOT_NULL(x, "input x is nullptr.");
   PADDLE_ENFORCE(x->IsInitialized(), "input x is not initialized.");
-  CustomTensor ct = CustomTensor((void *) x);
+  auto custom_use_input = framework::Tensor();
+  custom_use_input.ShareDataWith(*x);
+  CustomTensor ct = CustomTensor((void *) (&custom_use_input));
   VLOG(0) << "Get ZeroCopyTensor in";
   VLOG(0) << "run forward func in CallKernelFunc";
   auto outs = func(ct);
@@ -86,19 +88,24 @@ void CallKernelFunc<const std::function<
     const framework::ExecutionContext& ctx,
     const std::function<std::vector<CustomTensor>(const CustomTensor&, const CustomTensor&,
                                             const CustomTensor&)>& func) {
-  std::vector<const Tensor*> ins;
+  std::vector<Tensor> ins;
   for (auto name : ctx.InNameList()) {
     VLOG(0) << "input name: " << name;
     auto* x = ctx.Input<Tensor>(name);
+    auto custom_use_input = framework::Tensor();
+    custom_use_input.ShareDataWith(*x);
+
     PADDLE_ENFORCE_NOT_NULL(x, "input %s is nullptr.", name);
     PADDLE_ENFORCE(x->IsInitialized(), "input %s is not initialized.", name);
-    ins.emplace_back(x);
+    ins.emplace_back(custom_use_input);
   }
 
   VLOG(0) << "run forward func in CallKernelFunc";
-  CustomTensor ct1 = CustomTensor((void*)ins[0]);
-  CustomTensor ct2 = CustomTensor((void*)ins[1]);
-  CustomTensor ct3 = CustomTensor((void*)ins[2]);
+
+  CustomTensor ct1 = CustomTensor((void*)(&ins[0]));
+  CustomTensor ct2 = CustomTensor((void*)(&ins[1]));
+  CustomTensor ct3 = CustomTensor((void*)(&ins[2]));
+
   auto outs = func(ct1, ct2, ct3);
 
   VLOG(0) << "share output in CallKernelFunc";

From 4e719fcf8a715665df69a7242c5afedb31bc0faf Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 29 Jan 2021 17:53:56 +0800
Subject: [PATCH 06/83] fix compile error

---
 paddle/fluid/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index fee811d428a57..40ce6c668b646 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -159,7 +159,7 @@ cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
 
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
-cc_library(custom_operator SRCS custom_operator.cc DEPS operator op_registry device_context)
+cc_library(custom_operator SRCS custom_operator.cc DEPS operator op_registry device_context ../extension/src/tensor.cc)
 
 if(WITH_PYTHON)
   py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)

From 87059c5046d481e001f5d1f6221996f50d81f01e Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 29 Jan 2021 18:47:57 +0800
Subject: [PATCH 07/83] fix compile error

---
 paddle/fluid/framework/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 40ce6c668b646..bbb11831147b8 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -159,7 +159,8 @@ cc_test(op_call_stack_test SRCS op_call_stack_test.cc DEPS op_call_stack)
 
 nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
-cc_library(custom_operator SRCS custom_operator.cc DEPS operator op_registry device_context ../extension/src/tensor.cc)
+cc_library(custom_tensor SRCS ../extension/src/tensor.cc DEPS lod_tensor)
+cc_library(custom_operator SRCS custom_operator.cc DEPS operator op_registry device_context custom_tensor)
 
 if(WITH_PYTHON)
   py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto)

From 2ef89a5cd3564a35e25c993a4fab21a96b96219f Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 11:28:08 +0800
Subject: [PATCH 08/83] add CustomTensor default constructor

---
 paddle/fluid/extension/include/tensor.h | 3 +++
 paddle/fluid/extension/src/tensor.cc    | 8 ++++----
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index bdb64a56c86f4..e796eadbedb51 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -21,6 +21,9 @@ namespace paddle {
 
 class CustomTensor{
 public:
+    /// \brief Construct a CustomTensor for CustomOp.
+    /// Generally it's only used for user to create CustomTensor.
+    explicit CustomTensor();
     explicit CustomTensor(void* raw_tensor);
     /// \brief Reset the shape of the tensor.
     /// Generally it's only used for the input tensor.
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index d668e580701a7..296adc56acb1c 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -29,7 +29,7 @@ void CustomTensor::Reshape(const std::vector<int> &shape) {
     GET_CASTED_TENSOR
     tensor->Resize(framework::make_ddim(shape));
 }
-
+CustomTensor::CustomTensor():tensor_(nullptr){};
 CustomTensor::CustomTensor(void* raw_tensor) : tensor_(static_cast<framework::LoDTensor*>(raw_tensor)){}
 
 template <typename T>
@@ -167,9 +167,9 @@ std::vector<std::vector<size_t>> CustomTensor::lod() const {
 
 
 void CustomTensor::ShareDataWith(void* out_data){
-    auto out_data_tmp = framework::LoDTensor();
-    out_data_tmp = *static_cast<framework::LoDTensor*>(tensor_.get());
-    static_cast<framework::LoDTensor*>(out_data)->ShareDataWith(out_data_tmp);
+    static_cast<framework::LoDTensor*>(out_data)
+    ->ShareDataWith(
+            *static_cast<framework::LoDTensor*>(tensor_.get()));
 }
 
 }  // namespace paddle

From f217ccb990182b7447605f0628da6c5420719826 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 11:48:45 +0800
Subject: [PATCH 09/83] add size() for CustomTensor

---
 paddle/fluid/extension/include/tensor.h | 5 +++++
 paddle/fluid/extension/src/tensor.cc    | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index e796eadbedb51..067352d0fafd2 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -84,6 +84,11 @@ class CustomTensor{
     /// \return void.
     void ShareDataWith(void* tensor_out);
 
+    /// \brief Get the size of current tensor.
+    /// Use this method to get the size of tensor
+    /// \return int64_t.
+    int64_t size();
+
 private:
     mutable std::shared_ptr<void> tensor_;
     PaddlePlace place_;
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 296adc56acb1c..6a7d934918577 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -172,5 +172,10 @@ void CustomTensor::ShareDataWith(void* out_data){
             *static_cast<framework::LoDTensor*>(tensor_.get()));
 }
 
+int64_t CustomTensor::size() {
+    GET_CASTED_TENSOR;
+    return tensor->numel();
+}
+
 }  // namespace paddle
 

From 7f9f1cd652c1ff70297941522a3509ba904190a3 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 12:04:30 +0800
Subject: [PATCH 10/83] make size const for CustomTensor

---
 paddle/fluid/extension/include/tensor.h | 2 +-
 paddle/fluid/extension/src/tensor.cc    | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 067352d0fafd2..bea3a7776e7fd 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -87,7 +87,7 @@ class CustomTensor{
     /// \brief Get the size of current tensor.
     /// Use this method to get the size of tensor
     /// \return int64_t.
-    int64_t size();
+    int64_t size() const;
 
 private:
     mutable std::shared_ptr<void> tensor_;
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 6a7d934918577..2005c8c31b9ec 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -172,7 +172,7 @@ void CustomTensor::ShareDataWith(void* out_data){
             *static_cast<framework::LoDTensor*>(tensor_.get()));
 }
 
-int64_t CustomTensor::size() {
+int64_t CustomTensor::size() const{
     GET_CASTED_TENSOR;
     return tensor->numel();
 }

From 8102863e6c214e281f5c68fab5f8b93edb8f94d6 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 15:50:15 +0800
Subject: [PATCH 11/83] refactor place related api to circle the concept

---
 paddle/fluid/extension/include/device.h | 12 ++++-
 paddle/fluid/extension/include/tensor.h | 17 +++----
 paddle/fluid/extension/src/tensor.cc    | 60 ++++++++++++++-----------
 3 files changed, 51 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/extension/include/device.h b/paddle/fluid/extension/include/device.h
index 8b28499b69095..2a384e07e5e1d 100644
--- a/paddle/fluid/extension/include/device.h
+++ b/paddle/fluid/extension/include/device.h
@@ -16,6 +16,16 @@ limitations under the License. */
 
 namespace paddle {
 
-enum class PaddlePlace { kUNK = -1, kCPU, kGPU };
+enum class PlaceType { kUNK = -1, kCPU, kGPU };
+
+template<typename T>
+class PaddlePlace {
+public:
+    explicit PaddlePlace(PlaceType pc) : pc_(pc){}
+    const PlaceType& GetPlace() const { return pc_; };
+
+protected:
+    PlaceType pc_;
+};
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index bea3a7776e7fd..f625e71835a46 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -36,15 +36,13 @@ class CustomTensor{
     /// It's usually used to get input data pointer.
     /// \param place The place of the tensor.
     template <typename T>
-    T* mutable_data(PaddlePlace place);
+    T* mutable_data(const PaddlePlace& place);
 
     /// \brief Get the memory pointer directly.
     /// It's usually used to get the output data pointer.
-    /// \param[out] place To get the device type of the tensor.
-    /// \param[out] size To get the data size of the tensor.
     /// \return The tensor data buffer pointer.
     template <typename T>
-    T* data(PaddlePlace* place, int* size) const;
+    T* data() const;
 
     /// \brief Copy the host memory to tensor data.
     /// It's usually used to set the input tensor data.
@@ -69,10 +67,6 @@ class CustomTensor{
     /// \brief Return the lod info of the tensor.
     std::vector<std::vector<size_t>> lod() const;
 
-    void SetPlace(PaddlePlace place) {
-        place_ = place;
-    }
-
     /// \brief Return the data type of the tensor.
     /// It's usually used to get the output tensor data type.
     /// \return The data type of the tensor.
@@ -89,11 +83,14 @@ class CustomTensor{
     /// \return int64_t.
     int64_t size() const;
 
+    /// \brief Get the place of current tensor.
+    /// Use this method to get the place of tensor
+    /// \return Place.
+    const PaddlePlace& place();
+
 private:
     mutable std::shared_ptr<void> tensor_;
     PaddlePlace place_;
-    PaddleDType dtype_;
-    int device_num_{};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 2005c8c31b9ec..877518e3c1a19 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
-
+#include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
 
 #define GET_CASTED_TENSOR                               \
@@ -29,45 +29,39 @@ void CustomTensor::Reshape(const std::vector<int> &shape) {
     GET_CASTED_TENSOR
     tensor->Resize(framework::make_ddim(shape));
 }
-CustomTensor::CustomTensor():tensor_(nullptr){};
-CustomTensor::CustomTensor(void* raw_tensor) : tensor_(static_cast<framework::LoDTensor*>(raw_tensor)){}
+CustomTensor::CustomTensor():tensor_(std::make_shared<framework::LoDTensor>()), place_(PlaceType::kUNK){};
+CustomTensor::CustomTensor(void* raw_tensor) : tensor_(static_cast<framework::LoDTensor*>(raw_tensor)), place_(PlaceType::kUNK){}
 
 template <typename T>
-T *CustomTensor::mutable_data(PaddlePlace place) {
+T *CustomTensor::mutable_data(const PaddlePlace& place) {
     GET_CASTED_TENSOR
+    place_ = place;
     PADDLE_ENFORCE_GT(
             tensor->numel(), 0,
             platform::errors::PreconditionNotMet(
                     "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
                     "&shape)"
                     "function before retrieving mutable_data from input tensor."));
-    switch (static_cast<int>(place)) {
-        case static_cast<int>(PaddlePlace::kCPU): {
+    switch (static_cast<int>(place.GetPlace())) {
+        case static_cast<int>(PlaceType::kCPU): {
             return tensor->mutable_data<T>(platform::CPUPlace());
         }
-        case static_cast<int>(PaddlePlace::kGPU): {
-            return tensor->mutable_data<T>(platform::CUDAPlace(device_num_));
+        case static_cast<int>(PlaceType::kGPU): {
+#ifdef PADDLE_WITH_CUDA
+            int device_num = platform::GetCurrentDeviceId();
+            return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
+#endif
         }
         default:
             PADDLE_THROW(platform::errors::Unavailable("Unsupported place: %d",
-                                                       static_cast<int>(place)));
+                                                       static_cast<int>(place.GetPlace())));
     }
 }
 
 template <typename T>
-T *CustomTensor::data(PaddlePlace *place, int *size) const {
+T *CustomTensor::data() const {
     GET_CASTED_TENSOR;
     auto *res = tensor->data<T>();
-
-    if (platform::is_cpu_place(tensor->place())) {
-        *place = PaddlePlace::kCPU;
-    } else if (platform::is_gpu_place(tensor->place())) {
-        *place = PaddlePlace::kGPU;
-    } else {
-        *place = PaddlePlace::kUNK;
-    }
-
-    *size = tensor->numel();
     return res;
 }
 
@@ -96,16 +90,17 @@ void CustomTensor::copy_from_cpu(const T *data) {
                               "function before copying data from cpu."));
     size_t ele_size = tensor->numel() * sizeof(T);
 
-    if (place_ == PaddlePlace::kCPU) {
+    if (place_.GetPlace() == PlaceType::kCPU) {
         auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
         std::memcpy(static_cast<void *>(t_data), data, ele_size);
     } else {
 #ifdef PADDLE_WITH_CUDA
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-platform::CUDAPlace gpu_place(device_);
-auto *t_data = tensor->mutable_data<T>(gpu_place);
-auto *dev_ctx =
-    static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    int device_num = platform::GetCurrentDeviceId();
+    platform::CUDAPlace gpu_place(device_num);
+    auto *t_data = tensor->mutable_data<T>(gpu_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
 
 memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
              data, ele_size, dev_ctx->stream());
@@ -165,6 +160,18 @@ std::vector<std::vector<size_t>> CustomTensor::lod() const {
     return res;
 }
 
+const PaddlePlace& CustomTensor::place() {
+    GET_CASTED_TENSOR;
+    if(platform::is_cpu_place(tensor->place())){
+        place_ = PaddlePlace(PlaceType::kCPU);
+    }else if(platform::is_gpu_place(tensor->place())){
+        place_ = PaddlePlace(PlaceType::kGPU);
+    }else{
+        PADDLE_THROW("Current CustomTensor hold unsupported Place Type, Please Init it"
+                     "with Place::kCPU or Place::kGPU");
+    }
+    return place_;
+}
 
 void CustomTensor::ShareDataWith(void* out_data){
     static_cast<framework::LoDTensor*>(out_data)
@@ -176,6 +183,5 @@ int64_t CustomTensor::size() const{
     GET_CASTED_TENSOR;
     return tensor->numel();
 }
-
 }  // namespace paddle
 

From c5b3b5c4467e91086874ae00e026ce833bf0e5d2 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 17:45:11 +0800
Subject: [PATCH 12/83] fix compile error

---
 paddle/fluid/extension/include/op_functor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/extension/include/op_functor.h b/paddle/fluid/extension/include/op_functor.h
index 4d6a3fc6d201a..b80b97975e8c5 100644
--- a/paddle/fluid/extension/include/op_functor.h
+++ b/paddle/fluid/extension/include/op_functor.h
@@ -26,7 +26,7 @@ limitations under the License. */
 #include "all.h"
 namespace paddle {
 
-using Tensor = CustomTensor;
+using Tensor = paddle::CustomTensor;
 
 using FuncInfo = std::pair<size_t, size_t>;
 using TraitsFunc = FuncInfo (*)();

From c67e36f2afa055f03e1b840ea634bbe68d724147 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 17:48:46 +0800
Subject: [PATCH 13/83] fix compile error

---
 paddle/fluid/extension/include/op_functor.h | 17 ++++++++---------
 1 file changed, 8 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/extension/include/op_functor.h b/paddle/fluid/extension/include/op_functor.h
index b80b97975e8c5..4021418eb7997 100644
--- a/paddle/fluid/extension/include/op_functor.h
+++ b/paddle/fluid/extension/include/op_functor.h
@@ -24,13 +24,12 @@ limitations under the License. */
 #include <vector>
 
 #include "all.h"
-namespace paddle {
 
-using Tensor = paddle::CustomTensor;
+namespace paddle {
 
 using FuncInfo = std::pair<size_t, size_t>;
 using TraitsFunc = FuncInfo (*)();
-using ComputeFunc = std::vector<Tensor> (*)(std::vector<const Tensor*> inputs,
+using ComputeFunc = std::vector<CustomTensor> (*)(std::vector<const CustomTensor*> inputs,
                                             std::vector<boost::any> attrs);
 // key std::string means data type, replace by enum DataType later
 using ComputeFuncMap = std::unordered_map<std::string, ComputeFunc>;
@@ -74,7 +73,7 @@ struct ComputeFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return Compute(std::vector<const Tensor*> inputs,
+  static Return Compute(std::vector<const CustomTensor*> inputs,
                         std::vector<boost::any> attrs) {
     return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0>(
         inputs, attrs);
@@ -85,14 +84,14 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   struct ComputeCallHelper;
 
   template <typename... Tail>
-  struct ComputeCallHelper<const Tensor&, Tail...> {
+  struct ComputeCallHelper<const CustomTensor&, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const Tensor*> inputs,
+    static Return Compute(std::vector<const CustomTensor*> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
                     "Input tensor should appear before attributes.");
-      const Tensor& arg = *(inputs[in_idx]);
+      const CustomTensor& arg = *(inputs[in_idx]);
       return ComputeCallHelper<Tail...>::template Compute<in_idx + 1, attr_idx>(
           inputs, attrs, pargs..., arg);
     }
@@ -101,7 +100,7 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<int, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const Tensor*> inputs,
+    static Return Compute(std::vector<const CustomTensor*> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       try {
@@ -120,7 +119,7 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename T>
   struct ComputeCallHelper<TypeTag<T>> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const Tensor*> inputs,
+    static Return Compute(std::vector<const CustomTensor*> inputs,
                           std::vector<boost::any> attrs, const Args&... args) {
       return impl_fn(args...);
     }

From bb4c29573e500ec6e395c248ee560f993fd510aa Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 17:52:26 +0800
Subject: [PATCH 14/83] fix compile error

---
 paddle/fluid/extension/include/op_functor.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/extension/include/op_functor.h b/paddle/fluid/extension/include/op_functor.h
index 4021418eb7997..c5a2f854bf4f3 100644
--- a/paddle/fluid/extension/include/op_functor.h
+++ b/paddle/fluid/extension/include/op_functor.h
@@ -23,8 +23,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "all.h"
-
+#include "paddle/fluid/extension/include/tensor.h"
+#include "paddle/fluid/extension/include/device.h"
 namespace paddle {
 
 using FuncInfo = std::pair<size_t, size_t>;

From d416fdbfb43a6b8a4558b2a559fd51296fe2cc40 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 17:58:19 +0800
Subject: [PATCH 15/83] fix compile error

---
 paddle/fluid/framework/custom_operator.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index f447999a71a2c..fa04e125996e8 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -88,17 +88,17 @@ static void RunComputeFunc(const framework::ExecutionContext& ctx,
     PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "Input tensor (%s) is not initialized."));
-    auto custom_use_input = framework::LoDTensor();
+    auto custom_use_input = framework::Tensor();
     custom_use_input.ShareDataWith(*x);
-    ins.emplace_back(custom_use_input);
+    ins.push_back(custom_use_input);
   }
 
-  std::vector<const CustomTensor> custom_use_ins;
+  std::vector<CustomTensor> custom_use_ins;
   std::vector<const CustomTensor*> custom_use_ins_ptr;
   for(auto tensor : ins){
       auto custom_tensor = CustomTensor((void*)(&tensor));
-      custom_use_ins.emplace_back(custom_tensor);
-      custom_use_ins_ptr.emplace_back(&custom_tensor);
+      custom_use_ins.push_back(custom_tensor);
+      custom_use_ins_ptr.push_back(&custom_tensor);
   }
   std::vector<boost::any> attrs;
 

From 8cc60ec9ab5d9925a3c45fd57f179440e8ed73f8 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 18:00:18 +0800
Subject: [PATCH 16/83] fix compile error

---
 paddle/fluid/framework/custom_operator.cc | 48 +++++++++++------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index fa04e125996e8..21096123297a6 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -79,7 +79,7 @@ static std::type_index StringToDataType(const std::string& str) {
 static void RunComputeFunc(const framework::ExecutionContext& ctx,
                            paddle::ComputeFunc func) {
   VLOG(0) << "Before run ComputeFunc.";
-  std::vector<const framework::Tensor> ins;
+  std::vector<framework::Tensor> ins;
   for (auto name : ctx.InNameList()) {
     VLOG(0) << "input name: " << name;
     auto* x = ctx.Input<Tensor>(name);
@@ -93,29 +93,29 @@ static void RunComputeFunc(const framework::ExecutionContext& ctx,
     ins.push_back(custom_use_input);
   }
 
-  std::vector<CustomTensor> custom_use_ins;
-  std::vector<const CustomTensor*> custom_use_ins_ptr;
-  for(auto tensor : ins){
-      auto custom_tensor = CustomTensor((void*)(&tensor));
-      custom_use_ins.push_back(custom_tensor);
-      custom_use_ins_ptr.push_back(&custom_tensor);
-  }
-  std::vector<boost::any> attrs;
-
-  VLOG(0) << "Run ComputeFunc.";
-
-  auto outs = func(custom_use_ins_ptr, attrs);
-
-  VLOG(0) << "Share outputs into ExecutionContext.";
-  auto out_name = ctx.OutNameList();
-  PADDLE_ENFORCE_EQ(
-      out_name.size(), 1UL,
-      platform::errors::InvalidArgument(
-          "Custom operator can only hold 1 output as vector<Tensor>."));
-  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
-  for (size_t i = 0; i < true_outs.size(); ++i) {
-      outs.at(i).ShareDataWith((true_outs)[i]);
-  }
+//  std::vector<CustomTensor> custom_use_ins;
+//  std::vector<const CustomTensor*> custom_use_ins_ptr;
+//  for(auto tensor : ins){
+//      auto custom_tensor = CustomTensor((void*)(&tensor));
+//      custom_use_ins.push_back(custom_tensor);
+//      custom_use_ins_ptr.push_back(&custom_tensor);
+//  }
+//  std::vector<boost::any> attrs;
+//
+//  VLOG(0) << "Run ComputeFunc.";
+//
+//  auto outs = func(custom_use_ins_ptr, attrs);
+//
+//  VLOG(0) << "Share outputs into ExecutionContext.";
+//  auto out_name = ctx.OutNameList();
+//  PADDLE_ENFORCE_EQ(
+//      out_name.size(), 1UL,
+//      platform::errors::InvalidArgument(
+//          "Custom operator can only hold 1 output as vector<Tensor>."));
+//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
+//  for (size_t i = 0; i < true_outs.size(); ++i) {
+//      outs.at(i).ShareDataWith((true_outs)[i]);
+//  }
 }
 
 //////////////////// Operator Define /////////////////

From 1dccc2d979eb36ea4f765b165c5b92c104093ec0 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 18:08:00 +0800
Subject: [PATCH 17/83] fix compile error

---
 paddle/fluid/extension/include/op_functor.h | 12 ++++++------
 paddle/fluid/extension/src/tensor.cc        |  1 +
 paddle/fluid/framework/custom_operator.cc   | 12 +++++-------
 3 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/extension/include/op_functor.h b/paddle/fluid/extension/include/op_functor.h
index c5a2f854bf4f3..7af7eba23f250 100644
--- a/paddle/fluid/extension/include/op_functor.h
+++ b/paddle/fluid/extension/include/op_functor.h
@@ -29,7 +29,7 @@ namespace paddle {
 
 using FuncInfo = std::pair<size_t, size_t>;
 using TraitsFunc = FuncInfo (*)();
-using ComputeFunc = std::vector<CustomTensor> (*)(std::vector<const CustomTensor*> inputs,
+using ComputeFunc = std::vector<CustomTensor> (*)(std::vector<const CustomTensor> inputs,
                                             std::vector<boost::any> attrs);
 // key std::string means data type, replace by enum DataType later
 using ComputeFuncMap = std::unordered_map<std::string, ComputeFunc>;
@@ -73,7 +73,7 @@ struct ComputeFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return Compute(std::vector<const CustomTensor*> inputs,
+  static Return Compute(std::vector<const CustomTensor> inputs,
                         std::vector<boost::any> attrs) {
     return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0>(
         inputs, attrs);
@@ -86,12 +86,12 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<const CustomTensor&, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const CustomTensor*> inputs,
+    static Return Compute(std::vector<const CustomTensor> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
                     "Input tensor should appear before attributes.");
-      const CustomTensor& arg = *(inputs[in_idx]);
+      const CustomTensor& arg = inputs[in_idx];
       return ComputeCallHelper<Tail...>::template Compute<in_idx + 1, attr_idx>(
           inputs, attrs, pargs..., arg);
     }
@@ -100,7 +100,7 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<int, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const CustomTensor*> inputs,
+    static Return Compute(std::vector<const CustomTensor> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       try {
@@ -119,7 +119,7 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename T>
   struct ComputeCallHelper<TypeTag<T>> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const CustomTensor*> inputs,
+    static Return Compute(std::vector<const CustomTensor> inputs,
                           std::vector<boost::any> attrs, const Args&... args) {
       return impl_fn(args...);
     }
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 26907dafddcca..015868c39f4e6 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -32,6 +32,7 @@ void CustomTensor::Reshape(const std::vector<int> &shape) {
 CustomTensor::CustomTensor(PaddlePlace place):
         tensor_(std::make_shared<framework::LoDTensor>()),
         place_(place){};
+
 CustomTensor::CustomTensor(void* raw_tensor) :
         tensor_(static_cast<framework::LoDTensor*>(raw_tensor)),
         place_(PlaceType::kUNK){}
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 21096123297a6..9d2e1f8a60443 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -93,13 +93,11 @@ static void RunComputeFunc(const framework::ExecutionContext& ctx,
     ins.push_back(custom_use_input);
   }
 
-//  std::vector<CustomTensor> custom_use_ins;
-//  std::vector<const CustomTensor*> custom_use_ins_ptr;
-//  for(auto tensor : ins){
-//      auto custom_tensor = CustomTensor((void*)(&tensor));
-//      custom_use_ins.push_back(custom_tensor);
-//      custom_use_ins_ptr.push_back(&custom_tensor);
-//  }
+  std::vector<const CustomTensor> custom_use_ins;
+  for(auto tensor : ins){
+      auto custom_tensor = CustomTensor((void*)(&tensor));
+      custom_use_ins.push_back(custom_tensor);
+  }
 //  std::vector<boost::any> attrs;
 //
 //  VLOG(0) << "Run ComputeFunc.";

From 4b304f2b4566be88d465233880438d6e7acc99a2 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 18:10:11 +0800
Subject: [PATCH 18/83] fix compile error

---
 paddle/fluid/framework/custom_operator.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 9d2e1f8a60443..c7c48d9127844 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -93,7 +93,7 @@ static void RunComputeFunc(const framework::ExecutionContext& ctx,
     ins.push_back(custom_use_input);
   }
 
-  std::vector<const CustomTensor> custom_use_ins;
+  std::vector<CustomTensor> custom_use_ins;
   for(auto tensor : ins){
       auto custom_tensor = CustomTensor((void*)(&tensor));
       custom_use_ins.push_back(custom_tensor);

From dcda6cdf8da950cba764c59383f1280266404dd1 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 18:12:13 +0800
Subject: [PATCH 19/83] fix compile error

---
 paddle/fluid/extension/include/op_functor.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/extension/include/op_functor.h b/paddle/fluid/extension/include/op_functor.h
index 7af7eba23f250..5f710fba09bcb 100644
--- a/paddle/fluid/extension/include/op_functor.h
+++ b/paddle/fluid/extension/include/op_functor.h
@@ -29,7 +29,7 @@ namespace paddle {
 
 using FuncInfo = std::pair<size_t, size_t>;
 using TraitsFunc = FuncInfo (*)();
-using ComputeFunc = std::vector<CustomTensor> (*)(std::vector<const CustomTensor> inputs,
+using ComputeFunc = std::vector<CustomTensor> (*)(std::vector<CustomTensor> inputs,
                                             std::vector<boost::any> attrs);
 // key std::string means data type, replace by enum DataType later
 using ComputeFuncMap = std::unordered_map<std::string, ComputeFunc>;
@@ -73,7 +73,7 @@ struct ComputeFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return Compute(std::vector<const CustomTensor> inputs,
+  static Return Compute(std::vector<CustomTensor> inputs,
                         std::vector<boost::any> attrs) {
     return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0>(
         inputs, attrs);
@@ -86,7 +86,7 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<const CustomTensor&, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const CustomTensor> inputs,
+    static Return Compute(std::vector<CustomTensor> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
@@ -100,7 +100,7 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<int, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const CustomTensor> inputs,
+    static Return Compute(std::vector<CustomTensor> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       try {
@@ -119,7 +119,7 @@ struct ComputeFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename T>
   struct ComputeCallHelper<TypeTag<T>> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const CustomTensor> inputs,
+    static Return Compute(std::vector<CustomTensor> inputs,
                           std::vector<boost::any> attrs, const Args&... args) {
       return impl_fn(args...);
     }

From bec954f45ac36624392d3899531aabd3d881e116 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 18:16:01 +0800
Subject: [PATCH 20/83] fix compile error

---
 paddle/fluid/framework/custom_operator.cc | 38 +++++++++++------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index c7c48d9127844..cdf78077550b6 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -94,26 +94,26 @@ static void RunComputeFunc(const framework::ExecutionContext& ctx,
   }
 
   std::vector<CustomTensor> custom_use_ins;
-  for(auto tensor : ins){
-      auto custom_tensor = CustomTensor((void*)(&tensor));
-      custom_use_ins.push_back(custom_tensor);
+  custom_use_ins.reserve(ins.size());
+for(auto tensor : ins){
+      custom_use_ins.emplace_back(CustomTensor((void*)(&tensor)));
+  }
+  std::vector<boost::any> attrs;
+
+  VLOG(0) << "Run ComputeFunc.";
+
+  auto outs = func(custom_use_ins, attrs);
+
+  VLOG(0) << "Share outputs into ExecutionContext.";
+  auto out_name = ctx.OutNameList();
+  PADDLE_ENFORCE_EQ(
+      out_name.size(), 1UL,
+      platform::errors::InvalidArgument(
+          "Custom operator can only hold 1 output as vector<Tensor>."));
+  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
+  for (size_t i = 0; i < true_outs.size(); ++i) {
+      outs.at(i).ShareDataWith((true_outs)[i]);
   }
-//  std::vector<boost::any> attrs;
-//
-//  VLOG(0) << "Run ComputeFunc.";
-//
-//  auto outs = func(custom_use_ins_ptr, attrs);
-//
-//  VLOG(0) << "Share outputs into ExecutionContext.";
-//  auto out_name = ctx.OutNameList();
-//  PADDLE_ENFORCE_EQ(
-//      out_name.size(), 1UL,
-//      platform::errors::InvalidArgument(
-//          "Custom operator can only hold 1 output as vector<Tensor>."));
-//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
-//  for (size_t i = 0; i < true_outs.size(); ++i) {
-//      outs.at(i).ShareDataWith((true_outs)[i]);
-//  }
 }
 
 //////////////////// Operator Define /////////////////

From 2c5edac997f3cbd177550be9dfd87ea64a9ede5a Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 19:55:09 +0800
Subject: [PATCH 21/83] fix compile error

---
 paddle/fluid/extension/include/all.h      | 3 ++-
 paddle/fluid/framework/custom_operator.cc | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/extension/include/all.h b/paddle/fluid/extension/include/all.h
index c2bc3e28631cc..5b820e5788019 100644
--- a/paddle/fluid/extension/include/all.h
+++ b/paddle/fluid/extension/include/all.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #endif
 
 #include "paddle/fluid/extension/include/device.h"
+#include "paddle/fluid/extension/include/tensor.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/op_functor.h"
-#include "paddle/fluid/extension/include/tensor.h"
+
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index cdf78077550b6..dc3ba2fc7b87e 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -95,7 +95,7 @@ static void RunComputeFunc(const framework::ExecutionContext& ctx,
 
   std::vector<CustomTensor> custom_use_ins;
   custom_use_ins.reserve(ins.size());
-for(auto tensor : ins){
+  for(auto tensor : ins){
       custom_use_ins.emplace_back(CustomTensor((void*)(&tensor)));
   }
   std::vector<boost::any> attrs;

From 6990b999792054cb39a2d2f10b89a144f1b13212 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 20:10:35 +0800
Subject: [PATCH 22/83] fix compile error

---
 paddle/fluid/extension/include/op_functor.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/extension/include/op_functor.h b/paddle/fluid/extension/include/op_functor.h
index 5f710fba09bcb..47b5e2f9a7fac 100644
--- a/paddle/fluid/extension/include/op_functor.h
+++ b/paddle/fluid/extension/include/op_functor.h
@@ -25,6 +25,7 @@ limitations under the License. */
 
 #include "paddle/fluid/extension/include/tensor.h"
 #include "paddle/fluid/extension/include/device.h"
+
 namespace paddle {
 
 using FuncInfo = std::pair<size_t, size_t>;
@@ -329,15 +330,15 @@ struct OpKernelFuncRegistrar : public Registrar {
   }
 
 #define ADD_FORWARD_CPU_KERNEL(op_type, ...) \
-  ADD_KERNEL(op_type, true, CPU, ::paddle::platform::CPUPlace(), __VA_ARGS__)
+  ADD_KERNEL(op_type, true, CPU, ::paddle::PaddlePlace(paddle::PlaceType::kCPU), __VA_ARGS__)
 
 #define ADD_BACKWARD_CPU_KERNEL(op_type, ...) \
-  ADD_KERNEL(op_type, false, CPU, ::paddle::platform::CPUPlace(), __VA_ARGS__)
+  ADD_KERNEL(op_type, false, CPU, ::paddle::PaddlePlace(paddle::PlaceType::kCPU), __VA_ARGS__)
 
 #define ADD_FORWARD_CUDA_KERNEL(op_type, ...) \
-  ADD_KERNEL(op_type, true, CUDA, ::paddle::platform::CUDAPlace(), __VA_ARGS__)
+  ADD_KERNEL(op_type, true, CUDA, ::paddle::PaddlePlace(paddle::PlaceType::kGPU), __VA_ARGS__)
 
 #define ADD_BACKWARD_CUDA_KERNEL(op_type, ...) \
-  ADD_KERNEL(op_type, false, CUDA, ::paddle::platform::CUDAPlace(), __VA_ARGS__)
+  ADD_KERNEL(op_type, false, CUDA, ::paddle::PaddlePlace(paddle::PlaceType::kGPU), __VA_ARGS__)
 
 }  // namespace paddle

From 55b6a13b4c2fba9608970a1cba21b94f8e5326fb Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 1 Feb 2021 20:40:46 +0800
Subject: [PATCH 23/83] fix compile error

---
 paddle/fluid/platform/dynload/dynamic_loader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index e713054468905..17e5d7bc8274f 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -246,7 +246,7 @@ static inline void* GetDsoHandleFromSearchPath(
 #else
     auto errorno = GetLastError();
 #endif  // !_WIN32
-    if (throw_on_error) {
+    if (false) {
       // NOTE: Special error report case, no need to change its format
       PADDLE_THROW(
           platform::errors::PreconditionNotMet(error_msg, dso_name, errorno));

From abaa67ec086b8f8baa7a1a4ebf1c309f7a0263e9 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 11:26:08 +0800
Subject: [PATCH 24/83] fix compile error

---
 paddle/fluid/platform/dynload/dynamic_loader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 17e5d7bc8274f..e713054468905 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -246,7 +246,7 @@ static inline void* GetDsoHandleFromSearchPath(
 #else
     auto errorno = GetLastError();
 #endif  // !_WIN32
-    if (false) {
+    if (throw_on_error) {
       // NOTE: Special error report case, no need to change its format
       PADDLE_THROW(
           platform::errors::PreconditionNotMet(error_msg, dso_name, errorno));

From f8b23d43c1186bcf92e1cb2e2e020f52cd38eb36 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 13:48:59 +0800
Subject: [PATCH 25/83] fix compile error

---
 paddle/fluid/platform/dynload/dynamic_loader.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index e713054468905..a8fd906ff0573 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -150,6 +150,7 @@ static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
                                                  const std::string& dso_name,
                                                  int dynload_flags) {
   void* dso_handle = nullptr;
+    VLOG(0)<<"im here2";
   if (!spec_path.empty()) {
     // search xxx.so from custom path
     VLOG(3) << "Try to find library: " << dso_name
@@ -157,6 +158,7 @@ static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
     std::string dso_path = join(spec_path, dso_name);
     dso_handle = dlopen(dso_path.c_str(), dynload_flags);
   }
+    VLOG(0)<<"im here3";
   return dso_handle;
 }
 
@@ -164,7 +166,9 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
                                                 int dynload_flags) {
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
   // and /usr/local/lib path
+    VLOG(0)<<"im here8";
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    VLOG(0)<<"im here9";
   VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
 
@@ -204,18 +208,24 @@ static inline void* GetDsoHandleFromSearchPath(
   std::vector<std::string> dso_names = split(dso_name, ";");
   void* dso_handle = nullptr;
   for (auto dso : dso_names) {
+      VLOG(0)<<"im here1";
     // 1. search in user config path by FLAGS
     dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
+      VLOG(0)<<"im here4";
     // 2. search in extra paths
     if (nullptr == dso_handle) {
       for (auto path : extra_paths) {
+          VLOG(0)<<"im here5";
         VLOG(3) << "extra_paths: " << path;
         dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
       }
+        VLOG(0)<<"im here6";
     }
     // 3. search in system default path
     if (nullptr == dso_handle) {
+        VLOG(0)<<"im here7";
       dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
+        VLOG(0)<<"im here10";
     }
     if (nullptr != dso_handle) break;
   }

From ce4ecd084d26c9a6fe6679b0513cbe87b12df763 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 14:01:08 +0800
Subject: [PATCH 26/83] fix compile error

---
 paddle/fluid/platform/dynload/dynamic_loader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index a8fd906ff0573..f7e306046895b 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -169,7 +169,7 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
     VLOG(0)<<"im here8";
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
     VLOG(0)<<"im here9";
-  VLOG(3) << "Try to find library: " << dso_path
+    VLOG(0) << "Try to find library: " << dso_path
           << " from default system path.";
 
 // TODO(chenweihang): This path is used to search which libs?

From 0bb004c8d306a995190ae656a62638b3b5e5725b Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 14:22:46 +0800
Subject: [PATCH 27/83] fix compile error

---
 paddle/fluid/framework/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index bbb11831147b8..c6934796e50c5 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -327,7 +327,7 @@ set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_prot
 
 cc_library(paddle_framework DEPS ${FLUID_FRAMEWORK_MODULES})
 cc_library(paddle_framework_shared
-    SHARED SRCS executor.cc operator.cc custom_operator.cc
+    SHARED SRCS executor.cc operator.cc custom_operator.cc ../extension/src/tensor.cc
     ${CMAKE_CURRENT_SOURCE_DIR}/c/c_api.cc
     ${CMAKE_SOURCE_DIR}/paddle/fluid/imperative/layer.cc
     DEPS ${FLUID_FRAMEWORK_MODULES})

From 33ad438a9c4c76f1e78479df341e4355263ce5fb Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 14:46:13 +0800
Subject: [PATCH 28/83] fix compile error

---
 paddle/fluid/extension/src/tensor.cc            |  1 +
 paddle/fluid/platform/dynload/dynamic_loader.cc | 12 +-----------
 2 files changed, 2 insertions(+), 11 deletions(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 015868c39f4e6..cfd4db1c8d1f2 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -29,6 +29,7 @@ void CustomTensor::Reshape(const std::vector<int> &shape) {
     GET_CASTED_TENSOR
     tensor->Resize(framework::make_ddim(shape));
 }
+
 CustomTensor::CustomTensor(PaddlePlace place):
         tensor_(std::make_shared<framework::LoDTensor>()),
         place_(place){};
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index f7e306046895b..2e3f1f53d26b3 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -150,7 +150,6 @@ static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
                                                  const std::string& dso_name,
                                                  int dynload_flags) {
   void* dso_handle = nullptr;
-    VLOG(0)<<"im here2";
   if (!spec_path.empty()) {
     // search xxx.so from custom path
     VLOG(3) << "Try to find library: " << dso_name
@@ -158,7 +157,6 @@ static inline void* GetDsoHandleFromSpecificPath(const std::string& spec_path,
     std::string dso_path = join(spec_path, dso_name);
     dso_handle = dlopen(dso_path.c_str(), dynload_flags);
   }
-    VLOG(0)<<"im here3";
   return dso_handle;
 }
 
@@ -166,10 +164,8 @@ static inline void* GetDsoHandleFromDefaultPath(const std::string& dso_path,
                                                 int dynload_flags) {
   // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH
   // and /usr/local/lib path
-    VLOG(0)<<"im here8";
   void* dso_handle = dlopen(dso_path.c_str(), dynload_flags);
-    VLOG(0)<<"im here9";
-    VLOG(0) << "Try to find library: " << dso_path
+    VLOG(3) << "Try to find library: " << dso_path
           << " from default system path.";
 
 // TODO(chenweihang): This path is used to search which libs?
@@ -208,24 +204,18 @@ static inline void* GetDsoHandleFromSearchPath(
   std::vector<std::string> dso_names = split(dso_name, ";");
   void* dso_handle = nullptr;
   for (auto dso : dso_names) {
-      VLOG(0)<<"im here1";
     // 1. search in user config path by FLAGS
     dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
-      VLOG(0)<<"im here4";
     // 2. search in extra paths
     if (nullptr == dso_handle) {
       for (auto path : extra_paths) {
-          VLOG(0)<<"im here5";
         VLOG(3) << "extra_paths: " << path;
         dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
       }
-        VLOG(0)<<"im here6";
     }
     // 3. search in system default path
     if (nullptr == dso_handle) {
-        VLOG(0)<<"im here7";
       dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
-        VLOG(0)<<"im here10";
     }
     if (nullptr != dso_handle) break;
   }

From 2e433cc0de8c09b1f533ee22fd3959e44c0b913f Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 16:11:59 +0800
Subject: [PATCH 29/83] fix compile error

---
 paddle/fluid/extension/src/tensor.cc | 41 ++++++++++++++++++++++++++--
 1 file changed, 38 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index cfd4db1c8d1f2..e2f2cce2d7c03 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -50,7 +50,7 @@ T *CustomTensor::mutable_data() {
     PADDLE_ENFORCE_GT(
             tensor->numel(), 0,
             platform::errors::PreconditionNotMet(
-                    "You should call ZeroCopyTensor::Reshape(const std::vector<int> "
+                    "You should call CustomTensor::Reshape(const std::vector<int> "
                     "&shape)"
                     "function before retrieving mutable_data from input tensor."));
     switch (static_cast<int>(place_.GetPlace())) {
@@ -68,7 +68,7 @@ T *CustomTensor::mutable_data() {
                                                    static_cast<int>(place_.GetPlace())));
     }
 }
-
+    
 template <typename T>
 T *CustomTensor::data() const {
     GET_CASTED_TENSOR;
@@ -96,7 +96,7 @@ void CustomTensor::copy_from_cpu(const T *data) {
     GET_CASTED_TENSOR;
     PADDLE_ENFORCE_GE(tensor->numel(), 0,
                       platform::errors::PreconditionNotMet(
-                              "You should call ZeroCopyTensor::Reshape(const "
+                              "You should call CustomTensor::Reshape(const "
                               "std::vector<int> &shape)"
                               "function before copying data from cpu."));
     size_t ele_size = tensor->numel() * sizeof(T);
@@ -148,6 +148,41 @@ cudaStreamSynchronize(dev_ctx->stream());
     }
 }
 
+template  void CustomTensor::copy_from_cpu<float>(const float *data);
+template  void CustomTensor::copy_from_cpu<double>(const double *data);
+template  void CustomTensor::copy_from_cpu<int64_t>(const int64_t *data);
+template  void CustomTensor::copy_from_cpu<int32_t>(const int32_t *data);
+template  void CustomTensor::copy_from_cpu<uint8_t>(const uint8_t *data);
+template  void CustomTensor::copy_from_cpu<int8_t>(const int8_t *data);
+
+template  void CustomTensor::copy_to_cpu<float>(float *data);
+template  void CustomTensor::copy_to_cpu<double>(double *data);
+template  void CustomTensor::copy_to_cpu<int64_t>(int64_t *data);
+template  void CustomTensor::copy_to_cpu<int32_t>(int32_t *data);
+template  void CustomTensor::copy_to_cpu<uint8_t>(uint8_t *data);
+template  void CustomTensor::copy_to_cpu<int8_t>(int8_t *data);
+
+template  float *CustomTensor::data<float>() const;
+template  double *CustomTensor::data<double>() const;
+template  int64_t *CustomTensor::data<int64_t>() const;
+template  int32_t *CustomTensor::data<int32_t>() const;
+template  uint8_t *CustomTensor::data<uint8_t>() const;
+template  int8_t *CustomTensor::data<int8_t>() const;
+
+template  float *CustomTensor::mutable_data<float>();
+template  double *CustomTensor::mutable_data<double>();
+template  int64_t *CustomTensor::mutable_data<int64_t>();
+template  int32_t *CustomTensor::mutable_data<int32_t>();
+template  uint8_t *CustomTensor::mutable_data<uint8_t>();
+template  int8_t *CustomTensor::mutable_data<int8_t>();
+
+template  float *CustomTensor::mutable_data<float>(const PaddlePlace& place);
+template  double *CustomTensor::mutable_data<double>(const PaddlePlace& place);
+template  int64_t *CustomTensor::mutable_data<int64_t>(const PaddlePlace& place);
+template  int32_t *CustomTensor::mutable_data<int32_t>(const PaddlePlace& place);
+template  uint8_t *CustomTensor::mutable_data<uint8_t>(const PaddlePlace& place);
+template  int8_t *CustomTensor::mutable_data<int8_t>(const PaddlePlace& place);
+
 std::vector<int> CustomTensor::shape() const {
     GET_CASTED_TENSOR
     return framework::vectorize<int>(tensor->dims());

From 6c1752e9f52920216cb676abc3dc35ea632db8fe Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 17:34:34 +0800
Subject: [PATCH 30/83] make place const

---
 paddle/fluid/extension/include/tensor.h   | 4 ++--
 paddle/fluid/extension/src/tensor.cc      | 2 +-
 paddle/fluid/framework/custom_operator.cc | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index a871c6d7c407b..34e0ca60e2467 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -93,11 +93,11 @@ class CustomTensor{
     /// \brief Get the place of current tensor.
     /// Use this method to get the place of tensor
     /// \return Place.
-    const PaddlePlace& place();
+    const PaddlePlace& place() const;
 
 private:
     mutable std::shared_ptr<void> tensor_;
-    PaddlePlace place_;
+    mutable PaddlePlace place_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index e2f2cce2d7c03..a3e748db25ee0 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -206,7 +206,7 @@ std::vector<std::vector<size_t>> CustomTensor::lod() const {
     return res;
 }
 
-const PaddlePlace& CustomTensor::place() {
+const PaddlePlace& CustomTensor::place() const {
     GET_CASTED_TENSOR;
     if(platform::is_cpu_place(tensor->place())){
         place_ = PaddlePlace(PlaceType::kCPU);
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index e4e291ae5b6a8..9a6e3dc5c95bc 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -295,7 +295,7 @@ void RegisterOperator(const std::string& name, size_t input_num,
 }
 
 
-platform::Place PaddlePlaceToPlatformPlace(PaddlePlace pc){
+platform::Place PaddlePlaceToPlatformPlace(const PaddlePlace& pc){
     if(pc.GetPlace() == PlaceType::kCPU){
         return platform::Place(platform::CPUPlace());
     }else if(pc.GetPlace() == PlaceType::kGPU){

From a4d190b2cbcf54691150d002124c38dc34e66daf Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Tue, 2 Feb 2021 17:45:44 +0800
Subject: [PATCH 31/83] make Tensor copy

---
 paddle/fluid/extension/include/op_function.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/extension/include/op_function.h b/paddle/fluid/extension/include/op_function.h
index 5f685597324eb..eedbceda90036 100644
--- a/paddle/fluid/extension/include/op_function.h
+++ b/paddle/fluid/extension/include/op_function.h
@@ -88,7 +88,7 @@ struct KernelFuncImpl;
 
 template <typename Return, typename... Args, Return (*impl_fn)(Args...)>
 struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
-  static Return Compute(std::vector<const Tensor*> inputs,
+  static Return Compute(std::vector<Tensor> inputs,
                         std::vector<boost::any> attrs) {
     return ComputeCallHelper<Args..., TypeTag<int>>::template Compute<0, 0>(
         inputs, attrs);
@@ -102,12 +102,12 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<const Tensor&, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const Tensor*> inputs,
+    static Return Compute(std::vector<Tensor> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
                     "Input tensor should appear before attributes.");
-      const Tensor& arg = *(inputs[in_idx]);
+      const Tensor& arg = inputs[in_idx];
       return ComputeCallHelper<Tail...>::template Compute<in_idx + 1, attr_idx>(
           inputs, attrs, pargs..., arg);
     }
@@ -117,7 +117,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename... Tail>
   struct ComputeCallHelper<int, Tail...> {
     template <int in_idx, int attr_idx, typename... PreviousArgs>
-    static Return Compute(std::vector<const Tensor*> inputs,
+    static Return Compute(std::vector<Tensor> inputs,
                           std::vector<boost::any> attrs,
                           const PreviousArgs&... pargs) {
       try {
@@ -137,7 +137,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
   template <typename T>
   struct ComputeCallHelper<TypeTag<T>> {
     template <int in_idx, int attr_idx>
-    static Return Compute(std::vector<const Tensor*> inputs,
+    static Return Compute(std::vector<Tensor> inputs,
                           std::vector<boost::any> attrs, const Args&... args) {
       return impl_fn(args...);
     }

From b9dde0aeb147edb1cdab8a672c1087e18a4c2260 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 03:29:17 +0000
Subject: [PATCH 32/83] debug CustomTensor core

---
 paddle/fluid/extension/include/dispatch.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/extension/include/dispatch.h b/paddle/fluid/extension/include/dispatch.h
index 5434b2fabf1ec..336aca591457e 100644
--- a/paddle/fluid/extension/include/dispatch.h
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -21,6 +21,7 @@ namespace paddle {
 #define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
   case enum_type: {                                                       \
     using HINT = type;                                                    \
+    std::cout<<"im here3"<<std::endl;                                     \
     return __VA_ARGS__();                                                 \
   }
 
@@ -30,6 +31,7 @@ namespace paddle {
 #define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
   [&] {                                                                      \
     const auto& dtype = TYPE;                                                \
+    std::cout<<"im here2"<<std::endl;                                        \
     switch (dtype) {                                                         \
       PD_PRIVATE_CASE_TYPE(NAME, ::paddle::framework::proto::VarType::FP32,  \
                            float, __VA_ARGS__)                               \

From 219746ae4e6cf343e1f334304fd77045c149bfd5 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 03:37:03 +0000
Subject: [PATCH 33/83] debug CustomTensor core

---
 paddle/fluid/extension/include/dispatch.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/extension/include/dispatch.h b/paddle/fluid/extension/include/dispatch.h
index 336aca591457e..2f4e843b5cac8 100644
--- a/paddle/fluid/extension/include/dispatch.h
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -19,7 +19,9 @@ limitations under the License. */
 namespace paddle {
 
 #define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
+  std::cout<<"im here2.1"<<std::endl;                                                                        \
   case enum_type: {                                                       \
+    std::cout<<"im here2.2"<<std::endl;\
     using HINT = type;                                                    \
     std::cout<<"im here3"<<std::endl;                                     \
     return __VA_ARGS__();                                                 \

From bedd624810690a005d67e0a00e9e93ae96137c72 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 04:11:25 +0000
Subject: [PATCH 34/83] debug CustomTensor core

---
 paddle/fluid/extension/include/dispatch.h | 11 ++++-------
 paddle/fluid/extension/include/dtype.h    |  1 +
 paddle/fluid/extension/src/tensor.cc      |  2 ++
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/paddle/fluid/extension/include/dispatch.h b/paddle/fluid/extension/include/dispatch.h
index 2f4e843b5cac8..a3dd3e9e01119 100644
--- a/paddle/fluid/extension/include/dispatch.h
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -19,12 +19,10 @@ limitations under the License. */
 namespace paddle {
 
 #define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
-  std::cout<<"im here2.1"<<std::endl;                                                                        \
   case enum_type: {                                                       \
-    std::cout<<"im here2.2"<<std::endl;\
     using HINT = type;                                                    \
-    std::cout<<"im here3"<<std::endl;                                     \
-    return __VA_ARGS__();                                                 \
+    __VA_ARGS__();                                                        \
+    break;                                                                \
   }
 
 #define PD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \
@@ -33,11 +31,10 @@ namespace paddle {
 #define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
   [&] {                                                                      \
     const auto& dtype = TYPE;                                                \
-    std::cout<<"im here2"<<std::endl;                                        \
     switch (dtype) {                                                         \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::framework::proto::VarType::FP32,  \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::PaddleDType::FLOAT32,             \
                            float, __VA_ARGS__)                               \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::framework::proto::VarType::FP64,  \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::PaddleDType::FLOAT64,             \
                            double, __VA_ARGS__)                              \
       default:                                                               \
         throw std::runtime_error("function not implemented for this type."); \
diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index bc649897aa702..7c809907b7ecf 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -20,6 +20,7 @@ namespace paddle {
 
 enum PaddleDType {
         FLOAT32,
+        FLOAT64,
         INT64,
         INT32,
         UINT8,
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index a3e748db25ee0..d033f60bac14b 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -87,6 +87,8 @@ PaddleDType CustomTensor::type() const {
         return PaddleDType::INT32;
     } else if (type == framework::proto::VarType::UINT8) {
         return PaddleDType::UINT8;
+    } else if (type == framework::proto::VarType::FP64){
+        return PaddleDType::FLOAT64;
     }
     return PaddleDType::FLOAT32;
 }

From a148ea224d7068a54a4ce7f964e4be9f2c43a214 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 05:07:33 +0000
Subject: [PATCH 35/83] debug CustomTensor core

---
 paddle/fluid/framework/custom_operator.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 9a6e3dc5c95bc..13719b589008c 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -102,6 +102,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
           "Custom operator can only hold 1 output as vector<Tensor>."));
   auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
   for (size_t i = 0; i < true_outs.size(); ++i) {
+      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
       outs.at(i).ShareDataWith((true_outs)[i]);
   }
 }

From 1757e3ac2b0ea911a656e942102458a9b02cab3b Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 05:14:59 +0000
Subject: [PATCH 36/83] debug CustomTensor core

---
 paddle/fluid/framework/custom_operator.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 13719b589008c..2e1d1adf96daf 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -100,11 +100,11 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       out_name.size(), 1UL,
       platform::errors::InvalidArgument(
           "Custom operator can only hold 1 output as vector<Tensor>."));
-  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
-  for (size_t i = 0; i < true_outs.size(); ++i) {
-      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
-      outs.at(i).ShareDataWith((true_outs)[i]);
-  }
+//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
+//  for (size_t i = 0; i < true_outs.size(); ++i) {
+//      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
+//      outs.at(i).ShareDataWith((true_outs)[i]);
+//  }
 }
 
 //////////////////// Operator Define /////////////////

From 1815a0fae9241eab11ebc8c1da25f543bfd28b1b Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 05:45:22 +0000
Subject: [PATCH 37/83] debug CustomTensor core

---
 paddle/fluid/framework/custom_operator.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 2e1d1adf96daf..13719b589008c 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -100,11 +100,11 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       out_name.size(), 1UL,
       platform::errors::InvalidArgument(
           "Custom operator can only hold 1 output as vector<Tensor>."));
-//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
-//  for (size_t i = 0; i < true_outs.size(); ++i) {
-//      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
-//      outs.at(i).ShareDataWith((true_outs)[i]);
-//  }
+  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
+  for (size_t i = 0; i < true_outs.size(); ++i) {
+      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
+      outs.at(i).ShareDataWith((true_outs)[i]);
+  }
 }
 
 //////////////////// Operator Define /////////////////

From b1e94cd0544948fd1312fa8fb52414fea7b65989 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 05:51:36 +0000
Subject: [PATCH 38/83] debug CustomTensor core

---
 paddle/fluid/framework/custom_operator.cc | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 13719b589008c..2e1d1adf96daf 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -100,11 +100,11 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       out_name.size(), 1UL,
       platform::errors::InvalidArgument(
           "Custom operator can only hold 1 output as vector<Tensor>."));
-  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
-  for (size_t i = 0; i < true_outs.size(); ++i) {
-      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
-      outs.at(i).ShareDataWith((true_outs)[i]);
-  }
+//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
+//  for (size_t i = 0; i < true_outs.size(); ++i) {
+//      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
+//      outs.at(i).ShareDataWith((true_outs)[i]);
+//  }
 }
 
 //////////////////// Operator Define /////////////////

From dbd0e17841e9a3c8e21dc469234e87fdc24b05a9 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 05:59:17 +0000
Subject: [PATCH 39/83] debug CustomTensor core

---
 paddle/fluid/framework/custom_operator.cc | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 2e1d1adf96daf..a907941163724 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -100,11 +100,12 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
       out_name.size(), 1UL,
       platform::errors::InvalidArgument(
           "Custom operator can only hold 1 output as vector<Tensor>."));
-//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
-//  for (size_t i = 0; i < true_outs.size(); ++i) {
-//      outs.at(i).SetLoD(std::vector<std::vector<size_t>>());
-//      outs.at(i).ShareDataWith((true_outs)[i]);
-//  }
+  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
+  for (size_t i = 0; i < true_outs.size(); ++i) {
+      auto tmp = std::vector<std::vector<size_t>>({{1}});
+      outs.at(i).SetLoD(tmp);
+      outs.at(i).ShareDataWith((true_outs)[i]);
+  }
 }
 
 //////////////////// Operator Define /////////////////

From 984d11f7367d272d1a92db01ce4f65df6cc9a602 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 06:31:05 +0000
Subject: [PATCH 40/83] debug CustomTensor core

---
 paddle/fluid/extension/include/tensor.h   | 14 +++----
 paddle/fluid/extension/src/tensor.cc      | 46 +++++++++++------------
 paddle/fluid/framework/custom_operator.cc | 13 ++++++-
 3 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 34e0ca60e2467..6d2ef2a3e6675 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -66,13 +66,13 @@ class CustomTensor{
     /// \brief Return the shape of the Tensor.
     std::vector<int> shape() const;
 
-    /// \brief Set lod info of the tensor.
-    /// More about LOD can be seen here:
-    ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
-    /// \param x the lod info.
-    void SetLoD(const std::vector<std::vector<size_t>>& x);
-    /// \brief Return the lod info of the tensor.
-    std::vector<std::vector<size_t>> lod() const;
+//    /// \brief Set lod info of the tensor.
+//    /// More about LOD can be seen here:
+//    ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+//    /// \param x the lod info.
+//    void SetLoD(const std::vector<std::vector<size_t>>& x);
+//    /// \brief Return the lod info of the tensor.
+//    std::vector<std::vector<size_t>> lod() const;
 
     /// \brief Return the data type of the tensor.
     /// It's usually used to get the output tensor data type.
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index d033f60bac14b..a192fbe024219 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -21,9 +21,9 @@ namespace paddle {
 
 #define GET_CASTED_TENSOR                               \
   if (!tensor_) {                                       \
-    tensor_ = std::make_shared<framework::LoDTensor>(); \
+    tensor_ = std::make_shared<framework::Tensor>(); \
   }                                                     \
-  auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
+  auto *tensor = static_cast<framework::Tensor *>(tensor_.get());
 
 void CustomTensor::Reshape(const std::vector<int> &shape) {
     GET_CASTED_TENSOR
@@ -31,11 +31,11 @@ void CustomTensor::Reshape(const std::vector<int> &shape) {
 }
 
 CustomTensor::CustomTensor(PaddlePlace place):
-        tensor_(std::make_shared<framework::LoDTensor>()),
+        tensor_(std::make_shared<framework::Tensor>()),
         place_(place){};
 
 CustomTensor::CustomTensor(void* raw_tensor) :
-        tensor_(static_cast<framework::LoDTensor*>(raw_tensor)),
+        tensor_(static_cast<framework::Tensor*>(raw_tensor)),
         place_(PlaceType::kUNK){}
 
 template <typename T>
@@ -190,23 +190,23 @@ std::vector<int> CustomTensor::shape() const {
     return framework::vectorize<int>(tensor->dims());
 }
 
-void CustomTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
-    GET_CASTED_TENSOR;
-    framework::LoD lod;
-    for (auto &level : x) {
-        lod.emplace_back(level);
-    }
-    tensor->set_lod(lod);
-}
-
-std::vector<std::vector<size_t>> CustomTensor::lod() const {
-    GET_CASTED_TENSOR;
-    std::vector<std::vector<size_t>> res;
-    for (auto &level : tensor->lod()) {
-        res.emplace_back(level);
-    }
-    return res;
-}
+//void CustomTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+//    GET_CASTED_TENSOR;
+//    framework::LoD lod;
+//    for (auto &level : x) {
+//        lod.emplace_back(level);
+//    }
+//    tensor->set_lod(lod);
+//}
+//
+//std::vector<std::vector<size_t>> CustomTensor::lod() const {
+//    GET_CASTED_TENSOR;
+//    std::vector<std::vector<size_t>> res;
+//    for (auto &level : tensor->lod()) {
+//        res.emplace_back(level);
+//    }
+//    return res;
+//}
 
 const PaddlePlace& CustomTensor::place() const {
     GET_CASTED_TENSOR;
@@ -223,9 +223,9 @@ const PaddlePlace& CustomTensor::place() const {
 }
 
 void CustomTensor::ShareDataWith(void* out_data){
-    static_cast<framework::LoDTensor*>(out_data)
+    static_cast<framework::Tensor*>(out_data)
     ->ShareDataWith(
-            *static_cast<framework::LoDTensor*>(tensor_.get()));
+            *static_cast<framework::Tensor*>(tensor_.get()));
 }
 
 int64_t CustomTensor::size() const{
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index a907941163724..fbeb6c96fd02b 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -103,9 +103,20 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
   auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
   for (size_t i = 0; i < true_outs.size(); ++i) {
       auto tmp = std::vector<std::vector<size_t>>({{1}});
-      outs.at(i).SetLoD(tmp);
+//      outs.at(i).SetLoD(tmp);
       outs.at(i).ShareDataWith((true_outs)[i]);
   }
+//  auto out_name = ctx.OutNameList();
+//  PADDLE_ENFORCE_EQ(
+//      out_name.size(), 1UL,
+//      platform::errors::InvalidArgument(
+//          "Custom operator can only hold 1 output as vector<Tensor>."));
+//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
+//  for (size_t i = 0; i < true_outs.size(); ++i) {
+//      auto tmp = std::vector<std::vector<size_t>>({{1}});
+//      outs.at(i).SetLoD(tmp);
+//      outs.at(i).ShareDataWith((true_outs)[i]);
+//  }
 }
 
 //////////////////// Operator Define /////////////////

From 1d2eae7e4b3f355ba9edaf97ee502e1bf4526dd7 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 08:07:14 +0000
Subject: [PATCH 41/83] debug CustomTensor core

---
 paddle/fluid/extension/include/tensor.h |  5 ++++-
 paddle/fluid/extension/src/tensor.cc    | 30 ++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 6d2ef2a3e6675..764588e1eb840 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -25,6 +25,9 @@ class CustomTensor{
     /// Generally it's only used for user to create CustomTensor.
     explicit CustomTensor(PaddlePlace place);
     explicit CustomTensor(void* raw_tensor);
+    ~CustomTensor();
+    CustomTensor(const CustomTensor& origin);
+    CustomTensor& operator=(const CustomTensor& origin);
     /// \brief Reset the shape of the tensor.
     /// Generally it's only used for the input tensor.
     /// Reshape must be called before calling mutable_data() or copy_from_cpu()
@@ -96,7 +99,7 @@ class CustomTensor{
     const PaddlePlace& place() const;
 
 private:
-    mutable std::shared_ptr<void> tensor_;
+    mutable void* tensor_;
     mutable PaddlePlace place_;
 };
 
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index a192fbe024219..4a839621ec297 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -21,23 +21,47 @@ namespace paddle {
 
 #define GET_CASTED_TENSOR                               \
   if (!tensor_) {                                       \
-    tensor_ = std::make_shared<framework::Tensor>(); \
+    tensor_ = new framework::Tensor(); \
   }                                                     \
-  auto *tensor = static_cast<framework::Tensor *>(tensor_.get());
+  auto *tensor = static_cast<framework::Tensor *>(tensor_);
 
 void CustomTensor::Reshape(const std::vector<int> &shape) {
     GET_CASTED_TENSOR
     tensor->Resize(framework::make_ddim(shape));
 }
 
+CustomTensor::CustomTensor(const CustomTensor& origin) {
+    place_ = origin.place();
+    tensor_ = new framework::Tensor();
+    auto *tensor = static_cast<framework::Tensor *>(tensor_);
+    auto *origin_tensor = static_cast<framework::Tensor *>(origin.tensor_);
+    tensor->ShareDataWith(*origin_tensor);
+}
+
 CustomTensor::CustomTensor(PaddlePlace place):
-        tensor_(std::make_shared<framework::Tensor>()),
+        tensor_(new framework::Tensor()),
         place_(place){};
 
 CustomTensor::CustomTensor(void* raw_tensor) :
         tensor_(static_cast<framework::Tensor*>(raw_tensor)),
         place_(PlaceType::kUNK){}
 
+CustomTensor::~CustomTensor() {
+    delete static_cast<framework::Tensor*>(tensor_);
+    tensor_ = nullptr;
+}
+
+CustomTensor& CustomTensor::operator=(const CustomTensor& other){
+    if (this != &other) // not a self-assignment
+    {
+        other.place_ = place();
+        auto *tensor = static_cast<framework::Tensor *>(tensor_);
+        auto *other_tensor = static_cast<framework::Tensor *>(other.tensor_);
+        other_tensor->ShareDataWith(*tensor);
+    }
+    return *this;
+}
+
 template <typename T>
 T *CustomTensor::mutable_data(const PaddlePlace& place) {
     place_ = place;

From eda48e84eea35f14d8f9c146c5e11d8d01458c82 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 08:09:28 +0000
Subject: [PATCH 42/83] debug CustomTensor core

---
 paddle/fluid/extension/include/device.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/extension/include/device.h b/paddle/fluid/extension/include/device.h
index 28a2bd7931c3a..3acfa6348ab9d 100644
--- a/paddle/fluid/extension/include/device.h
+++ b/paddle/fluid/extension/include/device.h
@@ -20,6 +20,7 @@ enum class PlaceType { kUNK = -1, kCPU, kGPU };
 
 class PaddlePlace {
 public:
+    PaddlePlace() : pc_(PlaceType::kUNK){};
     explicit PaddlePlace(PlaceType pc) : pc_(pc){}
     const PlaceType& GetPlace() const { return pc_; };
 

From 284125ca46d2a879a8556df3089ee8acf83297b4 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 08:11:28 +0000
Subject: [PATCH 43/83] debug CustomTensor core

---
 paddle/fluid/extension/src/tensor.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 4a839621ec297..1381cf03be7a7 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -249,7 +249,7 @@ const PaddlePlace& CustomTensor::place() const {
 void CustomTensor::ShareDataWith(void* out_data){
     static_cast<framework::Tensor*>(out_data)
     ->ShareDataWith(
-            *static_cast<framework::Tensor*>(tensor_.get()));
+            *static_cast<framework::Tensor*>(tensor_));
 }
 
 int64_t CustomTensor::size() const{

From 0851daa8c40fcf9319bcd8c7e9d4930aa089dbee Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 08:21:56 +0000
Subject: [PATCH 44/83] debug CustomTensor core

---
 paddle/fluid/extension/src/tensor.cc | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 1381cf03be7a7..8a1edb72c7cab 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -42,9 +42,13 @@ CustomTensor::CustomTensor(PaddlePlace place):
         tensor_(new framework::Tensor()),
         place_(place){};
 
-CustomTensor::CustomTensor(void* raw_tensor) :
-        tensor_(static_cast<framework::Tensor*>(raw_tensor)),
-        place_(PlaceType::kUNK){}
+CustomTensor::CustomTensor(void* raw_tensor){
+    tensor_ = new framework::Tensor();
+    auto *tensor = static_cast<framework::Tensor *>(tensor_);
+    auto *src_tensor = static_cast<framework::Tensor *>(raw_tensor);
+    tensor->ShareDataWith(*src_tensor);
+    place_ = place();
+}
 
 CustomTensor::~CustomTensor() {
     delete static_cast<framework::Tensor*>(tensor_);

From ea98ccbaa50b485afe7d1f105bccfcf3fb0f37af Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 08:24:00 +0000
Subject: [PATCH 45/83] debug CustomTensor core

---
 paddle/fluid/framework/custom_operator.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index fbeb6c96fd02b..cd55d2e18418b 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -69,7 +69,7 @@ static T* DynLoad(void* handle, std::string name) {
 static void RunKernelFunc(const framework::ExecutionContext& ctx,
                           paddle::KernelFunc func) {
   VLOG(1) << "Custom Operator: Start run KernelFunc.";
-  std::vector<framework::Tensor> ins;
+  std::vector<const framework::Tensor*> ins;
   for (auto name : ctx.InNameList()) {
     VLOG(1) << "Custom Operator: input name - " << name;
     auto* x = ctx.Input<Tensor>(name);
@@ -78,15 +78,13 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "Input tensor (%s) is not initialized."));
-    auto custom_use_input = framework::Tensor();
-    custom_use_input.ShareDataWith(*x);
-    ins.push_back(custom_use_input);
+    ins.push_back(x);
   }
 
   std::vector<CustomTensor> custom_use_ins;
   custom_use_ins.reserve(ins.size());
-  for(auto tensor : ins){
-      custom_use_ins.emplace_back(CustomTensor((void*)(&tensor)));
+  for(auto in : ins){
+      custom_use_ins.emplace_back(CustomTensor((void*)(in)));
   }
   std::vector<boost::any> attrs;
 

From e04bd30314f0f3a595fdc5b13cb047963fa7e9dc Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 08:45:10 +0000
Subject: [PATCH 46/83] remove additional head of framework

---
 paddle/fluid/extension/include/dispatch.h | 2 +-
 paddle/fluid/extension/include/dtype.h    | 2 --
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/extension/include/dispatch.h b/paddle/fluid/extension/include/dispatch.h
index a3dd3e9e01119..53c1e95391765 100644
--- a/paddle/fluid/extension/include/dispatch.h
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/extension/include/dtype.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index 7c809907b7ecf..5cbecc0be52f7 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -14,8 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/framework/data_type.h"
-
 namespace paddle {
 
 enum PaddleDType {

From 1c0cd18ee2f1ea53c2e2bf940de972dde32e54ea Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 09:02:33 +0000
Subject: [PATCH 47/83] use back to shared ptr for custom tensor

---
 paddle/fluid/extension/include/dispatch.h |  1 +
 paddle/fluid/extension/include/tensor.h   | 19 +++---
 paddle/fluid/extension/src/tensor.cc      | 81 ++++++++---------------
 paddle/fluid/framework/custom_operator.cc |  6 +-
 4 files changed, 38 insertions(+), 69 deletions(-)

diff --git a/paddle/fluid/extension/include/dispatch.h b/paddle/fluid/extension/include/dispatch.h
index 53c1e95391765..15a5e65f55b43 100644
--- a/paddle/fluid/extension/include/dispatch.h
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/extension/include/dtype.h"
+#include "paddle/fluid/framework/data_type.h"
 
 namespace paddle {
 
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 764588e1eb840..34e0ca60e2467 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -25,9 +25,6 @@ class CustomTensor{
     /// Generally it's only used for user to create CustomTensor.
     explicit CustomTensor(PaddlePlace place);
     explicit CustomTensor(void* raw_tensor);
-    ~CustomTensor();
-    CustomTensor(const CustomTensor& origin);
-    CustomTensor& operator=(const CustomTensor& origin);
     /// \brief Reset the shape of the tensor.
     /// Generally it's only used for the input tensor.
     /// Reshape must be called before calling mutable_data() or copy_from_cpu()
@@ -69,13 +66,13 @@ class CustomTensor{
     /// \brief Return the shape of the Tensor.
     std::vector<int> shape() const;
 
-//    /// \brief Set lod info of the tensor.
-//    /// More about LOD can be seen here:
-//    ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
-//    /// \param x the lod info.
-//    void SetLoD(const std::vector<std::vector<size_t>>& x);
-//    /// \brief Return the lod info of the tensor.
-//    std::vector<std::vector<size_t>> lod() const;
+    /// \brief Set lod info of the tensor.
+    /// More about LOD can be seen here:
+    ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
+    /// \param x the lod info.
+    void SetLoD(const std::vector<std::vector<size_t>>& x);
+    /// \brief Return the lod info of the tensor.
+    std::vector<std::vector<size_t>> lod() const;
 
     /// \brief Return the data type of the tensor.
     /// It's usually used to get the output tensor data type.
@@ -99,7 +96,7 @@ class CustomTensor{
     const PaddlePlace& place() const;
 
 private:
-    mutable void* tensor_;
+    mutable std::shared_ptr<void> tensor_;
     mutable PaddlePlace place_;
 };
 
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 8a1edb72c7cab..8a690514a43cb 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -19,52 +19,23 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
 
-#define GET_CASTED_TENSOR                               \
-  if (!tensor_) {                                       \
-    tensor_ = new framework::Tensor(); \
-  }                                                     \
-  auto *tensor = static_cast<framework::Tensor *>(tensor_);
+#define GET_CASTED_TENSOR                                     \
+  if (!tensor_) {                                             \
+    tensor_ = std::make_shared<framework::LoDTensor>();          \
+  }                                                           \
+  auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
 
 void CustomTensor::Reshape(const std::vector<int> &shape) {
     GET_CASTED_TENSOR
     tensor->Resize(framework::make_ddim(shape));
 }
 
-CustomTensor::CustomTensor(const CustomTensor& origin) {
-    place_ = origin.place();
-    tensor_ = new framework::Tensor();
-    auto *tensor = static_cast<framework::Tensor *>(tensor_);
-    auto *origin_tensor = static_cast<framework::Tensor *>(origin.tensor_);
-    tensor->ShareDataWith(*origin_tensor);
-}
-
 CustomTensor::CustomTensor(PaddlePlace place):
-        tensor_(new framework::Tensor()),
+        tensor_(std::make_shared<framework::LoDTensor>()),
         place_(place){};
 
-CustomTensor::CustomTensor(void* raw_tensor){
-    tensor_ = new framework::Tensor();
-    auto *tensor = static_cast<framework::Tensor *>(tensor_);
-    auto *src_tensor = static_cast<framework::Tensor *>(raw_tensor);
-    tensor->ShareDataWith(*src_tensor);
-    place_ = place();
-}
-
-CustomTensor::~CustomTensor() {
-    delete static_cast<framework::Tensor*>(tensor_);
-    tensor_ = nullptr;
-}
+CustomTensor::CustomTensor(void* raw_tensor): tensor_(raw_tensor), place_(PlaceType::kUNK){}
 
-CustomTensor& CustomTensor::operator=(const CustomTensor& other){
-    if (this != &other) // not a self-assignment
-    {
-        other.place_ = place();
-        auto *tensor = static_cast<framework::Tensor *>(tensor_);
-        auto *other_tensor = static_cast<framework::Tensor *>(other.tensor_);
-        other_tensor->ShareDataWith(*tensor);
-    }
-    return *this;
-}
 
 template <typename T>
 T *CustomTensor::mutable_data(const PaddlePlace& place) {
@@ -218,23 +189,23 @@ std::vector<int> CustomTensor::shape() const {
     return framework::vectorize<int>(tensor->dims());
 }
 
-//void CustomTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
-//    GET_CASTED_TENSOR;
-//    framework::LoD lod;
-//    for (auto &level : x) {
-//        lod.emplace_back(level);
-//    }
-//    tensor->set_lod(lod);
-//}
-//
-//std::vector<std::vector<size_t>> CustomTensor::lod() const {
-//    GET_CASTED_TENSOR;
-//    std::vector<std::vector<size_t>> res;
-//    for (auto &level : tensor->lod()) {
-//        res.emplace_back(level);
-//    }
-//    return res;
-//}
+void CustomTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+    GET_CASTED_TENSOR;
+    framework::LoD lod;
+    for (auto &level : x) {
+        lod.emplace_back(level);
+    }
+    tensor->set_lod(lod);
+}
+
+std::vector<std::vector<size_t>> CustomTensor::lod() const {
+    GET_CASTED_TENSOR;
+    std::vector<std::vector<size_t>> res;
+    for (auto &level : tensor->lod()) {
+        res.emplace_back(level);
+    }
+    return res;
+}
 
 const PaddlePlace& CustomTensor::place() const {
     GET_CASTED_TENSOR;
@@ -251,9 +222,9 @@ const PaddlePlace& CustomTensor::place() const {
 }
 
 void CustomTensor::ShareDataWith(void* out_data){
-    static_cast<framework::Tensor*>(out_data)
+    static_cast<framework::LoDTensor*>(out_data)
     ->ShareDataWith(
-            *static_cast<framework::Tensor*>(tensor_));
+            *static_cast<framework::LoDTensor*>(tensor_.get()));
 }
 
 int64_t CustomTensor::size() const{
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index cd55d2e18418b..5819ea6ce6013 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -78,7 +78,9 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "Input tensor (%s) is not initialized."));
-    ins.push_back(x);
+    auto* tmp_in = new framework::Tensor();
+    tmp_in->ShareDataWith(*x);
+    ins.push_back(tmp_in);
   }
 
   std::vector<CustomTensor> custom_use_ins;
@@ -100,8 +102,6 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
           "Custom operator can only hold 1 output as vector<Tensor>."));
   auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
   for (size_t i = 0; i < true_outs.size(); ++i) {
-      auto tmp = std::vector<std::vector<size_t>>({{1}});
-//      outs.at(i).SetLoD(tmp);
       outs.at(i).ShareDataWith((true_outs)[i]);
   }
 //  auto out_name = ctx.OutNameList();

From aa09b08e1d75587f1a0dc0b0caf017f7cd3cb81d Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 09:13:03 +0000
Subject: [PATCH 48/83] use back to shared ptr for custom tensor

---
 paddle/fluid/extension/src/tensor.cc | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 8a690514a43cb..0738fbd0c15c7 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -34,7 +34,9 @@ CustomTensor::CustomTensor(PaddlePlace place):
         tensor_(std::make_shared<framework::LoDTensor>()),
         place_(place){};
 
-CustomTensor::CustomTensor(void* raw_tensor): tensor_(raw_tensor), place_(PlaceType::kUNK){}
+CustomTensor::CustomTensor(void* raw_tensor):
+    tensor_(static_cast<framework::LoDTensor*>(raw_tensor)),
+    place_(PlaceType::kUNK){}
 
 
 template <typename T>

From 330b650e3722b5caa9e0049888ec870b1608d1e9 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 09:17:26 +0000
Subject: [PATCH 49/83] use back to shared ptr for custom tensor

---
 paddle/fluid/extension/include/dtype.h  | 2 ++
 paddle/fluid/extension/include/tensor.h | 1 +
 2 files changed, 3 insertions(+)

diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index 5cbecc0be52f7..7c809907b7ecf 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/data_type.h"
+
 namespace paddle {
 
 enum PaddleDType {
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 34e0ca60e2467..88083537474b8 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/extension/include/device.h"
 #include "paddle/fluid/extension/include/dtype.h"
+#include <memory>
 
 namespace paddle {
 

From 743a91f4ac9368ee036cb46d97a2ca729c2fb84d Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 09:32:55 +0000
Subject: [PATCH 50/83] use back to shared ptr for custom tensor

---
 paddle/fluid/framework/custom_operator.cc | 67 +++++++++++------------
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 5819ea6ce6013..b46f1d817a685 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -63,13 +63,39 @@ static T* DynLoad(void* handle, std::string name) {
 }  // namespace detail
 
 ////////////////// Kernel Define ////////////////////
+// convert PaddlePlace to platform::Place
+platform::Place PaddlePlaceToPlatformPlace(const PaddlePlace& pc){
+    if(pc.GetPlace() == PlaceType::kCPU){
+        return platform::Place(platform::CPUPlace());
+    }else if(pc.GetPlace() == PlaceType::kGPU){
+#ifdef PADDLE_WITH_CUDA
+        return platform::Place(
+            platform::CUDAPlace(platform::GetCurrentDeviceId()));
+#endif
+    }else{
+        PADDLE_THROW("Place for CustomOp is undefined in Paddle");
+    }
+    return platform::Place();
+}
 
+PaddlePlace PlatformPlaceToPaddlePlace(const platform::Place& pc){
+    if(platform::is_cpu_place(pc)){
+        return PaddlePlace(PlaceType::kCPU);
+    }else if(platform::is_gpu_place(pc)){
+#ifdef PADDLE_WITH_CUDA
+        return PaddlePlace(PlaceType::kGPU);
+#endif
+    }else{
+        PADDLE_THROW("Place for CustomOp is undefined in Paddle");
+    }
+    return PaddlePlace(PlaceType::kUNK);
+}
 // custom op kernel call function define
 
 static void RunKernelFunc(const framework::ExecutionContext& ctx,
                           paddle::KernelFunc func) {
   VLOG(1) << "Custom Operator: Start run KernelFunc.";
-  std::vector<const framework::Tensor*> ins;
+  std::vector<CustomTensor> custom_ins;
   for (auto name : ctx.InNameList()) {
     VLOG(1) << "Custom Operator: input name - " << name;
     auto* x = ctx.Input<Tensor>(name);
@@ -78,21 +104,16 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "Input tensor (%s) is not initialized."));
-    auto* tmp_in = new framework::Tensor();
-    tmp_in->ShareDataWith(*x);
-    ins.push_back(tmp_in);
+    auto custom_in = CustomTensor(PlatformPlaceToPaddlePlace(x->place()));
+    custom_in.ShareDataWith((void *)x);
+    custom_ins.emplace_back(custom_in);
   }
 
-  std::vector<CustomTensor> custom_use_ins;
-  custom_use_ins.reserve(ins.size());
-  for(auto in : ins){
-      custom_use_ins.emplace_back(CustomTensor((void*)(in)));
-  }
   std::vector<boost::any> attrs;
 
   VLOG(0) << "Run ComputeFunc.";
 
-  auto outs = func(custom_use_ins, attrs);
+  auto outs = func(custom_ins, attrs);
 
   VLOG(1) << "Custom Operator: Share outputs into ExecutionContext.";
   auto out_name = ctx.OutNameList();
@@ -104,17 +125,6 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
   for (size_t i = 0; i < true_outs.size(); ++i) {
       outs.at(i).ShareDataWith((true_outs)[i]);
   }
-//  auto out_name = ctx.OutNameList();
-//  PADDLE_ENFORCE_EQ(
-//      out_name.size(), 1UL,
-//      platform::errors::InvalidArgument(
-//          "Custom operator can only hold 1 output as vector<Tensor>."));
-//  auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
-//  for (size_t i = 0; i < true_outs.size(); ++i) {
-//      auto tmp = std::vector<std::vector<size_t>>({{1}});
-//      outs.at(i).SetLoD(tmp);
-//      outs.at(i).ShareDataWith((true_outs)[i]);
-//  }
 }
 
 //////////////////// Operator Define /////////////////
@@ -306,21 +316,6 @@ void RegisterOperator(const std::string& name, size_t input_num,
 }
 
 
-platform::Place PaddlePlaceToPlatformPlace(const PaddlePlace& pc){
-    if(pc.GetPlace() == PlaceType::kCPU){
-        return platform::Place(platform::CPUPlace());
-    }else if(pc.GetPlace() == PlaceType::kGPU){
-#ifdef PADDLE_WITH_CUDA
-        return platform::Place(
-                platform::CUDAPlace(platform::GetCurrentDeviceId()));
-#endif
-    }else{
-        PADDLE_THROW("Place for CustomOp is undefined in Paddle");
-    }
-    return platform::Place();
-}
-
-
 void RegisterOperatorKernelWithPlace(const std::string& name,
                                      const paddle::KernelFunc& kernel_func,
                                      const proto::VarType::Type type,

From 9b8917b51925b1a6e2828dabad0ba02e5850d688 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 09:40:59 +0000
Subject: [PATCH 51/83] use back to shared ptr for custom tensor

---
 paddle/fluid/extension/src/tensor.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 0738fbd0c15c7..4de21a3227211 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -224,9 +224,8 @@ const PaddlePlace& CustomTensor::place() const {
 }
 
 void CustomTensor::ShareDataWith(void* out_data){
-    static_cast<framework::LoDTensor*>(out_data)
-    ->ShareDataWith(
-            *static_cast<framework::LoDTensor*>(tensor_.get()));
+    GET_CASTED_TENSOR;
+    tensor->ShareDataWith(*static_cast<framework::LoDTensor*>(out_data));
 }
 
 int64_t CustomTensor::size() const{

From 627fa2e74648a839e2d395d4fbc06eaa438eb14a Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Wed, 3 Feb 2021 09:44:58 +0000
Subject: [PATCH 52/83] use back to shared ptr for custom tensor

---
 paddle/fluid/extension/include/tensor.h   | 9 +++++++--
 paddle/fluid/extension/src/tensor.cc      | 8 +++++++-
 paddle/fluid/framework/custom_operator.cc | 4 ++--
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 88083537474b8..05a5b9bd523c1 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -81,10 +81,15 @@ class CustomTensor{
     PaddleDType type() const;
 
 
-    /// \brief Share data with another tensor.
+    /// \brief Share data TO another tensor.
     /// Use this to pass tensor from op to op
     /// \return void.
-    void ShareDataWith(void* tensor_out);
+    void ShareDataTo(void* other);
+
+    /// \brief Share data FROM another tensor.
+    /// Use this to pass tensor from op to op
+    /// \return void.
+    void ShareDataFrom(void* other);
 
     /// \brief Get the size of current tensor.
     /// Use this method to get the size of tensor
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 4de21a3227211..c83d47cfc273b 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -223,7 +223,13 @@ const PaddlePlace& CustomTensor::place() const {
     return place_;
 }
 
-void CustomTensor::ShareDataWith(void* out_data){
+void CustomTensor::ShareDataTo(void* other){
+    static_cast<framework::LoDTensor*>(other)
+    ->ShareDataWith(
+            *static_cast<framework::LoDTensor*>(tensor_.get()));
+}
+
+void CustomTensor::ShareDataFrom(void* out_data){
     GET_CASTED_TENSOR;
     tensor->ShareDataWith(*static_cast<framework::LoDTensor*>(out_data));
 }
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index b46f1d817a685..03b8a28575561 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -105,7 +105,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                       platform::errors::InvalidArgument(
                           "Input tensor (%s) is not initialized."));
     auto custom_in = CustomTensor(PlatformPlaceToPaddlePlace(x->place()));
-    custom_in.ShareDataWith((void *)x);
+    custom_in.ShareDataFrom((void *)x);
     custom_ins.emplace_back(custom_in);
   }
 
@@ -123,7 +123,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
           "Custom operator can only hold 1 output as vector<Tensor>."));
   auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
   for (size_t i = 0; i < true_outs.size(); ++i) {
-      outs.at(i).ShareDataWith((true_outs)[i]);
+      outs.at(i).ShareDataTo((true_outs)[i]);
   }
 }
 

From 7ecffc010b93f4c5e0342c7f7516e3d855bf44f0 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 4 Feb 2021 03:08:05 +0000
Subject: [PATCH 53/83] add gpu test

---
 paddle/fluid/extension/include/op_function.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/extension/include/op_function.h b/paddle/fluid/extension/include/op_function.h
index eedbceda90036..9d1eac4293482 100644
--- a/paddle/fluid/extension/include/op_function.h
+++ b/paddle/fluid/extension/include/op_function.h
@@ -241,9 +241,9 @@ class OpFunctionMap {
   }
 
   void Insert(const std::string& op_type, const OpFunction& op_func) {
-    PADDLE_ENFORCE_NE(map_.find(op_type) != map_.end(), true,
-                      platform::errors::AlreadyExists(
-                          "Operator (%s) has been registered.", op_type));
+//    PADDLE_ENFORCE_NE(map_.find(op_type) != map_.end(), true,
+//                      platform::errors::AlreadyExists(
+//                          "Operator (%s) has been registered.", op_type));
     map_.insert({op_type, op_func});
   }
 

From 687c9ef1174cb788558e6c27a80ea007defc232e Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 4 Feb 2021 04:22:33 +0000
Subject: [PATCH 54/83] merge latest cwh code in

---
 paddle/fluid/extension/include/op_function.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/extension/include/op_function.h b/paddle/fluid/extension/include/op_function.h
index 534cc0106ccd4..2861e4498c596 100644
--- a/paddle/fluid/extension/include/op_function.h
+++ b/paddle/fluid/extension/include/op_function.h
@@ -23,6 +23,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include <sstream>
+#include <boost/any.hpp>
 #include "paddle/fluid/extension/include/tensor.h"
 #include "paddle/fluid/extension/include/device.h"
 

From a9cd76a224258bfdb65a2f24863349e06f32c98c Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 4 Feb 2021 09:55:21 +0000
Subject: [PATCH 55/83] adjust ut code of custom op

---
 .../fluid/tests/custom_op/relu_op_simple.cc   | 62 +++++++++----------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
index 4e0e51a0ffe39..6c2661cecffc4 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -37,24 +37,24 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
   }
 }
 
-std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor();
-  out.Resize(x.dims());
+std::vector<paddle::CustomTensor> relu_cpu_forward(const paddle::CustomTensor& x) {
+  auto out = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kCPU));
+  out.Reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
         relu_cpu_forward_kernel<data_t>(
-            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.numel());
+            x.data<data_t>(), out.mutable_data<data_t>(x.place()), x.size());
       }));
 
   return {out};
 }
 
-std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& grad_out,
-                                              const paddle::Tensor& out,
-                                              const paddle::Tensor& x) {
-  auto grad_x = paddle::Tensor();
-  grad_x.Resize(x.dims());
+std::vector<paddle::CustomTensor> relu_cpu_backward(const paddle::CustomTensor& grad_out,
+                                              const paddle::CustomTensor& out,
+                                              const paddle::CustomTensor& x) {
+  auto grad_x = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kCPU));
+  grad_x.Reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
@@ -67,33 +67,33 @@ std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& grad_out,
   return {grad_x};
 }
 
-std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x);
-std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& grad_out,
-                                               const paddle::Tensor& out,
-                                               const paddle::Tensor& x);
+std::vector<paddle::CustomTensor> relu_cuda_forward(const paddle::CustomTensor& x);
+std::vector<paddle::CustomTensor> relu_cuda_backward(const paddle::CustomTensor& grad_out,
+                                               const paddle::CustomTensor& out,
+                                               const paddle::CustomTensor& x);
 
-std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
+std::vector<paddle::CustomTensor> ReluForward(const paddle::CustomTensor& x) {
   // TODO(chenweihang): Check Input
-  if (paddle::platform::is_cpu_place(x.place())) {
-    return relu_cpu_forward(x);
-  } else if (paddle::platform::is_gpu_place(x.place())) {
-    return relu_cuda_forward(x);
-  } else {
-    throw std::runtime_error("Not implemented.");
-  }
+    if (x.place().GetPlace() == paddle::PlaceType::kCPU) {
+        return relu_cpu_forward(x);
+    } else if (x.place().GetPlace() == paddle::PlaceType::kGPU) {
+        return relu_cuda_forward(x);
+    } else {
+        throw std::runtime_error("Not implemented.");
+    }
 }
 
-std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& grad_out,
-                                         const paddle::Tensor& out,
-                                         const paddle::Tensor& x) {
+std::vector<paddle::CustomTensor> ReluBackward(const paddle::CustomTensor& grad_out,
+                                         const paddle::CustomTensor& out,
+                                         const paddle::CustomTensor& x) {
   // TODO(chenweihang): Check Input
-  if (paddle::platform::is_cpu_place(x.place())) {
-    return relu_cpu_backward(grad_out, out, x);
-  } else if (paddle::platform::is_gpu_place(x.place())) {
-    return relu_cuda_backward(grad_out, out, x);
-  } else {
-    throw std::runtime_error("Not implemented.");
-  }
+    if (x.place().GetPlace() == paddle::PlaceType::kCPU) {
+        return relu_cpu_backward(grad_out, out, x);
+    } else if (x.place().GetPlace() == paddle::PlaceType::kGPU) {
+        return relu_cuda_backward(grad_out, out, x);
+    } else {
+        throw std::runtime_error("Not implemented.");
+    }
 }
 
 std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape) {

From 2afe58aa7c126c6a60190d717f5b1a5b1a459f12 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 4 Feb 2021 10:01:40 +0000
Subject: [PATCH 56/83] adjust ut code of custom op

---
 python/paddle/fluid/tests/custom_op/relu_op_simple.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
index 6c2661cecffc4..c3ab24d3deae6 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -61,7 +61,7 @@ std::vector<paddle::CustomTensor> relu_cpu_backward(const paddle::CustomTensor&
                                    grad_out.data<data_t>(),
                                    out.data<data_t>(),
                                    grad_x.mutable_data<data_t>(x.place()),
-                                   out.numel());
+                                   out.size());
                              }));
 
   return {grad_x};

From 569337528544ab2fd36bfaff1cbee64e32808e42 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Thu, 4 Feb 2021 10:13:16 +0000
Subject: [PATCH 57/83] adjust ut code of custom op

---
 .../fluid/tests/custom_op/relu_op_simple.cu   | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
index c132228790916..b4764d706282c 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
@@ -35,11 +35,11 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
   }
 }
 
-std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
-  auto out = paddle::Tensor();
-  out.Resize(x.dims());
+std::vector<paddle::CustomTensor> relu_cuda_forward(const paddle::CustomTensor& x) {
+  auto out = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kGPU));
+  out.Reshape(x.shape());
 
-  int numel = x.numel();
+  int numel = x.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_TYPES(
@@ -51,13 +51,13 @@ std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   return {out};
 }
 
-std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& grad_out,
-                                               const paddle::Tensor& out,
-                                               const paddle::Tensor& x) {
-  auto grad_x = paddle::Tensor();
-  grad_x.Resize(x.dims());
+std::vector<paddle::CustomTensor> relu_cuda_backward(const paddle::CustomTensor& grad_out,
+                                               const paddle::CustomTensor& out,
+                                               const paddle::CustomTensor& x) {
+  auto grad_x = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kGPU));
+  grad_x.Reshape(x.shape());
 
-  int numel = out.numel();
+  int numel = out.size();
   int block = 512;
   int grid = (numel + block - 1) / block;
   PD_DISPATCH_FLOATING_TYPES(

From 0332e2918a54d528bb4f0df173b514a1654a584b Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 5 Feb 2021 05:57:12 +0000
Subject: [PATCH 58/83] adjust ut code of custom op

---
 paddle/fluid/extension/include/device.h       | 19 ++++----
 paddle/fluid/extension/include/tensor.h       |  8 ++--
 paddle/fluid/extension/src/tensor.cc          | 44 +++++++++----------
 paddle/fluid/framework/custom_operator.cc     | 20 ++++-----
 .../fluid/tests/custom_op/relu_op_simple.cc   | 12 ++---
 .../fluid/tests/custom_op/relu_op_simple.cu   |  4 +-
 6 files changed, 54 insertions(+), 53 deletions(-)

diff --git a/paddle/fluid/extension/include/device.h b/paddle/fluid/extension/include/device.h
index 3acfa6348ab9d..cdecf8a518b39 100644
--- a/paddle/fluid/extension/include/device.h
+++ b/paddle/fluid/extension/include/device.h
@@ -18,14 +18,15 @@ namespace paddle {
 
 enum class PlaceType { kUNK = -1, kCPU, kGPU };
 
-class PaddlePlace {
-public:
-    PaddlePlace() : pc_(PlaceType::kUNK){};
-    explicit PaddlePlace(PlaceType pc) : pc_(pc){}
-    const PlaceType& GetPlace() const { return pc_; };
-
-protected:
-    PlaceType pc_;
-};
+//class PaddlePlace {
+//public:
+//    PaddlePlace() : pc_(PlaceType::kUNK){};
+//    explicit PaddlePlace(PlaceType pc) : pc_(pc){}
+//    const PlaceType& GetPlace() const { return pc_; };
+//    void SetPlace(const PlaceType& place) { pc_ = place; }
+//
+//protected:
+//    PlaceType pc_;
+//};
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 05a5b9bd523c1..0201b3704dfe9 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -24,7 +24,7 @@ class CustomTensor{
 public:
     /// \brief Construct a CustomTensor on None Place for CustomOp.
     /// Generally it's only used for user to create CustomTensor.
-    explicit CustomTensor(PaddlePlace place);
+    explicit CustomTensor(const PlaceType& place);
     explicit CustomTensor(void* raw_tensor);
     /// \brief Reset the shape of the tensor.
     /// Generally it's only used for the input tensor.
@@ -38,7 +38,7 @@ class CustomTensor{
     /// \param place The place of the tensor this will override the original place
     /// of current tensor.
     template <typename T>
-    T* mutable_data(const PaddlePlace& place);
+    T* mutable_data(const PlaceType& place);
 
     /// \brief Get the memory pointer in CPU or GPU with specific data type.
     /// Please Reshape the tensor first before call this.
@@ -99,11 +99,11 @@ class CustomTensor{
     /// \brief Get the place of current tensor.
     /// Use this method to get the place of tensor
     /// \return Place.
-    const PaddlePlace& place() const;
+    const PlaceType& place() const;
 
 private:
     mutable std::shared_ptr<void> tensor_;
-    mutable PaddlePlace place_;
+    mutable PlaceType place_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index c83d47cfc273b..94c7a4c6c7861 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -30,7 +30,7 @@ void CustomTensor::Reshape(const std::vector<int> &shape) {
     tensor->Resize(framework::make_ddim(shape));
 }
 
-CustomTensor::CustomTensor(PaddlePlace place):
+CustomTensor::CustomTensor(const PlaceType& place):
         tensor_(std::make_shared<framework::LoDTensor>()),
         place_(place){};
 
@@ -40,7 +40,7 @@ CustomTensor::CustomTensor(void* raw_tensor):
 
 
 template <typename T>
-T *CustomTensor::mutable_data(const PaddlePlace& place) {
+T *CustomTensor::mutable_data(const PlaceType& place) {
     place_ = place;
     return mutable_data<T>();
 }
@@ -54,7 +54,7 @@ T *CustomTensor::mutable_data() {
                     "You should call CustomTensor::Reshape(const std::vector<int> "
                     "&shape)"
                     "function before retrieving mutable_data from input tensor."));
-    switch (static_cast<int>(place_.GetPlace())) {
+    switch (static_cast<int>(place_)) {
         case static_cast<int>(PlaceType::kCPU): {
             return tensor->mutable_data<T>(platform::CPUPlace());
         }
@@ -66,7 +66,7 @@ T *CustomTensor::mutable_data() {
         }
         default:
             PADDLE_THROW(platform::errors::Unavailable("CustomOp unsupported place: %d",
-                                                   static_cast<int>(place_.GetPlace())));
+                                                   static_cast<int>(place_)));
     }
 }
     
@@ -104,7 +104,7 @@ void CustomTensor::copy_from_cpu(const T *data) {
                               "function before copying data from cpu."));
     size_t ele_size = tensor->numel() * sizeof(T);
 
-    if (place_.GetPlace() == PlaceType::kCPU) {
+    if (place_ == PlaceType::kCPU) {
         auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
         std::memcpy(static_cast<void *>(t_data), data, ele_size);
     } else {
@@ -116,8 +116,8 @@ void CustomTensor::copy_from_cpu(const T *data) {
     auto *dev_ctx =
         static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
 
-memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
-             data, ele_size, dev_ctx->stream());
+    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
+                 data, ele_size, dev_ctx->stream());
 #else
         PADDLE_THROW(platform::errors::Unavailable(
                 "Not compiled with CUDA, should not reach here."));
@@ -137,13 +137,13 @@ void CustomTensor::copy_to_cpu(T *data) {
     } else {
 #ifdef PADDLE_WITH_CUDA
         platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
-auto *dev_ctx =
-    static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
-             t_data, ele_num * sizeof(T), dev_ctx->stream());
+        auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
+        auto *dev_ctx =
+            static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+        memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
+                     t_data, ele_num * sizeof(T), dev_ctx->stream());
 
-cudaStreamSynchronize(dev_ctx->stream());
+        cudaStreamSynchronize(dev_ctx->stream());
 #else
         PADDLE_THROW(platform::errors::Unavailable(
                 "Not compile with CUDA, should not reach here."));
@@ -179,12 +179,12 @@ template  int32_t *CustomTensor::mutable_data<int32_t>();
 template  uint8_t *CustomTensor::mutable_data<uint8_t>();
 template  int8_t *CustomTensor::mutable_data<int8_t>();
 
-template  float *CustomTensor::mutable_data<float>(const PaddlePlace& place);
-template  double *CustomTensor::mutable_data<double>(const PaddlePlace& place);
-template  int64_t *CustomTensor::mutable_data<int64_t>(const PaddlePlace& place);
-template  int32_t *CustomTensor::mutable_data<int32_t>(const PaddlePlace& place);
-template  uint8_t *CustomTensor::mutable_data<uint8_t>(const PaddlePlace& place);
-template  int8_t *CustomTensor::mutable_data<int8_t>(const PaddlePlace& place);
+template  float *CustomTensor::mutable_data<float>(const PlaceType& place);
+template  double *CustomTensor::mutable_data<double>(const PlaceType& place);
+template  int64_t *CustomTensor::mutable_data<int64_t>(const PlaceType& place);
+template  int32_t *CustomTensor::mutable_data<int32_t>(const PlaceType& place);
+template  uint8_t *CustomTensor::mutable_data<uint8_t>(const PlaceType& place);
+template  int8_t *CustomTensor::mutable_data<int8_t>(const PlaceType& place);
 
 std::vector<int> CustomTensor::shape() const {
     GET_CASTED_TENSOR
@@ -209,12 +209,12 @@ std::vector<std::vector<size_t>> CustomTensor::lod() const {
     return res;
 }
 
-const PaddlePlace& CustomTensor::place() const {
+const PlaceType& CustomTensor::place() const {
     GET_CASTED_TENSOR;
     if(platform::is_cpu_place(tensor->place())){
-        place_ = PaddlePlace(PlaceType::kCPU);
+        place_ = PlaceType::kCPU;
     }else if(platform::is_gpu_place(tensor->place())){
-        place_ = PaddlePlace(PlaceType::kGPU);
+        place_ = PlaceType::kGPU;
     }else{
         PADDLE_THROW("Current CustomTensor hold unsupported Place Type, Please Init it"
                      "using CustomTensor::mutable_data<T>(PaddlePlace) which T is"
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 985001f419f30..3ccf9b0102dde 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -64,10 +64,10 @@ static T* DynLoad(void* handle, std::string name) {
 
 ////////////////// Kernel Define ////////////////////
 // convert PaddlePlace to platform::Place
-platform::Place PaddlePlaceToPlatformPlace(const PaddlePlace& pc){
-    if(pc.GetPlace() == PlaceType::kCPU){
+platform::Place PaddlePlaceToPlatformPlace(const PlaceType& pc){
+    if(pc == PlaceType::kCPU){
         return platform::Place(platform::CPUPlace());
-    }else if(pc.GetPlace() == PlaceType::kGPU){
+    }else if(pc == PlaceType::kGPU){
 #ifdef PADDLE_WITH_CUDA
         return platform::Place(
             platform::CUDAPlace(platform::GetCurrentDeviceId()));
@@ -78,17 +78,17 @@ platform::Place PaddlePlaceToPlatformPlace(const PaddlePlace& pc){
     return platform::Place();
 }
 
-PaddlePlace PlatformPlaceToPaddlePlace(const platform::Place& pc){
+PlaceType PlatformPlaceToPaddlePlace(const platform::Place& pc){
     if(platform::is_cpu_place(pc)){
-        return PaddlePlace(PlaceType::kCPU);
+        return PlaceType::kCPU;
     }else if(platform::is_gpu_place(pc)){
 #ifdef PADDLE_WITH_CUDA
-        return PaddlePlace(PlaceType::kGPU);
+        return PlaceType::kGPU;
 #endif
     }else{
         PADDLE_THROW("Place for CustomOp is undefined in Paddle");
     }
-    return PaddlePlace(PlaceType::kUNK);
+    return PlaceType::kUNK;
 }
 // custom op kernel call function define
 
@@ -319,7 +319,7 @@ void RegisterOperator(const std::string& name, size_t input_num,
 void RegisterOperatorKernelWithPlace(const std::string& name,
                                      const paddle::KernelFunc& kernel_func,
                                      const proto::VarType::Type type,
-                                     const PaddlePlace& place) {
+                                     const PlaceType& place) {
   OpKernelType key(type, PaddlePlaceToPlatformPlace(place));
   VLOG(1) << "Custom Operator: op kernel key: " << key;
   OperatorWithKernel::AllOpKernels()[name][key] =
@@ -340,9 +340,9 @@ void RegisterOperatorKernel(const std::string& name,
   // But this is not entirely correct, if user only give a cpu kernel,
   // but call api in gpu device, it will cause error.
   RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
-                                  PaddlePlace(PlaceType::kCPU));
+                                  PlaceType::kCPU);
   RegisterOperatorKernelWithPlace(name, kernel_func, proto::VarType::RAW,
-                                  PaddlePlace(PlaceType::kGPU));
+                                  PlaceType::kGPU);
 }
 
 void RegisterOperatorWithOpFunctionMap(
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
index c3ab24d3deae6..6e6186d02f63c 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -38,7 +38,7 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
 }
 
 std::vector<paddle::CustomTensor> relu_cpu_forward(const paddle::CustomTensor& x) {
-  auto out = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kCPU));
+  auto out = paddle::CustomTensor(paddle::PlaceType::kCPU);
   out.Reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -53,7 +53,7 @@ std::vector<paddle::CustomTensor> relu_cpu_forward(const paddle::CustomTensor& x
 std::vector<paddle::CustomTensor> relu_cpu_backward(const paddle::CustomTensor& grad_out,
                                               const paddle::CustomTensor& out,
                                               const paddle::CustomTensor& x) {
-  auto grad_x = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kCPU));
+  auto grad_x = paddle::CustomTensor(paddle::PlaceType::kCPU);
   grad_x.Reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
@@ -74,9 +74,9 @@ std::vector<paddle::CustomTensor> relu_cuda_backward(const paddle::CustomTensor&
 
 std::vector<paddle::CustomTensor> ReluForward(const paddle::CustomTensor& x) {
   // TODO(chenweihang): Check Input
-    if (x.place().GetPlace() == paddle::PlaceType::kCPU) {
+    if (x.place() == paddle::PlaceType::kCPU) {
         return relu_cpu_forward(x);
-    } else if (x.place().GetPlace() == paddle::PlaceType::kGPU) {
+    } else if (x.place() == paddle::PlaceType::kGPU) {
         return relu_cuda_forward(x);
     } else {
         throw std::runtime_error("Not implemented.");
@@ -87,9 +87,9 @@ std::vector<paddle::CustomTensor> ReluBackward(const paddle::CustomTensor& grad_
                                          const paddle::CustomTensor& out,
                                          const paddle::CustomTensor& x) {
   // TODO(chenweihang): Check Input
-    if (x.place().GetPlace() == paddle::PlaceType::kCPU) {
+    if (x.place() == paddle::PlaceType::kCPU) {
         return relu_cpu_backward(grad_out, out, x);
-    } else if (x.place().GetPlace() == paddle::PlaceType::kGPU) {
+    } else if (x.place() == paddle::PlaceType::kGPU) {
         return relu_cuda_backward(grad_out, out, x);
     } else {
         throw std::runtime_error("Not implemented.");
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
index b4764d706282c..5a5d146503996 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
@@ -36,7 +36,7 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
 }
 
 std::vector<paddle::CustomTensor> relu_cuda_forward(const paddle::CustomTensor& x) {
-  auto out = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kGPU));
+  auto out = paddle::CustomTensor(paddle::PlaceType::kGPU);
   out.Reshape(x.shape());
 
   int numel = x.size();
@@ -54,7 +54,7 @@ std::vector<paddle::CustomTensor> relu_cuda_forward(const paddle::CustomTensor&
 std::vector<paddle::CustomTensor> relu_cuda_backward(const paddle::CustomTensor& grad_out,
                                                const paddle::CustomTensor& out,
                                                const paddle::CustomTensor& x) {
-  auto grad_x = paddle::CustomTensor(paddle::PaddlePlace(paddle::PlaceType::kGPU));
+  auto grad_x = paddle::CustomTensor(paddle::PlaceType::kGPU);
   grad_x.Reshape(x.shape());
 
   int numel = out.size();

From a9a75500ece31dd6aba1c218991d16139ecd9e71 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 5 Feb 2021 06:05:08 +0000
Subject: [PATCH 59/83] adjust ut code of custom op

---
 paddle/fluid/extension/include/tensor.h |  1 -
 paddle/fluid/extension/src/tensor.cc    | 13 ++++---------
 2 files changed, 4 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 0201b3704dfe9..5e989e91c2941 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -25,7 +25,6 @@ class CustomTensor{
     /// \brief Construct a CustomTensor on None Place for CustomOp.
     /// Generally it's only used for user to create CustomTensor.
     explicit CustomTensor(const PlaceType& place);
-    explicit CustomTensor(void* raw_tensor);
     /// \brief Reset the shape of the tensor.
     /// Generally it's only used for the input tensor.
     /// Reshape must be called before calling mutable_data() or copy_from_cpu()
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 94c7a4c6c7861..f069ccb2eda0a 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -19,10 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 namespace paddle {
 
-#define GET_CASTED_TENSOR                                     \
-  if (!tensor_) {                                             \
-    tensor_ = std::make_shared<framework::LoDTensor>();          \
-  }                                                           \
+#define GET_CASTED_TENSOR                                             \
+  if (!tensor_) {                                                     \
+    tensor_ = std::make_shared<framework::LoDTensor>();               \
+  }                                                                   \
   auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
 
 void CustomTensor::Reshape(const std::vector<int> &shape) {
@@ -34,11 +34,6 @@ CustomTensor::CustomTensor(const PlaceType& place):
         tensor_(std::make_shared<framework::LoDTensor>()),
         place_(place){};
 
-CustomTensor::CustomTensor(void* raw_tensor):
-    tensor_(static_cast<framework::LoDTensor*>(raw_tensor)),
-    place_(PlaceType::kUNK){}
-
-
 template <typename T>
 T *CustomTensor::mutable_data(const PlaceType& place) {
     place_ = place;

From 9aa0d6967dbaae635f87801bf23ac3c1934ecd64 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 5 Feb 2021 09:40:16 +0000
Subject: [PATCH 60/83] hid share data from and to

---
 paddle/fluid/extension/include/tensor.h      | 14 ++-------
 paddle/fluid/extension/src/tensor.cc         | 16 ++++++----
 paddle/fluid/framework/custom_operator.cc    |  5 +--
 paddle/fluid/framework/custom_tensor_utils.h | 33 ++++++++++++++++++++
 4 files changed, 48 insertions(+), 20 deletions(-)
 create mode 100644 paddle/fluid/framework/custom_tensor_utils.h

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 5e989e91c2941..eff863b202abc 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <memory>
 
 namespace paddle {
-
+class CustomTensorUtils;
 class CustomTensor{
 public:
     /// \brief Construct a CustomTensor on None Place for CustomOp.
@@ -79,17 +79,6 @@ class CustomTensor{
     /// \return The data type of the tensor.
     PaddleDType type() const;
 
-
-    /// \brief Share data TO another tensor.
-    /// Use this to pass tensor from op to op
-    /// \return void.
-    void ShareDataTo(void* other);
-
-    /// \brief Share data FROM another tensor.
-    /// Use this to pass tensor from op to op
-    /// \return void.
-    void ShareDataFrom(void* other);
-
     /// \brief Get the size of current tensor.
     /// Use this method to get the size of tensor
     /// \return int64_t.
@@ -101,6 +90,7 @@ class CustomTensor{
     const PlaceType& place() const;
 
 private:
+    friend class CustomTensorUtils;
     mutable std::shared_ptr<void> tensor_;
     mutable PlaceType place_;
 };
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index f069ccb2eda0a..ec860aa3b95d7 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
+#include "paddle/fluid/framework/custom_tensor_utils.h"
 namespace paddle {
 
 #define GET_CASTED_TENSOR                                             \
@@ -218,15 +219,18 @@ const PlaceType& CustomTensor::place() const {
     return place_;
 }
 
-void CustomTensor::ShareDataTo(void* other){
-    static_cast<framework::LoDTensor*>(other)
+void CustomTensorUtils::ShareDataTo(const CustomTensor& src, void* dst){
+    static_cast<framework::LoDTensor*>(dst)
     ->ShareDataWith(
-            *static_cast<framework::LoDTensor*>(tensor_.get()));
+            *static_cast<framework::LoDTensor*>(src.tensor_.get()));
 }
 
-void CustomTensor::ShareDataFrom(void* out_data){
-    GET_CASTED_TENSOR;
-    tensor->ShareDataWith(*static_cast<framework::LoDTensor*>(out_data));
+void CustomTensorUtils::ShareDataFrom(void* src, const CustomTensor& dst){
+    if (!dst.tensor_) {
+        dst.tensor_ = std::make_shared<framework::LoDTensor>();
+    }
+    auto *tensor = static_cast<framework::LoDTensor *>(dst.tensor_.get());
+    tensor->ShareDataWith(*static_cast<framework::LoDTensor*>(src));
 }
 
 int64_t CustomTensor::size() const{
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 3ccf9b0102dde..0d38ad737924b 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -33,6 +33,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
 #include "paddle/fluid/extension/include/tensor.h"
+#include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/platform/gpu_info.h"
 
 namespace paddle {
@@ -105,7 +106,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
                       platform::errors::InvalidArgument(
                           "Input tensor (%s) is not initialized."));
     auto custom_in = CustomTensor(PlatformPlaceToPaddlePlace(x->place()));
-    custom_in.ShareDataFrom((void *)x);
+    CustomTensorUtils::ShareDataFrom((void *)x, custom_in);
     custom_ins.emplace_back(custom_in);
   }
 
@@ -123,7 +124,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
           "Custom operator can only hold 1 output as vector<Tensor>."));
   auto true_outs = ctx.MultiOutput<Tensor>(out_name[0]);
   for (size_t i = 0; i < true_outs.size(); ++i) {
-      outs.at(i).ShareDataTo((true_outs)[i]);
+      paddle::CustomTensorUtils::ShareDataTo(outs.at(i), (true_outs)[i]);
   }
 }
 
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
new file mode 100644
index 0000000000000..0f97dd41ea6fd
--- /dev/null
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/extension.h"
+#include <memory>
+
+namespace paddle {
+class CustomTensorUtils{
+public:
+    /// \brief Share data TO another tensor.
+    /// Use this to pass tensor from op to op
+    /// \return void.
+    static void ShareDataTo(const CustomTensor& src, void* dst);
+
+    /// \brief Share data FROM another tensor.
+    /// Use this to pass tensor from op to op
+    /// \return void.
+    static void ShareDataFrom(void* src, const CustomTensor& dst);
+};
+}  // namespace paddle
+

From 6bbea362dcf0bb6ca577a09bf6e0358753d7972b Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Fri, 5 Feb 2021 10:04:07 +0000
Subject: [PATCH 61/83] rename CustomTensor to Tensor

---
 paddle/fluid/extension/include/op_function.h  |   2 +-
 paddle/fluid/extension/include/tensor.h       |   8 +-
 paddle/fluid/extension/src/tensor.cc          | 108 +++++++++---------
 paddle/fluid/framework/custom_operator.cc     |   4 +-
 paddle/fluid/framework/custom_tensor_utils.h  |   4 +-
 .../fluid/tests/custom_op/relu_op_simple.cc   |  28 ++---
 .../fluid/tests/custom_op/relu_op_simple.cu   |  12 +-
 7 files changed, 83 insertions(+), 83 deletions(-)

diff --git a/paddle/fluid/extension/include/op_function.h b/paddle/fluid/extension/include/op_function.h
index 2861e4498c596..744309eb91bed 100644
--- a/paddle/fluid/extension/include/op_function.h
+++ b/paddle/fluid/extension/include/op_function.h
@@ -36,7 +36,7 @@ limitations under the License. */
 
 namespace paddle {
 
-using Tensor = CustomTensor;
+using Tensor = paddle::Tensor;
 
 #define DISABLE_COPY_AND_ASSIGN(classname)         \
  private:                                          \
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index eff863b202abc..54edcf29cf331 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -20,11 +20,11 @@ limitations under the License. */
 
 namespace paddle {
 class CustomTensorUtils;
-class CustomTensor{
+class Tensor{
 public:
-    /// \brief Construct a CustomTensor on None Place for CustomOp.
-    /// Generally it's only used for user to create CustomTensor.
-    explicit CustomTensor(const PlaceType& place);
+    /// \brief Construct a Tensor on None Place for CustomOp.
+    /// Generally it's only used for user to create Tensor.
+    explicit Tensor(const PlaceType& place);
     /// \brief Reset the shape of the tensor.
     /// Generally it's only used for the input tensor.
     /// Reshape must be called before calling mutable_data() or copy_from_cpu()
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index ec860aa3b95d7..41b84a7d1b61b 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -26,28 +26,28 @@ namespace paddle {
   }                                                                   \
   auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
 
-void CustomTensor::Reshape(const std::vector<int> &shape) {
+void Tensor::Reshape(const std::vector<int> &shape) {
     GET_CASTED_TENSOR
     tensor->Resize(framework::make_ddim(shape));
 }
 
-CustomTensor::CustomTensor(const PlaceType& place):
+Tensor::Tensor(const PlaceType& place):
         tensor_(std::make_shared<framework::LoDTensor>()),
         place_(place){};
 
 template <typename T>
-T *CustomTensor::mutable_data(const PlaceType& place) {
+T *Tensor::mutable_data(const PlaceType& place) {
     place_ = place;
     return mutable_data<T>();
 }
 
 template <typename T>
-T *CustomTensor::mutable_data() {
+T *Tensor::mutable_data() {
     GET_CASTED_TENSOR
     PADDLE_ENFORCE_GT(
             tensor->numel(), 0,
             platform::errors::PreconditionNotMet(
-                    "You should call CustomTensor::Reshape(const std::vector<int> "
+                    "You should call Tensor::Reshape(const std::vector<int> "
                     "&shape)"
                     "function before retrieving mutable_data from input tensor."));
     switch (static_cast<int>(place_)) {
@@ -67,13 +67,13 @@ T *CustomTensor::mutable_data() {
 }
     
 template <typename T>
-T *CustomTensor::data() const {
+T *Tensor::data() const {
     GET_CASTED_TENSOR;
     auto *res = tensor->data<T>();
     return res;
 }
 
-PaddleDType CustomTensor::type() const {
+PaddleDType Tensor::type() const {
     GET_CASTED_TENSOR;
     auto type = tensor->type();
     if (type == framework::proto::VarType::FP32) {
@@ -91,11 +91,11 @@ PaddleDType CustomTensor::type() const {
 }
 
 template <typename T>
-void CustomTensor::copy_from_cpu(const T *data) {
+void Tensor::copy_from_cpu(const T *data) {
     GET_CASTED_TENSOR;
     PADDLE_ENFORCE_GE(tensor->numel(), 0,
                       platform::errors::PreconditionNotMet(
-                              "You should call CustomTensor::Reshape(const "
+                              "You should call Tensor::Reshape(const "
                               "std::vector<int> &shape)"
                               "function before copying data from cpu."));
     size_t ele_size = tensor->numel() * sizeof(T);
@@ -122,7 +122,7 @@ void CustomTensor::copy_from_cpu(const T *data) {
 }
 
 template <typename T>
-void CustomTensor::copy_to_cpu(T *data) {
+void Tensor::copy_to_cpu(T *data) {
     GET_CASTED_TENSOR;
     auto ele_num = tensor->numel();
     auto *t_data = tensor->data<T>();
@@ -147,47 +147,47 @@ void CustomTensor::copy_to_cpu(T *data) {
     }
 }
 
-template  void CustomTensor::copy_from_cpu<float>(const float *data);
-template  void CustomTensor::copy_from_cpu<double>(const double *data);
-template  void CustomTensor::copy_from_cpu<int64_t>(const int64_t *data);
-template  void CustomTensor::copy_from_cpu<int32_t>(const int32_t *data);
-template  void CustomTensor::copy_from_cpu<uint8_t>(const uint8_t *data);
-template  void CustomTensor::copy_from_cpu<int8_t>(const int8_t *data);
-
-template  void CustomTensor::copy_to_cpu<float>(float *data);
-template  void CustomTensor::copy_to_cpu<double>(double *data);
-template  void CustomTensor::copy_to_cpu<int64_t>(int64_t *data);
-template  void CustomTensor::copy_to_cpu<int32_t>(int32_t *data);
-template  void CustomTensor::copy_to_cpu<uint8_t>(uint8_t *data);
-template  void CustomTensor::copy_to_cpu<int8_t>(int8_t *data);
-
-template  float *CustomTensor::data<float>() const;
-template  double *CustomTensor::data<double>() const;
-template  int64_t *CustomTensor::data<int64_t>() const;
-template  int32_t *CustomTensor::data<int32_t>() const;
-template  uint8_t *CustomTensor::data<uint8_t>() const;
-template  int8_t *CustomTensor::data<int8_t>() const;
-
-template  float *CustomTensor::mutable_data<float>();
-template  double *CustomTensor::mutable_data<double>();
-template  int64_t *CustomTensor::mutable_data<int64_t>();
-template  int32_t *CustomTensor::mutable_data<int32_t>();
-template  uint8_t *CustomTensor::mutable_data<uint8_t>();
-template  int8_t *CustomTensor::mutable_data<int8_t>();
-
-template  float *CustomTensor::mutable_data<float>(const PlaceType& place);
-template  double *CustomTensor::mutable_data<double>(const PlaceType& place);
-template  int64_t *CustomTensor::mutable_data<int64_t>(const PlaceType& place);
-template  int32_t *CustomTensor::mutable_data<int32_t>(const PlaceType& place);
-template  uint8_t *CustomTensor::mutable_data<uint8_t>(const PlaceType& place);
-template  int8_t *CustomTensor::mutable_data<int8_t>(const PlaceType& place);
-
-std::vector<int> CustomTensor::shape() const {
+template  void Tensor::copy_from_cpu<float>(const float *data);
+template  void Tensor::copy_from_cpu<double>(const double *data);
+template  void Tensor::copy_from_cpu<int64_t>(const int64_t *data);
+template  void Tensor::copy_from_cpu<int32_t>(const int32_t *data);
+template  void Tensor::copy_from_cpu<uint8_t>(const uint8_t *data);
+template  void Tensor::copy_from_cpu<int8_t>(const int8_t *data);
+
+template  void Tensor::copy_to_cpu<float>(float *data);
+template  void Tensor::copy_to_cpu<double>(double *data);
+template  void Tensor::copy_to_cpu<int64_t>(int64_t *data);
+template  void Tensor::copy_to_cpu<int32_t>(int32_t *data);
+template  void Tensor::copy_to_cpu<uint8_t>(uint8_t *data);
+template  void Tensor::copy_to_cpu<int8_t>(int8_t *data);
+
+template  float *Tensor::data<float>() const;
+template  double *Tensor::data<double>() const;
+template  int64_t *Tensor::data<int64_t>() const;
+template  int32_t *Tensor::data<int32_t>() const;
+template  uint8_t *Tensor::data<uint8_t>() const;
+template  int8_t *Tensor::data<int8_t>() const;
+
+template  float *Tensor::mutable_data<float>();
+template  double *Tensor::mutable_data<double>();
+template  int64_t *Tensor::mutable_data<int64_t>();
+template  int32_t *Tensor::mutable_data<int32_t>();
+template  uint8_t *Tensor::mutable_data<uint8_t>();
+template  int8_t *Tensor::mutable_data<int8_t>();
+
+template  float *Tensor::mutable_data<float>(const PlaceType& place);
+template  double *Tensor::mutable_data<double>(const PlaceType& place);
+template  int64_t *Tensor::mutable_data<int64_t>(const PlaceType& place);
+template  int32_t *Tensor::mutable_data<int32_t>(const PlaceType& place);
+template  uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType& place);
+template  int8_t *Tensor::mutable_data<int8_t>(const PlaceType& place);
+
+std::vector<int> Tensor::shape() const {
     GET_CASTED_TENSOR
     return framework::vectorize<int>(tensor->dims());
 }
 
-void CustomTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
+void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
     GET_CASTED_TENSOR;
     framework::LoD lod;
     for (auto &level : x) {
@@ -196,7 +196,7 @@ void CustomTensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
     tensor->set_lod(lod);
 }
 
-std::vector<std::vector<size_t>> CustomTensor::lod() const {
+std::vector<std::vector<size_t>> Tensor::lod() const {
     GET_CASTED_TENSOR;
     std::vector<std::vector<size_t>> res;
     for (auto &level : tensor->lod()) {
@@ -205,27 +205,27 @@ std::vector<std::vector<size_t>> CustomTensor::lod() const {
     return res;
 }
 
-const PlaceType& CustomTensor::place() const {
+const PlaceType& Tensor::place() const {
     GET_CASTED_TENSOR;
     if(platform::is_cpu_place(tensor->place())){
         place_ = PlaceType::kCPU;
     }else if(platform::is_gpu_place(tensor->place())){
         place_ = PlaceType::kGPU;
     }else{
-        PADDLE_THROW("Current CustomTensor hold unsupported Place Type, Please Init it"
-                     "using CustomTensor::mutable_data<T>(PaddlePlace) which T is"
+        PADDLE_THROW("Current Tensor hold unsupported Place Type, Please Init it"
+                     "using Tensor::mutable_data<T>(PaddlePlace) which T is"
                      "either Place::kCPU or Place::kGPU");
     }
     return place_;
 }
 
-void CustomTensorUtils::ShareDataTo(const CustomTensor& src, void* dst){
+void CustomTensorUtils::ShareDataTo(const Tensor& src, void* dst){
     static_cast<framework::LoDTensor*>(dst)
     ->ShareDataWith(
             *static_cast<framework::LoDTensor*>(src.tensor_.get()));
 }
 
-void CustomTensorUtils::ShareDataFrom(void* src, const CustomTensor& dst){
+void CustomTensorUtils::ShareDataFrom(void* src, const Tensor& dst){
     if (!dst.tensor_) {
         dst.tensor_ = std::make_shared<framework::LoDTensor>();
     }
@@ -233,7 +233,7 @@ void CustomTensorUtils::ShareDataFrom(void* src, const CustomTensor& dst){
     tensor->ShareDataWith(*static_cast<framework::LoDTensor*>(src));
 }
 
-int64_t CustomTensor::size() const{
+int64_t Tensor::size() const{
     GET_CASTED_TENSOR;
     return tensor->numel();
 }
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 0d38ad737924b..febb5575c11c0 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -96,7 +96,7 @@ PlaceType PlatformPlaceToPaddlePlace(const platform::Place& pc){
 static void RunKernelFunc(const framework::ExecutionContext& ctx,
                           paddle::KernelFunc func) {
   VLOG(1) << "Custom Operator: Start run KernelFunc.";
-  std::vector<CustomTensor> custom_ins;
+  std::vector<paddle::Tensor> custom_ins;
   for (auto name : ctx.InNameList()) {
     VLOG(1) << "Custom Operator: input name - " << name;
     auto* x = ctx.Input<Tensor>(name);
@@ -105,7 +105,7 @@ static void RunKernelFunc(const framework::ExecutionContext& ctx,
     PADDLE_ENFORCE_EQ(x->IsInitialized(), true,
                       platform::errors::InvalidArgument(
                           "Input tensor (%s) is not initialized."));
-    auto custom_in = CustomTensor(PlatformPlaceToPaddlePlace(x->place()));
+    auto custom_in = paddle::Tensor(PlatformPlaceToPaddlePlace(x->place()));
     CustomTensorUtils::ShareDataFrom((void *)x, custom_in);
     custom_ins.emplace_back(custom_in);
   }
diff --git a/paddle/fluid/framework/custom_tensor_utils.h b/paddle/fluid/framework/custom_tensor_utils.h
index 0f97dd41ea6fd..68a339b410b1b 100644
--- a/paddle/fluid/framework/custom_tensor_utils.h
+++ b/paddle/fluid/framework/custom_tensor_utils.h
@@ -22,12 +22,12 @@ class CustomTensorUtils{
     /// \brief Share data TO another tensor.
     /// Use this to pass tensor from op to op
     /// \return void.
-    static void ShareDataTo(const CustomTensor& src, void* dst);
+    static void ShareDataTo(const Tensor& src, void* dst);
 
     /// \brief Share data FROM another tensor.
     /// Use this to pass tensor from op to op
     /// \return void.
-    static void ShareDataFrom(void* src, const CustomTensor& dst);
+    static void ShareDataFrom(void* src, const Tensor& dst);
 };
 }  // namespace paddle
 
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
index 6e6186d02f63c..953fe2f623d3c 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -37,8 +37,8 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
   }
 }
 
-std::vector<paddle::CustomTensor> relu_cpu_forward(const paddle::CustomTensor& x) {
-  auto out = paddle::CustomTensor(paddle::PlaceType::kCPU);
+std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
   out.Reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(
@@ -50,10 +50,10 @@ std::vector<paddle::CustomTensor> relu_cpu_forward(const paddle::CustomTensor& x
   return {out};
 }
 
-std::vector<paddle::CustomTensor> relu_cpu_backward(const paddle::CustomTensor& grad_out,
-                                              const paddle::CustomTensor& out,
-                                              const paddle::CustomTensor& x) {
-  auto grad_x = paddle::CustomTensor(paddle::PlaceType::kCPU);
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& grad_out,
+                                              const paddle::Tensor& out,
+                                              const paddle::Tensor& x) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
   grad_x.Reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
@@ -67,12 +67,12 @@ std::vector<paddle::CustomTensor> relu_cpu_backward(const paddle::CustomTensor&
   return {grad_x};
 }
 
-std::vector<paddle::CustomTensor> relu_cuda_forward(const paddle::CustomTensor& x);
-std::vector<paddle::CustomTensor> relu_cuda_backward(const paddle::CustomTensor& grad_out,
-                                               const paddle::CustomTensor& out,
-                                               const paddle::CustomTensor& x);
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x);
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& grad_out,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& x);
 
-std::vector<paddle::CustomTensor> ReluForward(const paddle::CustomTensor& x) {
+std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
   // TODO(chenweihang): Check Input
     if (x.place() == paddle::PlaceType::kCPU) {
         return relu_cpu_forward(x);
@@ -83,9 +83,9 @@ std::vector<paddle::CustomTensor> ReluForward(const paddle::CustomTensor& x) {
     }
 }
 
-std::vector<paddle::CustomTensor> ReluBackward(const paddle::CustomTensor& grad_out,
-                                         const paddle::CustomTensor& out,
-                                         const paddle::CustomTensor& x) {
+std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& grad_out,
+                                         const paddle::Tensor& out,
+                                         const paddle::Tensor& x) {
   // TODO(chenweihang): Check Input
     if (x.place() == paddle::PlaceType::kCPU) {
         return relu_cpu_backward(grad_out, out, x);
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
index 5a5d146503996..ce796973fa32b 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
@@ -35,8 +35,8 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
   }
 }
 
-std::vector<paddle::CustomTensor> relu_cuda_forward(const paddle::CustomTensor& x) {
-  auto out = paddle::CustomTensor(paddle::PlaceType::kGPU);
+std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kGPU);
   out.Reshape(x.shape());
 
   int numel = x.size();
@@ -51,10 +51,10 @@ std::vector<paddle::CustomTensor> relu_cuda_forward(const paddle::CustomTensor&
   return {out};
 }
 
-std::vector<paddle::CustomTensor> relu_cuda_backward(const paddle::CustomTensor& grad_out,
-                                               const paddle::CustomTensor& out,
-                                               const paddle::CustomTensor& x) {
-  auto grad_x = paddle::CustomTensor(paddle::PlaceType::kGPU);
+std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& grad_out,
+                                               const paddle::Tensor& out,
+                                               const paddle::Tensor& x) {
+  auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU);
   grad_x.Reshape(x.shape());
 
   int numel = out.size();

From 3fb3f0ad17558ed6741aaf98ef2f015b41de41ef Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 03:53:04 +0000
Subject: [PATCH 62/83] support multi dtype

---
 paddle/fluid/extension/include/dtype.h |  24 +-
 paddle/fluid/extension/src/tensor.cc   | 318 +++++++++++++------------
 2 files changed, 178 insertions(+), 164 deletions(-)

diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index 5cbecc0be52f7..b21ad8bcd1b3c 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -13,17 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 
 enum PaddleDType {
-        FLOAT32,
-        FLOAT64,
-        INT64,
-        INT32,
-        UINT8,
-        INT8,
-        // TODO(Superjomn) support more data types if needed.
-    };
+  FLOAT32,
+  FLOAT64,
+  BFLOAT16,
+  COMPLEX128,
+  COMPLEX64,
+  FLOAT16,
+  INT64,
+  INT32,
+  UINT8,
+  INT8,
+  // TODO(Superjomn) support more data types if needed.
+};
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 41b84a7d1b61b..8de14ff8d9d4c 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -13,97 +13,104 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/extension/include/all.h"
+#include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/gpu_info.h"
-#include "paddle/fluid/framework/custom_tensor_utils.h"
 namespace paddle {
 
-#define GET_CASTED_TENSOR                                             \
-  if (!tensor_) {                                                     \
-    tensor_ = std::make_shared<framework::LoDTensor>();               \
-  }                                                                   \
+#define GET_CASTED_TENSOR                               \
+  if (!tensor_) {                                       \
+    tensor_ = std::make_shared<framework::LoDTensor>(); \
+  }                                                     \
   auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
 
 void Tensor::Reshape(const std::vector<int> &shape) {
-    GET_CASTED_TENSOR
-    tensor->Resize(framework::make_ddim(shape));
+  GET_CASTED_TENSOR
+  tensor->Resize(framework::make_ddim(shape));
 }
 
-Tensor::Tensor(const PlaceType& place):
-        tensor_(std::make_shared<framework::LoDTensor>()),
-        place_(place){};
+Tensor::Tensor(const PlaceType &place)
+    : tensor_(std::make_shared<framework::LoDTensor>()), place_(place) {}
 
 template <typename T>
-T *Tensor::mutable_data(const PlaceType& place) {
-    place_ = place;
-    return mutable_data<T>();
+T *Tensor::mutable_data(const PlaceType &place) {
+  place_ = place;
+  return mutable_data<T>();
 }
 
 template <typename T>
 T *Tensor::mutable_data() {
-    GET_CASTED_TENSOR
-    PADDLE_ENFORCE_GT(
-            tensor->numel(), 0,
-            platform::errors::PreconditionNotMet(
-                    "You should call Tensor::Reshape(const std::vector<int> "
-                    "&shape)"
-                    "function before retrieving mutable_data from input tensor."));
-    switch (static_cast<int>(place_)) {
-        case static_cast<int>(PlaceType::kCPU): {
-            return tensor->mutable_data<T>(platform::CPUPlace());
-        }
-        case static_cast<int>(PlaceType::kGPU): {
+  GET_CASTED_TENSOR
+  PADDLE_ENFORCE_GT(
+      tensor->numel(), 0,
+      platform::errors::PreconditionNotMet(
+          "You should call Tensor::Reshape(const std::vector<int> "
+          "&shape)"
+          "function before retrieving mutable_data from input tensor."));
+  switch (static_cast<int>(place_)) {
+    case static_cast<int>(PlaceType::kCPU): {
+      return tensor->mutable_data<T>(platform::CPUPlace());
+    }
+    case static_cast<int>(PlaceType::kGPU): {
 #ifdef PADDLE_WITH_CUDA
-            int device_num = platform::GetCurrentDeviceId();
-            return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
+      int device_num = platform::GetCurrentDeviceId();
+      return tensor->mutable_data<T>(platform::CUDAPlace(device_num));
 #endif
-        }
-        default:
-            PADDLE_THROW(platform::errors::Unavailable("CustomOp unsupported place: %d",
-                                                   static_cast<int>(place_)));
     }
+    default:
+      PADDLE_THROW(platform::errors::Unavailable(
+          "CustomOp unsupported place: %d", static_cast<int>(place_)));
+  }
 }
-    
+
 template <typename T>
 T *Tensor::data() const {
-    GET_CASTED_TENSOR;
-    auto *res = tensor->data<T>();
-    return res;
+  GET_CASTED_TENSOR;
+  auto *res = tensor->data<T>();
+  return res;
 }
 
 PaddleDType Tensor::type() const {
-    GET_CASTED_TENSOR;
-    auto type = tensor->type();
-    if (type == framework::proto::VarType::FP32) {
-        return PaddleDType::FLOAT32;
-    } else if (type == framework::proto::VarType::INT64) {
-        return PaddleDType::INT64;
-    } else if (type == framework::proto::VarType::INT32) {
-        return PaddleDType::INT32;
-    } else if (type == framework::proto::VarType::UINT8) {
-        return PaddleDType::UINT8;
-    } else if (type == framework::proto::VarType::FP64){
-        return PaddleDType::FLOAT64;
-    }
+  GET_CASTED_TENSOR;
+  auto type = tensor->type();
+  if (type == framework::proto::VarType::FP32) {
     return PaddleDType::FLOAT32;
+  } else if (type == framework::proto::VarType::INT64) {
+    return PaddleDType::INT64;
+  } else if (type == framework::proto::VarType::INT32) {
+    return PaddleDType::INT32;
+  } else if (type == framework::proto::VarType::UINT8) {
+    return PaddleDType::UINT8;
+  } else if (type == framework::proto::VarType::FP64) {
+    return PaddleDType::FLOAT64;
+  } else if (type == framework::proto::VarType::BF16) {
+    return PaddleDType::BFLOAT16;
+  } else if (type == framework::proto::VarType::FP16) {
+    return PaddleDType::FLOAT16;
+  } else if (type == framework::proto::VarType::COMPLEX64) {
+    return PaddleDType::COMPLEX64;
+  } else if (type == framework::proto::VarType::COMPLEX128) {
+    return PaddleDType::COMPLEX128;
+  }
+  return PaddleDType::FLOAT32;
 }
 
 template <typename T>
 void Tensor::copy_from_cpu(const T *data) {
-    GET_CASTED_TENSOR;
-    PADDLE_ENFORCE_GE(tensor->numel(), 0,
-                      platform::errors::PreconditionNotMet(
-                              "You should call Tensor::Reshape(const "
-                              "std::vector<int> &shape)"
-                              "function before copying data from cpu."));
-    size_t ele_size = tensor->numel() * sizeof(T);
-
-    if (place_ == PlaceType::kCPU) {
-        auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
-        std::memcpy(static_cast<void *>(t_data), data, ele_size);
-    } else {
+  GET_CASTED_TENSOR;
+  PADDLE_ENFORCE_GE(tensor->numel(), 0,
+                    platform::errors::PreconditionNotMet(
+                        "You should call Tensor::Reshape(const "
+                        "std::vector<int> &shape)"
+                        "function before copying data from cpu."));
+  size_t ele_size = tensor->numel() * sizeof(T);
+
+  if (place_ == PlaceType::kCPU) {
+    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
+    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  } else {
 #ifdef PADDLE_WITH_CUDA
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     int device_num = platform::GetCurrentDeviceId();
@@ -115,127 +122,126 @@ void Tensor::copy_from_cpu(const T *data) {
     memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
                  data, ele_size, dev_ctx->stream());
 #else
-        PADDLE_THROW(platform::errors::Unavailable(
-                "Not compiled with CUDA, should not reach here."));
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compiled with CUDA, should not reach here."));
 #endif
-    }
+  }
 }
 
 template <typename T>
 void Tensor::copy_to_cpu(T *data) {
-    GET_CASTED_TENSOR;
-    auto ele_num = tensor->numel();
-    auto *t_data = tensor->data<T>();
-    auto t_place = tensor->place();
-
-    if (platform::is_cpu_place(t_place)) {
-        std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
-    } else {
+  GET_CASTED_TENSOR;
+  auto ele_num = tensor->numel();
+  auto *t_data = tensor->data<T>();
+  auto t_place = tensor->place();
+
+  if (platform::is_cpu_place(t_place)) {
+    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+  } else {
 #ifdef PADDLE_WITH_CUDA
-        platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-        auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
-        auto *dev_ctx =
-            static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-        memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
-                     t_data, ele_num * sizeof(T), dev_ctx->stream());
-
-        cudaStreamSynchronize(dev_ctx->stream());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
+    auto *dev_ctx =
+        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
+                 t_data, ele_num * sizeof(T), dev_ctx->stream());
+
+    cudaStreamSynchronize(dev_ctx->stream());
 #else
-        PADDLE_THROW(platform::errors::Unavailable(
-                "Not compile with CUDA, should not reach here."));
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Not compile with CUDA, should not reach here."));
 #endif
-    }
+  }
 }
 
-template  void Tensor::copy_from_cpu<float>(const float *data);
-template  void Tensor::copy_from_cpu<double>(const double *data);
-template  void Tensor::copy_from_cpu<int64_t>(const int64_t *data);
-template  void Tensor::copy_from_cpu<int32_t>(const int32_t *data);
-template  void Tensor::copy_from_cpu<uint8_t>(const uint8_t *data);
-template  void Tensor::copy_from_cpu<int8_t>(const int8_t *data);
-
-template  void Tensor::copy_to_cpu<float>(float *data);
-template  void Tensor::copy_to_cpu<double>(double *data);
-template  void Tensor::copy_to_cpu<int64_t>(int64_t *data);
-template  void Tensor::copy_to_cpu<int32_t>(int32_t *data);
-template  void Tensor::copy_to_cpu<uint8_t>(uint8_t *data);
-template  void Tensor::copy_to_cpu<int8_t>(int8_t *data);
-
-template  float *Tensor::data<float>() const;
-template  double *Tensor::data<double>() const;
-template  int64_t *Tensor::data<int64_t>() const;
-template  int32_t *Tensor::data<int32_t>() const;
-template  uint8_t *Tensor::data<uint8_t>() const;
-template  int8_t *Tensor::data<int8_t>() const;
-
-template  float *Tensor::mutable_data<float>();
-template  double *Tensor::mutable_data<double>();
-template  int64_t *Tensor::mutable_data<int64_t>();
-template  int32_t *Tensor::mutable_data<int32_t>();
-template  uint8_t *Tensor::mutable_data<uint8_t>();
-template  int8_t *Tensor::mutable_data<int8_t>();
-
-template  float *Tensor::mutable_data<float>(const PlaceType& place);
-template  double *Tensor::mutable_data<double>(const PlaceType& place);
-template  int64_t *Tensor::mutable_data<int64_t>(const PlaceType& place);
-template  int32_t *Tensor::mutable_data<int32_t>(const PlaceType& place);
-template  uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType& place);
-template  int8_t *Tensor::mutable_data<int8_t>(const PlaceType& place);
+template void Tensor::copy_from_cpu<float>(const float *data);
+template void Tensor::copy_from_cpu<double>(const double *data);
+template void Tensor::copy_from_cpu<int64_t>(const int64_t *data);
+template void Tensor::copy_from_cpu<int32_t>(const int32_t *data);
+template void Tensor::copy_from_cpu<uint8_t>(const uint8_t *data);
+template void Tensor::copy_from_cpu<int8_t>(const int8_t *data);
+
+template void Tensor::copy_to_cpu<float>(float *data);
+template void Tensor::copy_to_cpu<double>(double *data);
+template void Tensor::copy_to_cpu<int64_t>(int64_t *data);
+template void Tensor::copy_to_cpu<int32_t>(int32_t *data);
+template void Tensor::copy_to_cpu<uint8_t>(uint8_t *data);
+template void Tensor::copy_to_cpu<int8_t>(int8_t *data);
+
+template float *Tensor::data<float>() const;
+template double *Tensor::data<double>() const;
+template int64_t *Tensor::data<int64_t>() const;
+template int32_t *Tensor::data<int32_t>() const;
+template uint8_t *Tensor::data<uint8_t>() const;
+template int8_t *Tensor::data<int8_t>() const;
+
+template float *Tensor::mutable_data<float>();
+template double *Tensor::mutable_data<double>();
+template int64_t *Tensor::mutable_data<int64_t>();
+template int32_t *Tensor::mutable_data<int32_t>();
+template uint8_t *Tensor::mutable_data<uint8_t>();
+template int8_t *Tensor::mutable_data<int8_t>();
+
+template float *Tensor::mutable_data<float>(const PlaceType &place);
+template double *Tensor::mutable_data<double>(const PlaceType &place);
+template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
+template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
+template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
+template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
 
 std::vector<int> Tensor::shape() const {
-    GET_CASTED_TENSOR
-    return framework::vectorize<int>(tensor->dims());
+  GET_CASTED_TENSOR
+  return framework::vectorize<int>(tensor->dims());
 }
 
 void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
-    GET_CASTED_TENSOR;
-    framework::LoD lod;
-    for (auto &level : x) {
-        lod.emplace_back(level);
-    }
-    tensor->set_lod(lod);
+  GET_CASTED_TENSOR;
+  framework::LoD lod;
+  for (auto &level : x) {
+    lod.emplace_back(level);
+  }
+  tensor->set_lod(lod);
 }
 
 std::vector<std::vector<size_t>> Tensor::lod() const {
-    GET_CASTED_TENSOR;
-    std::vector<std::vector<size_t>> res;
-    for (auto &level : tensor->lod()) {
-        res.emplace_back(level);
-    }
-    return res;
+  GET_CASTED_TENSOR;
+  std::vector<std::vector<size_t>> res;
+  for (auto &level : tensor->lod()) {
+    res.emplace_back(level);
+  }
+  return res;
 }
 
-const PlaceType& Tensor::place() const {
-    GET_CASTED_TENSOR;
-    if(platform::is_cpu_place(tensor->place())){
-        place_ = PlaceType::kCPU;
-    }else if(platform::is_gpu_place(tensor->place())){
-        place_ = PlaceType::kGPU;
-    }else{
-        PADDLE_THROW("Current Tensor hold unsupported Place Type, Please Init it"
-                     "using Tensor::mutable_data<T>(PaddlePlace) which T is"
-                     "either Place::kCPU or Place::kGPU");
-    }
-    return place_;
+const PlaceType &Tensor::place() const {
+  GET_CASTED_TENSOR;
+  if (platform::is_cpu_place(tensor->place())) {
+    place_ = PlaceType::kCPU;
+  } else if (platform::is_gpu_place(tensor->place())) {
+    place_ = PlaceType::kGPU;
+  } else {
+    PADDLE_THROW(
+        "Current Tensor hold unsupported Place Type, Please Init it"
+        "using Tensor::mutable_data<T>(PaddlePlace) which T is"
+        "either Place::kCPU or Place::kGPU");
+  }
+  return place_;
 }
 
-void CustomTensorUtils::ShareDataTo(const Tensor& src, void* dst){
-    static_cast<framework::LoDTensor*>(dst)
-    ->ShareDataWith(
-            *static_cast<framework::LoDTensor*>(src.tensor_.get()));
+void CustomTensorUtils::ShareDataTo(const Tensor &src, void *dst) {
+  static_cast<framework::LoDTensor *>(dst)->ShareDataWith(
+      *static_cast<framework::LoDTensor *>(src.tensor_.get()));
 }
 
-void CustomTensorUtils::ShareDataFrom(void* src, const Tensor& dst){
-    if (!dst.tensor_) {
-        dst.tensor_ = std::make_shared<framework::LoDTensor>();
-    }
-    auto *tensor = static_cast<framework::LoDTensor *>(dst.tensor_.get());
-    tensor->ShareDataWith(*static_cast<framework::LoDTensor*>(src));
+void CustomTensorUtils::ShareDataFrom(void *src, const Tensor &dst) {
+  if (!dst.tensor_) {
+    dst.tensor_ = std::make_shared<framework::LoDTensor>();
+  }
+  auto *tensor = static_cast<framework::LoDTensor *>(dst.tensor_.get());
+  tensor->ShareDataWith(*static_cast<framework::LoDTensor *>(src));
 }
 
-int64_t Tensor::size() const{
-    GET_CASTED_TENSOR;
-    return tensor->numel();
+int64_t Tensor::size() const {
+  GET_CASTED_TENSOR;
+  return tensor->numel();
 }
 }  // namespace paddle
-

From dc18813a3dd2e88bf14394be2e52bd4f04daa4c0 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 06:57:19 +0000
Subject: [PATCH 63/83] remove lod, make reshape lowercase, add copy test and
 refactor copy api

---
 paddle/fluid/extension/include/tensor.h       | 141 +++++++++---------
 paddle/fluid/extension/src/tensor.cc          |  96 ++++++------
 paddle/fluid/framework/CMakeLists.txt         |   1 +
 paddle/fluid/framework/custom_tensor_test.cc  |  43 ++++++
 .../fluid/tests/custom_op/relu_op_simple.cc   |  32 ++--
 .../fluid/tests/custom_op/relu_op_simple.cu   |   4 +-
 6 files changed, 172 insertions(+), 145 deletions(-)
 create mode 100644 paddle/fluid/framework/custom_tensor_test.cc

diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 54edcf29cf331..81d016888bdc7 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -14,85 +14,78 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
+#include <vector>
 #include "paddle/fluid/extension/include/device.h"
 #include "paddle/fluid/extension/include/dtype.h"
-#include <memory>
 
 namespace paddle {
 class CustomTensorUtils;
-class Tensor{
-public:
-    /// \brief Construct a Tensor on None Place for CustomOp.
-    /// Generally it's only used for user to create Tensor.
-    explicit Tensor(const PlaceType& place);
-    /// \brief Reset the shape of the tensor.
-    /// Generally it's only used for the input tensor.
-    /// Reshape must be called before calling mutable_data() or copy_from_cpu()
-    /// \param shape The shape to set.
-    void Reshape(const std::vector<int>& shape);
-
-    /// \brief Get the memory pointer in CPU or GPU with specific data type.
-    /// Please Reshape the tensor first before call this.
-    /// It's usually used to get input data pointer.
-    /// \param place The place of the tensor this will override the original place
-    /// of current tensor.
-    template <typename T>
-    T* mutable_data(const PlaceType& place);
-
-    /// \brief Get the memory pointer in CPU or GPU with specific data type.
-    /// Please Reshape the tensor first before call this.
-    /// It's usually used to get input data pointer.
-    template <typename T>
-    T* mutable_data();
-
-    /// \brief Get the memory pointer directly.
-    /// It's usually used to get the output data pointer.
-    /// \return The tensor data buffer pointer.
-    template <typename T>
-    T* data() const;
-
-    /// \brief Copy the host memory to tensor data.
-    /// It's usually used to set the input tensor data.
-    /// \param data The pointer of the data, from which the tensor will copy.
-    template <typename T>
-    void copy_from_cpu(const T* data);
-
-    /// \brief Copy the tensor data to the host memory.
-    /// It's usually used to get the output tensor data.
-    /// \param[out] data The tensor will copy the data to the address.
-    template <typename T>
-    void copy_to_cpu(T* data);
-
-    /// \brief Return the shape of the Tensor.
-    std::vector<int> shape() const;
-
-    /// \brief Set lod info of the tensor.
-    /// More about LOD can be seen here:
-    ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
-    /// \param x the lod info.
-    void SetLoD(const std::vector<std::vector<size_t>>& x);
-    /// \brief Return the lod info of the tensor.
-    std::vector<std::vector<size_t>> lod() const;
-
-    /// \brief Return the data type of the tensor.
-    /// It's usually used to get the output tensor data type.
-    /// \return The data type of the tensor.
-    PaddleDType type() const;
-
-    /// \brief Get the size of current tensor.
-    /// Use this method to get the size of tensor
-    /// \return int64_t.
-    int64_t size() const;
-
-    /// \brief Get the place of current tensor.
-    /// Use this method to get the place of tensor
-    /// \return Place.
-    const PlaceType& place() const;
-
-private:
-    friend class CustomTensorUtils;
-    mutable std::shared_ptr<void> tensor_;
-    mutable PlaceType place_;
+class Tensor {
+ public:
+  /// \brief Construct a Tensor on None Place for CustomOp.
+  /// Generally it's only used for user to create Tensor.
+  explicit Tensor(const PlaceType& place);
+  /// \brief Reset the shape of the tensor.
+  /// Generally it's only used for the input tensor.
+  /// Reshape must be called before calling mutable_data() or copy_from_cpu()
+  /// \param shape The shape to set.
+  void reshape(const std::vector<int>& shape);
+
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  /// \param place The place of the tensor this will override the original place
+  /// of current tensor.
+  template <typename T>
+  T* mutable_data(const PlaceType& place);
+
+  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// Please Reshape the tensor first before call this.
+  /// It's usually used to get input data pointer.
+  template <typename T>
+  T* mutable_data();
+
+  /// \brief Get the memory pointer directly.
+  /// It's usually used to get the output data pointer.
+  /// \return The tensor data buffer pointer.
+  template <typename T>
+  T* data() const;
+
+  /// \brief Copy the host memory to tensor data.
+  /// It's usually used to set the input tensor data.
+  /// \param data The pointer of the data, from which the tensor will copy.
+  template <typename T>
+  Tensor copy_to_gpu();
+
+  /// \brief Copy the tensor data to the host memory.
+  /// It's usually used to get the output tensor data.
+  /// \param[out] data The tensor will copy the data to the address.
+  template <typename T>
+  Tensor copy_to_cpu();
+
+  /// \brief Return the shape of the Tensor.
+  std::vector<int> shape() const;
+
+  /// \brief Return the data type of the tensor.
+  /// It's usually used to get the output tensor data type.
+  /// \return The data type of the tensor.
+  PaddleDType type() const;
+
+  /// \brief Get the size of current tensor.
+  /// Use this method to get the size of tensor
+  /// \return int64_t.
+  int64_t size() const;
+
+  /// \brief Get the place of current tensor.
+  /// Use this method to get the place of tensor
+  /// \return Place.
+  const PlaceType& place() const;
+
+ private:
+  friend class CustomTensorUtils;
+  mutable std::shared_ptr<void> tensor_;
+  mutable PlaceType place_;
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 8de14ff8d9d4c..b0c02a3dab66e 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -26,7 +26,7 @@ namespace paddle {
   }                                                     \
   auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
 
-void Tensor::Reshape(const std::vector<int> &shape) {
+void Tensor::reshape(const std::vector<int> &shape) {
   GET_CASTED_TENSOR
   tensor->Resize(framework::make_ddim(shape));
 }
@@ -98,7 +98,8 @@ PaddleDType Tensor::type() const {
 }
 
 template <typename T>
-void Tensor::copy_from_cpu(const T *data) {
+Tensor Tensor::copy_to_gpu() {
+#ifdef PADDLE_WITH_CUDA
   GET_CASTED_TENSOR;
   PADDLE_ENFORCE_GE(tensor->numel(), 0,
                     platform::errors::PreconditionNotMet(
@@ -106,45 +107,51 @@ void Tensor::copy_from_cpu(const T *data) {
                         "std::vector<int> &shape)"
                         "function before copying data from cpu."));
   size_t ele_size = tensor->numel() * sizeof(T);
-
-  if (place_ == PlaceType::kCPU) {
-    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
-    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  Tensor target = Tensor(PlaceType::kGPU);
+  auto *p_target_data = target.template mutable_data<T>();
+  auto p_src_data = tensor->data<T>();
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  int device_num = platform::GetCurrentDeviceId();
+  platform::CUDAPlace gpu_place(device_num);
+  auto *dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+  if (platform::is_cpu_place(tensor->place())) {
+    memory::Copy(gpu_place, static_cast<void *>(p_target_data),
+                 platform::CPUPlace(), p_src_data, ele_size, dev_ctx->stream());
   } else {
-#ifdef PADDLE_WITH_CUDA
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    int device_num = platform::GetCurrentDeviceId();
-    platform::CUDAPlace gpu_place(device_num);
-    auto *t_data = tensor->mutable_data<T>(gpu_place);
-    auto *dev_ctx =
-        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-
-    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
-                 data, ele_size, dev_ctx->stream());
+    memory::Copy(gpu_place, static_cast<void *>(p_target_data), gpu_place,
+                 p_src_data, ele_size, dev_ctx->stream());
+  }
+  cudaStreamSynchronize(dev_ctx->stream());
+  return target;
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not compiled with CUDA, should not reach here."));
+  PADDLE_THROW(platform::errors::Unavailable(
+      "Not compiled with CUDA, should not reach here."));
 #endif
-  }
+  return Tensor(PlaceType::kGPU);
 }
 
 template <typename T>
-void Tensor::copy_to_cpu(T *data) {
+Tensor Tensor::copy_to_cpu() {
   GET_CASTED_TENSOR;
   auto ele_num = tensor->numel();
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
+  Tensor target = Tensor(PlaceType::kCPU);
+  auto *p_target_data = target.template mutable_data<T>();
 
   if (platform::is_cpu_place(t_place)) {
-    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+    std::memcpy(static_cast<void *>(p_target_data), t_data,
+                ele_num * sizeof(T));
   } else {
 #ifdef PADDLE_WITH_CUDA
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
     auto *dev_ctx =
         static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
-                 t_data, ele_num * sizeof(T), dev_ctx->stream());
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(p_target_data),
+                 gpu_place, t_data, ele_num * sizeof(T), dev_ctx->stream());
 
     cudaStreamSynchronize(dev_ctx->stream());
 #else
@@ -152,21 +159,22 @@ void Tensor::copy_to_cpu(T *data) {
         "Not compile with CUDA, should not reach here."));
 #endif
   }
+  return target;
 }
 
-template void Tensor::copy_from_cpu<float>(const float *data);
-template void Tensor::copy_from_cpu<double>(const double *data);
-template void Tensor::copy_from_cpu<int64_t>(const int64_t *data);
-template void Tensor::copy_from_cpu<int32_t>(const int32_t *data);
-template void Tensor::copy_from_cpu<uint8_t>(const uint8_t *data);
-template void Tensor::copy_from_cpu<int8_t>(const int8_t *data);
+template Tensor Tensor::copy_to_gpu<float>();
+template Tensor Tensor::copy_to_gpu<double>();
+template Tensor Tensor::copy_to_gpu<int64_t>();
+template Tensor Tensor::copy_to_gpu<int32_t>();
+template Tensor Tensor::copy_to_gpu<uint8_t>();
+template Tensor Tensor::copy_to_gpu<int8_t>();
 
-template void Tensor::copy_to_cpu<float>(float *data);
-template void Tensor::copy_to_cpu<double>(double *data);
-template void Tensor::copy_to_cpu<int64_t>(int64_t *data);
-template void Tensor::copy_to_cpu<int32_t>(int32_t *data);
-template void Tensor::copy_to_cpu<uint8_t>(uint8_t *data);
-template void Tensor::copy_to_cpu<int8_t>(int8_t *data);
+template Tensor Tensor::copy_to_cpu<float>();
+template Tensor Tensor::copy_to_cpu<double>();
+template Tensor Tensor::copy_to_cpu<int64_t>();
+template Tensor Tensor::copy_to_cpu<int32_t>();
+template Tensor Tensor::copy_to_cpu<uint8_t>();
+template Tensor Tensor::copy_to_cpu<int8_t>();
 
 template float *Tensor::data<float>() const;
 template double *Tensor::data<double>() const;
@@ -194,24 +202,6 @@ std::vector<int> Tensor::shape() const {
   return framework::vectorize<int>(tensor->dims());
 }
 
-void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
-  GET_CASTED_TENSOR;
-  framework::LoD lod;
-  for (auto &level : x) {
-    lod.emplace_back(level);
-  }
-  tensor->set_lod(lod);
-}
-
-std::vector<std::vector<size_t>> Tensor::lod() const {
-  GET_CASTED_TENSOR;
-  std::vector<std::vector<size_t>> res;
-  for (auto &level : tensor->lod()) {
-    res.emplace_back(level);
-  }
-  return res;
-}
-
 const PlaceType &Tensor::place() const {
   GET_CASTED_TENSOR;
   if (platform::is_cpu_place(tensor->place())) {
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 9e285d72ebe9c..38581bdf56cef 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -161,6 +161,7 @@ nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry)
 
 
 cc_library(custom_tensor SRCS ../extension/src/tensor.cc DEPS lod_tensor)
+cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS operator op_registry device_context dynamic_loader custom_tensor)
 
 if(WITH_PYTHON)
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
new file mode 100644
index 0000000000000..b6735410e3533
--- /dev/null
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -0,0 +1,43 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+
+paddle::Tensor InitGPUTensorForTest() {
+  std::vector<int> tensor_shape = {5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
+  t1.mutable_data<float>(paddle::PlaceType::kGPU);
+  t1.reshape(tensor_shape);
+  for (size_t i = 0; i < t1.size(); i++) {
+    t1.data<float>()[i] = 5;
+  }
+  return t1;
+}
+template <typename T>
+void TestCopyToCpuOfGpuTensor() {
+  auto t1 = InitGPUTensorForTest();
+  auto t1_cpu_cp = t1.copy_to_cpu<T>();
+  CHECK_EQ(paddle::PlaceType::kCPU, t1_cpu_cp.place());
+  for (size_t i = 0; i < t1.size(); i++) {
+    CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
+  }
+}
+void GroupTestCopy() {
+  TestCopyToCpuOfGpuTensor<float>();
+  TestCopyToCpuOfGpuTensor<double>();
+}
+
+TEST(CustomTensor, copyTest) { GroupTestCopy(); }
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
index 953fe2f623d3c..99a1761286192 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -39,7 +39,7 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
 
 std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
   auto out = paddle::Tensor(paddle::PlaceType::kCPU);
-  out.Reshape(x.shape());
+  out.reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(
       x.type(), "relu_cpu_forward", ([&] {
@@ -54,7 +54,7 @@ std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& grad_out,
                                               const paddle::Tensor& out,
                                               const paddle::Tensor& x) {
   auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
-  grad_x.Reshape(x.shape());
+  grad_x.reshape(x.shape());
 
   PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] {
                                relu_cpu_backward_kernel<data_t>(
@@ -74,26 +74,26 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& grad_out,
 
 std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
   // TODO(chenweihang): Check Input
-    if (x.place() == paddle::PlaceType::kCPU) {
-        return relu_cpu_forward(x);
-    } else if (x.place() == paddle::PlaceType::kGPU) {
-        return relu_cuda_forward(x);
-    } else {
-        throw std::runtime_error("Not implemented.");
-    }
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_forward(x);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_forward(x);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
 }
 
 std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& grad_out,
                                          const paddle::Tensor& out,
                                          const paddle::Tensor& x) {
   // TODO(chenweihang): Check Input
-    if (x.place() == paddle::PlaceType::kCPU) {
-        return relu_cpu_backward(grad_out, out, x);
-    } else if (x.place() == paddle::PlaceType::kGPU) {
-        return relu_cuda_backward(grad_out, out, x);
-    } else {
-        throw std::runtime_error("Not implemented.");
-    }
+  if (x.place() == paddle::PlaceType::kCPU) {
+    return relu_cpu_backward(grad_out, out, x);
+  } else if (x.place() == paddle::PlaceType::kGPU) {
+    return relu_cuda_backward(grad_out, out, x);
+  } else {
+    throw std::runtime_error("Not implemented.");
+  }
 }
 
 std::vector<std::vector<int64_t>> ReluInferShape(std::vector<int64_t> x_shape) {
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
index ce796973fa32b..8faf4ac536947 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cu
@@ -37,7 +37,7 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
 
 std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
   auto out = paddle::Tensor(paddle::PlaceType::kGPU);
-  out.Reshape(x.shape());
+  out.reshape(x.shape());
 
   int numel = x.size();
   int block = 512;
@@ -55,7 +55,7 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& grad_out,
                                                const paddle::Tensor& out,
                                                const paddle::Tensor& x) {
   auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU);
-  grad_x.Reshape(x.shape());
+  grad_x.reshape(x.shape());
 
   int numel = out.size();
   int block = 512;

From a83c469a3a9624dabe5efcb88e2b3592b99c73f5 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 12:14:33 +0000
Subject: [PATCH 64/83] remove lod, make reshape lowercase, add copy test and
 refactor copy api

---
 paddle/fluid/framework/custom_tensor_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index b6735410e3533..3c626ffd21117 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -31,7 +31,7 @@ void TestCopyToCpuOfGpuTensor() {
   auto t1 = InitGPUTensorForTest();
   auto t1_cpu_cp = t1.copy_to_cpu<T>();
   CHECK_EQ(paddle::PlaceType::kCPU, t1_cpu_cp.place());
-  for (size_t i = 0; i < t1.size(); i++) {
+  for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
   }
 }

From df6ba59ed936f03285a7f0290a1755353624904a Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 12:14:50 +0000
Subject: [PATCH 65/83] remove lod, make reshape lowercase, add copy test and
 refactor copy api

---
 paddle/fluid/framework/custom_tensor_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 3c626ffd21117..16ad77699556a 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -21,7 +21,7 @@ paddle::Tensor InitGPUTensorForTest() {
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
   t1.mutable_data<float>(paddle::PlaceType::kGPU);
   t1.reshape(tensor_shape);
-  for (size_t i = 0; i < t1.size(); i++) {
+  for (int64_t i = 0; i < t1.size(); i++) {
     t1.data<float>()[i] = 5;
   }
   return t1;

From 5272c858ca178c468aabf23d7100236a16888020 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 12:17:01 +0000
Subject: [PATCH 66/83] remove lod, make reshape lowercase, add copy test and
 refactor copy api

---
 paddle/fluid/framework/custom_tensor_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 16ad77699556a..5a023fd46c957 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -30,7 +30,7 @@ template <typename T>
 void TestCopyToCpuOfGpuTensor() {
   auto t1 = InitGPUTensorForTest();
   auto t1_cpu_cp = t1.copy_to_cpu<T>();
-  CHECK_EQ(paddle::PlaceType::kCPU, t1_cpu_cp.place());
+  CHECK((paddle::PlaceType::kCPU == t1_cpu_cp.place()));
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
   }

From 19a8ff7059119bfdefcc1a95726e60c319eff827 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 13:16:16 +0000
Subject: [PATCH 67/83] fix copy to error

---
 paddle/fluid/extension/src/tensor.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 87f172fcc5fbf..7fb91532e21f5 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -112,6 +112,7 @@ Tensor Tensor::copy_to_gpu() {
                         "function before copying data from cpu."));
   size_t ele_size = tensor->numel() * sizeof(T);
   Tensor target = Tensor(PlaceType::kGPU);
+  target.reshape(shape());
   auto *p_target_data = target.template mutable_data<T>();
   auto p_src_data = tensor->data<T>();
 
@@ -143,8 +144,8 @@ Tensor Tensor::copy_to_cpu() {
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
   Tensor target = Tensor(PlaceType::kCPU);
+  target.reshape(shape());
   auto *p_target_data = target.template mutable_data<T>();
-
   if (platform::is_cpu_place(t_place)) {
     std::memcpy(static_cast<void *>(p_target_data), t_data,
                 ele_num * sizeof(T));

From 07d37959c875e175b17e4f9eb8e0285f7938f821 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 14:20:30 +0000
Subject: [PATCH 68/83] add more test

---
 paddle/fluid/framework/custom_operator.cc    |  2 -
 paddle/fluid/framework/custom_tensor_test.cc | 68 ++++++++++++++++++--
 2 files changed, 64 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 86e67d78dd797..153a3c92d32df 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -25,8 +25,6 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include "paddle/fluid/extension/include/all.h"
-
 #include "paddle/fluid/extension/include/tensor.h"
 #include "paddle/fluid/framework/attribute.h"
 #include "paddle/fluid/framework/c/c_api.h"
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 5a023fd46c957..415726f4d9883 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -19,15 +19,26 @@
 paddle::Tensor InitGPUTensorForTest() {
   std::vector<int> tensor_shape = {5, 5};
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
+  t1.reshape(tensor_shape);
   t1.mutable_data<float>(paddle::PlaceType::kGPU);
+  for (int64_t i = 0; i < t1.size(); i++) {
+    t1.data<float>()[i] = 5;
+  }
+  return t1;
+}
+
+paddle::Tensor InitCPUTensorForTest() {
+  std::vector<int> tensor_shape = {5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
   t1.reshape(tensor_shape);
+  t1.mutable_data<float>(paddle::PlaceType::kCPU);
   for (int64_t i = 0; i < t1.size(); i++) {
     t1.data<float>()[i] = 5;
   }
   return t1;
 }
 template <typename T>
-void TestCopyToCpuOfGpuTensor() {
+void TestCopyToCpuFromGpuTensor() {
   auto t1 = InitGPUTensorForTest();
   auto t1_cpu_cp = t1.copy_to_cpu<T>();
   CHECK((paddle::PlaceType::kCPU == t1_cpu_cp.place()));
@@ -35,9 +46,58 @@ void TestCopyToCpuOfGpuTensor() {
     CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
   }
 }
+
+template <typename T>
+void TestCopyToGPUFromCpuTensor() {
+  auto t1 = InitGPUTensorForTest();
+  auto t1_gpu_cp = t1.copy_to_gpu<T>();
+  CHECK((paddle::PlaceType::kGPU == t1_gpu_cp.place()));
+  for (int64_t i = 0; i < t1.size(); i++) {
+    CHECK_EQ(t1_gpu_cp.template data<T>()[i], 5);
+  }
+}
+
+void TestAPIPlace() {
+  auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
+  auto t2 = paddle::Tensor(paddle::PlaceType::kCPU);
+  CHECK((paddle::PlaceType::kGPU == t1.place()));
+  CHECK((paddle::PlaceType::kCPU == t2.place()));
+}
+
+void TestAPISizeAndShape() {
+  std::vector<int> tensor_shape = {5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t1.reshape(tensor_shape);
+  CHECK_EQ(t1.size(), 25);
+  CHECK_EQ(t1.shape(), tensor_shape);
+}
+template <typename T>
+paddle::DataType TestDtype() {
+  std::vector<int> tensor_shape = {5, 5};
+  auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t1.reshape(tensor_shape);
+  t1.template mutable_data<T>();
+  return t1.type();
+}
+
 void GroupTestCopy() {
-  TestCopyToCpuOfGpuTensor<float>();
-  TestCopyToCpuOfGpuTensor<double>();
+  TestCopyToCpuFromGpuTensor<float>();
+  TestCopyToCpuFromGpuTensor<double>();
+  TestCopyToGPUFromCpuTensor<float>();
+  TestCopyToGPUFromCpuTensor<double>();
+}
+void GroupTestDtype() {
+  CHECK(TestDtype<float>() == paddle::DataType::FLOAT32);
+  CHECK(TestDtype<double>() == paddle::DataType::FLOAT64);
+  CHECK(TestDtype<int>() == paddle::DataType::INT32);
+  CHECK(TestDtype<int64_t>() == paddle::DataType::INT64);
+  CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
+  CHECK(TestDtype<int8_t>() == paddle::DataType::INT8);
 }
 
-TEST(CustomTensor, copyTest) { GroupTestCopy(); }
+TEST(CustomTensor, copyTest) {
+  GroupTestCopy();
+  GroupTestDtype();
+  TestAPISizeAndShape();
+  TestAPIPlace();
+}

From 49ed21c91afdf4348ee56683ad63b0b9cacfef8f Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 14:25:54 +0000
Subject: [PATCH 69/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc          | 2 +-
 python/paddle/fluid/tests/custom_op/relu_op_simple.cc | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 415726f4d9883..2e62cfad59453 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -69,7 +69,7 @@ void TestAPISizeAndShape() {
   auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
   t1.reshape(tensor_shape);
   CHECK_EQ(t1.size(), 25);
-  CHECK_EQ(t1.shape(), tensor_shape);
+  CHECK(t1.shape() == tensor_shape);
 }
 template <typename T>
 paddle::DataType TestDtype() {
diff --git a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
index 6be0acf2b924c..684466a734147 100644
--- a/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
+++ b/python/paddle/fluid/tests/custom_op/relu_op_simple.cc
@@ -50,9 +50,9 @@ std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
   return {out};
 }
 
-std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& grad_out,
+std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
                                               const paddle::Tensor& out,
-                                              const paddle::Tensor& x) {
+                                              const paddle::Tensor& grad_out) {
   auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU);
   grad_x.reshape(x.shape());
 

From 9288fff7efab6b82f7a0e160b8984188299e87f7 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 14:38:49 +0000
Subject: [PATCH 70/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 2e62cfad59453..e7e6976883017 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -91,7 +91,8 @@ void GroupTestDtype() {
   CHECK(TestDtype<double>() == paddle::DataType::FLOAT64);
   CHECK(TestDtype<int>() == paddle::DataType::INT32);
   CHECK(TestDtype<int64_t>() == paddle::DataType::INT64);
-  CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
+  // TODO(JiabinYang): CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
+
   CHECK(TestDtype<int8_t>() == paddle::DataType::INT8);
 }
 

From c775ea7c39b16b53a341a58849a144db851611f3 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 14:58:21 +0000
Subject: [PATCH 71/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index e7e6976883017..e7a628d72f43f 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -21,9 +21,11 @@ paddle::Tensor InitGPUTensorForTest() {
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
   t1.reshape(tensor_shape);
   t1.mutable_data<float>(paddle::PlaceType::kGPU);
+  std::cout << "im 1" << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
     t1.data<float>()[i] = 5;
   }
+  std::cout << "im 2" << std::endl;
   return t1;
 }
 
@@ -40,11 +42,15 @@ paddle::Tensor InitCPUTensorForTest() {
 template <typename T>
 void TestCopyToCpuFromGpuTensor() {
   auto t1 = InitGPUTensorForTest();
+  std::cout << "im 3" << std::endl;
   auto t1_cpu_cp = t1.copy_to_cpu<T>();
+  std::cout << "im 4" << std::endl;
   CHECK((paddle::PlaceType::kCPU == t1_cpu_cp.place()));
+  std::cout << "t1 sizeL: " << t1_cpu_cp.size() << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
   }
+  std::cout << "im 5" << std::endl;
 }
 
 template <typename T>
@@ -52,6 +58,7 @@ void TestCopyToGPUFromCpuTensor() {
   auto t1 = InitGPUTensorForTest();
   auto t1_gpu_cp = t1.copy_to_gpu<T>();
   CHECK((paddle::PlaceType::kGPU == t1_gpu_cp.place()));
+  std::cout << "im 2" << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_gpu_cp.template data<T>()[i], 5);
   }

From db42afc98a8a6f5c5b204ff4f5c338e62fe06e64 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 15:02:04 +0000
Subject: [PATCH 72/83] add more test

---
 .../custom_op/test_simple_custom_op_setup.py  | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
index 4e2befd9a9da9..2e0301c8338dc 100644
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@@ -86,16 +86,16 @@ def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.devices = ['cpu', 'gpu']
 
-    def test_static(self):
-        for device in self.devices:
-            for dtype in self.dtypes:
-                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-                out = relu2_static(self.custom_op, device, dtype, x)
-                pd_out = relu2_static(self.custom_op, device, dtype, x, False)
-                self.assertTrue(
-                    np.array_equal(out, pd_out),
-                    "custom op out: {},\n paddle api out: {}".format(out,
-                                                                     pd_out))
+    # def test_static(self):
+    #     for device in self.devices:
+    #         for dtype in self.dtypes:
+    #             x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+    #             out = relu2_static(self.custom_op, device, dtype, x)
+    #             pd_out = relu2_static(self.custom_op, device, dtype, x, False)
+    #             self.assertTrue(
+    #                 np.array_equal(out, pd_out),
+    #                 "custom op out: {},\n paddle api out: {}".format(out,
+    #                                                                  pd_out))
 
     def test_dynamic(self):
         for device in self.devices:

From 2243035efa87179e825eed2f53f06e81a079f100 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 15:12:10 +0000
Subject: [PATCH 73/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index e7a628d72f43f..11231484b67c7 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -20,10 +20,10 @@ paddle::Tensor InitGPUTensorForTest() {
   std::vector<int> tensor_shape = {5, 5};
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
   t1.reshape(tensor_shape);
-  t1.mutable_data<float>(paddle::PlaceType::kGPU);
+  auto* p_data_ptr = t1.mutable_data<float>(paddle::PlaceType::kGPU);
   std::cout << "im 1" << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
-    t1.data<float>()[i] = 5;
+    p_data_ptr[i] = 5;
   }
   std::cout << "im 2" << std::endl;
   return t1;
@@ -33,9 +33,9 @@ paddle::Tensor InitCPUTensorForTest() {
   std::vector<int> tensor_shape = {5, 5};
   auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
   t1.reshape(tensor_shape);
-  t1.mutable_data<float>(paddle::PlaceType::kCPU);
+  auto* p_data_ptr = t1.mutable_data<float>(paddle::PlaceType::kCPU);
   for (int64_t i = 0; i < t1.size(); i++) {
-    t1.data<float>()[i] = 5;
+    p_data_ptr[i] = 5;
   }
   return t1;
 }

From c634ab0a19bbbde1489cb8b7c1a0205b1e7a3eab Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 15:57:48 +0000
Subject: [PATCH 74/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc  |  1 +
 .../custom_op/test_simple_custom_op_setup.py  | 20 +++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 11231484b67c7..ce39d8f7bd12d 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -23,6 +23,7 @@ paddle::Tensor InitGPUTensorForTest() {
   auto* p_data_ptr = t1.mutable_data<float>(paddle::PlaceType::kGPU);
   std::cout << "im 1" << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
+    std::cout << "im 1.1" << std::endl;
     p_data_ptr[i] = 5;
   }
   std::cout << "im 2" << std::endl;
diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
index 2e0301c8338dc..4e2befd9a9da9 100644
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@@ -86,16 +86,16 @@ def setUp(self):
         self.dtypes = ['float32', 'float64']
         self.devices = ['cpu', 'gpu']
 
-    # def test_static(self):
-    #     for device in self.devices:
-    #         for dtype in self.dtypes:
-    #             x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
-    #             out = relu2_static(self.custom_op, device, dtype, x)
-    #             pd_out = relu2_static(self.custom_op, device, dtype, x, False)
-    #             self.assertTrue(
-    #                 np.array_equal(out, pd_out),
-    #                 "custom op out: {},\n paddle api out: {}".format(out,
-    #                                                                  pd_out))
+    def test_static(self):
+        for device in self.devices:
+            for dtype in self.dtypes:
+                x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
+                out = relu2_static(self.custom_op, device, dtype, x)
+                pd_out = relu2_static(self.custom_op, device, dtype, x, False)
+                self.assertTrue(
+                    np.array_equal(out, pd_out),
+                    "custom op out: {},\n paddle api out: {}".format(out,
+                                                                     pd_out))
 
     def test_dynamic(self):
         for device in self.devices:

From 46f875892c978fd3dc98576f8e5917d78100d874 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:09:05 +0000
Subject: [PATCH 75/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 22 +++++---------------
 1 file changed, 5 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index ce39d8f7bd12d..09352a7be9c8c 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -16,20 +16,6 @@
 #include "paddle/extension.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
-paddle::Tensor InitGPUTensorForTest() {
-  std::vector<int> tensor_shape = {5, 5};
-  auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
-  t1.reshape(tensor_shape);
-  auto* p_data_ptr = t1.mutable_data<float>(paddle::PlaceType::kGPU);
-  std::cout << "im 1" << std::endl;
-  for (int64_t i = 0; i < t1.size(); i++) {
-    std::cout << "im 1.1" << std::endl;
-    p_data_ptr[i] = 5;
-  }
-  std::cout << "im 2" << std::endl;
-  return t1;
-}
-
 paddle::Tensor InitCPUTensorForTest() {
   std::vector<int> tensor_shape = {5, 5};
   auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
@@ -42,7 +28,7 @@ paddle::Tensor InitCPUTensorForTest() {
 }
 template <typename T>
 void TestCopyToCpuFromGpuTensor() {
-  auto t1 = InitGPUTensorForTest();
+  auto t1 = InitCPUTensorForTest();
   std::cout << "im 3" << std::endl;
   auto t1_cpu_cp = t1.copy_to_cpu<T>();
   std::cout << "im 4" << std::endl;
@@ -56,12 +42,14 @@ void TestCopyToCpuFromGpuTensor() {
 
 template <typename T>
 void TestCopyToGPUFromCpuTensor() {
-  auto t1 = InitGPUTensorForTest();
+  auto t1 = InitCPUTensorForTest();
   auto t1_gpu_cp = t1.copy_to_gpu<T>();
   CHECK((paddle::PlaceType::kGPU == t1_gpu_cp.place()));
   std::cout << "im 2" << std::endl;
+  auto rlt = t1_gpu_cp.template copy_to_cpu<T>();
+  std::cout << "im 2.3" << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
-    CHECK_EQ(t1_gpu_cp.template data<T>()[i], 5);
+    CHECK_EQ(rlt.template data<T>()[i], 5);
   }
 }
 

From d912a99069a6450c93bcf868c43ee7101d210093 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:13:50 +0000
Subject: [PATCH 76/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 09352a7be9c8c..b1b60972040ba 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -29,15 +29,11 @@ paddle::Tensor InitCPUTensorForTest() {
 template <typename T>
 void TestCopyToCpuFromGpuTensor() {
   auto t1 = InitCPUTensorForTest();
-  std::cout << "im 3" << std::endl;
   auto t1_cpu_cp = t1.copy_to_cpu<T>();
-  std::cout << "im 4" << std::endl;
   CHECK((paddle::PlaceType::kCPU == t1_cpu_cp.place()));
-  std::cout << "t1 sizeL: " << t1_cpu_cp.size() << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
   }
-  std::cout << "im 5" << std::endl;
 }
 
 template <typename T>
@@ -45,9 +41,7 @@ void TestCopyToGPUFromCpuTensor() {
   auto t1 = InitCPUTensorForTest();
   auto t1_gpu_cp = t1.copy_to_gpu<T>();
   CHECK((paddle::PlaceType::kGPU == t1_gpu_cp.place()));
-  std::cout << "im 2" << std::endl;
   auto rlt = t1_gpu_cp.template copy_to_cpu<T>();
-  std::cout << "im 2.3" << std::endl;
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(rlt.template data<T>()[i], 5);
   }
@@ -93,8 +87,12 @@ void GroupTestDtype() {
 }
 
 TEST(CustomTensor, copyTest) {
+  VLOG(0) << "TestCopy";
   GroupTestCopy();
+  VLOG(0) << "TestDtype";
   GroupTestDtype();
+  VLOG(0) << "TestShape";
   TestAPISizeAndShape();
+  VLOG(0) << "Test Place";
   TestAPIPlace();
 }

From 4735e8dafbb9429b56c056cf388b1ad9b9e76910 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:25:50 +0000
Subject: [PATCH 77/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 33 ++++++++++----------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index b1b60972040ba..d2df359e548d6 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -16,34 +16,33 @@
 #include "paddle/extension.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 
+template <typename T>
 paddle::Tensor InitCPUTensorForTest() {
   std::vector<int> tensor_shape = {5, 5};
   auto t1 = paddle::Tensor(paddle::PlaceType::kCPU);
   t1.reshape(tensor_shape);
-  auto* p_data_ptr = t1.mutable_data<float>(paddle::PlaceType::kCPU);
+  auto* p_data_ptr = t1.mutable_data<T>(paddle::PlaceType::kCPU);
   for (int64_t i = 0; i < t1.size(); i++) {
     p_data_ptr[i] = 5;
   }
   return t1;
 }
 template <typename T>
-void TestCopyToCpuFromGpuTensor() {
-  auto t1 = InitCPUTensorForTest();
-  auto t1_cpu_cp = t1.copy_to_cpu<T>();
+void TestCopyTensor() {
+  auto t1 = InitCPUTensorForTest<T>();
+  auto t1_cpu_cp = t1.template copy_to_cpu<T>();
   CHECK((paddle::PlaceType::kCPU == t1_cpu_cp.place()));
   for (int64_t i = 0; i < t1.size(); i++) {
     CHECK_EQ(t1_cpu_cp.template data<T>()[i], 5);
   }
-}
-
-template <typename T>
-void TestCopyToGPUFromCpuTensor() {
-  auto t1 = InitCPUTensorForTest();
-  auto t1_gpu_cp = t1.copy_to_gpu<T>();
+  auto t1_gpu_cp = t1_cpu_cp.template copy_to_gpu<T>();
   CHECK((paddle::PlaceType::kGPU == t1_gpu_cp.place()));
-  auto rlt = t1_gpu_cp.template copy_to_cpu<T>();
+  auto t1_gpu_cp_cp = t1_gpu_cp.template copy_to_gpu<T>();
+  CHECK((paddle::PlaceType::kGPU == t1_gpu_cp_cp.place()));
+  auto t1_gpu_cp_cp_cpu = t1_gpu_cp.template copy_to_cpu<T>();
+  CHECK((paddle::PlaceType::kCPU == t1_gpu_cp_cp_cpu.place()));
   for (int64_t i = 0; i < t1.size(); i++) {
-    CHECK_EQ(rlt.template data<T>()[i], 5);
+    CHECK_EQ(t1_gpu_cp_cp_cpu.template data<T>()[i], 5);
   }
 }
 
@@ -71,10 +70,10 @@ paddle::DataType TestDtype() {
 }
 
 void GroupTestCopy() {
-  TestCopyToCpuFromGpuTensor<float>();
-  TestCopyToCpuFromGpuTensor<double>();
-  TestCopyToGPUFromCpuTensor<float>();
-  TestCopyToGPUFromCpuTensor<double>();
+  VLOG(0) << "Float cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<float>();
+  VLOG(0) << "Double cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<double>();
 }
 void GroupTestDtype() {
   CHECK(TestDtype<float>() == paddle::DataType::FLOAT32);
@@ -93,6 +92,6 @@ TEST(CustomTensor, copyTest) {
   GroupTestDtype();
   VLOG(0) << "TestShape";
   TestAPISizeAndShape();
-  VLOG(0) << "Test Place";
+  VLOG(0) << "TestPlace";
   TestAPIPlace();
 }

From 4d78356f62f333f4e4eafd6154dec5146e0789ad Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:37:16 +0000
Subject: [PATCH 78/83] add more test

---
 paddle/fluid/extension/src/tensor.cc         |  2 ++
 paddle/fluid/framework/custom_tensor_test.cc | 29 ++++++++++++++++++--
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 7fb91532e21f5..1a4bfd7b31c3e 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -85,6 +85,8 @@ DataType Tensor::type() const {
     return DataType::INT32;
   } else if (type == framework::proto::VarType::INT16) {
     return DataType::INT16;
+  } else if (type == framework::proto::VarType::INT8) {
+    return DataType::INT8;
   } else if (type == framework::proto::VarType::UINT8) {
     return DataType::UINT8;
   } else if (type == framework::proto::VarType::FP64) {
diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index d2df359e548d6..bab771297b883 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -60,6 +60,7 @@ void TestAPISizeAndShape() {
   CHECK_EQ(t1.size(), 25);
   CHECK(t1.shape() == tensor_shape);
 }
+
 template <typename T>
 paddle::DataType TestDtype() {
   std::vector<int> tensor_shape = {5, 5};
@@ -74,15 +75,39 @@ void GroupTestCopy() {
   TestCopyTensor<float>();
   VLOG(0) << "Double cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<double>();
+  VLOG(0) << "Fp16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::float16>();
+  VLOG(0) << "BF16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::bfloat16>();
+  VLOG(0) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::complex128>();
+  VLOG(0) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<paddle::platform::complex64>();
+  VLOG(0) << "int cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int>();
+  VLOG(0) << "int64 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int64_t>();
+  VLOG(0) << "int16 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int16_t>();
+  VLOG(0) << "int8 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<int8_t>();
+  VLOG(0) << "uint8 cpu-cpu-gpu-gpu-cpu";
+  TestCopyTensor<u_int8_t>();
 }
 void GroupTestDtype() {
   CHECK(TestDtype<float>() == paddle::DataType::FLOAT32);
   CHECK(TestDtype<double>() == paddle::DataType::FLOAT64);
+  CHECK(TestDtype<paddle::platform::float16>() == paddle::DataType::FLOAT16);
+  CHECK(TestDtype<paddle::platform::bfloat16>() == paddle::DataType::BFLOAT16);
+  CHECK(TestDtype<paddle::platform::complex128>() ==
+        paddle::DataType::COMPLEX128);
+  CHECK(TestDtype<paddle::platform::complex64>() ==
+        paddle::DataType::COMPLEX64);
   CHECK(TestDtype<int>() == paddle::DataType::INT32);
   CHECK(TestDtype<int64_t>() == paddle::DataType::INT64);
-  // TODO(JiabinYang): CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
-
+  CHECK(TestDtype<int16_t>() == paddle::DataType::INT16);
   CHECK(TestDtype<int8_t>() == paddle::DataType::INT8);
+  CHECK(TestDtype<u_int8_t>() == paddle::DataType::UINT8);
 }
 
 TEST(CustomTensor, copyTest) {

From d886e9badc57f69653e7225f799bb088681530a5 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:42:00 +0000
Subject: [PATCH 79/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index bab771297b883..699dc8868c33c 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -75,15 +75,16 @@ void GroupTestCopy() {
   TestCopyTensor<float>();
   VLOG(0) << "Double cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<double>();
-  VLOG(0) << "Fp16 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::float16>();
-  VLOG(0) << "BF16 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::bfloat16>();
-  VLOG(0) << "complex128 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::complex128>();
-  VLOG(0) << "complex64 cpu-cpu-gpu-gpu-cpu";
-  TestCopyTensor<paddle::platform::complex64>();
-  VLOG(0) << "int cpu-cpu-gpu-gpu-cpu";
+  // TODO(JiabinYang): Support these test later
+  //  VLOG(0) << "Fp16 cpu-cpu-gpu-gpu-cpu";
+  //  TestCopyTensor<paddle::platform::float16>();
+  //  VLOG(0) << "BF16 cpu-cpu-gpu-gpu-cpu";
+  //  TestCopyTensor<paddle::platform::bfloat16>();
+  //  VLOG(0) << "complex128 cpu-cpu-gpu-gpu-cpu";
+  //  TestCopyTensor<paddle::platform::complex128>();
+  //  VLOG(0) << "complex64 cpu-cpu-gpu-gpu-cpu";
+  //  TestCopyTensor<paddle::platform::complex64>();
+  //  VLOG(0) << "int cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<int>();
   VLOG(0) << "int64 cpu-cpu-gpu-gpu-cpu";
   TestCopyTensor<int64_t>();

From 43ed2a7200dbc34253543582295564603f1b087a Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:48:32 +0000
Subject: [PATCH 80/83] add more test

---
 paddle/fluid/extension/src/tensor.cc | 37 ++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 1a4bfd7b31c3e..d291dfef1b156 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -169,19 +169,29 @@ Tensor Tensor::copy_to_cpu() {
   return target;
 }
 
+template Tensor Tensor::copy_to_gpu<paddle::platform::float16>();
+template Tensor Tensor::copy_to_gpu<paddle::platform::bfloat16>();
+template Tensor Tensor::copy_to_gpu<paddle::platform::complex64>();
+template Tensor Tensor::copy_to_gpu<paddle::platform::complex128>();
 template Tensor Tensor::copy_to_gpu<float>();
 template Tensor Tensor::copy_to_gpu<double>();
 template Tensor Tensor::copy_to_gpu<int64_t>();
 template Tensor Tensor::copy_to_gpu<int32_t>();
 template Tensor Tensor::copy_to_gpu<uint8_t>();
 template Tensor Tensor::copy_to_gpu<int8_t>();
+template Tensor Tensor::copy_to_gpu<int16_t>();
 
+template Tensor Tensor::copy_to_cpu<paddle::platform::float16>();
+template Tensor Tensor::copy_to_cpu<paddle::platform::bfloat16>();
+template Tensor Tensor::copy_to_cpu<paddle::platform::complex64>();
+template Tensor Tensor::copy_to_cpu<paddle::platform::complex128>();
 template Tensor Tensor::copy_to_cpu<float>();
 template Tensor Tensor::copy_to_cpu<double>();
 template Tensor Tensor::copy_to_cpu<int64_t>();
 template Tensor Tensor::copy_to_cpu<int32_t>();
 template Tensor Tensor::copy_to_cpu<uint8_t>();
 template Tensor Tensor::copy_to_cpu<int8_t>();
+template Tensor Tensor::copy_to_cpu<int16_t>();
 
 template float *Tensor::data<float>() const;
 template double *Tensor::data<double>() const;
@@ -189,6 +199,15 @@ template int64_t *Tensor::data<int64_t>() const;
 template int32_t *Tensor::data<int32_t>() const;
 template uint8_t *Tensor::data<uint8_t>() const;
 template int8_t *Tensor::data<int8_t>() const;
+template paddle::platform::float16 *Tensor::data<paddle::platform::float16>()
+    const;
+template paddle::platform::bfloat16 *Tensor::data<paddle::platform::bfloat16>()
+    const;
+template paddle::platform::complex128 *
+Tensor::data<paddle::platform::complex128>() const;
+template paddle::platform::complex64 *
+Tensor::data<paddle::platform::complex64>() const;
+template int16_t *Tensor::data<int16_t>() const;
 
 template float *Tensor::mutable_data<float>();
 template double *Tensor::mutable_data<double>();
@@ -196,6 +215,15 @@ template int64_t *Tensor::mutable_data<int64_t>();
 template int32_t *Tensor::mutable_data<int32_t>();
 template uint8_t *Tensor::mutable_data<uint8_t>();
 template int8_t *Tensor::mutable_data<int8_t>();
+template paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>();
+template paddle::platform::bfloat16 *
+Tensor::mutable_data<paddle::platform::bfloat16>();
+template paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>();
+template paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>();
+template int16_t *Tensor::mutable_data<int16_t>();
 
 template float *Tensor::mutable_data<float>(const PlaceType &place);
 template double *Tensor::mutable_data<double>(const PlaceType &place);
@@ -203,6 +231,15 @@ template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
 template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
 template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
 template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
+template paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
+template paddle::platform::bfloat16 *
+Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
+template paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+template paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
+template int16_t *Tensor::mutable_data<int16_t>(const PlaceType &place);
 
 std::vector<int> Tensor::shape() const {
   GET_CASTED_TENSOR

From 0e7f2861a4c8eaf2f402d05b2272511edb434b83 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:54:38 +0000
Subject: [PATCH 81/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 699dc8868c33c..5e054acd2fee0 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -48,7 +48,9 @@ void TestCopyTensor() {
 
 void TestAPIPlace() {
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
+  t1.mutable_data<float>();
   auto t2 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t2.mutable_data<float>();
   CHECK((paddle::PlaceType::kGPU == t1.place()));
   CHECK((paddle::PlaceType::kCPU == t2.place()));
 }

From d12969c94666b07c1d3e990185463b22f0a5b1cd Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Sun, 7 Feb 2021 16:58:03 +0000
Subject: [PATCH 82/83] add more test

---
 paddle/fluid/framework/custom_tensor_test.cc | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/paddle/fluid/framework/custom_tensor_test.cc b/paddle/fluid/framework/custom_tensor_test.cc
index 5e054acd2fee0..6688bedee2685 100644
--- a/paddle/fluid/framework/custom_tensor_test.cc
+++ b/paddle/fluid/framework/custom_tensor_test.cc
@@ -47,9 +47,12 @@ void TestCopyTensor() {
 }
 
 void TestAPIPlace() {
+  std::vector<int> tensor_shape = {5, 5};
   auto t1 = paddle::Tensor(paddle::PlaceType::kGPU);
+  t1.reshape(tensor_shape);
   t1.mutable_data<float>();
   auto t2 = paddle::Tensor(paddle::PlaceType::kCPU);
+  t2.reshape(tensor_shape);
   t2.mutable_data<float>();
   CHECK((paddle::PlaceType::kGPU == t1.place()));
   CHECK((paddle::PlaceType::kCPU == t2.place()));

From 34af5ab348f1d215fe156d23a5d03d39140bcc31 Mon Sep 17 00:00:00 2001
From: JiabinYang <marsyang199376@gmail.com>
Date: Mon, 8 Feb 2021 03:56:04 +0000
Subject: [PATCH 83/83] add more test

---
 paddle/fluid/extension/include/dtype.h    |  2 +-
 paddle/fluid/extension/include/tensor.h   | 23 ++++++++++++++---------
 paddle/fluid/extension/src/tensor.cc      |  8 ++++----
 paddle/fluid/framework/custom_operator.cc |  3 ---
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index 292071a479f59..e01e94a6a726d 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -32,7 +32,7 @@ enum DataType {
   INT16,
   UINT8,
   INT8,
-  // TODO(Superjomn) support more data types if needed.
+  // TODO(JiabinYang) support more data types if needed.
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index 372e5b8453604..bf6cd63f24d15 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -28,21 +28,24 @@ class Tensor {
   explicit Tensor(const PlaceType& place);
   /// \brief Reset the shape of the tensor.
   /// Generally it's only used for the input tensor.
-  /// Reshape must be called before calling mutable_data() or copy_from_cpu()
+  /// Reshape must be called before calling
+  /// mutable_data() or copy_from_cpu()
   /// \param shape The shape to set.
   void reshape(const std::vector<int>& shape);
 
-  /// \brief Get the memory pointer in CPU or GPU with specific data type.
+  /// \brief Get the memory pointer in CPU or GPU with
+  /// specific data type.
   /// Please Reshape the tensor first before call this.
   /// It's usually used to get input data pointer.
-  /// \param place The place of the tensor this will override the original place
-  /// of current tensor.
+  /// \param place The place of the tensor this will
+  /// override the original place of current tensor.
   template <typename T>
   T* mutable_data(const PlaceType& place);
 
-  /// \brief Get the memory pointer in CPU or GPU with specific data type.
-  /// Please Reshape the tensor first before call this.
-  /// It's usually used to get input data pointer.
+  /// \brief Get the memory pointer in CPU or GPU with
+  /// specific data type. Please Reshape the tensor
+  /// first before call this.It's usually used to get
+  /// input data pointer.
   template <typename T>
   T* mutable_data();
 
@@ -54,13 +57,15 @@ class Tensor {
 
   /// \brief Copy the host memory to tensor data.
   /// It's usually used to set the input tensor data.
-  /// \param data The pointer of the data, from which the tensor will copy.
+  /// \param data The pointer of the data, from which
+  /// the tensor will copy.
   template <typename T>
   Tensor copy_to_gpu();
 
   /// \brief Copy the tensor data to the host memory.
   /// It's usually used to get the output tensor data.
-  /// \param[out] data The tensor will copy the data to the address.
+  /// \param[out] data The tensor will copy the data to
+  /// the address.
   template <typename T>
   Tensor copy_to_cpu();
 
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index d291dfef1b156..35fec36b04cea 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/extension/include/tensor.h"
 #include <utility>
-#include "paddle/fluid/extension/include/all.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -133,8 +133,8 @@ Tensor Tensor::copy_to_gpu() {
   cudaStreamSynchronize(dev_ctx->stream());
   return target;
 #else
-  PADDLE_THROW(platform::errors::Unavailable(
-      "Not compiled with CUDA, should not reach here."));
+  PADDLE_THROW(
+      platform::errors::Unavailable("PaddlePaddle is not compiled with CUDA"));
 #endif
   return Tensor(PlaceType::kGPU);
 }
@@ -163,7 +163,7 @@ Tensor Tensor::copy_to_cpu() {
     cudaStreamSynchronize(dev_ctx->stream());
 #else
     PADDLE_THROW(platform::errors::Unavailable(
-        "Not compile with CUDA, should not reach here."));
+        "PaddlePaddle is not compiled with CUDA."));
 #endif
   }
   return target;
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 153a3c92d32df..2621f7ab4e269 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -126,7 +126,6 @@ proto::VarType::Type ConvertEnumDTypeToInnerDType(
       return proto::VarType::INT32;
     case paddle::DataType::INT64:
       return proto::VarType::INT64;
-    // TODO(chenweihang):
     default:
       PADDLE_THROW(platform::errors::Unimplemented("Unsupported data type."));
   }
@@ -157,8 +156,6 @@ paddle::DataType ConvertInnerDTypeToEnumDType(
       return paddle::DataType::UINT8;
     case proto::VarType::INT16:
       return paddle::DataType::INT16;
-
-    // TODO(chenweihang):
     default:
       PADDLE_THROW(platform::errors::Unimplemented("Unsupported data type."));
   }