chenwhql · chenwhql · Feb 8, 2021 · Jan 27, 2021 · Jan 29, 2021 · Jan 29, 2021
diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
@@ -13,17 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
 
 enum DataType {
   FLOAT32,
   FLOAT64,
+  BFLOAT16,
+  COMPLEX128,
+  COMPLEX64,
+  FLOAT16,
   INT64,
   INT32,
+  INT16,
   UINT8,
   INT8,
-  // TODO(yangjiabin): Add other dtype support in next PR
+  // TODO(Superjomn) support more data types if needed.
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
@@ -16,7 +16,6 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
-
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/place.h"
 
@@ -31,7 +30,7 @@ class Tensor {
   /// Generally it's only used for the input tensor.
   /// Reshape must be called before calling mutable_data() or copy_from_cpu()
   /// \param shape The shape to set.
-  void Reshape(const std::vector<int>& shape);
+  void reshape(const std::vector<int>& shape);
 
   /// \brief Get the memory pointer in CPU or GPU with specific data type.
   /// Please Reshape the tensor first before call this.
@@ -57,25 +56,17 @@ class Tensor {
   /// It's usually used to set the input tensor data.
   /// \param data The pointer of the data, from which the tensor will copy.
   template <typename T>
-  void copy_from_cpu(const T* data);
+  Tensor copy_to_gpu();
 
   /// \brief Copy the tensor data to the host memory.
   /// It's usually used to get the output tensor data.
   /// \param[out] data The tensor will copy the data to the address.
   template <typename T>
-  void copy_to_cpu(T* data);
+  Tensor copy_to_cpu();
 
   /// \brief Return the shape of the Tensor.
   std::vector<int> shape() const;
 
-  /// \brief Set lod info of the tensor.
-  /// More about LOD can be seen here:
-  ///  https://www.paddlepaddle.org.cn/documentation/docs/zh/beginners_guide/basic_concept/lod_tensor.html#lodtensor
-  /// \param x the lod info.
-  void SetLoD(const std::vector<std::vector<size_t>>& x);
-  /// \brief Return the lod info of the tensor.
-  std::vector<std::vector<size_t>> lod() const;
-
   /// \brief Return the data type of the tensor.
   /// It's usually used to get the output tensor data type.
   /// \return The data type of the tensor.

diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
@@ -12,10 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/extension/include/tensor.h"
-
 #include <utility>
-
+#include "paddle/fluid/extension/include/all.h"
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -30,7 +28,7 @@ namespace paddle {
   }                                                     \
   auto *tensor = static_cast<framework::LoDTensor *>(tensor_.get());
 
-void Tensor::Reshape(const std::vector<int> &shape) {
+void Tensor::reshape(const std::vector<int> &shape) {
   GET_CASTED_TENSOR
   tensor->Resize(framework::make_ddim(shape));
 }
@@ -85,129 +83,169 @@ DataType Tensor::type() const {
     return DataType::INT64;
   } else if (type == framework::proto::VarType::INT32) {
     return DataType::INT32;
+  } else if (type == framework::proto::VarType::INT16) {
+    return DataType::INT16;
+  } else if (type == framework::proto::VarType::INT8) {
+    return DataType::INT8;
   } else if (type == framework::proto::VarType::UINT8) {
     return DataType::UINT8;
   } else if (type == framework::proto::VarType::FP64) {
     return DataType::FLOAT64;
+  } else if (type == framework::proto::VarType::BF16) {
+    return DataType::BFLOAT16;
+  } else if (type == framework::proto::VarType::FP16) {
+    return DataType::FLOAT16;
+  } else if (type == framework::proto::VarType::COMPLEX64) {
+    return DataType::COMPLEX64;
+  } else if (type == framework::proto::VarType::COMPLEX128) {
+    return DataType::COMPLEX128;
   }
   return DataType::FLOAT32;
 }
 
 template <typename T>
-void Tensor::copy_from_cpu(const T *data) {
+Tensor Tensor::copy_to_gpu() {
+#ifdef PADDLE_WITH_CUDA
   GET_CASTED_TENSOR;
   PADDLE_ENFORCE_GE(tensor->numel(), 0,
                     platform::errors::PreconditionNotMet(
                         "You should call Tensor::Reshape(const "
                         "std::vector<int> &shape)"
                         "function before copying data from cpu."));
   size_t ele_size = tensor->numel() * sizeof(T);
-
-  if (place_ == PlaceType::kCPU) {
-    auto *t_data = tensor->mutable_data<T>(platform::CPUPlace());
-    std::memcpy(static_cast<void *>(t_data), data, ele_size);
+  Tensor target = Tensor(PlaceType::kGPU);
+  target.reshape(shape());
+  auto *p_target_data = target.template mutable_data<T>();
+  auto p_src_data = tensor->data<T>();
+
+  platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+  int device_num = platform::GetCurrentDeviceId();
+  platform::CUDAPlace gpu_place(device_num);
+  auto *dev_ctx =
+      static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
+  if (platform::is_cpu_place(tensor->place())) {
+    memory::Copy(gpu_place, static_cast<void *>(p_target_data),
+                 platform::CPUPlace(), p_src_data, ele_size, dev_ctx->stream());
   } else {
-#ifdef PADDLE_WITH_CUDA
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    int device_num = platform::GetCurrentDeviceId();
-    platform::CUDAPlace gpu_place(device_num);
-    auto *t_data = tensor->mutable_data<T>(gpu_place);
-    auto *dev_ctx =
-        static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-
-    memory::Copy(gpu_place, static_cast<void *>(t_data), platform::CPUPlace(),
-                 data, ele_size, dev_ctx->stream());
+    memory::Copy(gpu_place, static_cast<void *>(p_target_data), gpu_place,
+                 p_src_data, ele_size, dev_ctx->stream());
+  }
+  cudaStreamSynchronize(dev_ctx->stream());
+  return target;
 #else
-    PADDLE_THROW(platform::errors::Unavailable(
-        "Not compiled with CUDA, should not reach here."));
+  PADDLE_THROW(platform::errors::Unavailable(
+      "Not compiled with CUDA, should not reach here."));
 #endif
-  }
+  return Tensor(PlaceType::kGPU);
 }
 
 template <typename T>
-void Tensor::copy_to_cpu(T *data) {
+Tensor Tensor::copy_to_cpu() {
   GET_CASTED_TENSOR;
   auto ele_num = tensor->numel();
   auto *t_data = tensor->data<T>();
   auto t_place = tensor->place();
-
+  Tensor target = Tensor(PlaceType::kCPU);
+  target.reshape(shape());
+  auto *p_target_data = target.template mutable_data<T>();
   if (platform::is_cpu_place(t_place)) {
-    std::memcpy(static_cast<void *>(data), t_data, ele_num * sizeof(T));
+    std::memcpy(static_cast<void *>(p_target_data), t_data,
+                ele_num * sizeof(T));
   } else {
 #ifdef PADDLE_WITH_CUDA
     platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
     auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, t_place);
     auto *dev_ctx =
         static_cast<const platform::CUDADeviceContext *>(pool.Get(gpu_place));
-    memory::Copy(platform::CPUPlace(), static_cast<void *>(data), gpu_place,
-                 t_data, ele_num * sizeof(T), dev_ctx->stream());
+    memory::Copy(platform::CPUPlace(), static_cast<void *>(p_target_data),
+                 gpu_place, t_data, ele_num * sizeof(T), dev_ctx->stream());
 
     cudaStreamSynchronize(dev_ctx->stream());
 #else
     PADDLE_THROW(platform::errors::Unavailable(
         "Not compile with CUDA, should not reach here."));
 #endif
   }
+  return target;
 }
 
-template void Tensor::copy_from_cpu<float>(const float *data);
-template void Tensor::copy_from_cpu<double>(const double *data);
-template void Tensor::copy_from_cpu<int64_t>(const int64_t *data);
-template void Tensor::copy_from_cpu<int32_t>(const int32_t *data);
-template void Tensor::copy_from_cpu<uint8_t>(const uint8_t *data);
-template void Tensor::copy_from_cpu<int8_t>(const int8_t *data);
-
-template void Tensor::copy_to_cpu<float>(float *data);
-template void Tensor::copy_to_cpu<double>(double *data);
-template void Tensor::copy_to_cpu<int64_t>(int64_t *data);
-template void Tensor::copy_to_cpu<int32_t>(int32_t *data);
-template void Tensor::copy_to_cpu<uint8_t>(uint8_t *data);
-template void Tensor::copy_to_cpu<int8_t>(int8_t *data);
+template Tensor Tensor::copy_to_gpu<paddle::platform::float16>();
+template Tensor Tensor::copy_to_gpu<paddle::platform::bfloat16>();
+template Tensor Tensor::copy_to_gpu<paddle::platform::complex64>();
+template Tensor Tensor::copy_to_gpu<paddle::platform::complex128>();
+template Tensor Tensor::copy_to_gpu<float>();
+template Tensor Tensor::copy_to_gpu<double>();
+template Tensor Tensor::copy_to_gpu<int64_t>();
+template Tensor Tensor::copy_to_gpu<int32_t>();
+template Tensor Tensor::copy_to_gpu<uint8_t>();
+template Tensor Tensor::copy_to_gpu<int8_t>();
+template Tensor Tensor::copy_to_gpu<int16_t>();
+
+template Tensor Tensor::copy_to_cpu<paddle::platform::float16>();
+template Tensor Tensor::copy_to_cpu<paddle::platform::bfloat16>();
+template Tensor Tensor::copy_to_cpu<paddle::platform::complex64>();
+template Tensor Tensor::copy_to_cpu<paddle::platform::complex128>();
+template Tensor Tensor::copy_to_cpu<float>();
+template Tensor Tensor::copy_to_cpu<double>();
+template Tensor Tensor::copy_to_cpu<int64_t>();
+template Tensor Tensor::copy_to_cpu<int32_t>();
+template Tensor Tensor::copy_to_cpu<uint8_t>();
+template Tensor Tensor::copy_to_cpu<int8_t>();
+template Tensor Tensor::copy_to_cpu<int16_t>();
 
 template float *Tensor::data<float>() const;
 template double *Tensor::data<double>() const;
 template int64_t *Tensor::data<int64_t>() const;
 template int32_t *Tensor::data<int32_t>() const;
 template uint8_t *Tensor::data<uint8_t>() const;
 template int8_t *Tensor::data<int8_t>() const;
+template paddle::platform::float16 *Tensor::data<paddle::platform::float16>()
+    const;
+template paddle::platform::bfloat16 *Tensor::data<paddle::platform::bfloat16>()
+    const;
+template paddle::platform::complex128 *
+Tensor::data<paddle::platform::complex128>() const;
+template paddle::platform::complex64 *
+Tensor::data<paddle::platform::complex64>() const;
+template int16_t *Tensor::data<int16_t>() const;
 
 template float *Tensor::mutable_data<float>();
 template double *Tensor::mutable_data<double>();
 template int64_t *Tensor::mutable_data<int64_t>();
 template int32_t *Tensor::mutable_data<int32_t>();
 template uint8_t *Tensor::mutable_data<uint8_t>();
 template int8_t *Tensor::mutable_data<int8_t>();
+template paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>();
+template paddle::platform::bfloat16 *
+Tensor::mutable_data<paddle::platform::bfloat16>();
+template paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>();
+template paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>();
+template int16_t *Tensor::mutable_data<int16_t>();
 
 template float *Tensor::mutable_data<float>(const PlaceType &place);
 template double *Tensor::mutable_data<double>(const PlaceType &place);
 template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
 template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
 template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
 template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
+template paddle::platform::float16 *
+Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
+template paddle::platform::bfloat16 *
+Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
+template paddle::platform::complex128 *
+Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
+template paddle::platform::complex64 *
+Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
+template int16_t *Tensor::mutable_data<int16_t>(const PlaceType &place);
 
 std::vector<int> Tensor::shape() const {
   GET_CASTED_TENSOR
   return framework::vectorize<int>(tensor->dims());
 }
 
-void Tensor::SetLoD(const std::vector<std::vector<size_t>> &x) {
-  GET_CASTED_TENSOR;
-  framework::LoD lod;
-  for (auto &level : x) {
-    lod.emplace_back(level);
-  }
-  tensor->set_lod(lod);
-}
-
-std::vector<std::vector<size_t>> Tensor::lod() const {
-  GET_CASTED_TENSOR;
-  std::vector<std::vector<size_t>> res;
-  for (auto &level : tensor->lod()) {
-    res.emplace_back(level);
-  }
-  return res;
-}
-
 const PlaceType &Tensor::place() const {
   GET_CASTED_TENSOR;
   if (platform::is_cpu_place(tensor->place())) {

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -323,6 +323,7 @@ configure_file(commit.h.in commit.h)
 cc_library(custom_tensor SRCS ../extension/src/tensor.cc DEPS lod_tensor)
 cc_library(op_meta_info SRCS ../extension/src/op_meta_info.cc DEPS custom_tensor)
 cc_library(custom_operator SRCS custom_operator.cc DEPS operator op_registry device_context dynamic_loader custom_tensor op_meta_info)
+cc_test(custom_tensor_test SRCS custom_tensor_test.cc DEPS custom_tensor)
 
 set(FLUID_FRAMEWORK_MODULES proto_desc memory lod_tensor executor data_feed_proto layer dynamic_loader)