From 70a4c7f71d11abb5aaf4aa5894ee5ee0659d11ae Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 10:04:18 +0800
Subject: [PATCH 01/10] add dist model tensor wrapper

---
 .../dist_model_tensor_wrapper.cc              | 42 ++++++++++++++
 .../dist_model_tensor_wrapper.h               | 55 +++++++++++++++++++
 2 files changed, 97 insertions(+)
 create mode 100644 paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
 create mode 100644 paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
new file mode 100644
index 0000000000000..9c7a3b7a0e989
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
@@ -0,0 +1,42 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace distributed {
+
+void DistModelDataBuf::Reset(void *data, size_t length) {
+  Free();
+  memory_owned_ = false;
+  data_ = data;
+  length_ = length;
+}
+
+void DistModelDataBuf::Free() {
+  if (memory_owned_ && data_) {
+    PADDLE_ENFORCE_GT(length_, 0UL,
+                      platform::errors::PreconditionNotMet(
+                          "Error occurred when deconstruct DistModelDataBuf: "
+                          "it contains no data!"));
+    // NOTE: if own the memory, it must be char* type
+    delete[] static_cast<char *>(data_);
+    data_ = nullptr;
+    length_ = 0;
+  }
+}
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
new file mode 100644
index 0000000000000..8899c6885829c
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -0,0 +1,55 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/platform/macros.h"
+
+namespace paddle {
+namespace distributed {
+
+enum DistModelDataType { FLOAT16, FLOAT32, INT64, INT32, INT8 };
+
+class DistModelDataBuf {
+ public:
+  explicit DistModelDataBuf(size_t length)
+      : data_(new char[length]), length_(length), memory_owned_(true) {}
+  DistModelDataBuf(void* data, size_t length)
+      : data_(data), length_(length), memory_owned_(false) {}
+  void Reset(void* data, size_t length);
+  size_t length() const { return length_; }
+  void* data() const { return data_; }
+  ~DistModelDataBuf() { Free(); }
+  DistModelDataBuf() = default;
+
+ private:
+  // TODO(fleet exe dev): support copy and assign later
+  DISABLE_COPY_AND_ASSIGN(DistModelDataBuf);
+  void Free();
+  void* data_{nullptr};
+  size_t length_{0};
+  bool memory_owned_{false};
+};
+
+struct DistModelTensor {
+  std::string name;
+  std::vector<int> shape;
+  DistModelDataBuf data;
+  DistModelDataType type;
+  std::vector<std::vector<size_t>> lod;
+};
+
+}  // namespace distributed
+}  // namespace paddle

From 1f69ec442abf2b183a5f16edeb11ada8fbcecc5c Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 10:52:38 +0800
Subject: [PATCH 02/10] add assgin and copy method

---
 .../dist_model_tensor_wrapper.cc              | 62 ++++++++++++++++++-
 .../dist_model_tensor_wrapper.h               |  8 ++-
 paddle/fluid/pybind/bind_fleet_executor.cc    | 13 ++++
 3 files changed, 79 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
index 9c7a3b7a0e989..b440d39c73a70 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.cc
@@ -18,7 +18,7 @@
 namespace paddle {
 namespace distributed {
 
-void DistModelDataBuf::Reset(void *data, size_t length) {
+void DistModelDataBuf::Reset(void* data, size_t length) {
   Free();
   memory_owned_ = false;
   data_ = data;
@@ -32,11 +32,69 @@ void DistModelDataBuf::Free() {
                           "Error occurred when deconstruct DistModelDataBuf: "
                           "it contains no data!"));
     // NOTE: if own the memory, it must be char* type
-    delete[] static_cast<char *>(data_);
+    delete[] static_cast<char*>(data_);
     data_ = nullptr;
     length_ = 0;
   }
 }
 
+void DistModelDataBuf::Resize(size_t length) {
+  if (length_ >= length) {
+    return;
+  }
+  if (memory_owned_) {
+    Free();
+    data_ = new char[length];
+    length_ = length;
+    memory_owned_ = true;
+  } else {
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "The memory is allocated externally, can not Resized"));
+  }
+}
+
+DistModelDataBuf& DistModelDataBuf::operator=(const DistModelDataBuf& other) {
+  if (!other.memory_owned_) {
+    data_ = other.data_;
+    length_ = other.length_;
+    memory_owned_ = other.memory_owned_;
+  } else {
+    Resize(other.length_);
+    if (other.length() && other.data()) {
+      std::memcpy(data_, other.data(), other.length());
+    } else if (other.length()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Invalid argument, null pointer data with length %u is passed",
+          other.length()));
+    }
+    length_ = other.length_;
+    memory_owned_ = true;
+  }
+  return *this;
+}
+
+DistModelDataBuf& DistModelDataBuf::operator=(DistModelDataBuf&& other) {
+  data_ = other.data_;
+  memory_owned_ = other.memory_owned_;
+  length_ = other.length_;
+  other.data_ = nullptr;
+  other.length_ = 0;
+  other.memory_owned_ = false;
+  return *this;
+}
+
+DistModelDataBuf::DistModelDataBuf(DistModelDataBuf&& other)
+    : data_(other.data_),
+      length_(other.length_),
+      memory_owned_(other.memory_owned_) {
+  other.memory_owned_ = false;
+  other.data_ = nullptr;
+  other.length_ = 0;
+}
+
+DistModelDataBuf::DistModelDataBuf(const DistModelDataBuf& other) {
+  *this = other;
+}
+
 }  // namespace distributed
 }  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index 8899c6885829c..a06677d581ddf 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -34,9 +34,13 @@ class DistModelDataBuf {
   ~DistModelDataBuf() { Free(); }
   DistModelDataBuf() = default;
 
+  DistModelDataBuf& operator=(const DistModelDataBuf& other);
+  DistModelDataBuf& operator=(DistModelDataBuf&& other);
+  DistModelDataBuf(DistModelDataBuf&& other);
+  DistModelDataBuf(const DistModelDataBuf& other);
+
  private:
-  // TODO(fleet exe dev): support copy and assign later
-  DISABLE_COPY_AND_ASSIGN(DistModelDataBuf);
+  void Resize(size_t length);
   void Free();
   void* data_{nullptr};
   size_t length_{0};
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 08f8aec52883f..3d3b83aa30d10 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -14,7 +14,10 @@
 
 #include "paddle/fluid/pybind/bind_fleet_executor.h"
 #include <pybind11/stl.h>
+#include <string>
+#include <vector>
 #include "paddle/fluid/distributed/fleet_executor/dist_model.h"
+#include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/operator.h"
@@ -31,6 +34,8 @@ using paddle::distributed::FleetExecutor;
 using paddle::distributed::TaskNode;
 using paddle::distributed::DistModelConfig;
 using paddle::distributed::DistModel;
+using paddle::distributed::DistModelDataBuf;
+using paddle::distributed::DistModelTensor;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 
@@ -78,6 +83,14 @@ void BindFleetExecutor(py::module* m) {
       .def(py::init<const DistModelConfig&>())
       .def("init", &DistModel::Init)
       .def("run", &DistModel::Run, py::call_guard<py::gil_scoped_release>());
+
+  py::class_<DistModelDataBuf>(*m, "DistModelDataBuf")
+      .def(py::init<size_t>())
+      .def(py::init([](std::vector<float>& data) {
+        auto buf = DistModelDataBuf(data.size() * sizeof(float));
+        std::memcpy(buf.data(), static_cast<void*>(data.data()), buf.length());
+        return buf;
+      }))
 }
 }  // namespace pybind
 }  // namespace paddle

From a3889204f75a50006f58991847a2a86fa80d8bb6 Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 11:55:56 +0800
Subject: [PATCH 03/10] add pybind

---
 .../dist_model_tensor_wrapper.h               |  22 +++-
 paddle/fluid/pybind/bind_fleet_executor.cc    | 106 ++++++++++++++++++
 2 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
index a06677d581ddf..4a04633388af2 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h
@@ -22,6 +22,24 @@ namespace distributed {
 
 enum DistModelDataType { FLOAT16, FLOAT32, INT64, INT32, INT8 };
 
+template <typename T>
+constexpr DistModelDataType DistModelGetDtype();
+
+template <>
+constexpr DistModelDataType DistModelGetDtype<int32_t>() {
+  return DistModelDataType::INT32;
+}
+
+template <>
+constexpr DistModelDataType DistModelGetDtype<int64_t>() {
+  return DistModelDataType::INT64;
+}
+
+template <>
+constexpr DistModelDataType DistModelGetDtype<float>() {
+  return DistModelDataType::FLOAT32;
+}
+
 class DistModelDataBuf {
  public:
   explicit DistModelDataBuf(size_t length)
@@ -33,6 +51,7 @@ class DistModelDataBuf {
   void* data() const { return data_; }
   ~DistModelDataBuf() { Free(); }
   DistModelDataBuf() = default;
+  void Resize(size_t length);
 
   DistModelDataBuf& operator=(const DistModelDataBuf& other);
   DistModelDataBuf& operator=(DistModelDataBuf&& other);
@@ -40,7 +59,6 @@ class DistModelDataBuf {
   DistModelDataBuf(const DistModelDataBuf& other);
 
  private:
-  void Resize(size_t length);
   void Free();
   void* data_{nullptr};
   size_t length_{0};
@@ -51,7 +69,7 @@ struct DistModelTensor {
   std::string name;
   std::vector<int> shape;
   DistModelDataBuf data;
-  DistModelDataType type;
+  DistModelDataType dtype;
   std::vector<std::vector<size_t>> lod;
 };
 
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 3d3b83aa30d10..efc5137b67637 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -36,9 +36,56 @@ using paddle::distributed::DistModelConfig;
 using paddle::distributed::DistModel;
 using paddle::distributed::DistModelDataBuf;
 using paddle::distributed::DistModelTensor;
+using paddle::distributed::DistModelDataType;
 using paddle::framework::OpDesc;
 using paddle::framework::ProgramDesc;
 
+template <typename T>
+DistModelDataBuf DistModelDataBufCreate(
+    py::array_t<T, py::array::c_style | py::array::forcecast> data) {
+  // accept numpy array directly
+  DistModelDataBuf buf(data.sizet() * sizeof(T));
+  std::copy_n(static_cast<const T*>(data.data()), data.size(),
+              static_cast<T*>(buf.data()));
+  return buf;
+}
+
+template <typename T>
+void DistModelDataBufReset(
+    DistModelDataBuf& buf,                                             // NOLINT
+    py::array_t<T, py::array::c_style | py::array::forcecast> data) {  // NOLINT
+  // reset the data with numpy array directly
+  buf.Resize(data.size() * sizeof(T));
+  std::copy_n(static_cast<const T*>(data.data()), data.sizer(),
+              static_cast<T*>(buf.data()));
+}
+
+template <typename T>
+DistModelTensor DistModelTensorCreate(
+    py::array_t<T, py::array::c_style | py::array::forcecast> data,
+    const std::string name, const std::vector<std::vecotr<size_t>>& lod,
+    bool copy) {
+  DistModelTensor tensor;
+
+  if (copy) {
+    DistModelDataBufBuf buf(data.size() * sizeof(T));
+    std::copy_n(static_cast<const T*>(data.data()), data.size(),
+                static_cast<T*>(buf.data()));
+    tensor.data = data;
+  } else {
+    tensor.data =
+        DistModelDataBuf(data.mutable_data(), data.size() * sizeof(T));
+  }
+
+  tensor.dtype = paddle::distributed::DistModelDataType<T>();
+  tensor.name = name;
+  tesnor.lod = lod;
+  tensor.shape.resize(data.ndim());
+  std::copy_n(data.shape(), data.ndim(), tensor.shape.begin());
+
+  return tensor;
+}
+
 void BindFleetExecutor(py::module* m) {
   py::class_<FleetExecutor>(*m, "FleetExecutor")
       .def(py::init<const std::string&>())
@@ -91,6 +138,65 @@ void BindFleetExecutor(py::module* m) {
         std::memcpy(buf.data(), static_cast<void*>(data.data()), buf.length());
         return buf;
       }))
+      .def(py::init(&DistModelDataBufCreate<int32_t>))
+      .def(py::init(&DistModelDataBufCreate<int64_t>))
+      .def(py::init(&DistModelDataBufCreate<float>))
+      .def("reset",
+           [](DistModelDataBuf& self, std::vector<float>& data) {
+             self.Resize(data.size() * sizeof(float));
+             std::memcpy(self.data(), data.data(), self.length());
+           })
+      .def("reset", &DistModelDataBufReset<int32_t>)
+      .def("reset", &DistModelDataBufReset<int64_t>)
+      .def("reset", &DistModelDataBufReset<float>)
+      .def("length", &DistModelDataBuf::length)
+      .def("tolist",
+           [](DistModelDataBuf& self, const std::string& dtype) -> py::list {
+             py::list l;
+             if (dtype == "int32") {
+               auto* data = static_cast<int32_t*>(self.data());
+               auto size = self.length() / sizeof(int32_t);
+               l = py::cast(std::vector<int32_t>(data, data + size));
+             } else if (dtype == "int64") {
+               auto* data = static_cast<int64_t*>(self.data());
+               auto size = self.length() / sizeof(int64_t);
+               l = py::cast(std::vector<int64_t>(data, data + size));
+             } else if (dtype == "float32") {
+               auto* data = static_cast<float*>(self.data());
+               auto size = self.length() / sizeof(float);
+               l = py::cast(std::vector<float>(data, data + size));
+             } else {
+               PADDLE_THROW(platform::errors::Unimplemented(
+                   "Unsupported data type. Now only supports INT32, INT64 and "
+                   "FLOAT32."));
+             }
+             return l;
+           });
+
+  py::class_<DistModelTensor>(*m, "DistModelTensor")
+      .def(py::init<>())
+      .def(py::init(&DistModelTensorCreate<int32_t>), py::arg("data"),
+           py::arg("name") = "",
+           py::arg("lod") = std::vector<std::vector<size_t>>(),
+           py::arg("copy") = true)
+      .def(py::init(&DistModelTensorCreate<int64_t>), py::arg("data"),
+           py::arg("name") = "",
+           py::arg("lod") = std::vector<std::vector<size_t>>(),
+           py::arg("copy") = true)
+      .def(py::init(&DistModelTensorCreate<float>), py::arg("data"),
+           py::arg("name") = "",
+           py::arg("lod") = std::vector<std::vector<size_t>>(),
+           py::arg("copy") = true)
+      .def_readwrite("name", &DistModelTensor::name)
+      .def_readwrite("shape", &DistModelTensor::shape)
+      .def_readwrite("data", &DistModelTensor::data)
+      .def_readwrite("dtype", &DistModelTensor::dtype)
+      .def_readwrite("lod", &DistModelTensor::lod);
+
+  py::enum_<DistModelDataType>(*m, "DistModelDataType")
+      .value("FLOAT32", DistModelDataType::FLOAT32)
+      .value("INT64", DistModelDataType::INT64)
+      .value("INT32", DistModelDataType::INT32);
 }
 }  // namespace pybind
 }  // namespace paddle

From 519ceeaee78ecfa7d632555ef70aeff7470a4c73 Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 12:05:40 +0800
Subject: [PATCH 04/10] add ut for tensor

---
 .../tests/unittests/test_dist_model_tensor.py | 58 +++++++++++++++++++
 1 file changed, 58 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_dist_model_tensor.py

diff --git a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
new file mode 100644
index 0000000000000..eb2de783605a2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
@@ -0,0 +1,58 @@
+# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+paddle.enable_static()
+import numpy as np
+from paddle.fluid.core import DistModelTensor
+from paddle.fluid.core import DistModelDataType
+
+
+class TestInferenceApi(unittest.TestCase):
+    def test_inference_api(self):
+        tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
+        dist_tensor32 = DistModelTensor(tensor32, '32_tensor')
+        dtype32 = dist_tensor32.dtype
+        self.assertEqual(dtype32, DistModelDataType.INT32)
+        self.assertEqual(
+            dist_tensor32.data.tolist('int32'), tensor32.ravel().tolist())
+        self.assertEqual(dist_tensor32.data.length(), 40)
+        self.assertEqual(dist_tensor32.name, '32_tensor')
+        dist_tensor32.data.reset(tensor32)
+
+        tensor64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
+        dist_tensor64 = DistModelTensor(tensor64, '64_tensor')
+        dtype64 = dist_tensor64.dtype
+        self.assertEqual(dtype64, DistModelDataType.INT64)
+        self.assertEqual(
+            dist_tensor64.data.tolist('int64'), tensor64.ravel().tolist())
+        self.assertEqual(dist_tensor64.data.length(), 40)
+        self.assertEqual(dist_tensor64.name, '64_tensor')
+        dist_tensor64.data.reset(tensor64)
+
+        tensor_float = np.random.randn(20, 2).astype('float32')
+        dist_tensor_float = DistModelTensor(tensor_float, 'float_tensor')
+        dtype_float = dist_tensor_float.dtype
+        self.assertEqual(dtype_float, DistModelDataType.FLOAT32)
+        self.assertEqual(
+            dist_tensor_float.data.tolist('float32'),
+            tensor_float.ravel().tolist())
+        self.assertEqual(dist_tensor_float.data.length(), 40)
+        self.assertEqual(dist_tensor_float.name, 'float_tensor')
+        dist_tensor_float.data.reset(tensor_float)
+
+
+if __name__ == '__main__':
+    unittest.main()

From e5e2c9111c02978fe5663044f17b1fa8a25af6ad Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 14:44:25 +0800
Subject: [PATCH 05/10] bug fix and add tondarray method

---
 paddle/fluid/pybind/bind_fleet_executor.cc    | 46 ++++++++++++++++---
 .../tests/unittests/test_dist_model_tensor.py |  6 +++
 2 files changed, 46 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index efc5137b67637..e1f8ec4bd7cb6 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/bind_fleet_executor.h"
+#include <pybind11/numpy.h>
 #include <pybind11/stl.h>
 #include <string>
 #include <vector>
@@ -44,7 +45,7 @@ template <typename T>
 DistModelDataBuf DistModelDataBufCreate(
     py::array_t<T, py::array::c_style | py::array::forcecast> data) {
   // accept numpy array directly
-  DistModelDataBuf buf(data.sizet() * sizeof(T));
+  DistModelDataBuf buf(data.size() * sizeof(T));
   std::copy_n(static_cast<const T*>(data.data()), data.size(),
               static_cast<T*>(buf.data()));
   return buf;
@@ -56,19 +57,19 @@ void DistModelDataBufReset(
     py::array_t<T, py::array::c_style | py::array::forcecast> data) {  // NOLINT
   // reset the data with numpy array directly
   buf.Resize(data.size() * sizeof(T));
-  std::copy_n(static_cast<const T*>(data.data()), data.sizer(),
+  std::copy_n(static_cast<const T*>(data.data()), data.size(),
               static_cast<T*>(buf.data()));
 }
 
 template <typename T>
 DistModelTensor DistModelTensorCreate(
     py::array_t<T, py::array::c_style | py::array::forcecast> data,
-    const std::string name, const std::vector<std::vecotr<size_t>>& lod,
+    const std::string name, const std::vector<std::vector<size_t>>& lod,
     bool copy) {
   DistModelTensor tensor;
 
   if (copy) {
-    DistModelDataBufBuf buf(data.size() * sizeof(T));
+    DistModelDataBuf buf(data.size() * sizeof(T));
     std::copy_n(static_cast<const T*>(data.data()), data.size(),
                 static_cast<T*>(buf.data()));
     tensor.data = data;
@@ -77,7 +78,7 @@ DistModelTensor DistModelTensorCreate(
         DistModelDataBuf(data.mutable_data(), data.size() * sizeof(T));
   }
 
-  tensor.dtype = paddle::distributed::DistModelDataType<T>();
+  tensor.dtype = paddle::distributed::DistModelGetDtype<T>();
   tensor.name = name;
   tesnor.lod = lod;
   tensor.shape.resize(data.ndim());
@@ -86,6 +87,38 @@ DistModelTensor DistModelTensorCreate(
   return tensor;
 }
 
+py::dtype DistModelTensorGetData(DistModelDataType dtype) {
+  py::dtype dt;
+  switch (dtype) {
+    case DistModelDataType::INT32:
+      dt = py::dtype::of<int32_t>();
+      break;
+    case DistModelDataType::INT64:
+      dt = py::dtype::of<int64_t>();
+      break;
+    case DistModelDataType::FLOAT32:
+      dt = py::dtype::of<float>();
+      break;
+    case DistModelDataType::INT8:
+      dt = py::dtype::of<int8_t>();
+      break;
+    case DistModelDataType::FLOAT16:
+      dt = py::dtype::of<paddle::platform::float16>();
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Unsupported data type. Now only supports INT32, INT64, INT8, "
+          "FLOAT16 and FLOAT32."));
+  }
+
+  return dt;
+}
+
+py::array DistModelTensorGetData(DistModelTensor& tensor) {  // NOLINT
+  py::dtype dt = DistModelTypeToNumpyDType(tensor.dtype);
+  return py::array(std::move(dt), {tensor.shape}, tensor.data.data());
+}
+
 void BindFleetExecutor(py::module* m) {
   py::class_<FleetExecutor>(*m, "FleetExecutor")
       .def(py::init<const std::string&>())
@@ -196,7 +229,8 @@ void BindFleetExecutor(py::module* m) {
   py::enum_<DistModelDataType>(*m, "DistModelDataType")
       .value("FLOAT32", DistModelDataType::FLOAT32)
       .value("INT64", DistModelDataType::INT64)
-      .value("INT32", DistModelDataType::INT32);
+      .value("INT32", DistModelDataType::INT32)
+      .def("as_ndarray", &DistModelTensorGetData);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
index eb2de783605a2..466639fa3cc21 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
@@ -31,6 +31,8 @@ def test_inference_api(self):
         self.assertEqual(dist_tensor32.data.length(), 40)
         self.assertEqual(dist_tensor32.name, '32_tensor')
         dist_tensor32.data.reset(tensor32)
+        self.assertEqual(dist_tensor32.as_ndarray().ravel().tolist(),
+                         tensor32.ravel().tolist())
 
         tensor64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
         dist_tensor64 = DistModelTensor(tensor64, '64_tensor')
@@ -41,6 +43,8 @@ def test_inference_api(self):
         self.assertEqual(dist_tensor64.data.length(), 40)
         self.assertEqual(dist_tensor64.name, '64_tensor')
         dist_tensor64.data.reset(tensor64)
+        self.assertEqual(dist_tensor64.as_ndarray().ravel().tolist(),
+                         tensor32.ravel().tolist())
 
         tensor_float = np.random.randn(20, 2).astype('float32')
         dist_tensor_float = DistModelTensor(tensor_float, 'float_tensor')
@@ -52,6 +56,8 @@ def test_inference_api(self):
         self.assertEqual(dist_tensor_float.data.length(), 40)
         self.assertEqual(dist_tensor_float.name, 'float_tensor')
         dist_tensor_float.data.reset(tensor_float)
+        self.assertEqual(dist_tensor_float.as_ndarray().ravel().tolist(),
+                         tensor32.ravel().tolist())
 
 
 if __name__ == '__main__':

From 473ca0642ef5a0e03c6ce6f02305da666b6079d2 Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 14:53:22 +0800
Subject: [PATCH 06/10] bug fix

---
 paddle/fluid/pybind/bind_fleet_executor.cc | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index e1f8ec4bd7cb6..5bb29fa8a5cb1 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -72,7 +72,7 @@ DistModelTensor DistModelTensorCreate(
     DistModelDataBuf buf(data.size() * sizeof(T));
     std::copy_n(static_cast<const T*>(data.data()), data.size(),
                 static_cast<T*>(buf.data()));
-    tensor.data = data;
+    tensor.data = std::move(buf);
   } else {
     tensor.data =
         DistModelDataBuf(data.mutable_data(), data.size() * sizeof(T));
@@ -80,14 +80,14 @@ DistModelTensor DistModelTensorCreate(
 
   tensor.dtype = paddle::distributed::DistModelGetDtype<T>();
   tensor.name = name;
-  tesnor.lod = lod;
+  tensor.lod = lod;
   tensor.shape.resize(data.ndim());
   std::copy_n(data.shape(), data.ndim(), tensor.shape.begin());
 
   return tensor;
 }
 
-py::dtype DistModelTensorGetData(DistModelDataType dtype) {
+py::dtype DistModelTypeToNumpyDType(DistModelDataType dtype) {
   py::dtype dt;
   switch (dtype) {
     case DistModelDataType::INT32:

From 1f722289bab738effc22a16b5943b54c5bcee67d Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 14:56:47 +0800
Subject: [PATCH 07/10] update dist model

---
 paddle/fluid/distributed/fleet_executor/dist_model.cc | 4 ++--
 paddle/fluid/distributed/fleet_executor/dist_model.h  | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.cc b/paddle/fluid/distributed/fleet_executor/dist_model.cc
index 310c809de7172..6454a34950513 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.cc
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.cc
@@ -355,8 +355,8 @@ bool DistModel::PrepareFeedAndFetch() {
   return true;
 }
 
-void DistModel::Run(const std::vector<paddle::framework::Tensor> &input_data,
-                    std::vector<paddle::framework::Tensor> *output_data) {
+void DistModel::Run(const std::vector<DistModelTensor> &input_data,
+                    std::vector<DistModelTensor> *output_data) {
   /* TODO(fleet exe dev): implement this funct */
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/dist_model.h b/paddle/fluid/distributed/fleet_executor/dist_model.h
index d6dc554f158d3..96e9c018074b5 100644
--- a/paddle/fluid/distributed/fleet_executor/dist_model.h
+++ b/paddle/fluid/distributed/fleet_executor/dist_model.h
@@ -17,6 +17,7 @@
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/distributed/fleet_executor/dist_model_tensor_wrapper.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/macros.h"
@@ -56,8 +57,8 @@ class DistModel {
  public:
   explicit DistModel(const DistModelConfig& config) : config_(config) {}
   bool Init();
-  void Run(const std::vector<paddle::framework::Tensor>& input_data,
-           std::vector<paddle::framework::Tensor>* output_data);
+  void Run(const std::vector<DistModelTensor>& input_data,
+           std::vector<DistModelTensor>* output_data);
   ~DistModel() = default;
 
  private:

From 843615a3b43dc362f6e2d2aabf6f88ae77b580fc Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 15:04:03 +0800
Subject: [PATCH 08/10] add cmake dependency

---
 paddle/fluid/distributed/fleet_executor/CMakeLists.txt     | 4 ++--
 .../paddle/fluid/tests/unittests/test_dist_model_tensor.py | 7 ++++---
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 1e31187367bd3..3e734b1b9ed24 100644
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -12,8 +12,8 @@ endif()
 
 cc_library(task_loop_thread_pool SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc DEPS enforce glog)
 
-cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc dist_model.cc
-        interceptor.cc compute_interceptor.cc amplifier_interceptor.cc message_service.cc message_bus.cc
+cc_library(fleet_executor SRCS fleet_executor.cc carrier.cc task_node.cc runtime_graph.cc dist_model.cc interceptor.cc
+        compute_interceptor.cc amplifier_interceptor.cc message_service.cc message_bus.cc dist_model_tensor_wrapper.cc
         DEPS proto_desc fleet_executor_desc_proto interceptor_message_proto task_loop_thread_pool collective_helper
         op_registry executor_gc_helper gflags glog ${BRPC_DEPS})
 
diff --git a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
index 466639fa3cc21..851e940552376 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
@@ -14,14 +14,15 @@
 
 import unittest
 import paddle
-paddle.enable_static()
 import numpy as np
 from paddle.fluid.core import DistModelTensor
 from paddle.fluid.core import DistModelDataType
 
+paddle.enable_static()
+
 
-class TestInferenceApi(unittest.TestCase):
-    def test_inference_api(self):
+class TestDistModelTensor(unittest.TestCase):
+    def test_dist_model_tensor(self):
         tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
         dist_tensor32 = DistModelTensor(tensor32, '32_tensor')
         dtype32 = dist_tensor32.dtype

From a51274c18cb2557c54d2dd94980c2ecd3598e111 Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 15:19:23 +0800
Subject: [PATCH 09/10] bug fix

---
 paddle/fluid/pybind/bind_fleet_executor.cc    |  6 +--
 .../tests/unittests/test_dist_model_tensor.py | 38 +++++++++----------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index 5bb29fa8a5cb1..450939dd0ff8b 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -224,13 +224,13 @@ void BindFleetExecutor(py::module* m) {
       .def_readwrite("shape", &DistModelTensor::shape)
       .def_readwrite("data", &DistModelTensor::data)
       .def_readwrite("dtype", &DistModelTensor::dtype)
-      .def_readwrite("lod", &DistModelTensor::lod);
+      .def_readwrite("lod", &DistModelTensor::lod)
+      .def("as_ndarray", &DistModelTensorGetData);
 
   py::enum_<DistModelDataType>(*m, "DistModelDataType")
       .value("FLOAT32", DistModelDataType::FLOAT32)
       .value("INT64", DistModelDataType::INT64)
-      .value("INT32", DistModelDataType::INT32)
-      .def("as_ndarray", &DistModelTensorGetData);
+      .value("INT32", DistModelDataType::INT32);
 }
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
index 851e940552376..da25550c4f47e 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_model_tensor.py
@@ -23,42 +23,40 @@
 
 class TestDistModelTensor(unittest.TestCase):
     def test_dist_model_tensor(self):
-        tensor32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
-        dist_tensor32 = DistModelTensor(tensor32, '32_tensor')
-        dtype32 = dist_tensor32.dtype
-        self.assertEqual(dtype32, DistModelDataType.INT32)
+        tensor_32 = np.random.randint(10, 20, size=[20, 2]).astype('int32')
+        dist_tensor32 = DistModelTensor(tensor_32, '32_tensor')
+        self.assertEqual(dist_tensor32.dtype, DistModelDataType.INT32)
         self.assertEqual(
-            dist_tensor32.data.tolist('int32'), tensor32.ravel().tolist())
-        self.assertEqual(dist_tensor32.data.length(), 40)
+            dist_tensor32.data.tolist('int32'), tensor_32.ravel().tolist())
+        # the length is how many byte the data contains
+        self.assertEqual(dist_tensor32.data.length(), 40 * 4)
         self.assertEqual(dist_tensor32.name, '32_tensor')
-        dist_tensor32.data.reset(tensor32)
+        dist_tensor32.data.reset(tensor_32)
         self.assertEqual(dist_tensor32.as_ndarray().ravel().tolist(),
-                         tensor32.ravel().tolist())
+                         tensor_32.ravel().tolist())
 
-        tensor64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
-        dist_tensor64 = DistModelTensor(tensor64, '64_tensor')
-        dtype64 = dist_tensor64.dtype
-        self.assertEqual(dtype64, DistModelDataType.INT64)
+        tensor_64 = np.random.randint(10, 20, size=[20, 2]).astype('int64')
+        dist_tensor64 = DistModelTensor(tensor_64, '64_tensor')
+        self.assertEqual(dist_tensor64.dtype, DistModelDataType.INT64)
         self.assertEqual(
-            dist_tensor64.data.tolist('int64'), tensor64.ravel().tolist())
-        self.assertEqual(dist_tensor64.data.length(), 40)
+            dist_tensor64.data.tolist('int64'), tensor_64.ravel().tolist())
+        self.assertEqual(dist_tensor64.data.length(), 40 * 8)
         self.assertEqual(dist_tensor64.name, '64_tensor')
-        dist_tensor64.data.reset(tensor64)
+        dist_tensor64.data.reset(tensor_64)
         self.assertEqual(dist_tensor64.as_ndarray().ravel().tolist(),
-                         tensor32.ravel().tolist())
+                         tensor_64.ravel().tolist())
 
         tensor_float = np.random.randn(20, 2).astype('float32')
         dist_tensor_float = DistModelTensor(tensor_float, 'float_tensor')
-        dtype_float = dist_tensor_float.dtype
-        self.assertEqual(dtype_float, DistModelDataType.FLOAT32)
+        self.assertEqual(dist_tensor_float.dtype, DistModelDataType.FLOAT32)
         self.assertEqual(
             dist_tensor_float.data.tolist('float32'),
             tensor_float.ravel().tolist())
-        self.assertEqual(dist_tensor_float.data.length(), 40)
+        self.assertEqual(dist_tensor_float.data.length(), 40 * 4)
         self.assertEqual(dist_tensor_float.name, 'float_tensor')
         dist_tensor_float.data.reset(tensor_float)
         self.assertEqual(dist_tensor_float.as_ndarray().ravel().tolist(),
-                         tensor32.ravel().tolist())
+                         tensor_float.ravel().tolist())
 
 
 if __name__ == '__main__':

From c42fe4f4292be9a5884a37a48f5c49d65686360e Mon Sep 17 00:00:00 2001
From: liuyuang <liuyuang@baidu.com>
Date: Thu, 20 Jan 2022 16:15:30 +0800
Subject: [PATCH 10/10] remove the ut from win

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 915af18a5702d..5c57d1a21bce6 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -152,6 +152,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_origin_scheduler)
     LIST(REMOVE_ITEM TEST_OPS test_auto_parallel_mapper)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_executor_task_node)
+    LIST(REMOVE_ITEM TEST_OPS test_dist_model_tensor)
 endif()
 
 # Temporally disable test_deprecated_decorator