Merge branch 'develop' of github.com:YuanRisheng/Paddle into add_raw_…

…kernel
YuanRisheng · Jan 25, 2022 · 33bb053 · 33bb053
2 parents 365f60a + 529f142
commit 33bb053
Show file tree

Hide file tree

Showing 90 changed files with 3,072 additions and 1,851 deletions.
diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
@@ -189,6 +189,7 @@ copy(inference_lib_dist
 copy_part_of_thrid_party(inference_lib_dist ${PADDLE_INFERENCE_INSTALL_DIR})
 
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
+
 if(WIN32)
     if(WITH_STATIC_LIB)
         set(paddle_inference_lib $<TARGET_FILE_DIR:paddle_inference>/libpaddle_inference.lib
@@ -304,7 +305,7 @@ copy(fluid_lib_dist
         )
 
 set(module "platform")
-set(platform_lib_deps profiler_proto error_codes_proto)
+set(platform_lib_deps profiler_proto errors)
 if(WITH_GPU)
   set(platform_lib_deps ${platform_lib_deps} external_error_proto)
 endif(WITH_GPU)
@@ -317,7 +318,7 @@ copy(fluid_lib_dist
 
 set(module "string")
 copy(fluid_lib_dist
-        SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
+        SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/*.h ${PADDLE_SOURCE_DIR}/paddle/utils/${module}/tinyformat/*.h 
         DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
         )
 

diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(utils)
 add_subdirectory(scripts)
 add_subdirectory(testing)
 set(PYTHON_TESTS_DIR ${PADDLE_BINARY_DIR}/python/paddle/fluid/tests CACHE INTERNAL "python tests directory")

diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt
@@ -4,7 +4,6 @@ add_subdirectory(distributed)
 add_subdirectory(framework)
 add_subdirectory(imperative)
 add_subdirectory(operators)
-add_subdirectory(string)
 add_subdirectory(pybind)
 add_subdirectory(eager)
 # NOTE: please add subdirectory inference at last.

diff --git a/paddle/fluid/framework/expect.h b/paddle/fluid/framework/expect.h
@@ -19,14 +19,18 @@
 #define _LINUX
 #endif
 
-#ifdef _LINUX
 #ifndef likely
-#define likely(x) __builtin_expect((x), 1)
+#ifdef _LINUX
+#define likely(expr) (__builtin_expect(!!(expr), 1))
+#else
+#define likely(expr) (expr)
 #endif
 #endif
 
-#ifdef _LINUX
 #ifndef unlikely
-#define unlikely(x) __builtin_expect((x), 0)
+#ifdef _LINUX
+#define unlikely(expr) (__builtin_expect(!!(expr), 0))
+#else
+#define unlikely(expr) (expr)
 #endif
 #endif
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
@@ -603,9 +603,9 @@ static std::vector<std::vector<ir::Node::Dep>> GetOpDependencies(
   for (const auto *op_desc : block_ops) {
     size_t op_idx = op_id_to_idx.size();
     PADDLE_ENFORCE_EQ(
-        op_id_to_idx.emplace(op_desc->Id(), op_idx).second, true,
+        op_id_to_idx.emplace(op_desc->OriginalId(), op_idx).second, true,
         platform::errors::InvalidArgument(
-            "There should not be duplicate op id: %d", op_desc->Id()));
+            "There should not be duplicate op id: %d", op_desc->OriginalId()));
   }
 
   std::vector<std::vector<ir::Node::Dep>> dep_matrix(op_num);
@@ -624,9 +624,9 @@ static std::vector<std::vector<ir::Node::Dep>> GetOpDependencies(
 
   for (const auto &pair : all_preceding_ops) {
     const auto *cur_op_node = pair.first;
-    size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->Id());
+    size_t op_idx_1 = get_op_idx_by_id(cur_op_node->Op()->OriginalId());
     for (const auto *preceding_op_node : pair.second) {
-      size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->Id());
+      size_t op_idx_2 = get_op_idx_by_id(preceding_op_node->Op()->OriginalId());
       dep_matrix[op_idx_1][op_idx_2] = ir::Node::Dep::kAfter;
       dep_matrix[op_idx_2][op_idx_1] = ir::Node::Dep::kBefore;
     }

diff --git a/paddle/fluid/framework/op_call_stack.h b/paddle/fluid/framework/op_call_stack.h
@@ -19,12 +19,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/type_defs.h"
 #include "paddle/fluid/platform/enforce.h"
 
-namespace paddle {
-namespace platform {
-struct EnforceNotMet;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace framework {
 

diff --git a/paddle/fluid/framework/op_proto_maker_test.cc b/paddle/fluid/framework/op_proto_maker_test.cc
@@ -18,12 +18,6 @@ limitations under the License. */
 #include "gtest/gtest-test-part.h"
 #include "gtest/gtest.h"
 
-namespace paddle {
-namespace platform {
-struct EnforceNotMet;
-}  // namespace platform
-}  // namespace paddle
-
 class TestAttrProtoMaker : public paddle::framework::OpProtoAndCheckerMaker {
  public:
   void Make() {

diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include <unordered_map>
 #include <unordered_set>
 #include <vector>
-#include "paddle/fluid/imperative/type_defs.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/pten/core/type_defs.h"
 #include "paddle/utils/small_vector.h"
 
 namespace paddle {
@@ -39,14 +39,6 @@ class InferNoNeedBufferVarsFN;
 using VariableNameMap = std::map<std::string, std::vector<std::string>>;
 using VariableValueMap = std::map<std::string, std::vector<Variable*>>;
 
-// The order should be as same as framework.proto
-using Attribute = boost::variant<
-    boost::blank, int, float, std::string, std::vector<int>, std::vector<float>,
-    std::vector<std::string>, bool, std::vector<bool>, BlockDesc*, int64_t,
-    std::vector<BlockDesc*>, std::vector<int64_t>, std::vector<double>>;
-
-using AttributeMap = std::unordered_map<std::string, Attribute>;
-
 #ifdef PADDLE_WITH_ASCEND_CL
 using NPUAttribute =
     boost::variant<boost::blank, int, float, std::string, std::vector<int>,

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
@@ -369,6 +369,10 @@ static void BuildDygraphPtenKernelContext(
     size_t end_idx = start_idx + outs_vector.size();
 
     for (size_t offset = 0; offset < outs_vector.size(); ++offset) {
+      if (outs_vector[offset] == nullptr) {
+        kernel_ctx->EmplaceBackOutputWithoutSetRange({nullptr});
+        continue;
+      }
       auto* var = outs_vector[offset]->MutableVar();
       framework::Tensor* tensor_out = nullptr;
       if (var->template IsType<framework::LoDTensor>()) {

diff --git a/paddle/fluid/imperative/type_defs.h b/paddle/fluid/imperative/type_defs.h
@@ -13,47 +13,4 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-namespace paddle {
-namespace imperative {
-
-class VariableWrapper;
-class SavedVariableWrapperList;
-class VarBase;
-class OpBase;
-class GradOpNode;
-class Tracer;
-
-using WeakNameVarBaseMap =
-    std::map<std::string, std::vector<std::weak_ptr<VarBase>>>;
-
-namespace details {
-template <typename T>
-struct NameVarMapTrait {};
-
-template <>
-struct NameVarMapTrait<VarBase> {
-  using Type = std::map<std::string, std::vector<std::shared_ptr<VarBase>>>;
-};
-
-template <>
-struct NameVarMapTrait<VariableWrapper> {
-  using Type = std::map<std::string, SavedVariableWrapperList>;
-};
-}  // namespace details
-
-template <typename T>
-using NameVarMap = typename details::NameVarMapTrait<T>::Type;
-
-using NameVarBaseMap = NameVarMap<VarBase>;
-using NameVariableWrapperMap = NameVarMap<VariableWrapper>;
-
-using VariableWrapperList = std::vector<std::shared_ptr<VariableWrapper>>;
-
-}  // namespace imperative
-}  // namespace paddle
+#include "paddle/pten/core/type_defs.h"
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
@@ -36,6 +36,7 @@ endif()
 # fluid_modules exclude API-interface of inference/api and inference/capi_exp
 get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
 get_property(pten_modules GLOBAL PROPERTY PTEN_MODULES)
+set(utils_modules stringpiece pretty_log string_helper)
 
 add_subdirectory(api)
 
@@ -46,9 +47,9 @@ set(STATIC_INFERENCE_API paddle_inference_api analysis_predictor
         analysis_config paddle_pass_builder activation_functions ${mkldnn_quantizer_cfg})
 #TODO(wilber, T8T9): Do we still need to support windows gpu static library?
 if(WIN32 AND WITH_GPU)
-  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
+  cc_library(paddle_inference DEPS ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
 else()
-  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API})
+  create_static_lib(paddle_inference ${fluid_modules} ${pten_modules} ${STATIC_INFERENCE_API} ${utils_modules})
   if(WITH_IPU)
     target_link_libraries(paddle_inference -Wl,--allow-multiple-definition popart_canonicalization_utils)
   endif()

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -33,34 +33,6 @@ class CPUDeviceContext;
 namespace paddle {
 namespace operators {
 
-template <typename T>
-struct SameDimsElemwiseAdd<
-    platform::CPUDeviceContext, T,
-    typename std::enable_if<std::is_floating_point<T>::value>::type> {
-  void operator()(const framework::ExecutionContext &ctx,
-                  const framework::Tensor *x, const framework::Tensor *y,
-                  framework::Tensor *z) {
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    blas.VADD(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
-  }
-};
-
-template <typename T>
-struct SameDimsElemwiseAdd<
-    platform::CPUDeviceContext, T,
-    typename std::enable_if<!std::is_floating_point<T>::value>::type> {
-  void operator()(const framework::ExecutionContext &ctx,
-                  const framework::Tensor *x, const framework::Tensor *y,
-                  framework::Tensor *z) {
-    auto eigen_x = framework::EigenVector<T>::Flatten(*x);
-    auto eigen_y = framework::EigenVector<T>::Flatten(*y);
-    auto eigen_z = framework::EigenVector<T>::Flatten(*z);
-    auto &place = *ctx.template device_context<platform::CPUDeviceContext>()
-                       .eigen_device();
-    eigen_z.device(place) = eigen_x + eigen_y;
-  }
-};
-
 class ElementwiseAddOpMaker : public ElementwiseOpMaker {
  protected:
   std::string GetName() const override { return "Add"; }

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cu b/paddle/fluid/operators/elementwise/elementwise_add_op.cu
@@ -13,139 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
+#include "paddle/pten/kernels/gpu/elementwise.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
 namespace paddle {
-namespace operators {
-
-template <typename T>
-static __global__ void SimpleElemwiseAddGradCUDAKernel(
-    const T* __restrict__ dout, int size, int vec_size, T* dx, T* dy) {
-  int tid = blockIdx.x * blockDim.x + threadIdx.x;
-  int stride = gridDim.x * blockDim.x;
-  int loop = size / vec_size;
-  int remainder = size % vec_size;
-  const float4* dout_vec = reinterpret_cast<const float4*>(dout);
-  float4* dx_vec = reinterpret_cast<float4*>(dx);
-  float4* dy_vec = reinterpret_cast<float4*>(dy);
-  float4 tmp_loop;
-
-  for (int i = tid; i < loop; i += stride) {
-    tmp_loop = dout_vec[i];
-    dx_vec[i] = tmp_loop;
-    dy_vec[i] = tmp_loop;
-  }
-
-  if (tid == loop && remainder != 0) {
-    T tmp_rem;
-    while (remainder) {
-      int idx = size - remainder;
-      remainder--;
-      tmp_rem = dout[idx];
-      dx[idx] = tmp_rem;
-      dy[idx] = tmp_rem;
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
-default_elementwise_add_grad(const framework::ExecutionContext& ctx,
-                             const framework::Tensor* x,
-                             const framework::Tensor* y,
-                             const framework::Tensor* out,
-                             const framework::Tensor* dout,
-                             framework::Tensor* dx, framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  auto* dout_data = dout->data<T>();
-
-  // dx
-  if (dx != nullptr) {
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    if (dx->dims() == dout->dims()) {
-      if (dx_data != dout_data) {
-        framework::TensorCopy(
-            *dout, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dx);
-      }
-    } else {
-      // For inplace strategy, dx will be stored in addr of dout, which makes
-      // the result of dy wrong.
-      if (dx->IsSharedBufferWith(*dout)) {
-        dx->clear();
-        dx->mutable_data<T>(x->dims(), ctx.GetPlace());
-      }
-      std::vector<int> reduce_dims = GetReduceDim(x->dims(), out->dims(), axis);
-      gpuStream_t stream = ctx.cuda_device_context().stream();
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          *dout, dx, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-  // dy
-  if (dy != nullptr) {
-    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
-    if (dy->dims() == dout->dims()) {
-      if (dy_data != dout_data) {
-        framework::TensorCopy(
-            *dout, ctx.GetPlace(),
-            ctx.template device_context<platform::DeviceContext>(), dy);
-      }
-    } else {
-      std::vector<int> reduce_dims = GetReduceDim(y->dims(), out->dims(), axis);
-      gpuStream_t stream = ctx.cuda_device_context().stream();
-      TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
-          *dout, dy, kps::IdentityFunctor<T>(), reduce_dims, stream);
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-typename std::enable_if<
-    std::is_same<DeviceContext, plat::CUDADeviceContext>::value>::type
-elementwise_add_grad(const framework::ExecutionContext& ctx,
-                     const framework::Tensor* x, const framework::Tensor* y,
-                     const framework::Tensor* out,
-                     const framework::Tensor* dout, framework::Tensor* dx,
-                     framework::Tensor* dy) {
-  auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-  auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
-  auto* dout_data = dout->data<T>();
-  if (dx_data == dout_data && dy_data != dout_data) {
-    VLOG(4) << "Special case when dx_data is the same as dout_data, "
-               "only need copy dout to dy";
-    framework::TensorCopy(
-        *dout, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), dy);
-  } else if (dx_data != dout_data && dy_data == dout_data) {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "only need copy dout to dx";
-    framework::TensorCopy(
-        *dout, ctx.GetPlace(),
-        ctx.template device_context<platform::DeviceContext>(), dx);
-  } else if (dx_data != dout_data && dy_data != dout_data) {
-    auto size = x->numel();
-    int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
-    dim3 block_size = dim3(PREDEFINED_BLOCK_SIZE, 1);
-    dim3 grid_size =
-        dim3(((size + vec_size - 1) / vec_size + PREDEFINED_BLOCK_SIZE - 1) /
-                 PREDEFINED_BLOCK_SIZE,
-             1);
-    SimpleElemwiseAddGradCUDAKernel<
-        T><<<grid_size, block_size, 0,
-             ctx.template device_context<plat::CUDADeviceContext>().stream()>>>(
-        dout->data<T>(), size, vec_size, dx->mutable_data<T>(ctx.GetPlace()),
-        dy->mutable_data<T>(ctx.GetPlace()));
-  } else {
-    VLOG(4) << "Special case when dy_data is the same as dout_data, "
-               "and dx_data is the same as dout_data, do not need "
-               "any operator";
-  }
-}
-
-}  // namespace operators
+namespace operators {}  // namespace operators
 }  // namespace paddle
 REGISTER_OP_CUDA_KERNEL(
     elementwise_add, ops::ElementwiseAddKernel<plat::CUDADeviceContext, float>,