merge develop

PaddlePaddle · Aug 28, 2020 · c2ebb07 · c2ebb07
2 parents 1bc093a + edf5f31
commit c2ebb07
Show file tree

Hide file tree

Showing 73 changed files with 4,182 additions and 472 deletions.
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
@@ -34,7 +34,8 @@ struct OpUpdateRecord {
     kModifyAttr,
     kNewAttr,
     kNewInput,
-    kNewOutput
+    kNewOutput,
+    kBugfixWithBehaviorChanged,
   };
   Type type_;
   std::string remark_;
@@ -82,6 +83,11 @@ struct NewOutput : OpUpdateRecord {
   std::string name_;
 };
 
+struct BugfixWithBehaviorChanged : OpUpdateRecord {
+  explicit BugfixWithBehaviorChanged(const std::string& remark)
+      : OpUpdateRecord({Type::kBugfixWithBehaviorChanged, remark}) {}
+};
+
 class OpVersionDesc {
  public:
   OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
@@ -110,6 +116,12 @@ class OpVersionDesc {
     return *this;
   }
 
+  OpVersionDesc& BugfixWithBehaviorChanged(const std::string& remark) {
+    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
+        new compatible::BugfixWithBehaviorChanged(remark)));
+    return *this;
+  }
+
  private:
   std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
 };

diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
@@ -23,6 +23,10 @@ namespace compatible {
 
 TEST(test_operator_version, test_operator_version) {
   REGISTER_OP_VERSION(test__)
+      .AddCheckpoint(
+          R"ROC(Fix the bug of reshape op, support the case of axis < 0)ROC",
+          framework::compatible::OpVersionDesc().BugfixWithBehaviorChanged(
+              "Support the case of axis < 0"))
       .AddCheckpoint(
           R"ROC(
         Upgrade reshape, modified one attribute [axis] and add a new attribute [size].

diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
@@ -913,10 +913,20 @@ std::ostream& print_tensor(std::ostream& os, const framework::Tensor& tensor) {
   auto element_num = tensor.numel();
 
   os << "  - data: [";
-  if (element_num > 0) {
-    os << inspect[0];
-    for (int j = 1; j < element_num; ++j) {
-      os << " " << inspect[j];
+  // Note: int8_t && uint8_t is typedf of char, ostream unable to print properly
+  if (typeid(int8_t) == typeid(T) || typeid(uint8_t) == typeid(T)) {
+    if (element_num > 0) {
+      os << signed(inspect[0]);
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << signed(inspect[j]);
+      }
+    }
+  } else {
+    if (element_num > 0) {
+      os << inspect[0];
+      for (int j = 1; j < element_num; ++j) {
+        os << " " << inspect[j];
+      }
     }
   }
   os << "]";

diff --git a/paddle/fluid/imperative/backward_strategy.h b/paddle/fluid/imperative/backward_strategy.h
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
@@ -30,12 +30,13 @@
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/profiler.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
-void BasicEngine::Init(VarBase* var, const detail::BackwardStrategy& strategy,
-                       bool retain_graph) {
-  backward_strategy_ = strategy;
+void BasicEngine::Init(VarBase* var, bool retain_graph) {
+  sorted_sum_gradient_ = FLAGS_sort_sum_gradient;
   retain_graph_ = retain_graph;
   init_node_ = var->GradVarBase()->GradNode();
   var->GradVarBase()->ClearGradNode();
@@ -105,7 +106,7 @@ void BasicEngine::PrepareGradAccumulators(const OpBase& op) {
 
       auto& accumulator = accumulators_[var.get()];
       if (!accumulator) {
-        if (backward_strategy_.sorted_sum_gradient_) {
+        if (sorted_sum_gradient_) {
           accumulator.reset(new SortedGradientAccumulator(var.get()));
         } else {
           accumulator.reset(new EagerGradientAccumulator(var.get()));

diff --git a/paddle/fluid/imperative/basic_engine.h b/paddle/fluid/imperative/basic_engine.h
@@ -18,7 +18,6 @@
 #include <unordered_map>
 #include <utility>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
@@ -30,8 +29,7 @@ class OpBase;
 
 class BasicEngine : public Engine {
  public:
-  void Init(VarBase* var, const detail::BackwardStrategy& strategy,
-            bool retain_graph = false);
+  void Init(VarBase* var, bool retain_graph = false);
 
   void Execute() override;
 
@@ -46,7 +44,7 @@ class BasicEngine : public Engine {
 
  private:
   std::shared_ptr<GradOpNode> init_node_;
-  detail::BackwardStrategy backward_strategy_;
+  bool sorted_sum_gradient_;
   std::unordered_map<GradOpNode*, size_t> node_deps_;
   std::unordered_map<VariableWrapper*, std::unique_ptr<GradientAccumulator>>
       accumulators_;

diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -33,6 +33,8 @@
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
+DECLARE_bool(sort_sum_gradient);
+
 namespace paddle {
 namespace imperative {
 
@@ -529,8 +531,7 @@ class PartialGradTask {
                   const std::vector<std::shared_ptr<VarBase>> &output_targets,
                   const std::vector<std::shared_ptr<VarBase>> &output_grads,
                   const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                  const platform::Place &place,
-                  const detail::BackwardStrategy &strategy, bool create_graph,
+                  const platform::Place &place, bool create_graph,
                   bool retain_graph, bool allow_unused, bool only_inputs);
 
   std::vector<std::shared_ptr<VarBase>> Run();
@@ -577,23 +578,22 @@ class PartialGradTask {
   bool retain_graph_;
   bool allow_unused_;
   bool only_inputs_;
-  detail::BackwardStrategy strategy_;
+  bool sorted_sum_gradient_{FLAGS_sort_sum_gradient};
 };
 
 PartialGradTask::PartialGradTask(
     const std::vector<std::shared_ptr<VarBase>> &input_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs) {
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs) {
   input_targets_ = input_targets;
   place_ = place;
   create_graph_ = create_graph;
   retain_graph_ = retain_graph;
   allow_unused_ = allow_unused;
   only_inputs_ = only_inputs;
-  strategy_ = strategy;
 
   PADDLE_ENFORCE_EQ(only_inputs_, true,
                     platform::errors::Unimplemented(
@@ -981,7 +981,7 @@ void PartialGradTask::PrepareInitialGradientAccumulators(const OpBase *op) {
 
       if (!accumulator) {
         accumulator.reset(new GradientAccumulationInfo(
-            var, strategy_.sorted_sum_gradient_, create_graph_));
+            var, sorted_sum_gradient_, create_graph_));
       }
 
       accumulator->IncreaseTotalRefCnt();
@@ -1033,11 +1033,11 @@ PartialGradEngine::PartialGradEngine(
     const std::vector<std::shared_ptr<VarBase>> &output_targets,
     const std::vector<std::shared_ptr<VarBase>> &output_grads,
     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-    const platform::Place &place, const detail::BackwardStrategy &strategy,
-    bool create_graph, bool retain_graph, bool allow_unused, bool only_inputs)
+    const platform::Place &place, bool create_graph, bool retain_graph,
+    bool allow_unused, bool only_inputs)
     : task_(new PartialGradTask(input_targets, output_targets, output_grads,
-                                no_grad_vars, place, strategy, create_graph,
-                                retain_graph, allow_unused, only_inputs)) {}
+                                no_grad_vars, place, create_graph, retain_graph,
+                                allow_unused, only_inputs)) {}
 
 PartialGradEngine::~PartialGradEngine() { Clear(); }
 

diff --git a/paddle/fluid/imperative/partial_grad_engine.h b/paddle/fluid/imperative/partial_grad_engine.h
@@ -16,7 +16,6 @@
 
 #include <memory>
 #include <vector>
-#include "paddle/fluid/imperative/backward_strategy.h"
 #include "paddle/fluid/imperative/engine.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -33,8 +32,7 @@ class PartialGradEngine : public Engine {
                     const std::vector<std::shared_ptr<VarBase>> &output_targets,
                     const std::vector<std::shared_ptr<VarBase>> &output_grads,
                     const std::vector<std::shared_ptr<VarBase>> &no_grad_vars,
-                    const platform::Place &place,
-                    const detail::BackwardStrategy &strategy, bool create_graph,
+                    const platform::Place &place, bool create_graph,
                     bool retain_graph, bool allow_unused, bool only_inputs);
 
   ~PartialGradEngine();

diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -240,9 +240,8 @@ TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   framework::AttributeMap reduce_attr_map;
   tracer.TraceOp("reduce_sum", reduce_in, reduce_out, reduce_attr_map,
                  gpu_place, true);
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(reduce_sum_out.get(), back_st);
+  engine.Init(reduce_sum_out.get());
   engine.Execute();
 
   framework::LoDTensor rlt;
@@ -356,9 +355,8 @@ TEST(test_tracer, test_var_without_grad_var) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 
-  detail::BackwardStrategy back_st;
   imperative::BasicEngine engine;
-  engine.Init(vout.get(), back_st);
+  engine.Init(vout.get());
   engine.Execute();
 
   // check the grad

diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.h b/paddle/fluid/operators/elementwise/elementwise_pow_op.h
@@ -22,15 +22,20 @@ namespace operators {
 template <typename T>
 struct PowFunctor {
   inline HOSTDEVICE T operator()(T a, T b) const {
-#ifdef __CUDA_ARCH__
-    // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
-    // it will return a float number like 2.99... , which floor to 2
-    // when cast to int by default and it is wrong.
-    // Use llrint to cast it to the nearest integer, which is 3.
+    // TODO(wujionghao): A potential speed improvement is supporting different
+    // types in C++.
+    // #ifdef __CUDA_ARCH__
+    //     // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+    //     // it will return a float number like 2.99... , which floor to 2
+    //     // when cast to int by default and it is wrong.
+    //     // Use llrint to cast it to the nearest integer, which is 3.
+    //     if (std::is_integral<T>::value) {
+    //       return std::llrint(std::pow(a, b));
+    //     }
+    // #endif
     if (std::is_integral<T>::value) {
       return std::llrint(std::pow(a, b));
     }
-#endif
     return std::pow(a, b);
   }
 };

diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
@@ -508,3 +508,16 @@ DEFINE_int32(
     "summary will be shown."
     "If FLAGS_call_stack_level == 2, the python stack, c++ stack, and "
     "error message summary will be shown.");
+
+/**
+ * Debug related FLAG
+ * Name: sort_sum_gradient
+ * Since Version: 2.0.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: If True, gradients are summed by the reverse order of
+ * the forward execution sequence.
+ */
+DEFINE_bool(sort_sum_gradient, false,
+            "Sum gradients by the reverse order of "
+            "the forward execution sequence.");
diff --git a/paddle/fluid/pybind/global_value_getter_setter.cc b/paddle/fluid/pybind/global_value_getter_setter.cc
@@ -38,6 +38,7 @@ DECLARE_bool(enable_rpc_profiler);
 DECLARE_int32(multiple_of_cupti_buffer_size);
 DECLARE_bool(reader_queue_speed_test_mode);
 DECLARE_int32(call_stack_level);
+DECLARE_bool(sort_sum_gradient);
 // device management
 DECLARE_int32(paddle_num_threads);
 // executor
@@ -340,7 +341,7 @@ static void RegisterGlobalVarGetterSetter() {
   REGISTER_PUBLIC_GLOBAL_VAR(
       FLAGS_eager_delete_tensor_gb, FLAGS_enable_parallel_graph,
       FLAGS_allocator_strategy, FLAGS_use_system_allocator, FLAGS_check_nan_inf,
-      FLAGS_call_stack_level, FLAGS_cpu_deterministic,
+      FLAGS_call_stack_level, FLAGS_sort_sum_gradient, FLAGS_cpu_deterministic,
       FLAGS_enable_rpc_profiler, FLAGS_multiple_of_cupti_buffer_size,
       FLAGS_reader_queue_speed_test_mode, FLAGS_pe_profile_fname,
       FLAGS_print_sub_graph_dir, FLAGS_fraction_of_cpu_memory_to_use,