From e5281b3c2d14fdd0cc515268307e29521eb40305 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Mon, 14 May 2018 13:23:07 +0800
Subject: [PATCH 01/12] Clean code & add execution strategy

---
 .../framework/details/execution_strategy.h    | 29 ++++++++++
 .../details/threaded_ssa_graph_executor.cc    | 17 +++---
 .../details/threaded_ssa_graph_executor.h     | 11 ++--
 paddle/fluid/framework/parallel_executor.cc   |  9 ++--
 paddle/fluid/framework/parallel_executor.h    | 36 +++++++------
 paddle/fluid/pybind/pybind.cc                 | 43 +++++++++------
 python/paddle/fluid/__init__.py               | 54 ++++++++++---------
 python/paddle/fluid/parallel_executor.py      | 51 ++++++++++--------
 .../tests/unittests/test_parallel_executor.py |  8 +--
 9 files changed, 154 insertions(+), 104 deletions(-)
 create mode 100644 paddle/fluid/framework/details/execution_strategy.h
diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h
new file mode 100644
index 0000000000000..e8d510ec95560
--- /dev/null
+++ b/paddle/fluid/framework/details/execution_strategy.h
@@ -0,0 +1,29 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct ExecutionStrategy {
+  size_t num_threads_{0};
+  bool use_event_{true};
+  bool allow_op_delay_{false};
+};
+
+}  //  namespace details
+}  //  namespace framework
+}  //  namespace paddle
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
index e90523ebe8dc7..ef263d82c5ec9 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -18,18 +18,17 @@ namespace paddle {
 namespace framework {
 namespace details {
 ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
-    size_t num_threads, bool use_event,
-    const std::vector<Scope *> &local_scopes,
+    const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
     const std::vector<platform::Place> &places,
-    std::unique_ptr<SSAGraph> &&graph, bool allow_op_delay)
+    std::unique_ptr<SSAGraph> &&graph)
     : SSAGraphExecutor(std::move(graph)),
-      pool_(num_threads >= 2 ? new ::ThreadPool(num_threads) : nullptr),
+      pool_(strategy.num_threads_ >= 2 ? new ::ThreadPool(strategy.num_threads_)
+                                       : nullptr),
       local_scopes_(local_scopes),
       places_(places),
       fetch_ctxs_(places),
-      use_event_(use_event),
       running_ops_(0),
-      allow_op_delay_(allow_op_delay) {}
+      strategy_(strategy) {}
 
 FeedFetchList ThreadedSSAGraphExecutor::Run(
     const std::vector<std::string> &fetch_tensors) {
@@ -86,7 +85,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     //
     // NOTE: DelayedOps have a lower priority. It will be scheduled after all
     // ready_ops have been performed.
-    if (ready_ops.empty() && allow_op_delay_ && running_ops_ == 0) {
+    if (ready_ops.empty() && strategy_.allow_op_delay_ && running_ops_ == 0) {
       run_all_ops(delayed_ops);
     } else {
       run_all_ops(ready_ops);
@@ -113,7 +112,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
         auto &deps = pending_ops[op];
         --deps;
         if (deps == 0) {
-          if (op->IsMultiDeviceTransfer() && allow_op_delay_) {
+          if (op->IsMultiDeviceTransfer() && strategy_.allow_op_delay_) {
             delayed_ops.insert(op);
           } else {
             ready_ops.insert(op);
@@ -191,7 +190,7 @@ void ThreadedSSAGraphExecutor::RunOp(
   auto op_run = [ready_var_q, op, this] {
     try {
       VLOG(10) << op << " " << op->Name() << " : " << op->DebugString();
-      op->Run(use_event_);
+      op->Run(strategy_.use_event_);
       VLOG(10) << op << " " << op->Name() << " Done ";
       running_ops_--;
       ready_var_q->Extend(op->Outputs());
diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
index f18a88526b323..1f7f88d75218e 100644
--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.h
@@ -23,6 +23,7 @@
 #include <functional>
 #include "ThreadPool.h"  // ThreadPool in thrird party
 #include "paddle/fluid/framework/blocking_queue.h"
+#include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/details/fetch_op_handle.h"
 #include "paddle/fluid/framework/details/ssa_graph_executor.h"
 
@@ -34,11 +35,10 @@ namespace details {
 
 class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
  public:
-  ThreadedSSAGraphExecutor(size_t num_threads, bool use_event,
+  ThreadedSSAGraphExecutor(const ExecutionStrategy &strategy,
                            const std::vector<Scope *> &local_scopes,
                            const std::vector<platform::Place> &places,
-                           std::unique_ptr<SSAGraph> &&graph,
-                           bool allow_op_delay);
+                           std::unique_ptr<SSAGraph> &&graph);
 
   // Run a SSAGraph by a thread pool
   // Use topological sort algorithm
@@ -55,10 +55,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
   std::vector<Scope *> local_scopes_;
   std::vector<platform::Place> places_;
   platform::DeviceContextPool fetch_ctxs_;
-  const bool use_event_;
   std::unique_ptr<platform::EnforceNotMet> exception_;
   std::atomic<int> running_ops_;
-  bool allow_op_delay_;
 
   void InsertPendingOp(std::unordered_map<OpHandleBase *, size_t> *pending_ops,
                        OpHandleBase *op_instance) const;
@@ -74,6 +72,9 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
       std::unordered_map<OpHandleBase *, size_t> *pending_ops,
       std::unordered_set<VarHandleBase *> *pending_vars,
       BlockingQueue<VarHandleBase *> *ready_vars, FeedFetchList *fetch_data);
+
+ private:
+  ExecutionStrategy strategy_;
 };
 
 }  // namespace details
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 20ef7e09f6301..cdfd0a8c07fc7 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -52,13 +52,13 @@ std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
 }
 
 ParallelExecutor::ParallelExecutor(
-    size_t num_threads, bool use_event,
     const std::vector<platform::Place> &places,
     const std::unordered_set<std::string> &params,
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
-    Scope *scope, const std::vector<Scope *> &local_scopes, bool allow_op_delay,
-    bool use_default_grad_scale, bool balance_parameter_opt_between_cards)
+    Scope *scope, const std::vector<Scope *> &local_scopes,
+    bool use_default_grad_scale, bool balance_parameter_opt_between_cards,
+    const ExecutionStrategy &exec_strategy)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
 
@@ -103,8 +103,7 @@ ParallelExecutor::ParallelExecutor(
   auto graph = builder.Build(main_program);
 
   member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
-      num_threads, use_event, member_->local_scopes_, places, std::move(graph),
-      allow_op_delay));
+      exec_strategy, member_->local_scopes_, places, std::move(graph)));
 
   // Step 3. Create vars in each scope;
   for (auto *var : main_program.Block(0).AllVars()) {
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index b251fc91417a1..ab50509124751 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -17,53 +17,55 @@ limitations under the License. */
 #include <string>
 #include <unordered_set>
 #include <vector>
+#include "paddle/fluid/framework/details/execution_strategy.h"
 #include "paddle/fluid/framework/executor.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
-
 namespace paddle {
 namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::ExecutionStrategy;
+
 class ParallelExecutor {
   DISABLE_COPY_AND_ASSIGN(ParallelExecutor);
 
  public:
-  explicit ParallelExecutor(size_t num_threads, bool use_event,
-                            const std::vector<platform::Place>& places,
-                            const std::unordered_set<std::string>& params,
-                            const std::unordered_set<std::string>& bcast_vars,
-                            const ProgramDesc& main_program,
-                            const std::string& loss_var_name, Scope* scope,
-                            const std::vector<Scope*>& local_scopes,
-                            bool allow_op_delay, bool use_default_grad_scale,
-                            bool balance_parameter_opt_between_cards);
+  explicit ParallelExecutor(const std::vector<platform::Place> &places,
+                            const std::unordered_set<std::string> &params,
+                            const std::unordered_set<std::string> &bcast_vars,
+                            const ProgramDesc &main_program,
+                            const std::string &loss_var_name, Scope *scope,
+                            const std::vector<Scope *> &local_scopes,
+                            bool use_default_grad_scale,
+                            bool balance_parameter_opt_between_cards,
+                            const ExecutionStrategy &exec_strategy);
 
   ~ParallelExecutor();
 
-  std::vector<Scope*>& GetLocalScopes();
+  std::vector<Scope *> &GetLocalScopes();
 
   /**
    * Feed tensors to local scopes. The size of tensors should be equal to the
    * size of local scopes.
    */
   void FeedTensorsIntoLocalScopes(
-      const std::vector<std::unordered_map<std::string, LoDTensor>>& tensors);
+      const std::vector<std::unordered_map<std::string, LoDTensor>> &tensors);
 
   void FeedAndSplitTensorIntoLocalScopes(
-      const std::unordered_map<std::string, LoDTensor>& tensors);
+      const std::unordered_map<std::string, LoDTensor> &tensors);
 
-  void Run(const std::vector<std::string>& fetch_tensors,
-           const std::string& fetched_var_name);
+  void Run(const std::vector<std::string> &fetch_tensors,
+           const std::string &fetched_var_name);
 
-  void BCastParamsToGPUs(const std::unordered_set<std::string>& vars) const;
+  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
 
  private:
-  ParallelExecutorPrivate* member_;
+  ParallelExecutorPrivate *member_;
 };
 
 }  // namespace framework
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3e2eed31b446b..c456bc1a71deb 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -494,22 +494,33 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("reset_profiler", platform::ResetProfiler);
 
-  py::class_<ParallelExecutor>(m, "ParallelExecutor")
-      .def("__init__",
-           [](ParallelExecutor &self, size_t num_threads, bool use_event,
-              const std::vector<platform::Place> &places,
-              const std::unordered_set<std::string> &params,
-              const std::unordered_set<std::string> &bcast_vars,
-              const ProgramDesc &main_program, const std::string &loss_var_name,
-              Scope *scope, std::vector<Scope *> &local_scopes,
-              bool allow_op_delay, bool use_default_grad_scale,
-              bool balance_parameter_opt_between_cards) {
-             new (&self) ParallelExecutor(
-                 num_threads, use_event, places, params, bcast_vars,
-                 main_program, loss_var_name, scope, local_scopes,
-                 allow_op_delay, use_default_grad_scale,
-                 balance_parameter_opt_between_cards);
-           })
+  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
+  py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
+      .def(py::init())
+      .def_property(
+          "num_threads",
+          [](const ExecutionStrategy &self) { return self.num_threads_; },
+          [](ExecutionStrategy &self, size_t num_threads) {
+            self.num_threads_ = num_threads;
+          })
+      .def_property(
+          "use_event",
+          [](const ExecutionStrategy &self) { return self.use_event_; },
+          [](ExecutionStrategy &self, bool use_event) {
+            self.use_event_ = use_event;
+          })
+      .def_property(
+          "allow_op_delay",
+          [](const ExecutionStrategy &self) { return self.allow_op_delay_; },
+          [](ExecutionStrategy &self, bool allow_op_delay) {
+            self.allow_op_delay_ = allow_op_delay;
+          });
+
+  pe.def(py::init<const std::vector<platform::Place> &,
+                  const std::unordered_set<std::string> &,
+                  const std::unordered_set<std::string> &, const ProgramDesc &,
+                  const std::string &, Scope *, std::vector<Scope *> &, bool,
+                  bool, const ExecutionStrategy &>())
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c8a435748dc5b..ef7a5864759d2 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -44,42 +44,44 @@
 from param_attr import ParamAttr, WeightNormParamAttr
 from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, CUDAPlace, CUDAPinnedPlace
-from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, InferenceTranspiler, memory_optimize, release_memory
+from transpiler import DistributeTranspiler, SimpleDistributeTranspiler, \
+    InferenceTranspiler, memory_optimize, release_memory
 from concurrency import (Go, make_channel, channel_send, channel_recv,
                          channel_close, Select)
 import clip
 import profiler
 import unique_name
 import recordio_writer
-from parallel_executor import ParallelExecutor
+from parallel_executor import ParallelExecutor, ExecutionStrategy
 
 Tensor = LoDTensor
 
-__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ +\
+__all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
           trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [
-    'io',
-    'initializer',
-    'layers',
-    'transpiler'
-    'nets',
-    'optimizer',
-    'learning_rate_decay',
-    'backward',
-    'regularizer',
-    'LoDTensor',
-    'CPUPlace',
-    'CUDAPlace',
-    'CUDAPinnedPlace',
-    'Tensor',
-    'ParamAttr',
-    'WeightNormParamAttr',
-    'DataFeeder',
-    'clip',
-    'profiler',
-    'unique_name',
-    'recordio_writer',
-    'ParallelExecutor',
-]
+              'io',
+              'initializer',
+              'layers',
+              'transpiler'
+              'nets',
+              'optimizer',
+              'learning_rate_decay',
+              'backward',
+              'regularizer',
+              'LoDTensor',
+              'CPUPlace',
+              'CUDAPlace',
+              'CUDAPinnedPlace',
+              'Tensor',
+              'ParamAttr',
+              'WeightNormParamAttr',
+              'DataFeeder',
+              'clip',
+              'profiler',
+              'unique_name',
+              'recordio_writer',
+              'ParallelExecutor',
+              'ExecutionStrategy',
+          ]
 
 
 def __bootstrap__():
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 5b43f860e7075..69ea9ee335efe 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -19,7 +19,9 @@
 import warnings
 import sys
 
-__all__ = ['ParallelExecutor']
+__all__ = ['ParallelExecutor', 'ExecutionStrategy']
+
+ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
 
 
 class ParallelExecutor(object):
@@ -27,11 +29,11 @@ def __init__(self,
                  use_cuda,
                  loss_name=None,
                  main_program=None,
-                 num_threads=None,
-                 allow_op_delay=False,
                  share_vars_from=None,
                  use_default_grad_scale=True,
-                 balance_parameter_opt_between_cards=False):
+                 balance_parameter_opt_between_cards=False,
+                 exec_strategy=None,
+                 **kwargs):
         """
         ParallelExecutor can run program in parallel.
 
@@ -40,11 +42,6 @@ def __init__(self,
             loss_name(str, default None): The loss name must set in training.
             main_program(Program, default None): The program that need to run,
                 if not provided, then default_main_program will be used.
-            num_threads(int, default None): How many threads are used for
-                training.
-            allow_op_delay(bool, default False): Whether to delay and buffer
-                some operators together for scheduling or not, which may
-                improve performance in some cases, default False.
             share_vars_from(ParallelExecutor, default None): If provied,
                 it will share variables from the specified ParallelExecutor.
             use_default_grad_scale(bool, default True): If set True, a default
@@ -76,6 +73,16 @@ def __init__(self,
               train_loss, = train_exe.run([loss.name], feed=feed_dict)
               test_loss, = test_exe.run([loss.name], feed=feed_dict)
         """
+        if len(kwargs) != 0:
+            err_msg = ""
+            for key in kwargs:
+                if key in dir(ExecutionStrategy):
+                    err_msg += \
+                        "Setting {0} by constructor is deprecated. Use " \
+                        "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
+                        "pe=ParallelExecutor(exec_strategy=strategy) " \
+                        "instead.\n "
+            raise ValueError(err_msg)
 
         self._places = []
         self._act_places = []
@@ -93,13 +100,20 @@ def __init__(self,
                 self._places.append(p)
         assert self._places, "no place for execution"
 
-        if num_threads is None:
+        if exec_strategy is None:
+            exec_strategy = ExecutionStrategy()
+            if use_cuda:
+                exec_strategy.use_event = True
+            else:
+                exec_strategy.use_event = False
+
+        if exec_strategy.num_threads == 0:
             if use_cuda:
                 # Experiments on se-resnext shows that too many threads hurt
                 # performance. Worth tunning for other models in the future.
-                num_threads = len(self._places) * 2
+                exec_strategy.num_threads = len(self._places) * 2
             else:
-                num_threads = min(
+                exec_strategy.num_threads = min(
                     len(self._places) * 2, multiprocessing.cpu_count())
 
         main = main_program
@@ -120,21 +134,14 @@ def __init__(self,
         ]
 
         self.executor = core.ParallelExecutor(
-            num_threads,
-            True if use_cuda else False,  # use_event
             self._places,
             set([
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(self.persistable_vars),
-            main.desc,
-            loss_name if loss_name else '',
-            scope,
-            local_scopes,
-            allow_op_delay,
-            use_default_grad_scale,
-            balance_parameter_opt_between_cards)
+            set(self.persistable_vars), main.desc, loss_name
+            if loss_name else '', scope, local_scopes, use_default_grad_scale,
+            balance_parameter_opt_between_cards, exec_strategy)
 
         self.scope = scope
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index a3be1a8db68c0..4173ad1925dc2 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -232,14 +232,14 @@ def run_executor(exe, feed, fetch_list, program=None):
             place = fluid.CUDAPlace(0)
             startup_exe = fluid.Executor(place)
             startup_exe.run(startup)
-
+            exec_strategy = fluid.ExecutionStrategy()
+            exec_strategy.allow_op_delay = allow_op_delay
             if use_parallel_executor:
                 exe = fluid.ParallelExecutor(
                     True,
                     loss_name=loss.name,
-                    allow_op_delay=allow_op_delay,
-                    balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-                )
+                    balance_parameter_opt_between_cards=balance_parameter_opt_between_cards,
+                    exec_strategy=exec_strategy)
             else:
                 exe = fluid.Executor(place=place)
 

From 08295f9877606cdae2df0a0983514eac67a45dca Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Mon, 14 May 2018 16:40:13 +0800
Subject: [PATCH 02/12] Add build strategy

---
 .../fluid/framework/details/build_strategy.h  | 36 +++++++++++
 .../details/multi_devices_graph_builder.cc    | 47 ++++++++-------
 .../details/multi_devices_graph_builder.h     | 12 ++--
 paddle/fluid/framework/parallel_executor.cc   | 12 ++--
 paddle/fluid/framework/parallel_executor.h    |  7 ++-
 paddle/fluid/platform/nccl_helper.h           |  2 +-
 paddle/fluid/pybind/pybind.cc                 | 31 +++++++++-
 python/paddle/fluid/__init__.py               |  8 +--
 python/paddle/fluid/parallel_executor.py      | 25 +++++---
 .../tests/unittests/test_parallel_executor.py | 59 +++++++++++--------
 10 files changed, 162 insertions(+), 77 deletions(-)
 create mode 100644 paddle/fluid/framework/details/build_strategy.h

diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
new file mode 100644
index 0000000000000..d6f9c547d8ab0
--- /dev/null
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+struct BuildStrategy {
+  enum class ReduceStrategy { kAllReduce = 0, kReduce = 1 };
+
+  enum class GradientScaleStrategy {
+    kCoeffNumDevice = 0,
+    kOne = 1,
+    kCustomized = 2,
+  };
+
+  ReduceStrategy reduce_{ReduceStrategy::kReduce};
+  GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
+};
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.cc b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
index 4755559f8d0c5..45bad58145a11 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.cc
@@ -37,31 +37,26 @@ MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
     const std::vector<Scope *> &local_scopes,
-    platform::NCCLContextMap *nccl_ctxs, bool use_default_grad_scale,
-    bool balance_parameter_opt_between_cards)
+    platform::NCCLContextMap *nccl_ctxs, const BuildStrategy &strategy)
     : loss_var_name_(loss_var_name),
       places_(places),
       local_scopes_(local_scopes),
       nccl_ctxs_(nccl_ctxs),
-      balance_parameter_opt_between_cards_(
-          balance_parameter_opt_between_cards) {
+      strategy_(strategy) {
 #else
 MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
     const std::vector<platform::Place> &places,
     const std::string &loss_var_name,
     const std::unordered_set<std::string> &params,
-    const std::vector<Scope *> &local_scopes, bool use_default_grad_scale,
-    bool balance_parameter_opt_between_cards)
+    const std::vector<Scope *> &local_scopes, const BuildStrategy &strategy)
     : loss_var_name_(loss_var_name),
       places_(places),
       local_scopes_(local_scopes),
-      balance_parameter_opt_between_cards_(
-          balance_parameter_opt_between_cards) {
+      strategy_(strategy) {
 #endif
   for (auto &p : params) {
     grad_names_.insert(GradVarName(p));
   }
-  use_default_grad_scale_ = use_default_grad_scale;
 }
 
 void MultiDevSSAGraphBuilder::CreateOpHandleIOs(SSAGraph *result,
@@ -146,7 +141,8 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
       CreateComputationalOps(&result, *op, 1);
     } else if (IsScaleLossOp(*op)) {
       // user can customize loss@grad if not use_default_grad_scale_
-      if (use_default_grad_scale_) {
+      if (strategy_.gradient_scale_ !=
+          BuildStrategy::GradientScaleStrategy::kCustomized) {
         CreateScaleLossGradOp(&result);
       }
       is_forwarding = false;
@@ -165,19 +161,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
         // broadcast, and each gradient is only broadcast once.
         for (auto &og : op->OutputArgumentNames()) {
           if (IsParameterGradientOnce(og, &og_has_been_broadcast)) {
-            if (balance_parameter_opt_between_cards_) {
-              CreateReduceOp(&result, og, cur_device_id);
-              var_name_on_devices[cur_device_id].emplace(og);
-              bcast_var_name_set[cur_device_id].emplace(
-                  og.substr(0, og.size() - strlen(kGradVarSuffix)));
-              cur_device_id = (cur_device_id + 1) % places_.size();
-            } else {
-              if (IsSparseGradient(var_types, og)) {
-                CreateReduceOp(&result, og, 0);
-                CreateBroadcastOp(&result, og, 0);
-              } else {
-                InsertNCCLAllReduceOp(&result, og);
-              }
+            switch (strategy_.reduce_) {
+              case BuildStrategy::ReduceStrategy::kReduce:
+                CreateReduceOp(&result, og, cur_device_id);
+                var_name_on_devices[cur_device_id].emplace(og);
+                bcast_var_name_set[cur_device_id].emplace(
+                    og.substr(0, og.size() - strlen(kGradVarSuffix)));
+                cur_device_id = (cur_device_id + 1) % places_.size();
+                break;
+              case BuildStrategy::ReduceStrategy::kAllReduce:
+                if (IsSparseGradient(var_types, og)) {
+                  CreateReduceOp(&result, og, 0);
+                  CreateBroadcastOp(&result, og, 0);
+                } else {
+                  InsertNCCLAllReduceOp(&result, og);
+                }
+                break;
             }
           }
         }
@@ -303,7 +302,7 @@ bool MultiDevSSAGraphBuilder::IsParameterGradientOnce(
 int MultiDevSSAGraphBuilder::GetOpDeviceID(
     const std::vector<std::unordered_set<std::string>> &var_name_on_devices,
     const OpDesc &op) const {
-  if (!balance_parameter_opt_between_cards_) {
+  if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
     return -1;
   }
 
diff --git a/paddle/fluid/framework/details/multi_devices_graph_builder.h b/paddle/fluid/framework/details/multi_devices_graph_builder.h
index 3a3e9e3b8538f..4f70852188424 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_builder.h
+++ b/paddle/fluid/framework/details/multi_devices_graph_builder.h
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/details/build_strategy.h"
 #include "paddle/fluid/framework/details/ssa_graph_builder.h"
 
 namespace paddle {
@@ -36,15 +37,13 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
                           platform::NCCLContextMap *nccl_ctxs,
-                          bool use_default_grad_scale,
-                          bool balance_parameter_opt_between_cards);
+                          const BuildStrategy &strategy);
 #else
   MultiDevSSAGraphBuilder(const std::vector<platform::Place> &places,
                           const std::string &loss_var_name,
                           const std::unordered_set<std::string> &params,
                           const std::vector<Scope *> &local_scopes,
-                          bool use_default_grad_scale,
-                          bool balance_parameter_opt_between_cards);
+                          const BuildStrategy &strategy);
 #endif
 
   std::unique_ptr<SSAGraph> Build(const ProgramDesc &program) const override;
@@ -62,8 +61,6 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
 #ifdef PADDLE_WITH_CUDA
   platform::NCCLContextMap *nccl_ctxs_;
 #endif
-  bool balance_parameter_opt_between_cards_;
-  bool use_default_grad_scale_;
 
   bool IsScaleLossOp(const OpDesc &op) const;
 
@@ -105,6 +102,9 @@ class MultiDevSSAGraphBuilder : public SSAGraphBuilder {
   bool IsSparseGradient(
       const std::unordered_map<std::string, proto::VarType::Type> &var_types,
       const std::string &og) const;
+
+ private:
+  BuildStrategy strategy_;
 };
 }  // namespace details
 }  // namespace framework
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index cdfd0a8c07fc7..392b13d3dc75b 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -57,8 +57,7 @@ ParallelExecutor::ParallelExecutor(
     const std::unordered_set<std::string> &bcast_vars,
     const ProgramDesc &main_program, const std::string &loss_var_name,
     Scope *scope, const std::vector<Scope *> &local_scopes,
-    bool use_default_grad_scale, bool balance_parameter_opt_between_cards,
-    const ExecutionStrategy &exec_strategy)
+    const ExecutionStrategy &exec_strategy, const BuildStrategy &build_strategy)
     : member_(new ParallelExecutorPrivate(places)) {
   member_->global_scope_ = scope;
 
@@ -93,12 +92,11 @@ ParallelExecutor::ParallelExecutor(
 #ifdef PADDLE_WITH_CUDA
   details::MultiDevSSAGraphBuilder builder(
       member_->places_, loss_var_name, params, member_->local_scopes_,
-      member_->nccl_ctxs_.get(), use_default_grad_scale,
-      balance_parameter_opt_between_cards);
+      member_->nccl_ctxs_.get(), build_strategy);
 #else
-  details::MultiDevSSAGraphBuilder builder(
-      member_->places_, loss_var_name, params, member_->local_scopes_,
-      use_default_grad_scale, balance_parameter_opt_between_cards);
+  details::MultiDevSSAGraphBuilder builder(member_->places_, loss_var_name,
+                                           params, member_->local_scopes_,
+                                           build_strategy);
 #endif
   auto graph = builder.Build(main_program);
 
diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h
index ab50509124751..121e74293c587 100644
--- a/paddle/fluid/framework/parallel_executor.h
+++ b/paddle/fluid/framework/parallel_executor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <paddle/fluid/framework/details/build_strategy.h>
 #include <string>
 #include <unordered_set>
 #include <vector>
@@ -29,6 +30,7 @@ namespace framework {
 
 class ParallelExecutorPrivate;
 
+using details::BuildStrategy;
 using details::ExecutionStrategy;
 
 class ParallelExecutor {
@@ -41,9 +43,8 @@ class ParallelExecutor {
                             const ProgramDesc &main_program,
                             const std::string &loss_var_name, Scope *scope,
                             const std::vector<Scope *> &local_scopes,
-                            bool use_default_grad_scale,
-                            bool balance_parameter_opt_between_cards,
-                            const ExecutionStrategy &exec_strategy);
+                            const ExecutionStrategy &exec_strategy,
+                            const BuildStrategy &build_strategy);
 
   ~ParallelExecutor();
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 0013597fd516d..ec1682a44e2d9 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -50,7 +50,7 @@ class NCCLGroupGuard {
   }
 
   inline ~NCCLGroupGuard() {
-    PADDLE_ENFORCE(dynload::ncclGroupEnd());
+    CHECK_EQ(dynload::ncclGroupEnd(), ncclSuccess);
     NCCLMutex().unlock();
   }
 };
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index c456bc1a71deb..ee2c5b904499a 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -494,6 +494,7 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("disable_profiler", platform::DisableProfiler);
   m.def("reset_profiler", platform::ResetProfiler);
 
+  // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
   py::class_<ExecutionStrategy>(pe, "ExecutionStrategy")
       .def(py::init())
@@ -515,12 +516,38 @@ All parameter, weight, gradient are variables in Paddle.
           [](ExecutionStrategy &self, bool allow_op_delay) {
             self.allow_op_delay_ = allow_op_delay;
           });
+  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
+
+  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
+      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
+      .value("AllReduce", BuildStrategy::ReduceStrategy::kAllReduce);
+  py::enum_<BuildStrategy::GradientScaleStrategy>(build_strategy,
+                                                  "GradientScaleStrategy")
+      .value("CoeffNumDevice",
+             BuildStrategy::GradientScaleStrategy::kCoeffNumDevice)
+      .value("One", BuildStrategy::GradientScaleStrategy::kOne)
+      .value("Customized", BuildStrategy::GradientScaleStrategy::kCustomized);
+
+  build_strategy.def(py::init())
+      .def_property(
+          "reduce_strategy",
+          [](const BuildStrategy &self) { return self.reduce_; },
+          [](BuildStrategy &self, BuildStrategy::ReduceStrategy strategy) {
+            self.reduce_ = strategy;
+          })
+      .def_property(
+          "gradient_scale_strategy",
+          [](const BuildStrategy &self) { return self.gradient_scale_; },
+          [](BuildStrategy &self,
+             BuildStrategy::GradientScaleStrategy strategy) {
+            self.gradient_scale_ = strategy;
+          });
 
   pe.def(py::init<const std::vector<platform::Place> &,
                   const std::unordered_set<std::string> &,
                   const std::unordered_set<std::string> &, const ProgramDesc &,
-                  const std::string &, Scope *, std::vector<Scope *> &, bool,
-                  bool, const ExecutionStrategy &>())
+                  const std::string &, Scope *, std::vector<Scope *> &,
+                  const ExecutionStrategy &, const BuildStrategy &>())
       .def("bcast_params", &ParallelExecutor::BCastParamsToGPUs)
       // NOTE: even we return a vec<Scope*>* to Python use reference policy.
       // We still cannot get local_scope from this vector, since the element
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index ef7a5864759d2..67aa5ec9979db 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -52,12 +52,14 @@
 import profiler
 import unique_name
 import recordio_writer
-from parallel_executor import ParallelExecutor, ExecutionStrategy
+import parallel_executor
+from parallel_executor import *
 
 Tensor = LoDTensor
 
 __all__ = framework.__all__ + executor.__all__ + concurrency.__all__ + \
-          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + [
+          trainer.__all__ + inferencer.__all__ + transpiler.__all__ + \
+          parallel_executor.__all__ + [
               'io',
               'initializer',
               'layers',
@@ -79,8 +81,6 @@
               'profiler',
               'unique_name',
               'recordio_writer',
-              'ParallelExecutor',
-              'ExecutionStrategy',
           ]
 
 
diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py
index 69ea9ee335efe..deab761f72a3f 100644
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
@@ -19,9 +19,10 @@
 import warnings
 import sys
 
-__all__ = ['ParallelExecutor', 'ExecutionStrategy']
+__all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy']
 
 ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
+BuildStrategy = core.ParallelExecutor.BuildStrategy
 
 
 class ParallelExecutor(object):
@@ -30,9 +31,8 @@ def __init__(self,
                  loss_name=None,
                  main_program=None,
                  share_vars_from=None,
-                 use_default_grad_scale=True,
-                 balance_parameter_opt_between_cards=False,
                  exec_strategy=None,
+                 build_strategy=None,
                  **kwargs):
         """
         ParallelExecutor can run program in parallel.
@@ -81,7 +81,16 @@ def __init__(self,
                         "Setting {0} by constructor is deprecated. Use " \
                         "strategy=ExecutionStrategy(); strategy.{0}=xxx; " \
                         "pe=ParallelExecutor(exec_strategy=strategy) " \
-                        "instead.\n "
+                        "instead.\n ".format(key)
+                elif key in dir(BuildStrategy):
+                    err_msg += \
+                        "Setting {0} by constructor is deprecated. Use " \
+                        "strategy=BuildStrategy(); See help(" \
+                        "paddle.fluid.ParallelExecutor.BuildStrategy) \n".format(
+                            key)
+                else:
+                    err_msg += "Setting {0} by constructor is deprecated. Use strategy.\n".format(
+                        key)
             raise ValueError(err_msg)
 
         self._places = []
@@ -116,6 +125,9 @@ def __init__(self,
                 exec_strategy.num_threads = min(
                     len(self._places) * 2, multiprocessing.cpu_count())
 
+        if build_strategy is None:
+            build_strategy = BuildStrategy()
+
         main = main_program
         main = main if main else framework.default_main_program()
         scope = executor.global_scope()
@@ -139,9 +151,8 @@ def __init__(self,
                 p.name for p in main.global_block().iter_parameters()
                 if not p.stop_gradient
             ]),
-            set(self.persistable_vars), main.desc, loss_name
-            if loss_name else '', scope, local_scopes, use_default_grad_scale,
-            balance_parameter_opt_between_cards, exec_strategy)
+            set(self.persistable_vars), main.desc, loss_name if loss_name else
+            '', scope, local_scopes, exec_strategy, build_strategy)
 
         self.scope = scope
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index 4173ad1925dc2..cd95ee47fdee1 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -234,12 +234,16 @@ def run_executor(exe, feed, fetch_list, program=None):
             startup_exe.run(startup)
             exec_strategy = fluid.ExecutionStrategy()
             exec_strategy.allow_op_delay = allow_op_delay
+
+            build_strategy = fluid.BuildStrategy()
+            build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce if balance_parameter_opt_between_cards else fluid.BuildStrategy.ReduceStrategy.AllReduce
+
             if use_parallel_executor:
                 exe = fluid.ParallelExecutor(
                     True,
                     loss_name=loss.name,
-                    balance_parameter_opt_between_cards=balance_parameter_opt_between_cards,
-                    exec_strategy=exec_strategy)
+                    exec_strategy=exec_strategy,
+                    build_strategy=build_strategy)
             else:
                 exe = fluid.Executor(place=place)
 
@@ -548,7 +552,7 @@ def test_main(self):
 
 
 class ParallelExecutorTestingDuringTraining(unittest.TestCase):
-    def check_network_convergence(self, balance_parameter_opt_between_cards):
+    def check_network_convergence(self, build_strategy=None):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -571,15 +575,13 @@ def check_network_convergence(self, balance_parameter_opt_between_cards):
                 use_cuda=True,
                 loss_name=loss.name,
                 main_program=main,
-                balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-            )
+                build_strategy=build_strategy)
 
             test_exe = fluid.ParallelExecutor(
                 use_cuda=True,
                 main_program=test_program,
                 share_vars_from=train_exe,
-                balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-            )
+                build_strategy=build_strategy)
 
             for i in xrange(5):
                 test_loss, = test_exe.run([loss.name], feed=feed_dict)
@@ -594,10 +596,14 @@ def check_network_convergence(self, balance_parameter_opt_between_cards):
                     str(test_loss))
 
     def test_parallel_testing(self):
-        self.check_network_convergence(False)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(build_strategy)
 
     def test_parallel_testing_with_new_strategy(self):
-        self.check_network_convergence(True)
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
+        self.check_network_convergence(build_strategy)
 
 
 import paddle.dataset.conll05 as conll05
@@ -617,7 +623,7 @@ def test_parallel_testing_with_new_strategy(self):
 
 
 def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
-            is_sparse, balance_parameter_opt_between_cards, **ignored):
+            is_sparse, **ignored):
     # 8 features
     predicate_embedding = fluid.layers.embedding(
         input=predicate,
@@ -686,9 +692,7 @@ def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
 
 
 class TestCRFModel(unittest.TestCase):
-    def check_network_convergence(self,
-                                  is_sparse,
-                                  balance_parameter_opt_between_cards=False):
+    def check_network_convergence(self, is_sparse, build_strategy=None):
         main = fluid.Program()
         startup = fluid.Program()
         with fluid.program_guard(main, startup):
@@ -739,8 +743,7 @@ def check_network_convergence(self,
             pe = fluid.ParallelExecutor(
                 use_cuda=True,
                 loss_name=avg_cost.name,
-                balance_parameter_opt_between_cards=balance_parameter_opt_between_cards
-            )
+                build_strategy=build_strategy)
 
             feeder = fluid.DataFeeder(
                 feed_list=[
@@ -756,19 +759,29 @@ def check_network_convergence(self,
                           pe.run(feed=feeder.feed(cur_batch),
                                  fetch_list=[avg_cost.name]))[0]
 
-    def test_update_sparse_parameter(self):
-        self.check_network_convergence(is_sparse=True)
+    def test_update_sparse_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=True, build_strategy=build_strategy)
 
-    def test_update_dense_parameter(self):
-        self.check_network_convergence(is_sparse=False)
+    def test_update_dense_parameter_all_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce
+        self.check_network_convergence(
+            is_sparse=False, build_strategy=build_strategy)
 
-    def test_update_sparse_parameter_with_new_strategy(self):
+    def test_update_sparse_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         self.check_network_convergence(
-            is_sparse=False, balance_parameter_opt_between_cards=True)
+            is_sparse=False, build_strategy=build_strategy)
 
-    def test_update_dense_parameter_with_new_strategy(self):
+    def test_update_dense_parameter_reduce(self):
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce
         self.check_network_convergence(
-            is_sparse=False, balance_parameter_opt_between_cards=True)
+            is_sparse=False, build_strategy=build_strategy)
 
 
 # test fetch all the variables of global_block

From af2cd942bb7baa487d0289d10fdfce7e35a6ba55 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 15 May 2018 09:50:48 +0800
Subject: [PATCH 03/12] copy boost

---
 cmake/inference_lib.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index cc758019827b9..78857cf2a0f9e 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -70,6 +70,12 @@ copy(glog_lib
   DSTS ${dst_dir} ${dst_dir}/lib
 )
 
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/boost")
+copy(boost_lib
+  SRCS ${BOOST_INCLUDE_DIR}
+  DSTS ${dst_dir}
+)
+
 if(NOT PROTOBUF_FOUND)
     set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
     copy(protobuf_lib

From efd425cb5edf52b7dbe5f6f19226ec5e29cf375d Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 15 May 2018 17:07:13 +0800
Subject: [PATCH 04/12] only copy header file of boost

---
 cmake/inference_lib.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 78857cf2a0f9e..54014055b4c7d 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -70,9 +70,9 @@ copy(glog_lib
   DSTS ${dst_dir} ${dst_dir}/lib
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/boost")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/include")
 copy(boost_lib
-  SRCS ${BOOST_INCLUDE_DIR}
+  SRCS ${BOOST_INCLUDE_DIR}/boost
   DSTS ${dst_dir}
 )
 

From 2ddca7196dcec9cff8632ea8732a1a03215e48e9 Mon Sep 17 00:00:00 2001
From: qiaolongfei <qiaolongfei@baidu.com>
Date: Tue, 15 May 2018 17:45:02 +0800
Subject: [PATCH 05/12] update boost dst dir

---
 cmake/inference_lib.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
index 54014055b4c7d..06a7ae56827d5 100644
--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -70,7 +70,7 @@ copy(glog_lib
   DSTS ${dst_dir} ${dst_dir}/lib
 )
 
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/include")
+set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/")
 copy(boost_lib
   SRCS ${BOOST_INCLUDE_DIR}/boost
   DSTS ${dst_dir}

From a77d1bc65e4eed9f8076df5d68513e0857e8acd2 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 15 May 2018 19:54:30 +0800
Subject: [PATCH 06/12] Add debug code

---
 paddle/fluid/framework/details/fetch_op_handle.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index b1c9dd0d15223..4a8f201108f78 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -53,6 +53,7 @@ void FetchOpHandle::RunImpl() {
   platform::CPUPlace cpu;
   auto &scopes = *local_scopes_;
 
+  PADDLE_ENFORCE_EQ(inputs_.size(), scopes.size());
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
     auto *var =

From 5895989a4f331ead9be667d0d7108be49d830920 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 15 May 2018 20:14:51 +0800
Subject: [PATCH 07/12] Add ref

---
 paddle/fluid/framework/details/fetch_op_handle.cc | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index 4a8f201108f78..c581149a27263 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -44,6 +44,12 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
   data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
 }
 
+template <typename T, typename... ARGS>
+inline T &Ref(T *ptr, ARGS &&... args) {
+  PADDLE_ENFORCE(ptr != nullptr, args...);
+  return *ptr;
+}
+
 void FetchOpHandle::RunImpl() {
   WaitInputVarGenerated(platform::CPUPlace());
 
@@ -56,8 +62,11 @@ void FetchOpHandle::RunImpl() {
   PADDLE_ENFORCE_EQ(inputs_.size(), scopes.size());
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto *var =
-        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
+    auto *var = Ref(Ref(scope->FindVar(kLocalExecScopeName), "Cannot find %s",
+                        kLocalExecScopeName)
+                        .Get<Scope *>(),
+                    "Cannot get scope")
+                    .FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
                             var_name);
     auto &t = var->Get<framework::LoDTensor>();

From c8f3ed23002e5f40c26ec6a685884a950a8e83b0 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 15 May 2018 20:27:39 +0800
Subject: [PATCH 08/12] Skip buggy test

---
 python/paddle/fluid/tests/unittests/test_parallel_executor.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor.py b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
index cd95ee47fdee1..6dc016487fd81 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor.py
@@ -849,7 +849,8 @@ def parallel_exe(self, train_inputs, seed):
                     assert not math.isnan(np.sum(ret[i])) and \
                            not math.isinf(np.sum(ret[i]))
 
-    def test_update_sparse_parameter(self):
+    @unittest.skip("this test is buggy")
+    def test_feed(self):
         tst_reader = paddle.batch(flowers.test(use_xmap=False), batch_size=16)
         tst_reader_iter = tst_reader()
 

From 999d0fdbef0c18024c89c9a5eee309177dc4e160 Mon Sep 17 00:00:00 2001
From: yuyang18 <reyoung@126.com>
Date: Tue, 15 May 2018 20:31:48 +0800
Subject: [PATCH 09/12] By default is all reduce

---
 paddle/fluid/framework/details/build_strategy.h   |  2 +-
 paddle/fluid/framework/details/fetch_op_handle.cc | 14 ++------------
 2 files changed, 3 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index d6f9c547d8ab0..91bdfe6134ffb 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -27,7 +27,7 @@ struct BuildStrategy {
     kCustomized = 2,
   };
 
-  ReduceStrategy reduce_{ReduceStrategy::kReduce};
+  ReduceStrategy reduce_{ReduceStrategy::kAllReduce};
   GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice};
 };
 
diff --git a/paddle/fluid/framework/details/fetch_op_handle.cc b/paddle/fluid/framework/details/fetch_op_handle.cc
index c581149a27263..b1c9dd0d15223 100644
--- a/paddle/fluid/framework/details/fetch_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_op_handle.cc
@@ -44,12 +44,6 @@ void FetchOpHandle::WaitAndMergeCPUTensors() const {
   data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
 }
 
-template <typename T, typename... ARGS>
-inline T &Ref(T *ptr, ARGS &&... args) {
-  PADDLE_ENFORCE(ptr != nullptr, args...);
-  return *ptr;
-}
-
 void FetchOpHandle::RunImpl() {
   WaitInputVarGenerated(platform::CPUPlace());
 
@@ -59,14 +53,10 @@ void FetchOpHandle::RunImpl() {
   platform::CPUPlace cpu;
   auto &scopes = *local_scopes_;
 
-  PADDLE_ENFORCE_EQ(inputs_.size(), scopes.size());
   for (size_t i = 0; i < scopes.size(); ++i) {
     auto &scope = scopes[i];
-    auto *var = Ref(Ref(scope->FindVar(kLocalExecScopeName), "Cannot find %s",
-                        kLocalExecScopeName)
-                        .Get<Scope *>(),
-                    "Cannot get scope")
-                    .FindVar(var_name);
+    auto *var =
+        scope->FindVar(kLocalExecScopeName)->Get<Scope *>()->FindVar(var_name);
     PADDLE_ENFORCE_NOT_NULL(var, "Cannot find variable %s in execution scope",
                             var_name);
     auto &t = var->Get<framework::LoDTensor>();

From 6af0593c6a0602ee8b277bdcab98a6f8d6499467 Mon Sep 17 00:00:00 2001
From: Siddharth Goyal <vi.siddharth78@gmail.com>
Date: Tue, 15 May 2018 15:31:36 -0700
Subject: [PATCH 10/12] Add FP16 option to load_combine op (#10601)

---
 paddle/fluid/operators/load_combine_op.cc     | 36 +++++---
 .../operators/save_load_combine_op_test.cc    | 90 ++++++++++++++++++-
 2 files changed, 113 insertions(+), 13 deletions(-)

diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc
index b5522dd246f25..0522a94195786 100644
--- a/paddle/fluid/operators/load_combine_op.cc
+++ b/paddle/fluid/operators/load_combine_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
-
+#include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device_context.h"
 
@@ -31,6 +31,7 @@ class LoadCombineOp : public framework::OperatorBase {
   void RunImpl(const framework::Scope &scope,
                const platform::Place &place) const override {
     auto filename = Attr<std::string>("file_path");
+    auto load_as_fp16 = Attr<bool>("load_as_fp16");
 
     std::ifstream fin(filename);
     PADDLE_ENFORCE(static_cast<bool>(fin),
@@ -59,17 +60,25 @@ class LoadCombineOp : public framework::OperatorBase {
       // Get data from fin to tensor
       DeserializeFromStream(fin, tensor, dev_ctx);
 
-      if (platform::is_gpu_place(place)) {
-        // copy CPU to GPU
-        framework::LoDTensor cpu_tensor;
-        cpu_tensor.ShareDataWith(*tensor);
-        cpu_tensor.set_lod(tensor->lod());
-
-        // reset tensor
+      auto in_dtype = framework::ToDataType(tensor->type());
+      auto out_dtype =
+          load_as_fp16 ? framework::proto::VarType::FP16 : in_dtype;
+
+      if (in_dtype != out_dtype) {
+        // convert to float16 tensor
+        auto in_kernel_type = framework::OpKernelType(in_dtype, place);
+        auto out_kernel_type = framework::OpKernelType(out_dtype, place);
+        framework::LoDTensor fp16_tensor;
+        // copy LoD info to the new tensor
+        fp16_tensor.set_lod(tensor->lod());
+        framework::TransDataType(in_kernel_type, out_kernel_type, *tensor,
+                                 &fp16_tensor);
+
+        // reset output tensor
         out_var->Clear();
         tensor = out_var->GetMutable<framework::LoDTensor>();
-        tensor->set_lod(cpu_tensor.lod());
-        TensorCopy(cpu_tensor, place, dev_ctx, tensor);
+        tensor->set_lod(fp16_tensor.lod());
+        tensor->ShareDataWith(fp16_tensor);
       }
     }
   }
@@ -82,6 +91,13 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker {
         "Out",
         "(vector) The output LoDTensors that will be read from the input file.")
         .AsDuplicable();
+    AddAttr<bool>(
+        "load_as_fp16",
+        "(boolean, default false)"
+        "If true, the tensor will be first loaded and then "
+        "converted to float16 data type. Otherwise, the tensor will be "
+        "directly loaded without data type conversion.")
+        .SetDefault(false);
     AddAttr<std::string>("file_path",
                          "(string) "
                          "LoDTensors will be loaded from \"file_path\".")
diff --git a/paddle/fluid/operators/save_load_combine_op_test.cc b/paddle/fluid/operators/save_load_combine_op_test.cc
index 47618c51d98eb..4743e0d9499b1 100644
--- a/paddle/fluid/operators/save_load_combine_op_test.cc
+++ b/paddle/fluid/operators/save_load_combine_op_test.cc
@@ -139,8 +139,9 @@ TEST(SaveLoadCombineOp, CPU) {
   CheckValues<int, int>(expect4, actual4, expect_lod4, actual_lod4, numel4);
 }
 
-// FP16 version of SaveLoadCombineOp Test
-TEST(SaveLoadCombineFP16Op, CPU) {
+// FP16 version of SaveLoadCombineOp Test, only altering the saving aspect
+// to save as FP16.
+TEST(SaveCombineFP16Op, CPU) {
   paddle::framework::Scope scope;
   paddle::platform::CPUPlace place;
 
@@ -169,7 +170,7 @@ TEST(SaveLoadCombineFP16Op, CPU) {
       20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
 
   // Set attributes
-  std::string filename = "check_tensor_fp16.ls";
+  std::string filename = "check_tensor_fp16_save.ls";
   paddle::framework::AttributeMap attrs;
   attrs.insert({"file_path", std::string(filename)});
   attrs.insert({"save_as_fp16", true});
@@ -216,6 +217,89 @@ TEST(SaveLoadCombineFP16Op, CPU) {
                                                 actual_lod4, numel4);
 }
 
+// FP16 version of SaveLoadCombineOp Test, only altering the loading aspect
+// to load tensors with FP16 precision.
+TEST(LoadCombineFP16Op, CPU) {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+
+  std::vector<int> lod1 = {0, 1, 2, 3, 10};
+  int numel1 = 100;
+  paddle::framework::LoD expect_lod1;
+  float* expect1 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 10, lod1, "test_var1", place, &scope, &expect_lod1);
+
+  std::vector<int> lod2 = {0, 2, 5, 10};
+  int numel2 = 200;
+  paddle::framework::LoD expect_lod2;
+  float* expect2 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      10, 20, lod2, "test_var2", place, &scope, &expect_lod2);
+
+  std::vector<int> lod3 = {0, 20};
+  int numel3 = 4000;
+  paddle::framework::LoD expect_lod3;
+  float* expect3 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 200, lod3, "test_var3", place, &scope, &expect_lod3);
+
+  std::vector<int> lod4 = {0, 1, 20};
+  int numel4 = 1000;
+  paddle::framework::LoD expect_lod4;
+  float* expect4 = CreateForSaveCombineOp<float, paddle::platform::float16>(
+      20, 50, lod4, "test_var4", place, &scope, &expect_lod4);
+
+  // Set attributes
+  std::string filename = "check_tensor_fp16_load.ls";
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"file_path", std::string(filename)});
+
+  // Run the save_combine_op
+  auto save_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "save_combine",
+      {{"X", {"test_var1", "test_var2", "test_var3", "test_var4"}}}, {}, attrs);
+  save_combine_op->Run(scope, place);
+
+  // Set up output vars
+  auto load_var1 = scope.Var("out_var1");
+  auto load_var2 = scope.Var("out_var2");
+  auto load_var3 = scope.Var("out_var3");
+  auto load_var4 = scope.Var("out_var4");
+
+  attrs.insert({"load_as_fp16", true});
+  // Run the load_combine_op
+  auto load_combine_op = paddle::framework::OpRegistry::CreateOp(
+      "load_combine", {},
+      {{"Out", {"out_var1", "out_var2", "out_var3", "out_var4"}}}, attrs);
+  load_combine_op->Run(scope, place);
+
+  auto* target1 = load_var1->GetMutable<paddle::framework::LoDTensor>();
+  auto* target2 = load_var2->GetMutable<paddle::framework::LoDTensor>();
+  auto* target3 = load_var3->GetMutable<paddle::framework::LoDTensor>();
+  auto* target4 = load_var4->GetMutable<paddle::framework::LoDTensor>();
+
+  paddle::framework::LoD actual_lod1, actual_lod2, actual_lod3, actual_lod4;
+  paddle::platform::float16* actual1 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target1, scope,
+                                                             &actual_lod1);
+  paddle::platform::float16* actual2 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target2, scope,
+                                                             &actual_lod2);
+  paddle::platform::float16* actual3 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target3, scope,
+                                                             &actual_lod3);
+  paddle::platform::float16* actual4 =
+      GetValuesAfterLoadCombineOp<paddle::platform::float16>(target4, scope,
+                                                             &actual_lod4);
+
+  CheckValues<float, paddle::platform::float16>(expect1, actual1, expect_lod1,
+                                                actual_lod1, numel1);
+  CheckValues<float, paddle::platform::float16>(expect2, actual2, expect_lod2,
+                                                actual_lod2, numel2);
+  CheckValues<float, paddle::platform::float16>(expect3, actual3, expect_lod3,
+                                                actual_lod3, numel3);
+  CheckValues<float, paddle::platform::float16>(expect4, actual4, expect_lod4,
+                                                actual_lod4, numel4);
+}
+
 // Test with original SaveLoadTest
 TEST(SaveLoadTestWithCombineOp, CPU) {
   paddle::framework::Scope scope;

From 74ca73b80d29870a2931d853cc26c6465102808d Mon Sep 17 00:00:00 2001
From: daminglu <daming-lu@users.noreply.github.com>
Date: Tue, 15 May 2018 17:18:40 -0700
Subject: [PATCH 11/12] Update trainer api (#10674)

---
 python/paddle/fluid/inferencer.py             | 20 +++--
 .../fit_a_line/test_fit_a_line.py             | 20 ++---
 .../test_recognize_digits_conv.py             | 73 +++++++++----------
 .../test_recognize_digits_mlp.py              | 72 +++++++++---------
 .../word2vec/no_test_word2vec_new_api.py      | 20 +++--
 python/paddle/fluid/trainer.py                | 17 +----
 6 files changed, 103 insertions(+), 119 deletions(-)

diff --git a/python/paddle/fluid/inferencer.py b/python/paddle/fluid/inferencer.py
index 1b8b9c07622dc..56c008d1af70f 100644
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -13,29 +13,35 @@
 # limitations under the License.
 
 import core
-import framework
+
 import executor
+import framework
 import io
+import unique_name
 from trainer import check_and_get_place
 
 __all__ = ['Inferencer', ]
 
 
 class Inferencer(object):
-    def __init__(self, param_path, place=None):
+    def __init__(self, infer_func, param_path, place=None):
         """
-        :param param_path: the path where the inference model is saved by fluid.io.save_inference_model
+        :param infer_func: a function that will return predict Variable
+        :param param_path: the path where the inference model is saved by fluid.io.save_params
         :param place: place to do the inference
         """
         self.param_path = param_path
         self.scope = core.Scope()
 
+        self.inference_program = framework.Program()
+        with framework.program_guard(self.inference_program):
+            with unique_name.guard():
+                self.predict_var = infer_func()
+
         self.exe = executor.Executor(check_and_get_place(place))
         with executor.scope_guard(self.scope):
             # load params from param_path into scope
-            [self.inference_program, _,
-             self.fetch_targets] = io.load_inference_model(
-                 executor=self.exe, dirname=param_path)
+            io.load_params(self.exe, param_path, self.inference_program)
 
     def infer(self, inputs, return_numpy=True):
         """
@@ -51,7 +57,7 @@ def infer(self, inputs, return_numpy=True):
         with executor.scope_guard(self.scope):
             results = self.exe.run(self.inference_program,
                                    feed=inputs,
-                                   fetch_list=self.fetch_targets,
+                                   fetch_list=[self.predict_var],
                                    return_numpy=return_numpy)
 
         return results
diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
index 8c9bbb52d7692..fbcf2a282f642 100644
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
@@ -48,12 +48,11 @@ def linear():
     return avg_loss
 
 
-def train(use_cuda, save_dirname):
+def train(use_cuda, train_program, save_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
     trainer = fluid.Trainer(
-        train_func=linear,
-        infer_func=inference_program,
+        train_func=train_program,
         place=place,
         optimizer=fluid.optimizer.SGD(learning_rate=0.001))
 
@@ -72,11 +71,7 @@ def event_handler(event):
             '''
             if float(test_metrics[0]) < 20.0:
                 if save_dirname is not None:
-                    # NOT clear yet
-                    # fluid.io.save_inference_model(save_dirname, ['x'], [y_predict])
-                    # trainer.save_params(save_dirname)
-                    # https://github.com/PaddlePaddle/Paddle/pull/10445
-                    trainer.save_inference_model(save_dirname)
+                    trainer.save_params(save_dirname)
             return
 
     trainer.train(
@@ -87,12 +82,13 @@ def event_handler(event):
 
 
 # infer
-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, inference_program, save_dirname=None):
     if save_dirname is None:
         return
 
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(param_path=save_dirname, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_dirname, place=place)
 
     batch_size = 10
     tensor_x = numpy.random.uniform(0, 10, [batch_size, 13]).astype("float32")
@@ -108,8 +104,8 @@ def main(use_cuda):
     # Directory for saving the trained model
     save_dirname = "fit_a_line.inference.model"
 
-    train(use_cuda, save_dirname)
-    infer(use_cuda, save_dirname)
+    train(use_cuda, linear, save_dirname)
+    infer(use_cuda, inference_program, save_dirname)
 
 
 class TestFitALine(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
index 1f91f471f22f7..420e6e6e42adc 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
@@ -53,48 +53,40 @@ def train_program():
     predict = inference_program()
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(cost)
-    # acc = fluid.layers.accuracy(input=predict, label=label)
-    # return avg_cost, acc
-    return avg_cost
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, acc]
 
 
-def train(use_cuda, save_dirname):
+def train(use_cuda, train_program, save_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
     trainer = fluid.Trainer(
-        train_func=train_program,
-        infer_func=inference_program,
-        place=place,
-        optimizer=optimizer)
+        train_func=train_program, place=place, optimizer=optimizer)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
-            # if (event.epoch + 1) % 10 == 0:
-            # trainer.save_params(save_dirname)
-            trainer.save_inference_model(save_dirname)
-
-            # TODO: Uncomment this part once we are sure that .train is working
-            # test_reader = paddle.batch(
-            #     paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-            # test_metrics = trainer.test(reader=test_reader)
-            # avg_cost_set = test_metrics[0]
-            # acc_set = test_metrics[1]
-            #
-            # # get test acc and loss
-            # acc = numpy.array(acc_set).mean()
-            # avg_cost = numpy.array(avg_cost_set).mean()
-            #
-            # print("avg_cost: %s" % avg_cost)
-            # print("acc     : %s" % acc)
-            #
-            # if float(acc) > 0.2:  # Smaller value to increase CI speed
-            #     trainer.save_params(save_dirname)
-            # else:
-            #     print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
-            #         event.epoch + 1, float(avg_cost), float(acc)))
-            #     if math.isnan(float(avg_cost)):
-            #         sys.exit("got NaN loss, training failed.")
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+            test_metrics = trainer.test(
+                reader=test_reader, feed_order=['img', 'label'])
+            avg_cost_set = test_metrics[0]
+            acc_set = test_metrics[1]
+
+            # get test acc and loss
+            acc = numpy.array(acc_set).mean()
+            avg_cost = numpy.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if float(acc) > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(save_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, float(avg_cost), float(acc)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -108,10 +100,11 @@ def event_handler(event):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, inference_program, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    inferencer = fluid.Inferencer(param_path=save_dirname, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_dirname, place=place)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -126,8 +119,14 @@ def main(use_cuda):
     save_dirname = "recognize_digits_conv.inference.model"
 
     # call train() with is_local argument to run distributed train
-    train(use_cuda=use_cuda, save_dirname=save_dirname)
-    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        save_dirname=save_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        save_dirname=save_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
index f072d70abdba5..9427a772f54fb 100644
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
@@ -40,47 +40,40 @@ def train_program():
     predict = inference_program()
     cost = fluid.layers.cross_entropy(input=predict, label=label)
     avg_cost = fluid.layers.mean(cost)
-    # acc = fluid.layers.accuracy(input=predict, label=label)
-    # return avg_cost, acc
-    return avg_cost
+    acc = fluid.layers.accuracy(input=predict, label=label)
+    return [avg_cost, acc]
 
 
-def train(use_cuda, save_dirname):
+def train(use_cuda, train_program, save_dirname):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
     optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 
     trainer = fluid.Trainer(
-        train_func=train_program,
-        infer_func=inference_program,
-        place=place,
-        optimizer=optimizer)
+        train_func=train_program, place=place, optimizer=optimizer)
 
     def event_handler(event):
         if isinstance(event, fluid.EndEpochEvent):
-            # if (event.epoch + 1) % 10 == 0:
-            trainer.save_inference_model(save_dirname)
-
-            # TODO: Uncomment this part once we are sure that .train is working
-            # test_reader = paddle.batch(
-            #     paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
-            # test_metrics = trainer.test(reader=test_reader)
-            # avg_cost_set = test_metrics[0]
-            # acc_set = test_metrics[1]
-            #
-            # # get test acc and loss
-            # acc = numpy.array(acc_set).mean()
-            # avg_cost = numpy.array(avg_cost_set).mean()
-            #
-            # print("avg_cost: %s" % avg_cost)
-            # print("acc     : %s" % acc)
-            #
-            # if float(acc) > 0.2:  # Smaller value to increase CI speed
-            #     trainer.save_params(save_dirname)
-            # else:
-            #     print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
-            #         event.epoch + 1, float(avg_cost), float(acc)))
-            #     if math.isnan(float(avg_cost)):
-            #         sys.exit("got NaN loss, training failed.")
+            test_reader = paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=BATCH_SIZE)
+            test_metrics = trainer.test(
+                reader=test_reader, feed_order=['img', 'label'])
+            avg_cost_set = test_metrics[0]
+            acc_set = test_metrics[1]
+
+            # get test acc and loss
+            acc = numpy.array(acc_set).mean()
+            avg_cost = numpy.array(avg_cost_set).mean()
+
+            print("avg_cost: %s" % avg_cost)
+            print("acc     : %s" % acc)
+
+            if float(acc) > 0.2:  # Smaller value to increase CI speed
+                trainer.save_params(save_dirname)
+            else:
+                print('BatchID {0}, Test Loss {1:0.2}, Acc {2:0.2}'.format(
+                    event.epoch + 1, float(avg_cost), float(acc)))
+                if math.isnan(float(avg_cost)):
+                    sys.exit("got NaN loss, training failed.")
 
     train_reader = paddle.batch(
         paddle.reader.shuffle(
@@ -94,10 +87,11 @@ def event_handler(event):
         feed_order=['img', 'label'])
 
 
-def infer(use_cuda, save_dirname=None):
+def infer(use_cuda, inference_program, save_dirname=None):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
 
-    inferencer = fluid.Inferencer(param_path=save_dirname, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_dirname, place=place)
 
     batch_size = 1
     tensor_img = numpy.random.uniform(-1.0, 1.0,
@@ -112,8 +106,14 @@ def main(use_cuda):
     save_dirname = "recognize_digits_mlp.inference.model"
 
     # call train() with is_local argument to run distributed train
-    train(use_cuda=use_cuda, save_dirname=save_dirname)
-    infer(use_cuda=use_cuda, save_dirname=save_dirname)
+    train(
+        use_cuda=use_cuda,
+        train_program=train_program,
+        save_dirname=save_dirname)
+    infer(
+        use_cuda=use_cuda,
+        inference_program=inference_program,
+        save_dirname=save_dirname)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py
index 00ba4acf88b1b..4f861e5aaeca7 100644
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/no_test_word2vec_new_api.py
@@ -90,7 +90,7 @@ def train_program(is_sparse):
     return avg_cost
 
 
-def train(use_cuda, is_sparse, save_path):
+def train(use_cuda, train_program, save_path):
     train_reader = paddle.batch(
         paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
     test_reader = paddle.batch(
@@ -105,23 +105,21 @@ def event_handler(event):
             print("loss= ", avg_cost)
 
             if avg_cost < 5.0:
-                trainer.save_inference_model(save_path)
+                trainer.save_params(save_path)
                 return
             if math.isnan(avg_cost):
                 sys.exit("got NaN loss, training failed.")
 
     trainer = fluid.Trainer(
-        partial(train_program, is_sparse),
-        partial(inference_program, is_sparse),
-        fluid.optimizer.SGD(learning_rate=0.001),
-        place=place)
+        train_program, fluid.optimizer.SGD(learning_rate=0.001), place=place)
     trainer.train(
         reader=train_reader, num_epochs=1, event_handler=event_handler)
 
 
-def infer(use_cuda, is_sparse, save_path):
+def infer(use_cuda, inference_program, save_path):
     place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace()
-    inferencer = fluid.Inferencer(param_path=save_path, place=place)
+    inferencer = fluid.Inferencer(
+        infer_func=inference_program, param_path=save_path, place=place)
 
     lod = [0, 1]
     first_word = create_random_lodtensor(lod, place, low=0, high=dict_size - 1)
@@ -144,9 +142,9 @@ def main(use_cuda, is_sparse):
     if use_cuda and not fluid.core.is_compiled_with_cuda():
         return
 
-    save_path = "word2vec.inference.model"
-    train(use_cuda, is_sparse, save_path)
-    infer(use_cuda, is_sparse, save_path)
+    save_path = "word2vec.params"
+    train(use_cuda, partial(train_program, is_sparse), save_path)
+    infer(use_cuda, partial(inference_program, is_sparse), save_path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 67d8be82d5fa8..2f1e70724fbdb 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -92,19 +92,13 @@ class Trainer(object):
         place: The device place of this trainer.
     """
 
-    def __init__(self,
-                 train_func,
-                 infer_func,
-                 optimizer,
-                 param_path=None,
-                 place=None):
+    def __init__(self, train_func, optimizer, param_path=None, place=None):
         # 1. we need to generate a framework.Program by calling
         # program_func. Reference: fluid.program_guard in
         # test_word2vec.py
         if not isinstance(optimizer, opt_module.Optimizer):
             raise TypeError("The optimizer should be an instance of Optimizer")
 
-        self.infer_func = infer_func
         self.scope = core.Scope()
 
         self.startup_program = framework.Program()
@@ -226,15 +220,6 @@ def save_params(self, param_path):
             exe = executor.Executor(self.place)
             io.save_persistables(exe, dirname=param_path)
 
-    def save_inference_model(self, model_path):
-        inference_program = framework.Program()
-        with framework.program_guard(inference_program):
-            with unique_name.guard():
-                predict_var = self.infer_func()
-        predict_var = self.train_program.block(0).var(predict_var.name)
-        exe = executor.Executor(self.place)
-        io.save_inference_model(model_path, [], [predict_var], exe)
-
     @contextlib.contextmanager
     def _prog_and_scope_guard(self):
         with framework.program_guard(

From 1c4bb5c83d872ab878e21e27c627f718c6f779cd Mon Sep 17 00:00:00 2001
From: Qiao Longfei <qiaolongfei@baidu.com>
Date: Wed, 16 May 2018 11:15:45 +0800
Subject: [PATCH 12/12] user need to set feed order for Trainer.train and
 Trainer.test (#10679)

---
 python/paddle/fluid/trainer.py | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/trainer.py b/python/paddle/fluid/trainer.py
index 2f1e70724fbdb..c24662ac2114c 100644
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
@@ -172,9 +172,9 @@ def _dist_transpile_if_necessary(self, optimize_ops, params_grads):
     def train(self,
               num_epochs,
               event_handler,
-              reader=None,
-              parallel=False,
-              feed_order=None):
+              reader,
+              feed_order,
+              parallel=False):
         """
         Train the model.
 
@@ -202,7 +202,7 @@ def train(self,
 
         self._train_by_executor(num_epochs, event_handler, reader, feed_order)
 
-    def test(self, reader, feed_order=None):
+    def test(self, reader, feed_order):
         """
         Test the model on given test data
 
@@ -276,12 +276,7 @@ def build_feed_var_list(program, feed_order):
     if not isinstance(program, framework.Program):
         raise TypeError("The 'program' should be an object of Program")
 
-    if feed_order is None:
-        feed_var_list = [
-            var for var in program.global_block().vars.itervalues()
-            if var.is_data
-        ]
-    elif isinstance(feed_order, list):
+    if isinstance(feed_order, list):
         feed_var_list = [
             program.global_block().var(var_name) for var_name in feed_order
         ]