Merge NVIDIA's NCCL multi-GPU, switch it to python

BVLC · Aug 8, 2016 · a155592 · a155592
1 parent 375003a
commit a155592
Show file tree

Hide file tree

Showing 29 changed files with 854 additions and 40 deletions.
diff --git a/.gitignore b/.gitignore
@@ -46,6 +46,7 @@
 
 # PyCharm files
 .idea
+*.iml
 
 # OSX dir files
 .DS_Store

diff --git a/Makefile b/Makefile
@@ -328,6 +328,12 @@ ifeq ($(USE_CUDNN), 1)
 	COMMON_FLAGS += -DUSE_CUDNN
 endif
 
+# NCCL acceleration configuration
+ifeq ($(USE_NCCL), 1)
+	LIBRARIES += nccl
+	COMMON_FLAGS += -DUSE_NCCL
+endif
+
 # configure IO libraries
 ifeq ($(USE_OPENCV), 1)
 	COMMON_FLAGS += -DUSE_OPENCV

diff --git a/Makefile.config.example b/Makefile.config.example
@@ -4,6 +4,10 @@
 # cuDNN acceleration switch (uncomment to build with cuDNN).
 # USE_CUDNN := 1
 
+# NCCL acceleration switch (uncomment to build with NCCL)
+# See https://github.com/NVIDIA/nccl
+# USE_NCCL := 1
+
 # CPU-only switch (uncomment to build without GPU support).
 # CPU_ONLY := 1
 

diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
@@ -220,6 +220,7 @@ class Blob {
   void set_cpu_data(Dtype* data);
   const int* gpu_shape() const;
   const Dtype* gpu_data() const;
+  void set_gpu_data(Dtype* data);
   const Dtype* cpu_diff() const;
   const Dtype* gpu_diff() const;
   Dtype* mutable_cpu_data();

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
@@ -161,6 +161,8 @@ class Caffe {
   // Parallel training info
   inline static int solver_count() { return Get().solver_count_; }
   inline static void set_solver_count(int val) { Get().solver_count_ = val; }
+  inline static int solver_rank() { return Get().solver_rank_; }
+  inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
   inline static bool root_solver() { return Get().root_solver_; }
   inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
 
@@ -173,6 +175,7 @@ class Caffe {
 
   Brew mode_;
   int solver_count_;
+  int solver_rank_;
   bool root_solver_;
 
  private:

diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
@@ -23,7 +23,7 @@ class DataTransformer {
    * @brief Initialize the Random number generations if needed by the
    *    transformation.
    */
-  void InitRand();
+  void InitRand(unsigned int seed);
 
   /**
    * @brief Applies the transformation defined in the data layer's

diff --git a/include/caffe/internal_thread.hpp b/include/caffe/internal_thread.hpp
@@ -42,8 +42,8 @@ class InternalThread {
   bool must_stop();
 
  private:
-  void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
-      bool root_solver);
+  void entry(int device, Caffe::Brew mode, int rand_seed,
+      int solver_count, int solver_rank, bool root_solver);
 
   shared_ptr<boost::thread> thread_;
 };

diff --git a/include/caffe/layers/python_layer.hpp b/include/caffe/layers/python_layer.hpp
@@ -20,10 +20,10 @@ class PythonLayer : public Layer<Dtype> {
       const vector<Blob<Dtype>*>& top) {
     // Disallow PythonLayer in MultiGPU training stage, due to GIL issues
     // Details: https://github.com/BVLC/caffe/issues/2936
-    if (this->phase_ == TRAIN && Caffe::solver_count() > 1
-        && !ShareInParallel()) {
-      LOG(FATAL) << "PythonLayer is not implemented in Multi-GPU training";
-    }
+//    if (this->phase_ == TRAIN && Caffe::solver_count() > 1
+//        && !ShareInParallel()) {
+//      LOG(FATAL) << "PythonLayer is not implemented in Multi-GPU training";
+//    }
     self_.attr("param_str") = bp::str(
         this->layer_param_.python_param().param_str());
     self_.attr("phase") = static_cast<int>(this->phase_);

diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
@@ -228,6 +228,31 @@ class Net {
   static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
       const string& layer_name);
 
+  // Invoked at specific points during an iteration
+  class Callback {
+   protected:
+    virtual void run(int layer) = 0;
+
+    template <typename T>
+    friend class Net;
+  };
+  const vector<Callback*>& before_forward() const { return before_forward_; }
+  void add_before_forward(Callback* value) {
+    before_forward_.push_back(value);
+  }
+  const vector<Callback*>& after_forward() const { return after_forward_; }
+  void add_after_forward(Callback* value) {
+    after_forward_.push_back(value);
+  }
+  const vector<Callback*>& before_backward() const { return before_backward_; }
+  void add_before_backward(Callback* value) {
+    before_backward_.push_back(value);
+  }
+  const vector<Callback*>& after_backward() const { return after_backward_; }
+  void add_after_backward(Callback* value) {
+    after_backward_.push_back(value);
+  }
+
  protected:
   // Helpers for Init.
   /// @brief Append a new top blob to the net.
@@ -308,7 +333,13 @@ class Net {
   bool debug_info_;
   /// The root net that actually holds the shared layers in data parallelism
   const Net* const root_net_;
-  DISABLE_COPY_AND_ASSIGN(Net);
+  // Callbacks
+  vector<Callback*> before_forward_;
+  vector<Callback*> after_forward_;
+  vector<Callback*> before_backward_;
+  vector<Callback*> after_backward_;
+
+DISABLE_COPY_AND_ASSIGN(Net);
 };
 
 

diff --git a/include/caffe/parallel.hpp b/include/caffe/parallel.hpp
@@ -3,6 +3,7 @@
 
 #include <boost/date_time/posix_time/posix_time.hpp>
 
+#include <string>
 #include <vector>
 
 #include "caffe/blob.hpp"
@@ -14,6 +15,10 @@
 #include "caffe/syncedmem.hpp"
 #include "caffe/util/blocking_queue.hpp"
 
+#ifdef USE_NCCL
+#include "caffe/util/nccl.hpp"
+#endif
+
 namespace caffe {
 
 // Represents a net parameters. Once a net is created, its parameter buffers can
@@ -80,6 +85,50 @@ class DevicePair {
   int device_;
 };
 
+template<typename Dtype>
+class NCCL : public GPUParams<Dtype>,
+             public Solver<Dtype>::Callback,
+             public Net<Dtype>::Callback {
+ public:
+  /**
+   * In multi-process settings, first create a NCCL id (new_uid), then
+   * pass it to each process to create connected instances.
+   */
+  NCCL(shared_ptr<Solver<Dtype> > solver, const string& uid = "");
+  ~NCCL();
+
+  /**
+   * In single process settings, create instances without uids and
+   * call this.
+   */
+  static void init_single_process(vector<NCCL<Dtype>*>* nccls);
+
+  static string new_uid();
+
+  /**
+   * Broadcast weigths from rank 0 other solvers.
+   */
+  void bcast();
+
+ protected:
+  void on_start() {}
+  void on_gradients_ready();
+  void run(int layer);
+
+#ifdef USE_NCCL
+  ncclComm_t comm_;
+  cudaStream_t stream_;
+  vector<cudaEvent_t> layer_events_;
+  cudaEvent_t solver_event_;
+#endif
+
+  shared_ptr<Solver<Dtype> > solver_;
+  bool layer_wise_;
+  using Params<Dtype>::size_;
+  using Params<Dtype>::data_;
+  using Params<Dtype>::diff_;
+};
+
 // Synchronous data parallelism using map-reduce between local GPUs.
 template<typename Dtype>
 class P2PSync : public GPUParams<Dtype>, public Solver<Dtype>::Callback,

diff --git a/include/caffe/util/math_functions.hpp b/include/caffe/util/math_functions.hpp
@@ -185,6 +185,11 @@ void caffe_gpu_add_scalar(const int N, const Dtype alpha, Dtype *X);
 template <typename Dtype>
 void caffe_gpu_scal(const int N, const Dtype alpha, Dtype *X);
 
+#ifndef CPU_ONLY
+template <typename Dtype>
+void caffe_gpu_scal(const int N, const Dtype alpha, Dtype* X, cudaStream_t str);
+#endif
+
 template <typename Dtype>
 void caffe_gpu_add(const int N, const Dtype* a, const Dtype* b, Dtype* y);
 

diff --git a/include/caffe/util/nccl.hpp b/include/caffe/util/nccl.hpp
@@ -0,0 +1,37 @@
+#ifndef CAFFE_UTIL_NCCL_H_
+#define CAFFE_UTIL_NCCL_H_
+#ifdef USE_NCCL
+
+#include <nccl.h>
+
+#include "caffe/common.hpp"
+
+#define NCCL_CHECK(condition) \
+{ \
+  ncclResult_t result = condition; \
+  CHECK_EQ(result, ncclSuccess) << " " \
+    << ncclGetErrorString(result); \
+}
+
+namespace caffe {
+
+namespace nccl {
+
+template <typename Dtype> class dataType;
+
+template<> class dataType<float> {
+ public:
+  static const ncclDataType_t type = ncclFloat;
+};
+template<> class dataType<double> {
+ public:
+  static const ncclDataType_t type = ncclDouble;
+};
+
+}  // namespace nccl
+
+}  // namespace caffe
+
+#endif  // end USE_NCCL
+
+#endif  // CAFFE_UTIL_NCCL_H_
diff --git a/python/caffe/__init__.py b/python/caffe/__init__.py
@@ -1,8 +1,9 @@
-from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver
-from ._caffe import set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed
+from .pycaffe import Net, SGDSolver, NesterovSolver, AdaGradSolver, RMSPropSolver, AdaDeltaSolver, AdamSolver, DataTransformer, Blob, NCCL, Timer
+from ._caffe import init_log, log, set_mode_cpu, set_mode_gpu, set_device, Layer, get_solver, layer_type_list, set_random_seed, get_random, solver_count, set_solver_count, solver_rank, set_solver_rank, Layer, get_solver, layer_type_list
 from ._caffe import __version__
 from .proto.caffe_pb2 import TRAIN, TEST
 from .classifier import Classifier
 from .detector import Detector
 from . import io
 from .net_spec import layers, params, NetSpec, to_proto
+from .train import train