Skip to content

Commit

Permalink
Port multi-GPU to NCCL, add python support
Browse files Browse the repository at this point in the history
  • Loading branch information
cypof committed Sep 9, 2016
1 parent b2982c7 commit be16a01
Show file tree
Hide file tree
Showing 42 changed files with 766 additions and 798 deletions.
22 changes: 20 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -328,6 +328,12 @@ ifeq ($(USE_CUDNN), 1)
COMMON_FLAGS += -DUSE_CUDNN
endif

# NCCL acceleration configuration
ifeq ($(USE_NCCL), 1)
LIBRARIES += nccl
COMMON_FLAGS += -DUSE_NCCL
endif

# configure IO libraries
ifeq ($(USE_OPENCV), 1)
COMMON_FLAGS += -DUSE_OPENCV
Expand Down Expand Up @@ -446,9 +452,20 @@ endif
py mat py$(PROJECT) mat$(PROJECT) proto runtest \
superclean supercleanlist supercleanfiles warn everything

ifeq ($(CPU_ONLY), 1)
ifeq ($(USE_NCCL), 1)
checks: $(error Cannot define USE_NCCL with CPU_ONLY)
endif
ifeq ($(USE_CUDNN), 1)
checks: $(error Cannot define USE_CUDNN with CPU_ONLY)
endif
endif
checks:

all: lib tools examples

lib: $(STATIC_NAME) $(DYNAMIC_NAME)
lib: checks \
$(STATIC_NAME) $(DYNAMIC_NAME)

everything: $(EVERYTHING_TARGETS)

Expand Down Expand Up @@ -495,7 +512,8 @@ examples: $(EXAMPLE_BINS)

py$(PROJECT): py

py: $(PY$(PROJECT)_SO) $(PROTO_GEN_PY)
py: checks \
$(PY$(PROJECT)_SO) $(PROTO_GEN_PY)

$(PY$(PROJECT)_SO): $(PY$(PROJECT)_SRC) $(PY$(PROJECT)_HXX) | $(DYNAMIC_NAME)
@ echo CXX/LD -o $@ $<
Expand Down
10 changes: 10 additions & 0 deletions Makefile.config.example
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,16 @@ LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib
# INCLUDE_DIRS += $(shell brew --prefix)/include
# LIBRARY_DIRS += $(shell brew --prefix)/lib

# NCCL acceleration switch (uncomment to build with NCCL)
# E.g. setup:
# cd
# git clone https://github.com/NVIDIA/nccl
# cd nccl
# make -j
# USE_NCCL := 1
# INCLUDE_DIRS += $(HOME)/nccl/src
# LIBRARY_DIRS += $(HOME)/nccl/build/lib

# Uncomment to use `pkg-config` to specify OpenCV library paths.
# (Usually not necessary -- OpenCV libraries are normally installed in one of the above $LIBRARY_DIRS.)
# USE_PKG_CONFIG := 1
Expand Down
1 change: 1 addition & 0 deletions include/caffe/blob.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ class Blob {
void set_cpu_data(Dtype* data);
const int* gpu_shape() const;
const Dtype* gpu_data() const;
void set_gpu_data(Dtype* data);
const Dtype* cpu_diff() const;
const Dtype* gpu_diff() const;
Dtype* mutable_cpu_data();
Expand Down
14 changes: 10 additions & 4 deletions include/caffe/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,11 +158,14 @@ class Caffe {
// Search from start_id to the highest possible device ordinal,
// return the ordinal of the first available device.
static int FindDevice(const int start_id = 0);
// Parallel training info
// Parallel training
inline static int solver_count() { return Get().solver_count_; }
inline static void set_solver_count(int val) { Get().solver_count_ = val; }
inline static bool root_solver() { return Get().root_solver_; }
inline static void set_root_solver(bool val) { Get().root_solver_ = val; }
inline static int solver_rank() { return Get().solver_rank_; }
inline static void set_solver_rank(int val) { Get().solver_rank_ = val; }
inline static bool multi_process() { return Get().multi_process_; }
inline static void set_multi_process(bool val) { Get().multi_process_ = val; }
inline static bool root_solver() { return Get().solver_rank_ == 0; }

protected:
#ifndef CPU_ONLY
Expand All @@ -172,8 +175,11 @@ class Caffe {
shared_ptr<RNG> random_generator_;

Brew mode_;

// Parallel training
int solver_count_;
bool root_solver_;
int solver_rank_;
bool multi_process_;

private:
// The private constructor to avoid duplicate instantiation.
Expand Down
82 changes: 0 additions & 82 deletions include/caffe/data_reader.hpp

This file was deleted.

4 changes: 2 additions & 2 deletions include/caffe/internal_thread.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,8 @@ class InternalThread {
bool must_stop();

private:
void entry(int device, Caffe::Brew mode, int rand_seed, int solver_count,
bool root_solver);
void entry(int device, Caffe::Brew mode, int rand_seed,
int solver_count, int solver_rank, bool multi_process);

shared_ptr<boost::thread> thread_;
};
Expand Down
43 changes: 1 addition & 42 deletions include/caffe/layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class Layer {
* layer.
*/
explicit Layer(const LayerParameter& param)
: layer_param_(param), is_shared_(false) {
: layer_param_(param) {
// Set phase and copy blobs (if there are any).
phase_ = param.phase();
if (layer_param_.blobs_size() > 0) {
Expand Down Expand Up @@ -66,7 +66,6 @@ class Layer {
*/
void SetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
InitMutex();
CheckBlobCounts(bottom, top);
LayerSetUp(bottom, top);
Reshape(bottom, top);
Expand All @@ -92,30 +91,6 @@ class Layer {
virtual void LayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {}

/**
* @brief Whether a layer should be shared by multiple nets during data
* parallelism. By default, all layers except for data layers should
* not be shared. data layers should be shared to ensure each worker
* solver access data sequentially during data parallelism.
*/
virtual inline bool ShareInParallel() const { return false; }

/** @brief Return whether this layer is actually shared by other nets.
* If ShareInParallel() is true and using more than one GPU and the
* net has TRAIN phase, then this function is expected return true.
*/
inline bool IsShared() const { return is_shared_; }

/** @brief Set whether this layer is actually shared by other nets
* If ShareInParallel() is true and using more than one GPU and the
* net has TRAIN phase, then is_shared should be set true.
*/
inline void SetShared(bool is_shared) {
CHECK(ShareInParallel() || !is_shared)
<< type() << "Layer does not support sharing.";
is_shared_ = is_shared;
}

/**
* @brief Adjust the shapes of top blobs and internal buffers to accommodate
* the shapes of the bottom blobs.
Expand Down Expand Up @@ -428,19 +403,6 @@ class Layer {
}

private:
/** Whether this layer is actually shared by other nets*/
bool is_shared_;

/** The mutex for sequential forward if this layer is shared */
shared_ptr<boost::mutex> forward_mutex_;

/** Initialize forward_mutex_ */
void InitMutex();
/** Lock forward_mutex_ if this layer is shared */
void Lock();
/** Unlock forward_mutex_ if this layer is shared */
void Unlock();

DISABLE_COPY_AND_ASSIGN(Layer);
}; // class Layer

Expand All @@ -450,8 +412,6 @@ class Layer {
template <typename Dtype>
inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {
// Lock during forward to ensure sequential forward
Lock();
Dtype loss = 0;
Reshape(bottom, top);
switch (Caffe::mode()) {
Expand Down Expand Up @@ -482,7 +442,6 @@ inline Dtype Layer<Dtype>::Forward(const vector<Blob<Dtype>*>& bottom,
default:
LOG(FATAL) << "Unknown caffe mode.";
}
Unlock();
return loss;
}

Expand Down
5 changes: 3 additions & 2 deletions include/caffe/layers/base_data_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,16 @@ class BasePrefetchingDataLayer :
const vector<Blob<Dtype>*>& top);

// Prefetches batches (asynchronously if to GPU memory)
static const int PREFETCH_COUNT = 3;
static const int PREFETCH_COUNT = 4; // same as proto

protected:
virtual void InternalThreadEntry();
virtual void load_batch(Batch<Dtype>* batch) = 0;

Batch<Dtype> prefetch_[PREFETCH_COUNT];
vector<shared_ptr<Batch<Dtype> > > prefetch_;
BlockingQueue<Batch<Dtype>*> prefetch_free_;
BlockingQueue<Batch<Dtype>*> prefetch_full_;
Batch<Dtype>* prefetch_current_;

Blob<Dtype> transformed_data_;
};
Expand Down
7 changes: 5 additions & 2 deletions include/caffe/layers/data_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <vector>

#include "caffe/blob.hpp"
#include "caffe/data_reader.hpp"
#include "caffe/data_transformer.hpp"
#include "caffe/internal_thread.hpp"
#include "caffe/layer.hpp"
Expand All @@ -29,9 +28,13 @@ class DataLayer : public BasePrefetchingDataLayer<Dtype> {
virtual inline int MaxTopBlobs() const { return 2; }

protected:
void Next();
bool Skip();
virtual void load_batch(Batch<Dtype>* batch);

DataReader reader_;
shared_ptr<db::DB> db_;
shared_ptr<db::Cursor> cursor_;
uint64_t skip_counter_;
};

} // namespace caffe
Expand Down
4 changes: 2 additions & 2 deletions include/caffe/layers/python_layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ class PythonLayer : public Layer<Dtype> {
// Disallow PythonLayer in MultiGPU training stage, due to GIL issues
// Details: https://github.com/BVLC/caffe/issues/2936
if (this->phase_ == TRAIN && Caffe::solver_count() > 1
&& !ShareInParallel()) {
LOG(FATAL) << "PythonLayer is not implemented in Multi-GPU training";
&& !Caffe::root_solver() && !Caffe::multi_process()) {
LOG(FATAL) << "PythonLayer does not support CLI Multi-GPU, use train.py";
}
self_.attr("param_str") = bp::str(
this->layer_param_.python_param().param_str());
Expand Down
40 changes: 34 additions & 6 deletions include/caffe/net.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,9 @@ namespace caffe {
template <typename Dtype>
class Net {
public:
explicit Net(const NetParameter& param, const Net* root_net = NULL);
explicit Net(const NetParameter& param);
explicit Net(const string& param_file, Phase phase,
const int level = 0, const vector<string>* stages = NULL,
const Net* root_net = NULL);
const int level = 0, const vector<string>* stages = NULL);
virtual ~Net() {}

/// @brief Initialize a network with a NetParameter.
Expand Down Expand Up @@ -228,6 +227,31 @@ class Net {
static bool StateMeetsRule(const NetState& state, const NetStateRule& rule,
const string& layer_name);

// Invoked at specific points during an iteration
class Callback {
protected:
virtual void run(int layer) = 0;

template <typename T>
friend class Net;
};
const vector<Callback*>& before_forward() const { return before_forward_; }
void add_before_forward(Callback* value) {
before_forward_.push_back(value);
}
const vector<Callback*>& after_forward() const { return after_forward_; }
void add_after_forward(Callback* value) {
after_forward_.push_back(value);
}
const vector<Callback*>& before_backward() const { return before_backward_; }
void add_before_backward(Callback* value) {
before_backward_.push_back(value);
}
const vector<Callback*>& after_backward() const { return after_backward_; }
void add_after_backward(Callback* value) {
after_backward_.push_back(value);
}

protected:
// Helpers for Init.
/// @brief Append a new top blob to the net.
Expand Down Expand Up @@ -306,9 +330,13 @@ class Net {
size_t memory_used_;
/// Whether to compute and display debug info for the net.
bool debug_info_;
/// The root net that actually holds the shared layers in data parallelism
const Net* const root_net_;
DISABLE_COPY_AND_ASSIGN(Net);
// Callbacks
vector<Callback*> before_forward_;
vector<Callback*> after_forward_;
vector<Callback*> before_backward_;
vector<Callback*> after_backward_;

DISABLE_COPY_AND_ASSIGN(Net);
};


Expand Down
Loading

0 comments on commit be16a01

Please sign in to comment.