Skip to content

Commit

Permalink
Merge pull request BVLC#132 from yjxiong/mem
Browse files Browse the repository at this point in the history
Update Dec 2016
  • Loading branch information
yjxiong authored Dec 26, 2016
2 parents 36c6aa4 + 03f54c5 commit b065585
Show file tree
Hide file tree
Showing 42 changed files with 26,823 additions and 217 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
## Dec 25, 2016

- Supported dilated convolution
- Memory optimization is introduced to save memory during training and testing. [Wiki](https://github.com/yjxiong/caffe/wiki/Memory-Optimization)
- `BatchReductionLayer` supports reduction on an arbitrary axis with cuda implementation.
- Other small fixes.

## Apr 27, 2016

Features:
Expand Down
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ This branch hosts the code for the technical report ["Towards Good Practices for


### Updates
- Dec, 2016
* Major updates of the codebase. New features include memory optimization and dilated convolution.
- Aug 23, 2016
* [Temporal Segment Networks](https://github.com/yjxiong/temporal-segment-networks): a new state of the art action recognition framework is open sourced.
- Aug 1, 2016
Expand All @@ -36,7 +38,8 @@ This branch hosts the code for the technical report ["Towards Good Practices for
- Training on optical flow data.
- Data augmentation with fixed corner cropping and multi-scale cropping.
- Parallel training with multiple GPUs.
- cuDNNv5 integration.
- Newest cuDNN integration.
- Slim memory footprints in both training and testing,

### Usage

Expand Down Expand Up @@ -69,6 +72,10 @@ make && make install
mpirun -np 4 ./install/bin/caffe train --solver=<Your Solver File> [--weights=<Pretrained caffemodel>]
```
**Note**: actual batch_size will be `num_device` times `batch_size` specified in network's prototxt.
- Runtime memory optimization
- Memory optimization drastically reduces memory usage (half for training and almost all for testing) by
safely sharing underlying storage of a series of blobs.
- For usage and the mechanism behind the scene, see the [Wiki Page](https://github.com/yjxiong/caffe/wiki/Memory-Optimization)

### Working Examples
- Temporal Segment Networks: Towards Good Practices for Deep Action Recognition
Expand Down
2 changes: 1 addition & 1 deletion cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ include(cmake/ProtoBuf.cmake)
# ---[ HDF5
find_package(HDF5 COMPONENTS HL REQUIRED)
include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})

# ---[ LMDB
find_package(LMDB REQUIRED)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
clear all; close all;

source_folder = '../../../data/VOC_arg/SegmentationClass';
target_folder = '../../../data/VOC_arg/SegmentationClass_label';

imgs_dir = dir(fullfile(source_folder, '*.png'));

if ~exist(target_folder, 'dir')
mkdir(target_folder)
end

for i = 1 : length(imgs_dir)
fprintf('processing %d/%d\n', i, length(imgs_dir));
img = imread(fullfile(source_folder, imgs_dir(i).name));
imwrite(img, fullfile(target_folder, imgs_dir(i).name));
end
10,582 changes: 10,582 additions & 0 deletions examples/semantic_segmentation/dataset_file_examples/train.txt

Large diffs are not rendered by default.

12,031 changes: 12,031 additions & 0 deletions examples/semantic_segmentation/dataset_file_examples/train_val.txt

Large diffs are not rendered by default.

1,449 changes: 1,449 additions & 0 deletions examples/semantic_segmentation/dataset_file_examples/val.txt

Large diffs are not rendered by default.

7 changes: 7 additions & 0 deletions examples/semantic_segmentation/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#!/usr/bin/env sh

GOOGLE_LOG_DIR=models/semantic_segmentation/log \
mpirun -np 8 \
cmake_build/install/bin/caffe train \
--solver=models/semantic_segmentation/fcn_vgg_16_solver.prototxt \
--weights=VGG_ILSVRC_16_layers_conv.caffemodel
4 changes: 4 additions & 0 deletions include/caffe/blob.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,10 @@ class Blob {
* shared_ptr calls its destructor when reset with the "=" operator.
*/
void ShareDiff(const Blob& other);
bool IsSharingDiff(const Blob* other);

void SetDiffStorage(shared_ptr<SyncedMemory>& storage);
void SetDataStorage(shared_ptr<SyncedMemory>& storage);

bool ShapeEquals(const BlobProto& other);

Expand Down
6 changes: 6 additions & 0 deletions include/caffe/common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,12 @@ class Caffe {
inline static int device_id(){return Get().device_id_;}
inline static int remaining_sub_iter(){return Get().remaining_sub_iter_;}
inline static void set_remaining_sub_iter(int n){Get().remaining_sub_iter_ = n;}

// Functions for splitting MPI_Comm to fast distributed training.
inline static void MPI_split_comm(const int color, const int key) {
MPI_Comm intra_comm;
MPI_Comm_split(MPI_COMM_WORLD, color, key, &intra_comm);
}
#endif

#ifdef WITH_PYTHON_LAYER
Expand Down
56 changes: 55 additions & 1 deletion include/caffe/common_layers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,7 @@ class EltwiseLayer : public Layer<Dtype> {
EltwiseParameter_EltwiseOp op_;
vector<Dtype> coeffs_;
Blob<int> max_idx_;
Blob<Dtype> rng_buffer_;

bool stable_prod_grad_;
};
Expand Down Expand Up @@ -269,6 +270,13 @@ class FlattenLayer : public Layer<Dtype> {
virtual inline int ExactNumBottomBlobs() const { return 1; }
virtual inline int ExactNumTopBlobs() const { return 1; }

virtual inline bool is_sharing_data(int top_id, int bottom_id){
return top_id == bottom_id;
}
virtual inline bool is_sharing_diff(int top_id, int bottom_id){
return top_id == bottom_id;
}

protected:
/**
* @param bottom input Blob vector (length 2+)
Expand All @@ -292,6 +300,7 @@ class FlattenLayer : public Layer<Dtype> {
*/
virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);

};

/**
Expand Down Expand Up @@ -385,6 +394,13 @@ class ReshapeLayer : public Layer<Dtype> {
virtual inline int ExactNumBottomBlobs() const { return 1; }
virtual inline int ExactNumTopBlobs() const { return 1; }

virtual inline bool is_sharing_data(int top_id, int bottom_id) {
return top_id == bottom_id;
}
virtual inline bool is_sharing_diff(int top_id, int bottom_id) {
return top_id == bottom_id;
}

protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top) {}
Expand Down Expand Up @@ -560,6 +576,10 @@ class SplitLayer : public Layer<Dtype> {
virtual inline int ExactNumBottomBlobs() const { return 1; }
virtual inline int MinTopBlobs() const { return 1; }

virtual inline bool is_sharing_data(int top_id, int bottom_id) {
return true;
}

protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
Expand Down Expand Up @@ -633,6 +653,21 @@ class SliceLayer : public Layer<Dtype> {
virtual inline bool EqualNumBottomTopBlobs() const { return true; }
virtual inline bool is_gathering() {return true;}

virtual inline bool is_sharing_data(int top_id, int bottom_id){
#ifndef USE_MPI
return top_id == bottom_id;
#else
return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
#endif
}
virtual inline bool is_sharing_diff(int top_id, int bottom_id){
#ifndef USE_MPI
return top_id == bottom_id;
#else
return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
#endif
}

protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
Expand Down Expand Up @@ -667,6 +702,21 @@ class SliceLayer : public Layer<Dtype> {

virtual inline bool EqualNumBottomTopBlobs() const { return true; }

virtual inline bool is_sharing_data(int top_id, int bottom_id){
#ifndef USE_MPI
return top_id == bottom_id;
#else
return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
#endif
}
virtual inline bool is_sharing_diff(int top_id, int bottom_id){
#ifndef USE_MPI
return top_id == bottom_id;
#else
return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
#endif
}

protected:
virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);
Expand Down Expand Up @@ -734,7 +784,7 @@ class BNLayer : public Layer<Dtype> {
};


#if defined(USE_CUDNN)
#if defined(USE_CUDNN)
#if CUDNN_VERSION_MIN(5, 0, 0)
/**
* @brief cuDNN implementation of BNLayer.
Expand Down Expand Up @@ -937,10 +987,14 @@ class BatchReductionLayer : public Layer<Dtype> {
int num_;
/// @brief the step of reduction
int step_;
/// @brief whether to perform position sensitive learning
bool pos_;
/// @brief a helper Blob used for transferring ticks to GPU
Blob<Dtype> ticks_blob_;
vector<int> levels_;
vector<int> ticks_;

Blob<Dtype> argsort_idx_;
};

} // namespace caffe
Expand Down
42 changes: 42 additions & 0 deletions include/caffe/data_layers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,48 @@ class VideoDataLayer : public BasePrefetchingDataLayer<Dtype> {
};


/**
* @brief Provides data to the Net from video files.
*
* TODO(dox): thorough documentation for Forward and proto params.
*/
template <typename Dtype>
class SegDataLayer : public BasePrefetchingDataLayer<Dtype> {
public:
explicit SegDataLayer(const LayerParameter& param)
: BasePrefetchingDataLayer<Dtype>(param) {}
virtual ~SegDataLayer();
virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
const vector<Blob<Dtype>*>& top);

virtual inline const char* type() const { return "SegData"; }
virtual inline int ExactNumBottomBlobs() const { return 0; }
virtual inline int ExactNumTopBlobs() const { return 2; }

protected:
shared_ptr<Caffe::RNG> prefetch_rng_;
virtual void ShuffleImages();
virtual void InternalThreadEntry();

#ifdef USE_MPI
inline virtual void advance_cursor(){
lines_id_++;
if (lines_id_ >= lines_.size()) {
// We have reached the end. Restart from the first.
DLOG(INFO) << "Restarting data prefetching from start.";
lines_id_ = 0;
if (this->layer_param_.seg_data_param().shuffle()) {
ShuffleImages();
}
}
}
#endif

vector<std::pair<std::string, std::string> > lines_;
int lines_id_;
string name_pattern_;
};

/**
* @brief Provides data to the Net from memory.
*
Expand Down
8 changes: 8 additions & 0 deletions include/caffe/data_transformer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ class DataTransformer {
*/
void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);



void Transform(const Datum& datum_data, const Datum& datum_label,
Blob<Dtype>* transformed_data, Blob<Dtype>* transformed_label);

/**
* @brief Applies the transformation defined in the data layer's
* transform_param block to a vector of Datum.
Expand Down Expand Up @@ -123,6 +128,7 @@ class DataTransformer {
*/
vector<int> InferBlobShape(const cv::Mat& cv_img);


protected:
/**
* @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
Expand All @@ -146,6 +152,8 @@ class DataTransformer {

vector<float> custom_scale_ratios_;
int max_distort_;

bool org_size_proc_;
};

} // namespace caffe
Expand Down
6 changes: 6 additions & 0 deletions include/caffe/layer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -308,6 +308,12 @@ class Layer {
inline void set_need_sync(bool val){need_sync_ = val;}
#endif

/**
* @brief express whether this layer shares the data/diff between bottom and top
*/
virtual inline bool is_sharing_data(int top_id, int bottom_id){return false;}
virtual inline bool is_sharing_diff(int top_id, int bottom_id){return false;}


protected:
/** The protobuf that stores the layer parameters */
Expand Down
9 changes: 9 additions & 0 deletions include/caffe/net.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,10 @@ class Net {
/// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
void GetLearningRateAndWeightDecay();

/// @brief do a dry run to decide blob dependency
void MemoryOptimize();
void MemoryOptimize_v2();

/// @brief The network name
string name_;
/// @brief The phase: TRAIN or TEST
Expand Down Expand Up @@ -268,6 +272,11 @@ class Net {
/// Whether to compute and display debug info for the net.
bool debug_info_;

/// Memory optimization related stuff.
bool optimize_memory_;
vector< shared_ptr<SyncedMemory> > shared_storage_;
std::set<string> excluded_blob_names_;

DISABLE_COPY_AND_ASSIGN(Net);
};

Expand Down
1 change: 1 addition & 0 deletions include/caffe/syncedmem.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class SyncedMemory {
SyncedHead head() { return head_; }
size_t size() { return size_; }

void Resize(size_t new_size);
private:
void to_cpu();
void to_gpu();
Expand Down
Loading

0 comments on commit b065585

Please sign in to comment.