Merge pull request BVLC#132 from yjxiong/mem

Update Dec 2016
wk910930 · Dec 26, 2016 · b065585 · b065585
2 parents 36c6aa4 + 03f54c5
commit b065585
Show file tree

Hide file tree

Showing 42 changed files with 26,823 additions and 217 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+## Dec 25, 2016
+
+  - Supported dilated convolution
+  - Memory optimization is introduced to save memory during training and testing. [Wiki](https://github.com/yjxiong/caffe/wiki/Memory-Optimization)
+  - `BatchReductionLayer` supports reduction on an arbitrary axis with cuda implementation.
+  - Other small fixes.
+
 ## Apr 27, 2016
 
 Features:

diff --git a/README.md b/README.md
@@ -14,6 +14,8 @@ This branch hosts the code for the technical report ["Towards Good Practices for
 
 
 ### Updates
+- Dec, 2016
+  * Major updates of the codebase. New features include memory optimization and dilated convolution.
 - Aug 23, 2016
   * [Temporal Segment Networks](https://github.com/yjxiong/temporal-segment-networks): a new state of the art action recognition framework is open sourced.
 - Aug 1, 2016
@@ -36,7 +38,8 @@ This branch hosts the code for the technical report ["Towards Good Practices for
 - Training on optical flow data. 
 - Data augmentation with fixed corner cropping and multi-scale cropping.
 - Parallel training with multiple GPUs.
-- cuDNNv5 integration.
+- Newest cuDNN integration.
+- Slim memory footprints in both training and testing,
 
 ### Usage
 
@@ -69,6 +72,10 @@ make && make install
 mpirun -np 4 ./install/bin/caffe train --solver=<Your Solver File> [--weights=<Pretrained caffemodel>]
 ```
 **Note**: actual batch_size will be `num_device` times `batch_size` specified in network's prototxt.
+- Runtime memory optimization
+  - Memory optimization drastically reduces memory usage (half for training and almost all for testing) by
+  safely sharing underlying storage of a series of blobs.
+  - For usage and the mechanism behind the scene, see the [Wiki Page](https://github.com/yjxiong/caffe/wiki/Memory-Optimization)
 
 ### Working Examples
 - Temporal Segment Networks: Towards Good Practices for Deep Action Recognition

diff --git a/cmake/Dependencies.cmake b/cmake/Dependencies.cmake
@@ -26,7 +26,7 @@ include(cmake/ProtoBuf.cmake)
 # ---[ HDF5
 find_package(HDF5 COMPONENTS HL REQUIRED)
 include_directories(SYSTEM ${HDF5_INCLUDE_DIRS} ${HDF5_HL_INCLUDE_DIR})
-list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES})
+list(APPEND Caffe_LINKER_LIBS ${HDF5_LIBRARIES} ${HDF5_HL_LIBRARIES})
 
 # ---[ LMDB
 find_package(LMDB REQUIRED)

diff --git a/examples/semantic_segmentation/dataset_file_examples/SavePngLabel.m b/examples/semantic_segmentation/dataset_file_examples/SavePngLabel.m
@@ -0,0 +1,16 @@
+clear all; close all;
+
+source_folder = '../../../data/VOC_arg/SegmentationClass';
+target_folder = '../../../data/VOC_arg/SegmentationClass_label';
+
+imgs_dir = dir(fullfile(source_folder, '*.png'));
+
+if ~exist(target_folder, 'dir')
+    mkdir(target_folder)
+end
+
+for i = 1 : length(imgs_dir)
+    fprintf('processing %d/%d\n', i, length(imgs_dir));
+    img = imread(fullfile(source_folder, imgs_dir(i).name));
+    imwrite(img, fullfile(target_folder, imgs_dir(i).name));
+end
diff --git a/examples/semantic_segmentation/dataset_file_examples/train.txt b/examples/semantic_segmentation/dataset_file_examples/train.txt
diff --git a/examples/semantic_segmentation/dataset_file_examples/train_val.txt b/examples/semantic_segmentation/dataset_file_examples/train_val.txt
diff --git a/examples/semantic_segmentation/dataset_file_examples/val.txt b/examples/semantic_segmentation/dataset_file_examples/val.txt
diff --git a/examples/semantic_segmentation/train.sh b/examples/semantic_segmentation/train.sh
@@ -0,0 +1,7 @@
+#!/usr/bin/env sh
+
+GOOGLE_LOG_DIR=models/semantic_segmentation/log \
+    mpirun -np 8 \
+    cmake_build/install/bin/caffe train \
+    --solver=models/semantic_segmentation/fcn_vgg_16_solver.prototxt \
+    --weights=VGG_ILSVRC_16_layers_conv.caffemodel
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
@@ -262,6 +262,10 @@ class Blob {
    * shared_ptr calls its destructor when reset with the "=" operator.
    */
   void ShareDiff(const Blob& other);
+  bool IsSharingDiff(const Blob* other);
+
+  void SetDiffStorage(shared_ptr<SyncedMemory>& storage);
+  void SetDataStorage(shared_ptr<SyncedMemory>& storage);
 
   bool ShapeEquals(const BlobProto& other);
 

diff --git a/include/caffe/common.hpp b/include/caffe/common.hpp
@@ -189,6 +189,12 @@ class Caffe {
   inline static int device_id(){return Get().device_id_;}
   inline static int remaining_sub_iter(){return Get().remaining_sub_iter_;}
   inline static void set_remaining_sub_iter(int n){Get().remaining_sub_iter_ = n;}
+
+  // Functions for splitting MPI_Comm to fast distributed training.
+  inline static void MPI_split_comm(const int color, const int key) {
+    MPI_Comm intra_comm;
+    MPI_Comm_split(MPI_COMM_WORLD, color, key, &intra_comm);
+  }
 #endif
 
 #ifdef WITH_PYTHON_LAYER

diff --git a/include/caffe/common_layers.hpp b/include/caffe/common_layers.hpp
@@ -180,6 +180,7 @@ class EltwiseLayer : public Layer<Dtype> {
   EltwiseParameter_EltwiseOp op_;
   vector<Dtype> coeffs_;
   Blob<int> max_idx_;
+  Blob<Dtype> rng_buffer_;
 
   bool stable_prod_grad_;
 };
@@ -269,6 +270,13 @@ class FlattenLayer : public Layer<Dtype> {
   virtual inline int ExactNumBottomBlobs() const { return 1; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
+  virtual inline bool is_sharing_data(int top_id, int bottom_id){
+    return top_id == bottom_id;
+  }
+  virtual inline bool is_sharing_diff(int top_id, int bottom_id){
+    return top_id == bottom_id;
+  }
+
  protected:
   /**
    * @param bottom input Blob vector (length 2+)
@@ -292,6 +300,7 @@ class FlattenLayer : public Layer<Dtype> {
    */
   virtual void Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom);
+
 };
 
 /**
@@ -385,6 +394,13 @@ class ReshapeLayer : public Layer<Dtype> {
   virtual inline int ExactNumBottomBlobs() const { return 1; }
   virtual inline int ExactNumTopBlobs() const { return 1; }
 
+  virtual inline bool is_sharing_data(int top_id, int bottom_id) {
+    return top_id == bottom_id;
+  }
+  virtual inline bool is_sharing_diff(int top_id, int bottom_id) {
+    return top_id == bottom_id;
+  }
+
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top) {}
@@ -560,6 +576,10 @@ class SplitLayer : public Layer<Dtype> {
   virtual inline int ExactNumBottomBlobs() const { return 1; }
   virtual inline int MinTopBlobs() const { return 1; }
 
+  virtual inline bool is_sharing_data(int top_id, int bottom_id) {
+    return true;
+  }
+
  protected:
   virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
       const vector<Blob<Dtype>*>& top);
@@ -633,6 +653,21 @@ class SliceLayer : public Layer<Dtype> {
     virtual inline bool EqualNumBottomTopBlobs() const { return true; }
     virtual inline bool is_gathering() {return true;}
 
+    virtual inline bool is_sharing_data(int top_id, int bottom_id){
+#ifndef USE_MPI
+      return top_id == bottom_id;
+#else
+      return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
+#endif
+    }
+    virtual inline bool is_sharing_diff(int top_id, int bottom_id){
+#ifndef USE_MPI
+      return top_id == bottom_id;
+#else
+      return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
+#endif
+    }
+
   protected:
     virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
                              const vector<Blob<Dtype>*>& top);
@@ -667,6 +702,21 @@ class SliceLayer : public Layer<Dtype> {
 
       virtual inline bool EqualNumBottomTopBlobs() const { return true; }
 
+      virtual inline bool is_sharing_data(int top_id, int bottom_id){
+#ifndef USE_MPI
+        return top_id == bottom_id;
+#else
+        return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
+#endif
+      }
+      virtual inline bool is_sharing_diff(int top_id, int bottom_id){
+#ifndef USE_MPI
+        return top_id == bottom_id;
+#else
+        return (top_id == bottom_id) && (Caffe::parallel_mode()!=Caffe::MPI);
+#endif
+      }
+
   protected:
       virtual void Forward_cpu(const vector<Blob<Dtype>*>& bottom,
                                const vector<Blob<Dtype>*>& top);
@@ -734,7 +784,7 @@ class BNLayer : public Layer<Dtype> {
 };
 
 
-#if defined(USE_CUDNN) 
+#if defined(USE_CUDNN)
 #if CUDNN_VERSION_MIN(5, 0, 0)
 /**
  * @brief cuDNN implementation of BNLayer.
@@ -937,10 +987,14 @@ class BatchReductionLayer : public Layer<Dtype> {
     int num_;
     /// @brief the step of reduction
     int step_;
+    /// @brief whether to perform position sensitive learning
+    bool pos_;
     /// @brief a helper Blob used for transferring ticks to GPU
     Blob<Dtype> ticks_blob_;
     vector<int> levels_;
     vector<int> ticks_;
+
+    Blob<Dtype> argsort_idx_;
 };
 
 }  // namespace caffe

diff --git a/include/caffe/data_layers.hpp b/include/caffe/data_layers.hpp
@@ -355,6 +355,48 @@ class VideoDataLayer : public BasePrefetchingDataLayer<Dtype> {
 };
 
 
+/**
+ * @brief Provides data to the Net from video files.
+ *
+ * TODO(dox): thorough documentation for Forward and proto params.
+ */
+template <typename Dtype>
+class SegDataLayer : public BasePrefetchingDataLayer<Dtype> {
+public:
+	explicit SegDataLayer(const LayerParameter& param)
+	: BasePrefetchingDataLayer<Dtype>(param) {}
+	virtual ~SegDataLayer();
+	virtual void DataLayerSetUp(const vector<Blob<Dtype>*>& bottom,
+			const vector<Blob<Dtype>*>& top);
+
+	virtual inline const char* type() const { return "SegData"; }
+	virtual inline int ExactNumBottomBlobs() const { return 0; }
+	virtual inline int ExactNumTopBlobs() const { return 2; }
+
+protected:
+	shared_ptr<Caffe::RNG> prefetch_rng_;
+	virtual void ShuffleImages();
+	virtual void InternalThreadEntry();
+
+#ifdef USE_MPI
+	inline virtual void advance_cursor(){
+		lines_id_++;
+		if (lines_id_ >= lines_.size()) {
+			// We have reached the end. Restart from the first.
+			DLOG(INFO) << "Restarting data prefetching from start.";
+			lines_id_ = 0;
+			if (this->layer_param_.seg_data_param().shuffle()) {
+				ShuffleImages();
+			}
+		}
+	}
+#endif
+
+	vector<std::pair<std::string, std::string> > lines_;
+	int lines_id_;
+	string name_pattern_;
+};
+
 /**
  * @brief Provides data to the Net from memory.
  *

diff --git a/include/caffe/data_transformer.hpp b/include/caffe/data_transformer.hpp
@@ -37,6 +37,11 @@ class DataTransformer {
    */
   void Transform(const Datum& datum, Blob<Dtype>* transformed_blob);
 
+
+
+  void Transform(const Datum& datum_data, const Datum& datum_label, 
+               Blob<Dtype>* transformed_data, Blob<Dtype>* transformed_label);
+
   /**
    * @brief Applies the transformation defined in the data layer's
    * transform_param block to a vector of Datum.
@@ -123,6 +128,7 @@ class DataTransformer {
    */
   vector<int> InferBlobShape(const cv::Mat& cv_img);
 
+
  protected:
    /**
    * @brief Generates a random integer from Uniform({0, 1, ..., n-1}).
@@ -146,6 +152,8 @@ class DataTransformer {
 
   vector<float> custom_scale_ratios_;
   int max_distort_;
+
+  bool org_size_proc_;
 };
 
 }  // namespace caffe

diff --git a/include/caffe/layer.hpp b/include/caffe/layer.hpp
@@ -308,6 +308,12 @@ class Layer {
   inline void set_need_sync(bool val){need_sync_ = val;}
   #endif
 
+  /**
+   * @brief express whether this layer shares the data/diff between bottom and top
+   */
+  virtual inline bool is_sharing_data(int top_id, int bottom_id){return false;}
+  virtual inline bool is_sharing_diff(int top_id, int bottom_id){return false;}
+
 
  protected:
   /** The protobuf that stores the layer parameters */

diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
@@ -220,6 +220,10 @@ class Net {
   /// @brief Get misc parameters, e.g. the LR multiplier and weight decay.
   void GetLearningRateAndWeightDecay();
 
+  /// @brief do a dry run to decide blob dependency
+  void MemoryOptimize();
+  void MemoryOptimize_v2();
+
   /// @brief The network name
   string name_;
   /// @brief The phase: TRAIN or TEST
@@ -268,6 +272,11 @@ class Net {
   /// Whether to compute and display debug info for the net.
   bool debug_info_;
 
+  /// Memory optimization related stuff.
+  bool optimize_memory_;
+  vector< shared_ptr<SyncedMemory> > shared_storage_;
+  std::set<string> excluded_blob_names_;
+
   DISABLE_COPY_AND_ASSIGN(Net);
 };
 

diff --git a/include/caffe/syncedmem.hpp b/include/caffe/syncedmem.hpp
@@ -56,6 +56,7 @@ class SyncedMemory {
   SyncedHead head() { return head_; }
   size_t size() { return size_; }
 
+  void Resize(size_t new_size);
  private:
   void to_cpu();
   void to_gpu();