From 729765ac122dbb1b0f4c48f51cbe33ba6274494c Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Mon, 11 Aug 2014 21:38:59 -0700
Subject: [PATCH 1/5] zero-init param diffs and accumulate gradients

(With layers whose backward accumulates gradients), this effectively
decouples the computational batch from the SGD minibatch. Each
iteration accumulates gradients over iter_size batches, then parameters
are updated.
---
 src/caffe/proto/caffe.proto |  4 +++-
 src/caffe/solver.cpp        | 33 ++++++++++++++++++++++++++++++---
 2 files changed, 33 insertions(+), 4 deletions(-)
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 84b475ce3cd..13a77ae9665 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -75,7 +75,7 @@ message NetParameter {
 // NOTE
 // Update the next available ID when you add a new SolverParameter field.
 //
-// SolverParameter next available ID: 36 (last added: clip_gradients)
+// SolverParameter next available ID: 37 (last added: iter_size)
 message SolverParameter {
   //////////////////////////////////////////////////////////////////////////////
   // Specifying the train and test networks
@@ -128,6 +128,8 @@ message SolverParameter {
   // Display the loss averaged over the last average_loss iterations
   optional int32 average_loss = 33 [default = 1];
   optional int32 max_iter = 7; // the maximum number of iterations
+  // accumulate gradients over `iter_size` x `batch_size` instances
+  optional int32 iter_size = 36 [default = 1];
   optional string lr_policy = 8; // The learning rate decay policy.
   optional float gamma = 9; // The parameter to compute the learning rate.
   optional float power = 10; // The parameter to compute the learning rate.
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index 8ed8aec2fc8..edc24cdf85c 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -168,6 +168,25 @@ void Solver<Dtype>::Step(int iters) {
   Dtype smoothed_loss = 0;
 
   for (; iter_ < stop_iter; ++iter_) {
+    // zero-init the params
+    for (int i = 0; i < net_->params().size(); ++i) {
+      shared_ptr<Blob<Dtype> > blob = net_->params()[i];
+      switch(Caffe::mode()) {
+      case Caffe::CPU:
+        caffe_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_cpu_diff());
+        break;
+      case Caffe::GPU:
+#ifndef CPU_ONLY
+        caffe_gpu_set(blob->count(), static_cast<Dtype>(0),
+            blob->mutable_gpu_diff());
+#else
+        NO_GPU;
+#endif
+        break;
+      }
+    }
+
     if (param_.test_interval() && iter_ % param_.test_interval() == 0
         && (iter_ > 0 || param_.test_initialization())) {
       TestAll();
@@ -175,7 +194,13 @@ void Solver<Dtype>::Step(int iters) {
 
     const bool display = param_.display() && iter_ % param_.display() == 0;
     net_->set_debug_info(display && param_.debug_info());
-    Dtype loss = net_->ForwardBackward(bottom_vec);
+    // accumulate the loss and gradient
+    Dtype loss = 0;
+    for (int i = 0; i < param_.iter_size(); ++i) {
+      loss += net_->ForwardBackward(bottom_vec);
+    }
+    loss /= param_.iter_size();
+    // average the loss across iterations for smoothed reporting
     if (losses.size() < average_loss) {
       losses.push_back(loss);
       int size = losses.size();
@@ -477,7 +502,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
   case Caffe::CPU:
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_rate = rate * net_params_lr[param_id]
+                              / this->param_.iter_size();
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
       if (local_decay) {
@@ -513,7 +539,8 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
 #ifndef CPU_ONLY
     for (int param_id = 0; param_id < net_params.size(); ++param_id) {
       // Compute the value to history, and then copy them to the blob's diff.
-      Dtype local_rate = rate * net_params_lr[param_id];
+      Dtype local_rate = rate * net_params_lr[param_id]
+                              / this->param_.iter_size();
       Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
 
       if (local_decay) {

From fbc9dd2661d3c70dcce433e542db1be88b69711d Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Tue, 30 Dec 2014 22:52:07 -0800
Subject: [PATCH 2/5] zero-init param diffs in gradient checker

---
 include/caffe/test/test_gradient_check_util.hpp | 7 +++++--
 src/caffe/solver.cpp                            | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp
index 22937711b58..cc5dcbad0ee 100644
--- a/include/caffe/test/test_gradient_check_util.hpp
+++ b/include/caffe/test/test_gradient_check_util.hpp
@@ -80,11 +80,14 @@ void GradientChecker<Dtype>::CheckGradientSingle(Layer<Dtype>* layer,
       CHECK_EQ(top_count, bottom[blob_id]->count());
     }
   }
-  // First, figure out what blobs we need to check against.
+  // First, figure out what blobs we need to check against, and zero init
+  // parameter blobs.
   vector<Blob<Dtype>*> blobs_to_check;
   vector<bool> propagate_down(bottom.size(), check_bottom < 0);
   for (int i = 0; i < layer->blobs().size(); ++i) {
-    blobs_to_check.push_back(layer->blobs()[i].get());
+    Blob<Dtype>* blob = layer->blobs()[i].get();
+    caffe_set(blob->count(), static_cast<Dtype>(0), blob->mutable_cpu_diff());
+    blobs_to_check.push_back(blob);
   }
   if (check_bottom < 0) {
     for (int i = 0; i < bottom.size(); ++i) {
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index edc24cdf85c..80c11f32e2f 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -171,7 +171,7 @@ void Solver<Dtype>::Step(int iters) {
     // zero-init the params
     for (int i = 0; i < net_->params().size(); ++i) {
       shared_ptr<Blob<Dtype> > blob = net_->params()[i];
-      switch(Caffe::mode()) {
+      switch (Caffe::mode()) {
       case Caffe::CPU:
         caffe_set(blob->count(), static_cast<Dtype>(0),
             blob->mutable_cpu_diff());

From b0a2151111683664996c08422f2bd2c9d5408c8d Mon Sep 17 00:00:00 2001
From: Sergio <sguada@gmail.com>
Date: Fri, 26 Sep 2014 23:03:26 -0700
Subject: [PATCH 3/5] accumulate gradients in inner product layer

---
 src/caffe/layers/inner_product_layer.cpp | 4 ++--
 src/caffe/layers/inner_product_layer.cu  | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp
index b1ec6cb25c0..1a5bc46444a 100644
--- a/src/caffe/layers/inner_product_layer.cpp
+++ b/src/caffe/layers/inner_product_layer.cpp
@@ -81,13 +81,13 @@ void InnerProductLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->cpu_data();
     // Gradient with respect to weight
     caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff());
+        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff());
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->cpu_diff();
     // Gradient with respect to bias
     caffe_cpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.cpu_data(), (Dtype)0.,
+        bias_multiplier_.cpu_data(), (Dtype)1.,
         this->blobs_[1]->mutable_cpu_diff());
   }
   if (propagate_down[0]) {
diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu
index a9e1784a205..dd90cac12a8 100644
--- a/src/caffe/layers/inner_product_layer.cu
+++ b/src/caffe/layers/inner_product_layer.cu
@@ -33,13 +33,13 @@ void InnerProductLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
     const Dtype* bottom_data = bottom[0]->gpu_data();
     // Gradient with respect to weight
     caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1.,
-        top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff());
+        top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff());
   }
   if (bias_term_ && this->param_propagate_down_[1]) {
     const Dtype* top_diff = top[0]->gpu_diff();
     // Gradient with respect to bias
     caffe_gpu_gemv<Dtype>(CblasTrans, M_, N_, (Dtype)1., top_diff,
-        bias_multiplier_.gpu_data(), (Dtype)0.,
+        bias_multiplier_.gpu_data(), (Dtype)1.,
         this->blobs_[1]->mutable_gpu_diff());
   }
   if (propagate_down[0]) {

From dc1a37dd3ded2aceba9a792142df541dff0daa0e Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Tue, 30 Dec 2014 22:29:35 -0800
Subject: [PATCH 4/5] accumulate gradients in (de)conv layers

---
 src/caffe/layers/conv_layer.cpp   | 7 -------
 src/caffe/layers/conv_layer.cu    | 7 -------
 src/caffe/layers/deconv_layer.cpp | 7 -------
 src/caffe/layers/deconv_layer.cu  | 7 -------
 4 files changed, 28 deletions(-)

diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
index c0c9f6f3371..928ef5ee468 100644
--- a/src/caffe/layers/conv_layer.cpp
+++ b/src/caffe/layers/conv_layer.cpp
@@ -39,13 +39,6 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_cpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->cpu_diff();
     const Dtype* bottom_data = bottom[i]->cpu_data();
diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
index 3902fdf3930..b8a98ff7cc9 100644
--- a/src/caffe/layers/conv_layer.cu
+++ b/src/caffe/layers/conv_layer.cu
@@ -31,13 +31,6 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_gpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
     // Bias gradient, if necessary.
diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp
index e6d65ab526b..a4612963b6b 100644
--- a/src/caffe/layers/deconv_layer.cpp
+++ b/src/caffe/layers/deconv_layer.cpp
@@ -39,13 +39,6 @@ void DeconvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->cpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_cpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->cpu_diff();
     const Dtype* bottom_data = bottom[i]->cpu_data();
diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu
index 9198dd64c72..39bc4de8c66 100644
--- a/src/caffe/layers/deconv_layer.cu
+++ b/src/caffe/layers/deconv_layer.cu
@@ -31,13 +31,6 @@ void DeconvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) {
   const Dtype* weight = this->blobs_[0]->gpu_data();
   Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff();
-  if (this->param_propagate_down_[0]) {
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
-  }
-  if (this->bias_term_ && this->param_propagate_down_[1]) {
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0),
-        this->blobs_[1]->mutable_gpu_diff());
-  }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();
     const Dtype* bottom_data = bottom[i]->gpu_data();

From 05d2bc405cbb781f29d4bfd0c748143bbe328bfb Mon Sep 17 00:00:00 2001
From: Jonathan L Long <jonlong@cs.berkeley.edu>
Date: Sat, 13 Sep 2014 17:41:59 -0700
Subject: [PATCH 5/5] accumulate gradients in cudnn conv layer

---
 src/caffe/layers/cudnn_conv_layer.cu | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu
index 071014e1b48..b5bfdb098e0 100644
--- a/src/caffe/layers/cudnn_conv_layer.cu
+++ b/src/caffe/layers/cudnn_conv_layer.cu
@@ -54,12 +54,10 @@ void CuDNNConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
   if (this->param_propagate_down_[0]) {
     weight = this->blobs_[0]->gpu_data();
     weight_diff = this->blobs_[0]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff);
   }
   Dtype* bias_diff = NULL;
   if (this->bias_term_ && this->param_propagate_down_[1]) {
     bias_diff = this->blobs_[1]->mutable_gpu_diff();
-    caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff);
   }
   for (int i = 0; i < top.size(); ++i) {
     const Dtype* top_diff = top[i]->gpu_diff();