From 729765ac122dbb1b0f4c48f51cbe33ba6274494c Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Mon, 11 Aug 2014 21:38:59 -0700 Subject: [PATCH 1/5] zero-init param diffs and accumulate gradients (With layers whose backward accumulates gradients), this effectively decouples the computational batch from the SGD minibatch. Each iteration accumulates gradients over iter_size batches, then parameters are updated. --- src/caffe/proto/caffe.proto | 4 +++- src/caffe/solver.cpp | 33 ++++++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 84b475ce3cd..13a77ae9665 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -75,7 +75,7 @@ message NetParameter { // NOTE // Update the next available ID when you add a new SolverParameter field. // -// SolverParameter next available ID: 36 (last added: clip_gradients) +// SolverParameter next available ID: 37 (last added: iter_size) message SolverParameter { ////////////////////////////////////////////////////////////////////////////// // Specifying the train and test networks @@ -128,6 +128,8 @@ message SolverParameter { // Display the loss averaged over the last average_loss iterations optional int32 average_loss = 33 [default = 1]; optional int32 max_iter = 7; // the maximum number of iterations + // accumulate gradients over `iter_size` x `batch_size` instances + optional int32 iter_size = 36 [default = 1]; optional string lr_policy = 8; // The learning rate decay policy. optional float gamma = 9; // The parameter to compute the learning rate. optional float power = 10; // The parameter to compute the learning rate. diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 8ed8aec2fc8..edc24cdf85c 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -168,6 +168,25 @@ void Solver::Step(int iters) { Dtype smoothed_loss = 0; for (; iter_ < stop_iter; ++iter_) { + // zero-init the params + for (int i = 0; i < net_->params().size(); ++i) { + shared_ptr > blob = net_->params()[i]; + switch(Caffe::mode()) { + case Caffe::CPU: + caffe_set(blob->count(), static_cast(0), + blob->mutable_cpu_diff()); + break; + case Caffe::GPU: +#ifndef CPU_ONLY + caffe_gpu_set(blob->count(), static_cast(0), + blob->mutable_gpu_diff()); +#else + NO_GPU; +#endif + break; + } + } + if (param_.test_interval() && iter_ % param_.test_interval() == 0 && (iter_ > 0 || param_.test_initialization())) { TestAll(); @@ -175,7 +194,13 @@ void Solver::Step(int iters) { const bool display = param_.display() && iter_ % param_.display() == 0; net_->set_debug_info(display && param_.debug_info()); - Dtype loss = net_->ForwardBackward(bottom_vec); + // accumulate the loss and gradient + Dtype loss = 0; + for (int i = 0; i < param_.iter_size(); ++i) { + loss += net_->ForwardBackward(bottom_vec); + } + loss /= param_.iter_size(); + // average the loss across iterations for smoothed reporting if (losses.size() < average_loss) { losses.push_back(loss); int size = losses.size(); @@ -477,7 +502,8 @@ void SGDSolver::ComputeUpdateValue() { case Caffe::CPU: for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_rate = rate * net_params_lr[param_id] + / this->param_.iter_size(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; if (local_decay) { @@ -513,7 +539,8 @@ void SGDSolver::ComputeUpdateValue() { #ifndef CPU_ONLY for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. - Dtype local_rate = rate * net_params_lr[param_id]; + Dtype local_rate = rate * net_params_lr[param_id] + / this->param_.iter_size(); Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; if (local_decay) { From fbc9dd2661d3c70dcce433e542db1be88b69711d Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Tue, 30 Dec 2014 22:52:07 -0800 Subject: [PATCH 2/5] zero-init param diffs in gradient checker --- include/caffe/test/test_gradient_check_util.hpp | 7 +++++-- src/caffe/solver.cpp | 2 +- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/include/caffe/test/test_gradient_check_util.hpp b/include/caffe/test/test_gradient_check_util.hpp index 22937711b58..cc5dcbad0ee 100644 --- a/include/caffe/test/test_gradient_check_util.hpp +++ b/include/caffe/test/test_gradient_check_util.hpp @@ -80,11 +80,14 @@ void GradientChecker::CheckGradientSingle(Layer* layer, CHECK_EQ(top_count, bottom[blob_id]->count()); } } - // First, figure out what blobs we need to check against. + // First, figure out what blobs we need to check against, and zero init + // parameter blobs. vector*> blobs_to_check; vector propagate_down(bottom.size(), check_bottom < 0); for (int i = 0; i < layer->blobs().size(); ++i) { - blobs_to_check.push_back(layer->blobs()[i].get()); + Blob* blob = layer->blobs()[i].get(); + caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); + blobs_to_check.push_back(blob); } if (check_bottom < 0) { for (int i = 0; i < bottom.size(); ++i) { diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index edc24cdf85c..80c11f32e2f 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -171,7 +171,7 @@ void Solver::Step(int iters) { // zero-init the params for (int i = 0; i < net_->params().size(); ++i) { shared_ptr > blob = net_->params()[i]; - switch(Caffe::mode()) { + switch (Caffe::mode()) { case Caffe::CPU: caffe_set(blob->count(), static_cast(0), blob->mutable_cpu_diff()); From b0a2151111683664996c08422f2bd2c9d5408c8d Mon Sep 17 00:00:00 2001 From: Sergio Date: Fri, 26 Sep 2014 23:03:26 -0700 Subject: [PATCH 3/5] accumulate gradients in inner product layer --- src/caffe/layers/inner_product_layer.cpp | 4 ++-- src/caffe/layers/inner_product_layer.cu | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index b1ec6cb25c0..1a5bc46444a 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -81,13 +81,13 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->cpu_data(); // Gradient with respect to weight caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_cpu_diff()); + top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->cpu_diff(); // Gradient with respect to bias caffe_cpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.cpu_data(), (Dtype)0., + bias_multiplier_.cpu_data(), (Dtype)1., this->blobs_[1]->mutable_cpu_diff()); } if (propagate_down[0]) { diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu index a9e1784a205..dd90cac12a8 100644 --- a/src/caffe/layers/inner_product_layer.cu +++ b/src/caffe/layers/inner_product_layer.cu @@ -33,13 +33,13 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, const Dtype* bottom_data = bottom[0]->gpu_data(); // Gradient with respect to weight caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)0., this->blobs_[0]->mutable_gpu_diff()); + top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff()); } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bias caffe_gpu_gemv(CblasTrans, M_, N_, (Dtype)1., top_diff, - bias_multiplier_.gpu_data(), (Dtype)0., + bias_multiplier_.gpu_data(), (Dtype)1., this->blobs_[1]->mutable_gpu_diff()); } if (propagate_down[0]) { From dc1a37dd3ded2aceba9a792142df541dff0daa0e Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Tue, 30 Dec 2014 22:29:35 -0800 Subject: [PATCH 4/5] accumulate gradients in (de)conv layers --- src/caffe/layers/conv_layer.cpp | 7 ------- src/caffe/layers/conv_layer.cu | 7 ------- src/caffe/layers/deconv_layer.cpp | 7 ------- src/caffe/layers/deconv_layer.cu | 7 ------- 4 files changed, 28 deletions(-) diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp index c0c9f6f3371..928ef5ee468 100644 --- a/src/caffe/layers/conv_layer.cpp +++ b/src/caffe/layers/conv_layer.cpp @@ -39,13 +39,6 @@ void ConvolutionLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_cpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = bottom[i]->cpu_data(); diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu index 3902fdf3930..b8a98ff7cc9 100644 --- a/src/caffe/layers/conv_layer.cu +++ b/src/caffe/layers/conv_layer.cu @@ -31,13 +31,6 @@ void ConvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_gpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); // Bias gradient, if necessary. diff --git a/src/caffe/layers/deconv_layer.cpp b/src/caffe/layers/deconv_layer.cpp index e6d65ab526b..a4612963b6b 100644 --- a/src/caffe/layers/deconv_layer.cpp +++ b/src/caffe/layers/deconv_layer.cpp @@ -39,13 +39,6 @@ void DeconvolutionLayer::Backward_cpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->cpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_cpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_cpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->cpu_diff(); const Dtype* bottom_data = bottom[i]->cpu_data(); diff --git a/src/caffe/layers/deconv_layer.cu b/src/caffe/layers/deconv_layer.cu index 9198dd64c72..39bc4de8c66 100644 --- a/src/caffe/layers/deconv_layer.cu +++ b/src/caffe/layers/deconv_layer.cu @@ -31,13 +31,6 @@ void DeconvolutionLayer::Backward_gpu(const vector*>& top, const vector& propagate_down, const vector*>& bottom) { const Dtype* weight = this->blobs_[0]->gpu_data(); Dtype* weight_diff = this->blobs_[0]->mutable_gpu_diff(); - if (this->param_propagate_down_[0]) { - caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); - } - if (this->bias_term_ && this->param_propagate_down_[1]) { - caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), - this->blobs_[1]->mutable_gpu_diff()); - } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff(); const Dtype* bottom_data = bottom[i]->gpu_data(); From 05d2bc405cbb781f29d4bfd0c748143bbe328bfb Mon Sep 17 00:00:00 2001 From: Jonathan L Long Date: Sat, 13 Sep 2014 17:41:59 -0700 Subject: [PATCH 5/5] accumulate gradients in cudnn conv layer --- src/caffe/layers/cudnn_conv_layer.cu | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/caffe/layers/cudnn_conv_layer.cu b/src/caffe/layers/cudnn_conv_layer.cu index 071014e1b48..b5bfdb098e0 100644 --- a/src/caffe/layers/cudnn_conv_layer.cu +++ b/src/caffe/layers/cudnn_conv_layer.cu @@ -54,12 +54,10 @@ void CuDNNConvolutionLayer::Backward_gpu(const vector*>& top, if (this->param_propagate_down_[0]) { weight = this->blobs_[0]->gpu_data(); weight_diff = this->blobs_[0]->mutable_gpu_diff(); - caffe_gpu_set(this->blobs_[0]->count(), Dtype(0), weight_diff); } Dtype* bias_diff = NULL; if (this->bias_term_ && this->param_propagate_down_[1]) { bias_diff = this->blobs_[1]->mutable_gpu_diff(); - caffe_gpu_set(this->blobs_[1]->count(), Dtype(0), bias_diff); } for (int i = 0; i < top.size(); ++i) { const Dtype* top_diff = top[i]->gpu_diff();