diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index 2510de748de..af1c90a53f0 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -53,6 +53,9 @@ class Solver { virtual void RestoreSolverState(const SolverState& state) = 0; void DisplayOutputBlobs(const int net_id); + virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; } + virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; } + SolverParameter param_; int iter_; int current_step_; @@ -88,7 +91,12 @@ class SGDSolver : public Solver { // update maintains update related data and is not needed in snapshots. // temp maintains other information that might be needed in computation // of gradients/updates and is not needed in snapshots - vector > > history_, update_, temp_; + // accum is used to accumlate gradients over multiple forward-backward + // passes and is not needed in snapshots + vector > > history_, update_, temp_, accum_; + + virtual void AccumulateGradients(); + virtual void ResetAccumulateGradients(); DISABLE_COPY_AND_ASSIGN(SGDSolver); }; @@ -104,6 +112,9 @@ class NesterovSolver : public SGDSolver { protected: virtual void ComputeUpdateValue(); + virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; } + virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; } + DISABLE_COPY_AND_ASSIGN(NesterovSolver); }; diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 84b475ce3cd..8ad232058df 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -75,7 +75,7 @@ message NetParameter { // NOTE // Update the next available ID when you add a new SolverParameter field. // -// SolverParameter next available ID: 36 (last added: clip_gradients) +// SolverParameter next available ID: 37 (last added: accum_grad) message SolverParameter { ////////////////////////////////////////////////////////////////////////////// // Specifying the train and test networks @@ -145,6 +145,9 @@ message SolverParameter { // whenever their actual L2 norm is larger. optional float clip_gradients = 35 [default = -1]; + // Accumulate gradients. This only works with SGDSolver. + optional int32 accum_grad = 36 [default = 1]; + optional int32 snapshot = 14 [default = 0]; // The snapshot interval optional string snapshot_prefix = 15; // The prefix for the snapshot. // whether to snapshot diff in the results or not. Snapshotting diff will help diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 8ed8aec2fc8..cceda0e033f 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -33,6 +33,7 @@ void Solver::Init(const SolverParameter& param) { << param.DebugString(); param_ = param; CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative."; + CHECK_GE(param_.accum_grad(), 1) << "accum_grad should be non-negative."; if (param_.random_seed() >= 0) { Caffe::set_random_seed(param_.random_seed()); } @@ -164,6 +165,7 @@ void Solver::Step(int iters) { const int start_iter = iter_; const int stop_iter = iter_ + iters; int average_loss = this->param_.average_loss(); + const int accum_grad = this->param_.accum_grad(); vector losses; Dtype smoothed_loss = 0; @@ -175,7 +177,17 @@ void Solver::Step(int iters) { const bool display = param_.display() && iter_ % param_.display() == 0; net_->set_debug_info(display && param_.debug_info()); - Dtype loss = net_->ForwardBackward(bottom_vec); + Dtype loss = 0; + if (accum_grad > 1) { + ResetAccumulateGradients(); + for (int i = 0; i < accum_grad; ++i) { + loss += net_->ForwardBackward(bottom_vec); + AccumulateGradients(); + } + loss /= accum_grad; + } else { + loss = net_->ForwardBackward(bottom_vec); + } if (losses.size() < average_loss) { losses.push_back(loss); int size = losses.size(); @@ -430,6 +442,9 @@ void SGDSolver::PreSolve() { temp_.push_back(shared_ptr >(new Blob( net_param->num(), net_param->channels(), net_param->height(), net_param->width()))); + accum_.push_back(shared_ptr >(new Blob( + net_param->num(), net_param->channels(), net_param->height(), + net_param->width()))); } } @@ -458,12 +473,54 @@ void SGDSolver::ClipGradients() { } } +template +void SGDSolver::AccumulateGradients() { + const vector > >& net_params = this->net_->params(); + const int accum_grad = this->param_.accum_grad(); + if (Caffe::mode() == Caffe::GPU) { +#ifndef CPU_ONLY + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + caffe_gpu_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad), + net_params[param_id]->gpu_diff(), + accum_[param_id]->mutable_gpu_data()); + } +#else + NO_GPU; +#endif + } else { + for (int param_id = 0; param_id < net_params.size(); ++param_id) { + caffe_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad), + net_params[param_id]->cpu_diff(), + accum_[param_id]->mutable_cpu_data()); + } + } +} +template +void SGDSolver::ResetAccumulateGradients() { + if (Caffe::mode() == Caffe::GPU) { +#ifndef CPU_ONLY + for (int param_id = 0; param_id < accum_.size(); ++param_id) { + caffe_gpu_set(accum_[param_id]->count(), Dtype(0), + accum_[param_id]->mutable_gpu_data()); + } +#else + NO_GPU; +#endif + } else { + for (int param_id = 0; param_id < accum_.size(); ++param_id) { + caffe_set(accum_[param_id]->count(), Dtype(0), + accum_[param_id]->mutable_cpu_data()); + } + } +} + template void SGDSolver::ComputeUpdateValue() { const vector > >& net_params = this->net_->params(); const vector& net_params_lr = this->net_->params_lr(); const vector& net_params_weight_decay = this->net_->params_weight_decay(); + const int accum_grad = this->param_.accum_grad(); // get the learning rate Dtype rate = GetLearningRate(); if (this->param_.display() && this->iter_ % this->param_.display() == 0) { @@ -477,6 +534,10 @@ void SGDSolver::ComputeUpdateValue() { case Caffe::CPU: for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. + if (accum_grad > 1) { + caffe_copy(accum_[param_id]->count(), accum_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; @@ -513,6 +574,10 @@ void SGDSolver::ComputeUpdateValue() { #ifndef CPU_ONLY for (int param_id = 0; param_id < net_params.size(); ++param_id) { // Compute the value to history, and then copy them to the blob's diff. + if (accum_grad > 1) { + caffe_copy(accum_[param_id]->count(), accum_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; @@ -696,6 +761,7 @@ void AdaGradSolver::ComputeUpdateValue() { const vector& net_params_lr = this->net_->params_lr(); const vector& net_params_weight_decay = this->net_->params_weight_decay(); + const int accum_grad = this->param_.accum_grad(); // get the learning rate Dtype rate = this->GetLearningRate(); Dtype delta = this->param_.delta(); @@ -708,6 +774,11 @@ void AdaGradSolver::ComputeUpdateValue() { switch (Caffe::mode()) { case Caffe::CPU: for (int param_id = 0; param_id < net_params.size(); ++param_id) { + if (accum_grad > 1) { + caffe_copy(this->accum_[param_id]->count(), + this->accum_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + } Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; @@ -764,6 +835,11 @@ void AdaGradSolver::ComputeUpdateValue() { case Caffe::GPU: #ifndef CPU_ONLY for (int param_id = 0; param_id < net_params.size(); ++param_id) { + if (accum_grad > 1) { + caffe_copy(this->accum_[param_id]->count(), + this->accum_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + } Dtype local_rate = rate * net_params_lr[param_id]; Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; diff --git a/src/caffe/test/test_solver.cpp b/src/caffe/test/test_solver.cpp index 1c2c9bbb740..1ac5f84de21 100644 --- a/src/caffe/test/test_solver.cpp +++ b/src/caffe/test/test_solver.cpp @@ -1,3 +1,5 @@ +#include + #include #include #include @@ -6,6 +8,8 @@ #include "gtest/gtest.h" #include "caffe/common.hpp" +#include "caffe/data_layers.hpp" +#include "caffe/filler.hpp" #include "caffe/proto/caffe.pb.h" #include "caffe/solver.hpp" @@ -36,6 +40,49 @@ class SolverTest : public MultiDeviceTest { } solver_.reset(new SGDSolver(param)); } + virtual void InitSolverAccumGrad(int batch_size, int accum_grad) { + boost::format fmt( + "net_param { " + " name: 'TestAccumGrad' " + " layer { " + " name: 'data' " + " type: 'MemoryData' " + " top: 'data' " + " top: 'label' " + " memory_data_param { " + " batch_size: %1% " + " channels: 3 " + " height: 2 " + " width: 2 " + " } " + " } " + " layer { " + " name: 'ip' " + " type: 'InnerProduct' " + " bottom: 'data' " + " top: 'ip' " + " inner_product_param { " + " num_output: 1 " + " } " + " } " + " layer { " + " name: 'loss' " + " type: 'EuclideanLoss' " + " bottom: 'ip' " + " bottom: 'label' " + " top: 'loss' " + " } " + "} " + "base_lr: 0.01 " + "momentum: 0.9 " + "weight_decay: 0.004 " + "lr_policy: 'fixed' " + "display: 100 " + "max_iter: 100 " + "accum_grad: %2%"); + fmt % batch_size % accum_grad; + this->InitSolverFromProtoString(fmt.str()); + } shared_ptr > solver_; }; @@ -104,4 +151,47 @@ TYPED_TEST(SolverTest, TestInitTrainTestNets) { EXPECT_TRUE(this->solver_->test_nets()[1]->has_layer("accuracy")); } +TYPED_TEST(SolverTest, TestSolverGradientAccumulation) { + typedef typename TypeParam::Dtype Dtype; + // Data preparation + const int batch_size = 8; + const int step = 8; + Blob data(batch_size * step, 3, 2, 2); + Blob label(batch_size * step, 1, 1, 1); + FillerParameter data_filler_param; + data_filler_param.set_std(1); + GaussianFiller data_filler(data_filler_param); + data_filler.Fill(&data); + data_filler.Fill(&label); + + // Run with batch_size=8, accum_grad=1 + this->InitSolverAccumGrad(batch_size, 1); + boost::static_pointer_cast >( + this->solver_->net()->layers()[0])->Reset( + data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step); + this->solver_->Step(step); + shared_ptr > weight1 = this->solver_->net()->params()[0]; + shared_ptr > bias1 = this->solver_->net()->params()[1]; + + // Run with batch_size=4, accum_grad=2 + this->InitSolverAccumGrad(batch_size / 2, 2); + boost::static_pointer_cast >( + this->solver_->net()->layers()[0])->Reset( + data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step); + this->solver_->Step(step); + shared_ptr > weight2 = this->solver_->net()->params()[0]; + shared_ptr > bias2 = this->solver_->net()->params()[1]; + + // Check if the numbers are the same for both settings. + for (int i = 0; i < weight1->count(); ++i) { + Dtype value1 = weight1->cpu_data()[i]; + Dtype value2 = weight2->cpu_data()[i]; + EXPECT_NEAR(value1, value2, 1e-7); + } + for (int i = 0; i < bias1->count(); ++i) { + Dtype value1 = bias1->cpu_data()[i]; + Dtype value2 = bias2->cpu_data()[i]; + EXPECT_NEAR(value1, value2, 1e-7); + } +} } // namespace caffe