Skip to content

Commit

Permalink
Accumulate gradients in SGDSolver and AdaGrad
Browse files Browse the repository at this point in the history
and its test. I do not tested with AdaGrad but it should work.

Conflicts:
	src/caffe/proto/caffe.proto
	src/caffe/solver.cpp
  • Loading branch information
tnarihi committed Feb 26, 2015
1 parent a677076 commit d016dbd
Show file tree
Hide file tree
Showing 4 changed files with 183 additions and 3 deletions.
13 changes: 12 additions & 1 deletion include/caffe/solver.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,9 @@ class Solver {
virtual void RestoreSolverState(const SolverState& state) = 0;
void DisplayOutputBlobs(const int net_id);

virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; }
virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; }

SolverParameter param_;
int iter_;
int current_step_;
Expand Down Expand Up @@ -88,7 +91,12 @@ class SGDSolver : public Solver<Dtype> {
// update maintains update related data and is not needed in snapshots.
// temp maintains other information that might be needed in computation
// of gradients/updates and is not needed in snapshots
vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_;
// accum is used to accumlate gradients over multiple forward-backward
// passes and is not needed in snapshots
vector<shared_ptr<Blob<Dtype> > > history_, update_, temp_, accum_;

virtual void AccumulateGradients();
virtual void ResetAccumulateGradients();

DISABLE_COPY_AND_ASSIGN(SGDSolver);
};
Expand All @@ -104,6 +112,9 @@ class NesterovSolver : public SGDSolver<Dtype> {
protected:
virtual void ComputeUpdateValue();

virtual inline void AccumulateGradients() { NOT_IMPLEMENTED; }
virtual inline void ResetAccumulateGradients() { NOT_IMPLEMENTED; }

DISABLE_COPY_AND_ASSIGN(NesterovSolver);
};

Expand Down
5 changes: 4 additions & 1 deletion src/caffe/proto/caffe.proto
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ message NetParameter {
// NOTE
// Update the next available ID when you add a new SolverParameter field.
//
// SolverParameter next available ID: 36 (last added: clip_gradients)
// SolverParameter next available ID: 37 (last added: accum_grad)
message SolverParameter {
//////////////////////////////////////////////////////////////////////////////
// Specifying the train and test networks
Expand Down Expand Up @@ -145,6 +145,9 @@ message SolverParameter {
// whenever their actual L2 norm is larger.
optional float clip_gradients = 35 [default = -1];

// Accumulate gradients. This only works with SGDSolver.
optional int32 accum_grad = 36 [default = 1];

optional int32 snapshot = 14 [default = 0]; // The snapshot interval
optional string snapshot_prefix = 15; // The prefix for the snapshot.
// whether to snapshot diff in the results or not. Snapshotting diff will help
Expand Down
78 changes: 77 additions & 1 deletion src/caffe/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
<< param.DebugString();
param_ = param;
CHECK_GE(param_.average_loss(), 1) << "average_loss should be non-negative.";
CHECK_GE(param_.accum_grad(), 1) << "accum_grad should be non-negative.";
if (param_.random_seed() >= 0) {
Caffe::set_random_seed(param_.random_seed());
}
Expand Down Expand Up @@ -164,6 +165,7 @@ void Solver<Dtype>::Step(int iters) {
const int start_iter = iter_;
const int stop_iter = iter_ + iters;
int average_loss = this->param_.average_loss();
const int accum_grad = this->param_.accum_grad();
vector<Dtype> losses;
Dtype smoothed_loss = 0;

Expand All @@ -175,7 +177,17 @@ void Solver<Dtype>::Step(int iters) {

const bool display = param_.display() && iter_ % param_.display() == 0;
net_->set_debug_info(display && param_.debug_info());
Dtype loss = net_->ForwardBackward(bottom_vec);
Dtype loss = 0;
if (accum_grad > 1) {
ResetAccumulateGradients();
for (int i = 0; i < accum_grad; ++i) {
loss += net_->ForwardBackward(bottom_vec);
AccumulateGradients();
}
loss /= accum_grad;
} else {
loss = net_->ForwardBackward(bottom_vec);
}
if (losses.size() < average_loss) {
losses.push_back(loss);
int size = losses.size();
Expand Down Expand Up @@ -430,6 +442,9 @@ void SGDSolver<Dtype>::PreSolve() {
temp_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
net_param->num(), net_param->channels(), net_param->height(),
net_param->width())));
accum_.push_back(shared_ptr<Blob<Dtype> >(new Blob<Dtype>(
net_param->num(), net_param->channels(), net_param->height(),
net_param->width())));
}
}

Expand Down Expand Up @@ -458,12 +473,54 @@ void SGDSolver<Dtype>::ClipGradients() {
}
}

template <typename Dtype>
void SGDSolver<Dtype>::AccumulateGradients() {
const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
const int accum_grad = this->param_.accum_grad();
if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
caffe_gpu_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad),
net_params[param_id]->gpu_diff(),
accum_[param_id]->mutable_gpu_data());
}
#else
NO_GPU;
#endif
} else {
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
caffe_axpy(net_params[param_id]->count(), Dtype(1. / accum_grad),
net_params[param_id]->cpu_diff(),
accum_[param_id]->mutable_cpu_data());
}
}
}
template <typename Dtype>
void SGDSolver<Dtype>::ResetAccumulateGradients() {
if (Caffe::mode() == Caffe::GPU) {
#ifndef CPU_ONLY
for (int param_id = 0; param_id < accum_.size(); ++param_id) {
caffe_gpu_set(accum_[param_id]->count(), Dtype(0),
accum_[param_id]->mutable_gpu_data());
}
#else
NO_GPU;
#endif
} else {
for (int param_id = 0; param_id < accum_.size(); ++param_id) {
caffe_set(accum_[param_id]->count(), Dtype(0),
accum_[param_id]->mutable_cpu_data());
}
}
}

template <typename Dtype>
void SGDSolver<Dtype>::ComputeUpdateValue() {
const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
const vector<float>& net_params_lr = this->net_->params_lr();
const vector<float>& net_params_weight_decay =
this->net_->params_weight_decay();
const int accum_grad = this->param_.accum_grad();
// get the learning rate
Dtype rate = GetLearningRate();
if (this->param_.display() && this->iter_ % this->param_.display() == 0) {
Expand All @@ -477,6 +534,10 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
case Caffe::CPU:
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
// Compute the value to history, and then copy them to the blob's diff.
if (accum_grad > 1) {
caffe_copy(accum_[param_id]->count(), accum_[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());
}
Dtype local_rate = rate * net_params_lr[param_id];
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

Expand Down Expand Up @@ -513,6 +574,10 @@ void SGDSolver<Dtype>::ComputeUpdateValue() {
#ifndef CPU_ONLY
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
// Compute the value to history, and then copy them to the blob's diff.
if (accum_grad > 1) {
caffe_copy(accum_[param_id]->count(), accum_[param_id]->gpu_data(),
net_params[param_id]->mutable_gpu_diff());
}
Dtype local_rate = rate * net_params_lr[param_id];
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

Expand Down Expand Up @@ -696,6 +761,7 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
const vector<float>& net_params_lr = this->net_->params_lr();
const vector<float>& net_params_weight_decay =
this->net_->params_weight_decay();
const int accum_grad = this->param_.accum_grad();
// get the learning rate
Dtype rate = this->GetLearningRate();
Dtype delta = this->param_.delta();
Expand All @@ -708,6 +774,11 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
switch (Caffe::mode()) {
case Caffe::CPU:
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
if (accum_grad > 1) {
caffe_copy(this->accum_[param_id]->count(),
this->accum_[param_id]->cpu_data(),
net_params[param_id]->mutable_cpu_diff());
}
Dtype local_rate = rate * net_params_lr[param_id];
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

Expand Down Expand Up @@ -764,6 +835,11 @@ void AdaGradSolver<Dtype>::ComputeUpdateValue() {
case Caffe::GPU:
#ifndef CPU_ONLY
for (int param_id = 0; param_id < net_params.size(); ++param_id) {
if (accum_grad > 1) {
caffe_copy(this->accum_[param_id]->count(),
this->accum_[param_id]->gpu_data(),
net_params[param_id]->mutable_gpu_diff());
}
Dtype local_rate = rate * net_params_lr[param_id];
Dtype local_decay = weight_decay * net_params_weight_decay[param_id];

Expand Down
90 changes: 90 additions & 0 deletions src/caffe/test/test_solver.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include <boost/format.hpp>

#include <string>
#include <utility>
#include <vector>
Expand All @@ -6,6 +8,8 @@
#include "gtest/gtest.h"

#include "caffe/common.hpp"
#include "caffe/data_layers.hpp"
#include "caffe/filler.hpp"
#include "caffe/proto/caffe.pb.h"
#include "caffe/solver.hpp"

Expand Down Expand Up @@ -36,6 +40,49 @@ class SolverTest : public MultiDeviceTest<TypeParam> {
}
solver_.reset(new SGDSolver<Dtype>(param));
}
virtual void InitSolverAccumGrad(int batch_size, int accum_grad) {
boost::format fmt(
"net_param { "
" name: 'TestAccumGrad' "
" layer { "
" name: 'data' "
" type: 'MemoryData' "
" top: 'data' "
" top: 'label' "
" memory_data_param { "
" batch_size: %1% "
" channels: 3 "
" height: 2 "
" width: 2 "
" } "
" } "
" layer { "
" name: 'ip' "
" type: 'InnerProduct' "
" bottom: 'data' "
" top: 'ip' "
" inner_product_param { "
" num_output: 1 "
" } "
" } "
" layer { "
" name: 'loss' "
" type: 'EuclideanLoss' "
" bottom: 'ip' "
" bottom: 'label' "
" top: 'loss' "
" } "
"} "
"base_lr: 0.01 "
"momentum: 0.9 "
"weight_decay: 0.004 "
"lr_policy: 'fixed' "
"display: 100 "
"max_iter: 100 "
"accum_grad: %2%");
fmt % batch_size % accum_grad;
this->InitSolverFromProtoString(fmt.str());
}

shared_ptr<Solver<Dtype> > solver_;
};
Expand Down Expand Up @@ -104,4 +151,47 @@ TYPED_TEST(SolverTest, TestInitTrainTestNets) {
EXPECT_TRUE(this->solver_->test_nets()[1]->has_layer("accuracy"));
}

TYPED_TEST(SolverTest, TestSolverGradientAccumulation) {
typedef typename TypeParam::Dtype Dtype;
// Data preparation
const int batch_size = 8;
const int step = 8;
Blob<Dtype> data(batch_size * step, 3, 2, 2);
Blob<Dtype> label(batch_size * step, 1, 1, 1);
FillerParameter data_filler_param;
data_filler_param.set_std(1);
GaussianFiller<Dtype> data_filler(data_filler_param);
data_filler.Fill(&data);
data_filler.Fill(&label);

// Run with batch_size=8, accum_grad=1
this->InitSolverAccumGrad(batch_size, 1);
boost::static_pointer_cast<MemoryDataLayer<Dtype> >(
this->solver_->net()->layers()[0])->Reset(
data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step);
this->solver_->Step(step);
shared_ptr<Blob<Dtype> > weight1 = this->solver_->net()->params()[0];
shared_ptr<Blob<Dtype> > bias1 = this->solver_->net()->params()[1];

// Run with batch_size=4, accum_grad=2
this->InitSolverAccumGrad(batch_size / 2, 2);
boost::static_pointer_cast<MemoryDataLayer<Dtype> >(
this->solver_->net()->layers()[0])->Reset(
data.mutable_cpu_data(), label.mutable_cpu_data(), batch_size * step);
this->solver_->Step(step);
shared_ptr<Blob<Dtype> > weight2 = this->solver_->net()->params()[0];
shared_ptr<Blob<Dtype> > bias2 = this->solver_->net()->params()[1];

// Check if the numbers are the same for both settings.
for (int i = 0; i < weight1->count(); ++i) {
Dtype value1 = weight1->cpu_data()[i];
Dtype value2 = weight2->cpu_data()[i];
EXPECT_NEAR(value1, value2, 1e-7);
}
for (int i = 0; i < bias1->count(); ++i) {
Dtype value1 = bias1->cpu_data()[i];
Dtype value2 = bias2->cpu_data()[i];
EXPECT_NEAR(value1, value2, 1e-7);
}
}
} // namespace caffe

0 comments on commit d016dbd

Please sign in to comment.