From 55585f5bfab61328a61125b3d49627a69022d817 Mon Sep 17 00:00:00 2001 From: Evan Shelhamer Date: Thu, 21 May 2015 17:06:42 -0700 Subject: [PATCH] adjust local learning rate and decay according to gradient accumulation Divide local rate by `iter_size` to normalize the gradient according to the full minibatch size and not only the computational batch size. Multiply the local decay by `iter_size` to counter the division of the local learning rate since the decay is multiplied by the rate in the update equation. --- src/caffe/solver.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index d104522002b..4c8fa25c955 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -488,7 +488,7 @@ void SGDSolver::ApplyUpdate() { ClipGradients(); for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) { Regularize(param_id); - ComputeUpdateValue(param_id, rate); + ComputeUpdateValue(param_id, rate / this->param_.iter_size()); } this->net_->Update(); } @@ -500,7 +500,8 @@ void SGDSolver::Regularize(int param_id) { this->net_->params_weight_decay(); Dtype weight_decay = this->param_.weight_decay(); string regularization_type = this->param_.regularization_type(); - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; + Dtype local_decay = weight_decay * net_params_weight_decay[param_id] + * this->param_.iter_size(); switch (Caffe::mode()) { case Caffe::CPU: { if (local_decay) {