From 55585f5bfab61328a61125b3d49627a69022d817 Mon Sep 17 00:00:00 2001
From: Evan Shelhamer <shelhamer@imaginarynumber.net>
Date: Thu, 21 May 2015 17:06:42 -0700
Subject: [PATCH] adjust local learning rate and decay according to gradient
 accumulation

Divide local rate by `iter_size` to normalize the gradient according to
the full minibatch size and not only the computational batch size.

Multiply the local decay by `iter_size` to counter the division of the
local learning rate since the decay is multiplied by the rate in the
update equation.
---
 src/caffe/solver.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index d104522002b..4c8fa25c955 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -488,7 +488,7 @@ void SGDSolver<Dtype>::ApplyUpdate() {
   ClipGradients();
   for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
     Regularize(param_id);
-    ComputeUpdateValue(param_id, rate);
+    ComputeUpdateValue(param_id, rate / this->param_.iter_size());
   }
   this->net_->Update();
 }
@@ -500,7 +500,8 @@ void SGDSolver<Dtype>::Regularize(int param_id) {
       this->net_->params_weight_decay();
   Dtype weight_decay = this->param_.weight_decay();
   string regularization_type = this->param_.regularization_type();
-  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
+  Dtype local_decay = weight_decay * net_params_weight_decay[param_id]
+                                   * this->param_.iter_size();
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     if (local_decay) {