directly normalize accumulated gradients

`SGDSolver::Normalize()` normalizes accumulated gradients by scaling inversely to the accumulation as `1 / iter_size`. This fixes accumulation for AdaGrad and is more obvious than fooling with rates and decays in 55585f5.
BVLC · May 28, 2015 · 0e7a078 · 0e7a078
1 parent 92ab737
commit 0e7a078
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 3 deletions.
diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp
@@ -81,6 +81,7 @@ class SGDSolver : public Solver<Dtype> {
   void PreSolve();
   Dtype GetLearningRate();
   virtual void ApplyUpdate();
+  virtual void Normalize(int param_id);
   virtual void Regularize(int param_id);
   virtual void ComputeUpdateValue(int param_id, Dtype rate);
   virtual void ClipGradients();

diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
@@ -487,21 +487,47 @@ void SGDSolver<Dtype>::ApplyUpdate() {
   }
   ClipGradients();
   for (int param_id = 0; param_id < this->net_->params().size(); ++param_id) {
+    Normalize(param_id);
     Regularize(param_id);
-    ComputeUpdateValue(param_id, rate / this->param_.iter_size());
+    ComputeUpdateValue(param_id, rate);
   }
   this->net_->Update();
 }
 
+template <typename Dtype>
+void SGDSolver<Dtype>::Normalize(int param_id) {
+  if (this->param_.iter_size() == 1) { return; }
+  // Scale gradient to counterbalance accumulation.
+  const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
+  const Dtype accum_normalization = Dtype(1.) / this->param_.iter_size();
+  switch (Caffe::mode()) {
+  case Caffe::CPU: {
+    caffe_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_cpu_diff());
+    break;
+  }
+  case Caffe::GPU: {
+#ifndef CPU_ONLY
+    caffe_gpu_scal(net_params[param_id]->count(), accum_normalization,
+        net_params[param_id]->mutable_gpu_diff());
+#else
+    NO_GPU;
+#endif
+    break;
+  }
+  default:
+    LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
+  }
+}
+
 template <typename Dtype>
 void SGDSolver<Dtype>::Regularize(int param_id) {
   const vector<shared_ptr<Blob<Dtype> > >& net_params = this->net_->params();
   const vector<float>& net_params_weight_decay =
       this->net_->params_weight_decay();
   Dtype weight_decay = this->param_.weight_decay();
   string regularization_type = this->param_.regularization_type();
-  Dtype local_decay = weight_decay * net_params_weight_decay[param_id]
-                                   * this->param_.iter_size();
+  Dtype local_decay = weight_decay * net_params_weight_decay[param_id];
   switch (Caffe::mode()) {
   case Caffe::CPU: {
     if (local_decay) {