From 149a176dc2f35f2218cd7a639157e90c59b12e72 Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Fri, 25 Jul 2014 20:40:07 -0700
Subject: [PATCH 1/2] Print blob L1 norms during forward/backward passes and
 updates if new "debug_info" field in SolverParameter is set.

---
 include/caffe/blob.hpp      |  4 ++
 include/caffe/net.hpp       | 18 +++++--
 src/caffe/blob.cpp          | 70 +++++++++++++++++++++++++++
 src/caffe/net.cpp           | 95 ++++++++++++++++++++++++++++++++-----
 src/caffe/proto/caffe.proto |  4 ++
 src/caffe/solver.cpp        | 10 +++-
 6 files changed, 186 insertions(+), 15 deletions(-)
diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp
index bbea86aea69..ab7a0f60f80 100644
--- a/include/caffe/blob.hpp
+++ b/include/caffe/blob.hpp
@@ -76,6 +76,10 @@ class Blob {
   void FromProto(const BlobProto& proto);
   void ToProto(BlobProto* proto, bool write_diff = false) const;
 
+  // Compute the sum of absolute values (L1 norm) of the data or diff.
+  Dtype asum_data() const;
+  Dtype asum_diff() const;
+
   // Set the data_/diff_ shared_ptr to point to the SyncedMemory holding the
   // data_/diff_ of Blob other -- useful in layers which simply perform a copy
   // in their forward or backward pass.
diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp
index 481a64979f1..7548011d973 100644
--- a/include/caffe/net.hpp
+++ b/include/caffe/net.hpp
@@ -95,8 +95,9 @@ class Net {
   // returns the parameters
   inline vector<shared_ptr<Blob<Dtype> > >& params() { return params_; }
   // returns the parameter learning rate multipliers
-  inline vector<float>& params_lr() {return params_lr_; }
+  inline vector<float>& params_lr() { return params_lr_; }
   inline vector<float>& params_weight_decay() { return params_weight_decay_; }
+  const map<string, int>& param_names_index() { return param_names_index_; }
   // Input and output blob numbers
   inline int num_inputs() { return net_input_blobs_.size(); }
   inline int num_outputs() { return net_output_blobs_.size(); }
@@ -111,7 +112,8 @@ class Net {
   const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name);
   bool has_layer(const string& layer_name);
   const shared_ptr<Layer<Dtype> > layer_by_name(const string& layer_name);
-  const map<string, int>& param_names_index() { return param_names_index_; }
+
+  void set_debug_info(const bool value) { debug_info_ = value; }
 
  protected:
   // Helpers for Init.
@@ -125,6 +127,12 @@ class Net {
                    map<string, int>* blob_name_to_idx);
   void AppendParam(const NetParameter& param, const int layer_id,
                    const int param_id);
+
+  // Helpers for displaying debug info.
+  void ForwardDebugInfo(const int layer_id);
+  void BackwardDebugInfo(const int layer_id);
+  void UpdateDebugInfo(const int param_id);
+
   // Function to get misc parameters, e.g. the learning rate multiplier and
   // weight decay.
   void GetLearningRateAndWeightDecay();
@@ -150,7 +158,8 @@ class Net {
   vector<vector<Blob<Dtype>*> > top_vecs_;
   vector<vector<int> > top_id_vecs_;
   vector<int> param_owners_;
-  vector<pair<int, int> > layer_param_indices_;
+  vector<string> param_display_names_;
+  vector<pair<int, int> > param_layer_indices_;
   map<string, int> param_names_index_;
   // blob indices for the input and the output of the net
   vector<int> net_input_blob_indices_;
@@ -166,6 +175,9 @@ class Net {
   vector<float> params_weight_decay_;
   // The bytes of memory used by this net
   size_t memory_used_;
+  // Whether to compute and display debug info for the net.
+  bool debug_info_;
+
   DISABLE_COPY_AND_ASSIGN(Net);
 };
 
diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp
index 1051eaa1c2c..738b549e10c 100644
--- a/src/caffe/blob.cpp
+++ b/src/caffe/blob.cpp
@@ -137,6 +137,76 @@ void Blob<Dtype>::Update() {
   }
 }
 
+template <> unsigned int Blob<unsigned int>::asum_data() const {
+  NOT_IMPLEMENTED;
+  return 0;
+}
+
+template <> int Blob<int>::asum_data() const {
+  NOT_IMPLEMENTED;
+  return 0;
+}
+
+template <typename Dtype>
+Dtype Blob<Dtype>::asum_data() const {
+  if (!data_) { return 0; }
+  switch (data_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    return caffe_cpu_asum(count_, cpu_data());
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+  {
+    Dtype asum;
+    caffe_gpu_asum(count_, gpu_data(), &asum);
+    return asum;
+  }
+#else
+    NO_GPU;
+#endif
+  case SyncedMemory::UNINITIALIZED:
+    return 0;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
+  }
+  return 0;
+}
+
+template <> unsigned int Blob<unsigned int>::asum_diff() const {
+  NOT_IMPLEMENTED;
+  return 0;
+}
+
+template <> int Blob<int>::asum_diff() const {
+  NOT_IMPLEMENTED;
+  return 0;
+}
+
+template <typename Dtype>
+Dtype Blob<Dtype>::asum_diff() const {
+  if (!diff_) { return 0; }
+  switch (diff_->head()) {
+  case SyncedMemory::HEAD_AT_CPU:
+    return caffe_cpu_asum(count_, cpu_diff());
+  case SyncedMemory::HEAD_AT_GPU:
+  case SyncedMemory::SYNCED:
+#ifndef CPU_ONLY
+  {
+    Dtype asum;
+    caffe_gpu_asum(count_, gpu_diff(), &asum);
+    return asum;
+  }
+#else
+    NO_GPU;
+#endif
+  case SyncedMemory::UNINITIALIZED:
+    return 0;
+  default:
+    LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
+  }
+  return 0;
+}
+
 template <typename Dtype>
 void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
   if (num_ != source.num() || channels_ != source.channels() ||
diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 7673608a5ed..612cbb1b60d 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -163,6 +163,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
   GetLearningRateAndWeightDecay();
   LOG(INFO) << "Network initialization done.";
   LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
+  // Don't display debug info by default.
+  debug_info_ = false;
 }
 
 // Helper for Net::Init: add a new input or top blob to the net.  (Inputs have
@@ -242,13 +244,17 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
                              const int param_id) {
   const LayerParameter& layer_param = layers_[layer_id]->layer_param();
   const int param_size = layer_param.param_size();
-  string param_name;
-  if (param_size) {
-    param_name = layer_param.param(param_id);
+  string param_name = param_size ? layer_param.param(param_id) : "";
+  if (param_name.size()) {
+    param_display_names_.push_back(param_name);
+  } else {
+    ostringstream param_display_name;
+    param_display_name << param_id;
+    param_display_names_.push_back(param_display_name.str());
   }
   const int net_param_id = params_.size();
   params_.push_back(layers_[layer_id]->blobs()[param_id]);
-  layer_param_indices_.push_back(make_pair(layer_id, param_id));
+  param_layer_indices_.push_back(make_pair(layer_id, param_id));
   if (!param_size || !param_name.size() || (param_name.size() &&
       param_names_index_.find(param_name) == param_names_index_.end())) {
     // This layer "owns" this parameter blob -- it is either anonymous
@@ -263,7 +269,7 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
     const int owner_net_param_id = param_names_index_[param_name];
     param_owners_.push_back(owner_net_param_id);
     const pair<int, int>& owner_index =
-        layer_param_indices_[owner_net_param_id];
+        param_layer_indices_[owner_net_param_id];
     const int owner_layer_id = owner_index.first;
     const int owner_param_id = owner_index.second;
     LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
@@ -339,6 +345,7 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
     // LOG(ERROR) << "Forwarding " << layer_names_[i];
     Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], &top_vecs_[i]);
     loss += layer_loss;
+    if (debug_info_) { ForwardDebugInfo(i); }
   }
   return loss;
 }
@@ -402,10 +409,75 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
     if (layer_need_backward_[i]) {
       layers_[i]->Backward(
           top_vecs_[i], bottom_need_backward_[i], &bottom_vecs_[i]);
+      if (debug_info_) { BackwardDebugInfo(i); }
     }
   }
 }
 
+template <typename Dtype>
+void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
+  for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
+    const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
+    const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
+    const Dtype asum = blob.asum_data();
+    const Dtype asum_mean = asum / blob.count();
+    LOG(INFO) << "    [Forward] "
+       << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
+       << " data: " << asum << " (" << asum_mean << ")";
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
+  const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
+  for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
+    if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
+    const Blob<Dtype>& blob = *bottom_vec[bottom_id];
+    const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
+    const Dtype asum = blob.asum_diff();
+    const Dtype asum_mean = asum / blob.count();
+    LOG(INFO) << "    [Backward] "
+        << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
+        << " diff: " << asum << " (" << asum_mean << ")";
+  }
+  for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
+       ++param_id) {
+    if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
+    const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
+    const Dtype asum = blob.asum_diff();
+    const Dtype asum_mean = asum / blob.count();
+    LOG(INFO) << "    [Backward] "
+        << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
+        << " diff: " << asum << " (" << asum_mean << ")";
+  }
+}
+
+template <typename Dtype>
+void Net<Dtype>::UpdateDebugInfo(const int param_id) {
+  const Blob<Dtype>& blob = *params_[param_id];
+  const int param_owner = param_owners_[param_id];
+  const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
+  const string& param_display_name = param_display_names_[param_id];
+  const Dtype asum_diff = blob.asum_diff();
+  const Dtype asum_diff_mean = asum_diff / blob.count();
+  if (param_owner < 0) {
+    const Dtype asum_data = blob.asum_data();
+    const Dtype asum_data_mean = asum_data / blob.count();
+    LOG(INFO) << "    [Update] Layer " << layer_name
+        << ", param " << param_display_name
+        << " data: " << asum_data << " (" << asum_data_mean << ");"
+        << " diff: " << asum_diff << " (" << asum_diff_mean << ")";
+  } else {
+    const string& owner_layer_name =
+        layer_names_[param_layer_indices_[param_owner].first];
+    LOG(INFO) << "    [Update] Layer " << layer_name
+        << ", param blob " << param_display_name
+        << " (owned by layer " << owner_layer_name << ", "
+        << "param " << param_display_names_[param_owners_[param_id]] << ")"
+        << " diff: " << asum_diff << " (" << asum_diff_mean << ")";
+  }
+}
+
 template <typename Dtype>
 void Net<Dtype>::ShareTrainedLayersWith(Net* other) {
   int num_source_layers = other->layers().size();
@@ -516,9 +588,8 @@ void Net<Dtype>::Update() {
   // diff. (Assumes that the learning rate, weight decay, etc. have already been
   // accounted for in the current diff.)
   for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) {
-      continue;
-    }
+    if (param_owners_[i] < 0) { continue; }
+    if (debug_info_) { UpdateDebugInfo(i); }
     const int count = params_[i]->count();
     const Dtype* this_diff;
     Dtype* owner_diff;
@@ -534,6 +605,8 @@ void Net<Dtype>::Update() {
       owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
       caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
       break;
+#else
+      NO_GPU;
 #endif
     default:
       LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
@@ -541,9 +614,9 @@ void Net<Dtype>::Update() {
   }
   // Now, update the owned parameters.
   for (int i = 0; i < params_.size(); ++i) {
-    if (param_owners_[i] < 0) {
-      params_[i]->Update();
-    }
+    if (param_owners_[i] >= 0) { continue; }
+    if (debug_info_) { UpdateDebugInfo(i); }
+    params_[i]->Update();
   }
 }
 
diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto
index 34cf8443f56..5e437dff327 100644
--- a/src/caffe/proto/caffe.proto
+++ b/src/caffe/proto/caffe.proto
@@ -105,6 +105,10 @@ message SolverParameter {
   // random number generator -- useful for reproducible results. Otherwise,
   // (and by default) initialize using a seed derived from the system clock.
   optional int64 random_seed = 20 [default = -1];
+
+  // If true, print information about the state of the net that may help with
+  // debugging learning problems.
+  optional bool debug_info = 23 [default = false];
 }
 
 // A message that stores the solver snapshots
diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp
index a33149721bf..0dfc7f0a52d 100644
--- a/src/caffe/solver.cpp
+++ b/src/caffe/solver.cpp
@@ -53,6 +53,8 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
     LOG(INFO) << "Creating training net from file: " << param_.train_net();
     net_.reset(new Net<Dtype>(param_.train_net()));
   }
+  CHECK(net_) << "Training net uninitialized.";
+  net_->set_debug_info(param_.debug_info());
   const int num_test_net_params = param_.test_net_param_size();
   const int num_test_net_files = param_.test_net_size();
   const int num_test_nets = num_test_net_params + num_test_net_files;
@@ -100,11 +102,17 @@ void Solver<Dtype>::Solve(const char* resume_file) {
   // should be given, and we will just provide dummy vecs.
   vector<Blob<Dtype>*> bottom_vec;
   while (iter_++ < param_.max_iter()) {
+    const bool display = param_.display() && iter_ % param_.display() == 0;
+    if (display) {
+      net_->set_debug_info(param_.debug_info());
+    } else {
+      net_->set_debug_info(false);
+    }
     Dtype loss = net_->ForwardBackward(bottom_vec);
     ComputeUpdateValue();
     net_->Update();
 
-    if (param_.display() && iter_ % param_.display() == 0) {
+    if (display) {
       LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
     }
     if (param_.test_interval() && iter_ % param_.test_interval() == 0) {

From 69997119747d0c61424a8338ae70c0a805b74f0d Mon Sep 17 00:00:00 2001
From: Jeff Donahue <jeff.donahue@gmail.com>
Date: Sat, 26 Jul 2014 15:18:41 -0700
Subject: [PATCH 2/2] Print just the mean absolute value (no sum/l1norm)

---
 src/caffe/net.cpp | 26 ++++++++++----------------
 1 file changed, 10 insertions(+), 16 deletions(-)

diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp
index 612cbb1b60d..8e99d307f32 100644
--- a/src/caffe/net.cpp
+++ b/src/caffe/net.cpp
@@ -419,11 +419,10 @@ void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
   for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
     const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
     const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
-    const Dtype asum = blob.asum_data();
-    const Dtype asum_mean = asum / blob.count();
+    const Dtype asum_mean = blob.asum_data() / blob.count();
     LOG(INFO) << "    [Forward] "
        << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
-       << " data: " << asum << " (" << asum_mean << ")";
+       << " data: " << asum_mean;
   }
 }
 
@@ -434,21 +433,19 @@ void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
     if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
     const Blob<Dtype>& blob = *bottom_vec[bottom_id];
     const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
-    const Dtype asum = blob.asum_diff();
-    const Dtype asum_mean = asum / blob.count();
+    const Dtype asum_mean = blob.asum_diff() / blob.count();
     LOG(INFO) << "    [Backward] "
         << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
-        << " diff: " << asum << " (" << asum_mean << ")";
+        << " diff: " << asum_mean;
   }
   for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
        ++param_id) {
     if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
     const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
-    const Dtype asum = blob.asum_diff();
-    const Dtype asum_mean = asum / blob.count();
+    const Dtype asum_mean = blob.asum_diff() / blob.count();
     LOG(INFO) << "    [Backward] "
         << "Layer " << layer_names_[layer_id] << ", param blob " << param_id
-        << " diff: " << asum << " (" << asum_mean << ")";
+        << " diff: " << asum_mean;
   }
 }
 
@@ -458,15 +455,12 @@ void Net<Dtype>::UpdateDebugInfo(const int param_id) {
   const int param_owner = param_owners_[param_id];
   const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
   const string& param_display_name = param_display_names_[param_id];
-  const Dtype asum_diff = blob.asum_diff();
-  const Dtype asum_diff_mean = asum_diff / blob.count();
+  const Dtype asum_diff_mean = blob.asum_diff() / blob.count();
   if (param_owner < 0) {
-    const Dtype asum_data = blob.asum_data();
-    const Dtype asum_data_mean = asum_data / blob.count();
+    const Dtype asum_data_mean = blob.asum_data() / blob.count();
     LOG(INFO) << "    [Update] Layer " << layer_name
         << ", param " << param_display_name
-        << " data: " << asum_data << " (" << asum_data_mean << ");"
-        << " diff: " << asum_diff << " (" << asum_diff_mean << ")";
+        << " data: " << asum_data_mean << "; diff: " << asum_diff_mean;
   } else {
     const string& owner_layer_name =
         layer_names_[param_layer_indices_[param_owner].first];
@@ -474,7 +468,7 @@ void Net<Dtype>::UpdateDebugInfo(const int param_id) {
         << ", param blob " << param_display_name
         << " (owned by layer " << owner_layer_name << ", "
         << "param " << param_display_names_[param_owners_[param_id]] << ")"
-        << " diff: " << asum_diff << " (" << asum_diff_mean << ")";
+        << " diff: " << asum_diff_mean;
   }
 }