From 149a176dc2f35f2218cd7a639157e90c59b12e72 Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Fri, 25 Jul 2014 20:40:07 -0700 Subject: [PATCH 1/2] Print blob L1 norms during forward/backward passes and updates if new "debug_info" field in SolverParameter is set. --- include/caffe/blob.hpp | 4 ++ include/caffe/net.hpp | 18 +++++-- src/caffe/blob.cpp | 70 +++++++++++++++++++++++++++ src/caffe/net.cpp | 95 ++++++++++++++++++++++++++++++++----- src/caffe/proto/caffe.proto | 4 ++ src/caffe/solver.cpp | 10 +++- 6 files changed, 186 insertions(+), 15 deletions(-) diff --git a/include/caffe/blob.hpp b/include/caffe/blob.hpp index bbea86aea69..ab7a0f60f80 100644 --- a/include/caffe/blob.hpp +++ b/include/caffe/blob.hpp @@ -76,6 +76,10 @@ class Blob { void FromProto(const BlobProto& proto); void ToProto(BlobProto* proto, bool write_diff = false) const; + // Compute the sum of absolute values (L1 norm) of the data or diff. + Dtype asum_data() const; + Dtype asum_diff() const; + // Set the data_/diff_ shared_ptr to point to the SyncedMemory holding the // data_/diff_ of Blob other -- useful in layers which simply perform a copy // in their forward or backward pass. diff --git a/include/caffe/net.hpp b/include/caffe/net.hpp index 481a64979f1..7548011d973 100644 --- a/include/caffe/net.hpp +++ b/include/caffe/net.hpp @@ -95,8 +95,9 @@ class Net { // returns the parameters inline vector > >& params() { return params_; } // returns the parameter learning rate multipliers - inline vector& params_lr() {return params_lr_; } + inline vector& params_lr() { return params_lr_; } inline vector& params_weight_decay() { return params_weight_decay_; } + const map& param_names_index() { return param_names_index_; } // Input and output blob numbers inline int num_inputs() { return net_input_blobs_.size(); } inline int num_outputs() { return net_output_blobs_.size(); } @@ -111,7 +112,8 @@ class Net { const shared_ptr > blob_by_name(const string& blob_name); bool has_layer(const string& layer_name); const shared_ptr > layer_by_name(const string& layer_name); - const map& param_names_index() { return param_names_index_; } + + void set_debug_info(const bool value) { debug_info_ = value; } protected: // Helpers for Init. @@ -125,6 +127,12 @@ class Net { map* blob_name_to_idx); void AppendParam(const NetParameter& param, const int layer_id, const int param_id); + + // Helpers for displaying debug info. + void ForwardDebugInfo(const int layer_id); + void BackwardDebugInfo(const int layer_id); + void UpdateDebugInfo(const int param_id); + // Function to get misc parameters, e.g. the learning rate multiplier and // weight decay. void GetLearningRateAndWeightDecay(); @@ -150,7 +158,8 @@ class Net { vector*> > top_vecs_; vector > top_id_vecs_; vector param_owners_; - vector > layer_param_indices_; + vector param_display_names_; + vector > param_layer_indices_; map param_names_index_; // blob indices for the input and the output of the net vector net_input_blob_indices_; @@ -166,6 +175,9 @@ class Net { vector params_weight_decay_; // The bytes of memory used by this net size_t memory_used_; + // Whether to compute and display debug info for the net. + bool debug_info_; + DISABLE_COPY_AND_ASSIGN(Net); }; diff --git a/src/caffe/blob.cpp b/src/caffe/blob.cpp index 1051eaa1c2c..738b549e10c 100644 --- a/src/caffe/blob.cpp +++ b/src/caffe/blob.cpp @@ -137,6 +137,76 @@ void Blob::Update() { } } +template <> unsigned int Blob::asum_data() const { + NOT_IMPLEMENTED; + return 0; +} + +template <> int Blob::asum_data() const { + NOT_IMPLEMENTED; + return 0; +} + +template +Dtype Blob::asum_data() const { + if (!data_) { return 0; } + switch (data_->head()) { + case SyncedMemory::HEAD_AT_CPU: + return caffe_cpu_asum(count_, cpu_data()); + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + { + Dtype asum; + caffe_gpu_asum(count_, gpu_data(), &asum); + return asum; + } +#else + NO_GPU; +#endif + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head(); + } + return 0; +} + +template <> unsigned int Blob::asum_diff() const { + NOT_IMPLEMENTED; + return 0; +} + +template <> int Blob::asum_diff() const { + NOT_IMPLEMENTED; + return 0; +} + +template +Dtype Blob::asum_diff() const { + if (!diff_) { return 0; } + switch (diff_->head()) { + case SyncedMemory::HEAD_AT_CPU: + return caffe_cpu_asum(count_, cpu_diff()); + case SyncedMemory::HEAD_AT_GPU: + case SyncedMemory::SYNCED: +#ifndef CPU_ONLY + { + Dtype asum; + caffe_gpu_asum(count_, gpu_diff(), &asum); + return asum; + } +#else + NO_GPU; +#endif + case SyncedMemory::UNINITIALIZED: + return 0; + default: + LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head(); + } + return 0; +} + template void Blob::CopyFrom(const Blob& source, bool copy_diff, bool reshape) { if (num_ != source.num() || channels_ != source.channels() || diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 7673608a5ed..612cbb1b60d 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -163,6 +163,8 @@ void Net::Init(const NetParameter& in_param) { GetLearningRateAndWeightDecay(); LOG(INFO) << "Network initialization done."; LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype); + // Don't display debug info by default. + debug_info_ = false; } // Helper for Net::Init: add a new input or top blob to the net. (Inputs have @@ -242,13 +244,17 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, const int param_id) { const LayerParameter& layer_param = layers_[layer_id]->layer_param(); const int param_size = layer_param.param_size(); - string param_name; - if (param_size) { - param_name = layer_param.param(param_id); + string param_name = param_size ? layer_param.param(param_id) : ""; + if (param_name.size()) { + param_display_names_.push_back(param_name); + } else { + ostringstream param_display_name; + param_display_name << param_id; + param_display_names_.push_back(param_display_name.str()); } const int net_param_id = params_.size(); params_.push_back(layers_[layer_id]->blobs()[param_id]); - layer_param_indices_.push_back(make_pair(layer_id, param_id)); + param_layer_indices_.push_back(make_pair(layer_id, param_id)); if (!param_size || !param_name.size() || (param_name.size() && param_names_index_.find(param_name) == param_names_index_.end())) { // This layer "owns" this parameter blob -- it is either anonymous @@ -263,7 +269,7 @@ void Net::AppendParam(const NetParameter& param, const int layer_id, const int owner_net_param_id = param_names_index_[param_name]; param_owners_.push_back(owner_net_param_id); const pair& owner_index = - layer_param_indices_[owner_net_param_id]; + param_layer_indices_[owner_net_param_id]; const int owner_layer_id = owner_index.first; const int owner_param_id = owner_index.second; LOG(INFO) << "Sharing parameters '" << param_name << "' owned by " @@ -339,6 +345,7 @@ Dtype Net::ForwardFromTo(int start, int end) { // LOG(ERROR) << "Forwarding " << layer_names_[i]; Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], &top_vecs_[i]); loss += layer_loss; + if (debug_info_) { ForwardDebugInfo(i); } } return loss; } @@ -402,10 +409,75 @@ void Net::BackwardFromTo(int start, int end) { if (layer_need_backward_[i]) { layers_[i]->Backward( top_vecs_[i], bottom_need_backward_[i], &bottom_vecs_[i]); + if (debug_info_) { BackwardDebugInfo(i); } } } } +template +void Net::ForwardDebugInfo(const int layer_id) { + for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { + const Blob& blob = *top_vecs_[layer_id][top_id]; + const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; + const Dtype asum = blob.asum_data(); + const Dtype asum_mean = asum / blob.count(); + LOG(INFO) << " [Forward] " + << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name + << " data: " << asum << " (" << asum_mean << ")"; + } +} + +template +void Net::BackwardDebugInfo(const int layer_id) { + const vector*>& bottom_vec = bottom_vecs_[layer_id]; + for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) { + if (!bottom_need_backward_[layer_id][bottom_id]) { continue; } + const Blob& blob = *bottom_vec[bottom_id]; + const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; + const Dtype asum = blob.asum_diff(); + const Dtype asum_mean = asum / blob.count(); + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name + << " diff: " << asum << " (" << asum_mean << ")"; + } + for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); + ++param_id) { + if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } + const Blob& blob = *layers_[layer_id]->blobs()[param_id]; + const Dtype asum = blob.asum_diff(); + const Dtype asum_mean = asum / blob.count(); + LOG(INFO) << " [Backward] " + << "Layer " << layer_names_[layer_id] << ", param blob " << param_id + << " diff: " << asum << " (" << asum_mean << ")"; + } +} + +template +void Net::UpdateDebugInfo(const int param_id) { + const Blob& blob = *params_[param_id]; + const int param_owner = param_owners_[param_id]; + const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; + const string& param_display_name = param_display_names_[param_id]; + const Dtype asum_diff = blob.asum_diff(); + const Dtype asum_diff_mean = asum_diff / blob.count(); + if (param_owner < 0) { + const Dtype asum_data = blob.asum_data(); + const Dtype asum_data_mean = asum_data / blob.count(); + LOG(INFO) << " [Update] Layer " << layer_name + << ", param " << param_display_name + << " data: " << asum_data << " (" << asum_data_mean << ");" + << " diff: " << asum_diff << " (" << asum_diff_mean << ")"; + } else { + const string& owner_layer_name = + layer_names_[param_layer_indices_[param_owner].first]; + LOG(INFO) << " [Update] Layer " << layer_name + << ", param blob " << param_display_name + << " (owned by layer " << owner_layer_name << ", " + << "param " << param_display_names_[param_owners_[param_id]] << ")" + << " diff: " << asum_diff << " (" << asum_diff_mean << ")"; + } +} + template void Net::ShareTrainedLayersWith(Net* other) { int num_source_layers = other->layers().size(); @@ -516,9 +588,8 @@ void Net::Update() { // diff. (Assumes that the learning rate, weight decay, etc. have already been // accounted for in the current diff.) for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { - continue; - } + if (param_owners_[i] < 0) { continue; } + if (debug_info_) { UpdateDebugInfo(i); } const int count = params_[i]->count(); const Dtype* this_diff; Dtype* owner_diff; @@ -534,6 +605,8 @@ void Net::Update() { owner_diff = params_[param_owners_[i]]->mutable_gpu_diff(); caffe_gpu_add(count, this_diff, owner_diff, owner_diff); break; +#else + NO_GPU; #endif default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); @@ -541,9 +614,9 @@ void Net::Update() { } // Now, update the owned parameters. for (int i = 0; i < params_.size(); ++i) { - if (param_owners_[i] < 0) { - params_[i]->Update(); - } + if (param_owners_[i] >= 0) { continue; } + if (debug_info_) { UpdateDebugInfo(i); } + params_[i]->Update(); } } diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 34cf8443f56..5e437dff327 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -105,6 +105,10 @@ message SolverParameter { // random number generator -- useful for reproducible results. Otherwise, // (and by default) initialize using a seed derived from the system clock. optional int64 random_seed = 20 [default = -1]; + + // If true, print information about the state of the net that may help with + // debugging learning problems. + optional bool debug_info = 23 [default = false]; } // A message that stores the solver snapshots diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index a33149721bf..0dfc7f0a52d 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -53,6 +53,8 @@ void Solver::Init(const SolverParameter& param) { LOG(INFO) << "Creating training net from file: " << param_.train_net(); net_.reset(new Net(param_.train_net())); } + CHECK(net_) << "Training net uninitialized."; + net_->set_debug_info(param_.debug_info()); const int num_test_net_params = param_.test_net_param_size(); const int num_test_net_files = param_.test_net_size(); const int num_test_nets = num_test_net_params + num_test_net_files; @@ -100,11 +102,17 @@ void Solver::Solve(const char* resume_file) { // should be given, and we will just provide dummy vecs. vector*> bottom_vec; while (iter_++ < param_.max_iter()) { + const bool display = param_.display() && iter_ % param_.display() == 0; + if (display) { + net_->set_debug_info(param_.debug_info()); + } else { + net_->set_debug_info(false); + } Dtype loss = net_->ForwardBackward(bottom_vec); ComputeUpdateValue(); net_->Update(); - if (param_.display() && iter_ % param_.display() == 0) { + if (display) { LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss; } if (param_.test_interval() && iter_ % param_.test_interval() == 0) { From 69997119747d0c61424a8338ae70c0a805b74f0d Mon Sep 17 00:00:00 2001 From: Jeff Donahue Date: Sat, 26 Jul 2014 15:18:41 -0700 Subject: [PATCH 2/2] Print just the mean absolute value (no sum/l1norm) --- src/caffe/net.cpp | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) diff --git a/src/caffe/net.cpp b/src/caffe/net.cpp index 612cbb1b60d..8e99d307f32 100644 --- a/src/caffe/net.cpp +++ b/src/caffe/net.cpp @@ -419,11 +419,10 @@ void Net::ForwardDebugInfo(const int layer_id) { for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) { const Blob& blob = *top_vecs_[layer_id][top_id]; const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]]; - const Dtype asum = blob.asum_data(); - const Dtype asum_mean = asum / blob.count(); + const Dtype asum_mean = blob.asum_data() / blob.count(); LOG(INFO) << " [Forward] " << "Layer " << layer_names_[layer_id] << ", top blob " << blob_name - << " data: " << asum << " (" << asum_mean << ")"; + << " data: " << asum_mean; } } @@ -434,21 +433,19 @@ void Net::BackwardDebugInfo(const int layer_id) { if (!bottom_need_backward_[layer_id][bottom_id]) { continue; } const Blob& blob = *bottom_vec[bottom_id]; const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]]; - const Dtype asum = blob.asum_diff(); - const Dtype asum_mean = asum / blob.count(); + const Dtype asum_mean = blob.asum_diff() / blob.count(); LOG(INFO) << " [Backward] " << "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name - << " diff: " << asum << " (" << asum_mean << ")"; + << " diff: " << asum_mean; } for (int param_id = 0; param_id < layers_[layer_id]->blobs().size(); ++param_id) { if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; } const Blob& blob = *layers_[layer_id]->blobs()[param_id]; - const Dtype asum = blob.asum_diff(); - const Dtype asum_mean = asum / blob.count(); + const Dtype asum_mean = blob.asum_diff() / blob.count(); LOG(INFO) << " [Backward] " << "Layer " << layer_names_[layer_id] << ", param blob " << param_id - << " diff: " << asum << " (" << asum_mean << ")"; + << " diff: " << asum_mean; } } @@ -458,15 +455,12 @@ void Net::UpdateDebugInfo(const int param_id) { const int param_owner = param_owners_[param_id]; const string& layer_name = layer_names_[param_layer_indices_[param_id].first]; const string& param_display_name = param_display_names_[param_id]; - const Dtype asum_diff = blob.asum_diff(); - const Dtype asum_diff_mean = asum_diff / blob.count(); + const Dtype asum_diff_mean = blob.asum_diff() / blob.count(); if (param_owner < 0) { - const Dtype asum_data = blob.asum_data(); - const Dtype asum_data_mean = asum_data / blob.count(); + const Dtype asum_data_mean = blob.asum_data() / blob.count(); LOG(INFO) << " [Update] Layer " << layer_name << ", param " << param_display_name - << " data: " << asum_data << " (" << asum_data_mean << ");" - << " diff: " << asum_diff << " (" << asum_diff_mean << ")"; + << " data: " << asum_data_mean << "; diff: " << asum_diff_mean; } else { const string& owner_layer_name = layer_names_[param_layer_indices_[param_owner].first]; @@ -474,7 +468,7 @@ void Net::UpdateDebugInfo(const int param_id) { << ", param blob " << param_display_name << " (owned by layer " << owner_layer_name << ", " << "param " << param_display_names_[param_owners_[param_id]] << ")" - << " diff: " << asum_diff << " (" << asum_diff_mean << ")"; + << " diff: " << asum_diff_mean; } }