Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Print blob debug info during training if SolverParameter "debug_info" field is set #796

Merged
merged 2 commits into from
Jul 26, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions include/caffe/blob.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,10 @@ class Blob {
void FromProto(const BlobProto& proto);
void ToProto(BlobProto* proto, bool write_diff = false) const;

// Compute the sum of absolute values (L1 norm) of the data or diff.
Dtype asum_data() const;
Dtype asum_diff() const;

// Set the data_/diff_ shared_ptr to point to the SyncedMemory holding the
// data_/diff_ of Blob other -- useful in layers which simply perform a copy
// in their forward or backward pass.
Expand Down
18 changes: 15 additions & 3 deletions include/caffe/net.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ class Net {
// returns the parameters
inline vector<shared_ptr<Blob<Dtype> > >& params() { return params_; }
// returns the parameter learning rate multipliers
inline vector<float>& params_lr() {return params_lr_; }
inline vector<float>& params_lr() { return params_lr_; }
inline vector<float>& params_weight_decay() { return params_weight_decay_; }
const map<string, int>& param_names_index() { return param_names_index_; }
// Input and output blob numbers
inline int num_inputs() { return net_input_blobs_.size(); }
inline int num_outputs() { return net_output_blobs_.size(); }
Expand All @@ -111,7 +112,8 @@ class Net {
const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name);
bool has_layer(const string& layer_name);
const shared_ptr<Layer<Dtype> > layer_by_name(const string& layer_name);
const map<string, int>& param_names_index() { return param_names_index_; }

void set_debug_info(const bool value) { debug_info_ = value; }

protected:
// Helpers for Init.
Expand All @@ -125,6 +127,12 @@ class Net {
map<string, int>* blob_name_to_idx);
void AppendParam(const NetParameter& param, const int layer_id,
const int param_id);

// Helpers for displaying debug info.
void ForwardDebugInfo(const int layer_id);
void BackwardDebugInfo(const int layer_id);
void UpdateDebugInfo(const int param_id);

// Function to get misc parameters, e.g. the learning rate multiplier and
// weight decay.
void GetLearningRateAndWeightDecay();
Expand All @@ -150,7 +158,8 @@ class Net {
vector<vector<Blob<Dtype>*> > top_vecs_;
vector<vector<int> > top_id_vecs_;
vector<int> param_owners_;
vector<pair<int, int> > layer_param_indices_;
vector<string> param_display_names_;
vector<pair<int, int> > param_layer_indices_;
map<string, int> param_names_index_;
// blob indices for the input and the output of the net
vector<int> net_input_blob_indices_;
Expand All @@ -166,6 +175,9 @@ class Net {
vector<float> params_weight_decay_;
// The bytes of memory used by this net
size_t memory_used_;
// Whether to compute and display debug info for the net.
bool debug_info_;

DISABLE_COPY_AND_ASSIGN(Net);
};

Expand Down
70 changes: 70 additions & 0 deletions src/caffe/blob.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,76 @@ void Blob<Dtype>::Update() {
}
}

template <> unsigned int Blob<unsigned int>::asum_data() const {
NOT_IMPLEMENTED;
return 0;
}

template <> int Blob<int>::asum_data() const {
NOT_IMPLEMENTED;
return 0;
}

template <typename Dtype>
Dtype Blob<Dtype>::asum_data() const {
if (!data_) { return 0; }
switch (data_->head()) {
case SyncedMemory::HEAD_AT_CPU:
return caffe_cpu_asum(count_, cpu_data());
case SyncedMemory::HEAD_AT_GPU:
case SyncedMemory::SYNCED:
#ifndef CPU_ONLY
{
Dtype asum;
caffe_gpu_asum(count_, gpu_data(), &asum);
return asum;
}
#else
NO_GPU;
#endif
case SyncedMemory::UNINITIALIZED:
return 0;
default:
LOG(FATAL) << "Unknown SyncedMemory head state: " << data_->head();
}
return 0;
}

template <> unsigned int Blob<unsigned int>::asum_diff() const {
NOT_IMPLEMENTED;
return 0;
}

template <> int Blob<int>::asum_diff() const {
NOT_IMPLEMENTED;
return 0;
}

template <typename Dtype>
Dtype Blob<Dtype>::asum_diff() const {
if (!diff_) { return 0; }
switch (diff_->head()) {
case SyncedMemory::HEAD_AT_CPU:
return caffe_cpu_asum(count_, cpu_diff());
case SyncedMemory::HEAD_AT_GPU:
case SyncedMemory::SYNCED:
#ifndef CPU_ONLY
{
Dtype asum;
caffe_gpu_asum(count_, gpu_diff(), &asum);
return asum;
}
#else
NO_GPU;
#endif
case SyncedMemory::UNINITIALIZED:
return 0;
default:
LOG(FATAL) << "Unknown SyncedMemory head state: " << diff_->head();
}
return 0;
}

template <typename Dtype>
void Blob<Dtype>::CopyFrom(const Blob& source, bool copy_diff, bool reshape) {
if (num_ != source.num() || channels_ != source.channels() ||
Expand Down
89 changes: 78 additions & 11 deletions src/caffe/net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,8 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
GetLearningRateAndWeightDecay();
LOG(INFO) << "Network initialization done.";
LOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
// Don't display debug info by default.
debug_info_ = false;
}

// Helper for Net::Init: add a new input or top blob to the net. (Inputs have
Expand Down Expand Up @@ -242,13 +244,17 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
const int param_id) {
const LayerParameter& layer_param = layers_[layer_id]->layer_param();
const int param_size = layer_param.param_size();
string param_name;
if (param_size) {
param_name = layer_param.param(param_id);
string param_name = param_size ? layer_param.param(param_id) : "";
if (param_name.size()) {
param_display_names_.push_back(param_name);
} else {
ostringstream param_display_name;
param_display_name << param_id;
param_display_names_.push_back(param_display_name.str());
}
const int net_param_id = params_.size();
params_.push_back(layers_[layer_id]->blobs()[param_id]);
layer_param_indices_.push_back(make_pair(layer_id, param_id));
param_layer_indices_.push_back(make_pair(layer_id, param_id));
if (!param_size || !param_name.size() || (param_name.size() &&
param_names_index_.find(param_name) == param_names_index_.end())) {
// This layer "owns" this parameter blob -- it is either anonymous
Expand All @@ -263,7 +269,7 @@ void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
const int owner_net_param_id = param_names_index_[param_name];
param_owners_.push_back(owner_net_param_id);
const pair<int, int>& owner_index =
layer_param_indices_[owner_net_param_id];
param_layer_indices_[owner_net_param_id];
const int owner_layer_id = owner_index.first;
const int owner_param_id = owner_index.second;
LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
Expand Down Expand Up @@ -339,6 +345,7 @@ Dtype Net<Dtype>::ForwardFromTo(int start, int end) {
// LOG(ERROR) << "Forwarding " << layer_names_[i];
Dtype layer_loss = layers_[i]->Forward(bottom_vecs_[i], &top_vecs_[i]);
loss += layer_loss;
if (debug_info_) { ForwardDebugInfo(i); }
}
return loss;
}
Expand Down Expand Up @@ -402,10 +409,69 @@ void Net<Dtype>::BackwardFromTo(int start, int end) {
if (layer_need_backward_[i]) {
layers_[i]->Backward(
top_vecs_[i], bottom_need_backward_[i], &bottom_vecs_[i]);
if (debug_info_) { BackwardDebugInfo(i); }
}
}
}

template <typename Dtype>
void Net<Dtype>::ForwardDebugInfo(const int layer_id) {
for (int top_id = 0; top_id < top_vecs_[layer_id].size(); ++top_id) {
const Blob<Dtype>& blob = *top_vecs_[layer_id][top_id];
const string& blob_name = blob_names_[top_id_vecs_[layer_id][top_id]];
const Dtype asum_mean = blob.asum_data() / blob.count();
LOG(INFO) << " [Forward] "
<< "Layer " << layer_names_[layer_id] << ", top blob " << blob_name
<< " data: " << asum_mean;
}
}

template <typename Dtype>
void Net<Dtype>::BackwardDebugInfo(const int layer_id) {
const vector<Blob<Dtype>*>& bottom_vec = bottom_vecs_[layer_id];
for (int bottom_id = 0; bottom_id < bottom_vec.size(); ++bottom_id) {
if (!bottom_need_backward_[layer_id][bottom_id]) { continue; }
const Blob<Dtype>& blob = *bottom_vec[bottom_id];
const string& blob_name = blob_names_[bottom_id_vecs_[layer_id][bottom_id]];
const Dtype asum_mean = blob.asum_diff() / blob.count();
LOG(INFO) << " [Backward] "
<< "Layer " << layer_names_[layer_id] << ", bottom blob " << blob_name
<< " diff: " << asum_mean;
}
for (int param_id = 0; param_id < layers_[layer_id]->blobs().size();
++param_id) {
if (!layers_[layer_id]->param_propagate_down(param_id)) { continue; }
const Blob<Dtype>& blob = *layers_[layer_id]->blobs()[param_id];
const Dtype asum_mean = blob.asum_diff() / blob.count();
LOG(INFO) << " [Backward] "
<< "Layer " << layer_names_[layer_id] << ", param blob " << param_id
<< " diff: " << asum_mean;
}
}

template <typename Dtype>
void Net<Dtype>::UpdateDebugInfo(const int param_id) {
const Blob<Dtype>& blob = *params_[param_id];
const int param_owner = param_owners_[param_id];
const string& layer_name = layer_names_[param_layer_indices_[param_id].first];
const string& param_display_name = param_display_names_[param_id];
const Dtype asum_diff_mean = blob.asum_diff() / blob.count();
if (param_owner < 0) {
const Dtype asum_data_mean = blob.asum_data() / blob.count();
LOG(INFO) << " [Update] Layer " << layer_name
<< ", param " << param_display_name
<< " data: " << asum_data_mean << "; diff: " << asum_diff_mean;
} else {
const string& owner_layer_name =
layer_names_[param_layer_indices_[param_owner].first];
LOG(INFO) << " [Update] Layer " << layer_name
<< ", param blob " << param_display_name
<< " (owned by layer " << owner_layer_name << ", "
<< "param " << param_display_names_[param_owners_[param_id]] << ")"
<< " diff: " << asum_diff_mean;
}
}

template <typename Dtype>
void Net<Dtype>::ShareTrainedLayersWith(Net* other) {
int num_source_layers = other->layers().size();
Expand Down Expand Up @@ -516,9 +582,8 @@ void Net<Dtype>::Update() {
// diff. (Assumes that the learning rate, weight decay, etc. have already been
// accounted for in the current diff.)
for (int i = 0; i < params_.size(); ++i) {
if (param_owners_[i] < 0) {
continue;
}
if (param_owners_[i] < 0) { continue; }
if (debug_info_) { UpdateDebugInfo(i); }
const int count = params_[i]->count();
const Dtype* this_diff;
Dtype* owner_diff;
Expand All @@ -534,16 +599,18 @@ void Net<Dtype>::Update() {
owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
break;
#else
NO_GPU;
#endif
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}
// Now, update the owned parameters.
for (int i = 0; i < params_.size(); ++i) {
if (param_owners_[i] < 0) {
params_[i]->Update();
}
if (param_owners_[i] >= 0) { continue; }
if (debug_info_) { UpdateDebugInfo(i); }
params_[i]->Update();
}
}

Expand Down
4 changes: 4 additions & 0 deletions src/caffe/proto/caffe.proto
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ message SolverParameter {
// random number generator -- useful for reproducible results. Otherwise,
// (and by default) initialize using a seed derived from the system clock.
optional int64 random_seed = 20 [default = -1];

// If true, print information about the state of the net that may help with
// debugging learning problems.
optional bool debug_info = 23 [default = false];
}

// A message that stores the solver snapshots
Expand Down
10 changes: 9 additions & 1 deletion src/caffe/solver.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ void Solver<Dtype>::Init(const SolverParameter& param) {
LOG(INFO) << "Creating training net from file: " << param_.train_net();
net_.reset(new Net<Dtype>(param_.train_net()));
}
CHECK(net_) << "Training net uninitialized.";
net_->set_debug_info(param_.debug_info());
const int num_test_net_params = param_.test_net_param_size();
const int num_test_net_files = param_.test_net_size();
const int num_test_nets = num_test_net_params + num_test_net_files;
Expand Down Expand Up @@ -100,11 +102,17 @@ void Solver<Dtype>::Solve(const char* resume_file) {
// should be given, and we will just provide dummy vecs.
vector<Blob<Dtype>*> bottom_vec;
while (iter_++ < param_.max_iter()) {
const bool display = param_.display() && iter_ % param_.display() == 0;
if (display) {
net_->set_debug_info(param_.debug_info());
} else {
net_->set_debug_info(false);
}
Dtype loss = net_->ForwardBackward(bottom_vec);
ComputeUpdateValue();
net_->Update();

if (param_.display() && iter_ % param_.display() == 0) {
if (display) {
LOG(INFO) << "Iteration " << iter_ << ", loss = " << loss;
}
if (param_.test_interval() && iter_ % param_.test_interval() == 0) {
Expand Down