Skip to content

Commit

Permalink
Merge pull request BVLC#6 from jdemouth/caffe-0.14-cnmem-fp16
Browse files Browse the repository at this point in the history
Fix accuracy issues with FP16 for GoogLeNet.
  • Loading branch information
jdemouth committed Nov 4, 2015
2 parents 1ac13b0 + 5927757 commit 31eff30
Show file tree
Hide file tree
Showing 5 changed files with 121 additions and 54 deletions.
2 changes: 1 addition & 1 deletion include/caffe/loss_layers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ class AccuracyLayer : public Layer<Dtype,Mtype> {
/// The label indicating that an instance should be ignored.
int ignore_label_;
/// Keeps counts of the number of samples per class.
Blob<Dtype,Mtype> nums_buffer_;
Blob<int, int> counts_, valid_counts_;
};

/**
Expand Down
32 changes: 23 additions & 9 deletions models/bvlc_alexnet/train_val.prototxt
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ layer {
transform_param {
mirror: true
crop_size: 227
mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
mean_file: "/home/ubuntu/Devtech/imagenet/mean.binaryproto"
}
data_param {
source: "examples/imagenet/ilsvrc12_train_lmdb"
source: "/home/ubuntu/Devtech/imagenet/val_db"
batch_size: 256
backend: LMDB
}
Expand All @@ -29,10 +29,10 @@ layer {
transform_param {
mirror: false
crop_size: 227
mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
mean_file: "/home/ubuntu/Devtech/imagenet/mean.binaryproto"
}
data_param {
source: "examples/imagenet/ilsvrc12_val_lmdb"
source: "/home/ubuntu/Devtech/imagenet/val_db"
batch_size: 50
backend: LMDB
}
Expand Down Expand Up @@ -366,19 +366,33 @@ layer {
}
}
layer {
name: "accuracy"
name: "loss"
type: "SoftmaxWithLoss"
bottom: "fc8"
bottom: "label"
top: "loss"
}
layer {
name: "top-1"
type: "Accuracy"
bottom: "fc8"
bottom: "label"
top: "accuracy"
top: "top-1"
include {
phase: TEST
}
}
layer {
name: "loss"
type: "SoftmaxWithLoss"
name: "top-5"
type: "Accuracy"
bottom: "fc8"
bottom: "label"
top: "loss"
top: "top-5"
include {
phase: TEST
}
accuracy_param {
top_k: 5
}
}

48 changes: 29 additions & 19 deletions src/caffe/layers/accuracy_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,49 +43,55 @@ void AccuracyLayer<Dtype,Mtype>::Reshape(
vector<int> top_shape_per_class(1);
top_shape_per_class[0] = bottom[0]->shape(label_axis_);
top[1]->Reshape(top_shape_per_class);
nums_buffer_.Reshape(top_shape_per_class);
counts_.Reshape(top_shape_per_class);
valid_counts_.Reshape(top_shape_per_class);
}
}

template <typename Dtype, typename Mtype>
void AccuracyLayer<Dtype,Mtype>::Forward_cpu(const vector<Blob<Dtype,Mtype>*>& bottom,
const vector<Blob<Dtype,Mtype>*>& top) {
Mtype accuracy(0.f);
const Dtype* bottom_data = bottom[0]->cpu_data();
const Dtype* bottom_label = bottom[1]->cpu_data();
const int dim = bottom[0]->count() / outer_num_;
const int num_labels = bottom[0]->shape(label_axis_);
vector<Dtype> maxval(top_k_+1);
vector<int> max_id(top_k_+1);

if (top.size() > 1) {
caffe_set(nums_buffer_.count(), Get<Dtype>(0), nums_buffer_.mutable_cpu_data());
caffe_set(top[1]->count(), Get<Dtype>(0), top[1]->mutable_cpu_data());
caffe_set(counts_.count(), 0, counts_.mutable_cpu_data());
caffe_set(valid_counts_.count(), 0, valid_counts_.mutable_cpu_data());
}
int count = 0;

int count = 0, validCount = 0;
for (int i = 0; i < outer_num_; ++i) {
for (int j = 0; j < inner_num_; ++j) {
const int label_value =
static_cast<int>(Get<Mtype>(bottom_label[i * inner_num_ + j]));
Get<int>(bottom_label[i * inner_num_ + j]);
if (has_ignore_label_ && label_value == ignore_label_) {
continue;
}
if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
if (top.size() > 1)
++counts_.mutable_cpu_data()[label_value];
DCHECK_GE(label_value, 0);
DCHECK_LT(label_value, num_labels);
// Top-k accuracy
std::vector<std::pair<Mtype, int> > bottom_data_vector;
std::vector<std::pair<Dtype, int> > bottom_data_vector;
for (int k = 0; k < num_labels; ++k) {
bottom_data_vector.push_back(std::make_pair(
Get<Mtype>(bottom_data[i * dim + k * inner_num_ + j]), k));
bottom_data[i * dim + k * inner_num_ + j], k));
}
std::partial_sort(
bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
bottom_data_vector.end(), std::greater<std::pair<Mtype, int> >());
bottom_data_vector.begin(),
bottom_data_vector.begin() + top_k_,
bottom_data_vector.end(),
std::greater<std::pair<Dtype, int> >());

// check if true label is in top k predictions
for (int k = 0; k < top_k_; k++) {
if (bottom_data_vector[k].second == label_value) {
++accuracy;
if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
++validCount;
if (top.size() > 1) {
valid_counts_.mutable_cpu_data()[label_value]++;
}
break;
}
}
Expand All @@ -94,12 +100,16 @@ void AccuracyLayer<Dtype,Mtype>::Forward_cpu(const vector<Blob<Dtype,Mtype>*>& b
}

// LOG(INFO) << "Accuracy: " << accuracy;
top[0]->mutable_cpu_data()[0] = Get<Dtype>(accuracy / count);
double ratio = (double) validCount / count;
top[0]->mutable_cpu_data()[0] = Get<Dtype>(ratio);
if (top.size() > 1) {
for (int i = 0; i < top[1]->count(); ++i) {
top[1]->mutable_cpu_data()[i] =
nums_buffer_.cpu_data()[i] == 0. ? 0.
: top[1]->cpu_data()[i] / nums_buffer_.cpu_data()[i];
ratio =
counts_.cpu_data()[i] == 0 ?
0.0 :
(double) valid_counts_.cpu_data()[i] / counts_.cpu_data()[i];

top[1]->mutable_cpu_data()[i] = (Dtype) ratio;
}
}
// Accuracy layer should not be used as a loss function.
Expand Down
83 changes: 63 additions & 20 deletions src/caffe/layers/softmax_loss_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,66 @@

#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/gpu_memory.hpp"
#include "caffe/vision_layers.hpp"
#include "cub/cub/cub.cuh"

#define NUM_CUDA_THREADS 128

namespace caffe {

template <typename Dtype, typename Mtype>
__global__ void SoftmaxLossForwardGPU(const int nthreads,
const Dtype* prob_data, const Dtype* label, Dtype* loss,
const int num, const int dim, const int spatial_dim,
const bool has_ignore_label_, const int ignore_label_,
Dtype* counts) {
CUDA_KERNEL_LOOP(index, nthreads) {
const int n = index / spatial_dim;
const int s = index % spatial_dim;
const Dtype* prob_data,
const Dtype* label,
Dtype* loss,
const int num,
const int dim,
const int spatial_dim,
const bool has_ignore_label_,
const int ignore_label_,
Dtype* counts,
Mtype* results) {

typedef cub::BlockReduce<Mtype, NUM_CUDA_THREADS> BlockReduceF;
typedef cub::BlockReduce<int, NUM_CUDA_THREADS> BlockReduceI;

__shared__ typename BlockReduceF::TempStorage tempStorageF;
__shared__ typename BlockReduceI::TempStorage tempStorageI;

Mtype lossSum(0);
int count(0);
for( int idx = blockIdx.x*blockDim.x + threadIdx.x ; idx < nthreads ; idx += blockDim.x*gridDim.x ) {
const int n = idx / spatial_dim;
const int s = idx % spatial_dim;
const int label_value = Get<int>(label[n * spatial_dim + s]);
if (has_ignore_label_ && label_value == ignore_label_) {
loss[index] = Get<Dtype>(0);
counts[index] = Get<Dtype>(0);
loss[idx] = Get<Dtype>(0);
counts[idx] = Get<Dtype>(0);
} else {
loss[index] = Get<Dtype>( -log(max(Get<Mtype>(prob_data[n * dim + label_value * spatial_dim + s]),
Mtype(FLT_MIN))) );
counts[index] = Get<Dtype>(1);
Mtype tmp = -log(max(Get<Mtype>(prob_data[n * dim + label_value * spatial_dim + s]), Mtype(FLT_MIN)));
loss[idx] = Get<Dtype>(tmp);
counts[idx] = Get<Dtype>(1);
lossSum += tmp;
count += 1;
}
}

lossSum = BlockReduceF(tempStorageF).Sum(lossSum);
count = BlockReduceI(tempStorageI).Sum(count);

if( threadIdx.x == 0 ) {
results[0] = lossSum;
results[1] = Mtype(count);
}
}

template< typename Dtype >
struct GetFtype { typedef Dtype Type; };

template<>
struct GetFtype<float16> { typedef float Type; };

template <typename Dtype, typename Mtype>
void SoftmaxWithLossLayer<Dtype,Mtype>::Forward_gpu(
const vector<Blob<Dtype,Mtype>*>& bottom, const vector<Blob<Dtype,Mtype>*>& top) {
Expand All @@ -44,16 +79,24 @@ void SoftmaxWithLossLayer<Dtype,Mtype>::Forward_gpu(
// Similarly, this memory is never used elsewhere, and thus we can use it
// to avoid having to allocate additional GPU memory.
Dtype* counts = prob_.mutable_gpu_diff();

// TODO: Use 0-copy instead of a memcpy!
typedef typename GetFtype<Dtype>::Type Ftype;
Ftype *workspace;
gpu_memory::allocate((void**) &workspace, 2*sizeof(Ftype));

// NOLINT_NEXT_LINE(whitespace/operators)
SoftmaxLossForwardGPU<Dtype,Mtype><<<CAFFE_GET_BLOCKS(nthreads),
CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
Mtype loss;
caffe_gpu_asum<Dtype,Mtype>(nthreads, loss_data, &loss);
SoftmaxLossForwardGPU<Dtype, Ftype><<<1, NUM_CUDA_THREADS>>>(
nthreads, prob_data, label, loss_data,
outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts, workspace);

Ftype results[2];
CUDA_CHECK(cudaMemcpy(results, workspace, sizeof(results), cudaMemcpyDeviceToHost));
gpu_memory::deallocate(workspace);

Ftype loss = results[0];
if (normalize_) {
Mtype count;
caffe_gpu_asum<Dtype,Mtype>(nthreads, counts, &count);
loss /= count;
loss /= results[1];
} else {
loss /= outer_num_;
}
Expand Down
10 changes: 5 additions & 5 deletions tools/caffe_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,18 +178,18 @@ int test() {

vector<Blob<float16,CAFFE_FP16_MTYPE>* > bottom_vec;
vector<int> test_score_output_id;
vector<CAFFE_FP16_MTYPE> test_score;
vector<float> test_score;
float loss = 0;
for (int i = 0; i < FLAGS_iterations; ++i) {
CAFFE_FP16_MTYPE iter_loss;
const vector<Blob<float16,CAFFE_FP16_MTYPE>*>& result =
caffe_net.Forward(bottom_vec, &iter_loss);
loss += iter_loss;
loss += (float) iter_loss;
int idx = 0;
for (int j = 0; j < result.size(); ++j) {
const float16* result_vec = result[j]->cpu_data();
for (int k = 0; k < result[j]->count(); ++k, ++idx) {
const CAFFE_FP16_MTYPE score = Get<CAFFE_FP16_MTYPE>(result_vec[k]);
const float score = Get<float>(result_vec[k]);
if (i == 0) {
test_score.push_back(score);
test_score_output_id.push_back(j);
Expand All @@ -202,15 +202,15 @@ int test() {
}
}
}
loss /= FLAGS_iterations;
loss /= (float) FLAGS_iterations;
LOG(INFO) << "Loss: " << loss;
for (int i = 0; i < test_score.size(); ++i) {
const std::string& output_name = caffe_net.blob_names()[
caffe_net.output_blob_indices()[test_score_output_id[i]]];
const float loss_weight = caffe_net.blob_loss_weights()[
caffe_net.output_blob_indices()[test_score_output_id[i]]];
std::ostringstream loss_msg_stream;
const float mean_score = test_score[i] / FLAGS_iterations;
const float mean_score = test_score[i] / (float) FLAGS_iterations;
if (loss_weight) {
loss_msg_stream << " (* " << loss_weight
<< " = " << loss_weight * mean_score << " loss)";
Expand Down

0 comments on commit 31eff30

Please sign in to comment.