Merge pull request BVLC#6 from jdemouth/caffe-0.14-cnmem-fp16

Fix accuracy issues with FP16 for GoogLeNet.
lukeyeager · Nov 4, 2015 · 31eff30 · 31eff30
2 parents 1ac13b0 + 5927757
commit 31eff30
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 54 deletions.
diff --git a/include/caffe/loss_layers.hpp b/include/caffe/loss_layers.hpp
@@ -91,7 +91,7 @@ class AccuracyLayer : public Layer<Dtype,Mtype> {
   /// The label indicating that an instance should be ignored.
   int ignore_label_;
   /// Keeps counts of the number of samples per class.
-  Blob<Dtype,Mtype> nums_buffer_;
+  Blob<int, int> counts_, valid_counts_;
 };
 
 /**

diff --git a/models/bvlc_alexnet/train_val.prototxt b/models/bvlc_alexnet/train_val.prototxt
@@ -10,10 +10,10 @@ layer {
   transform_param {
     mirror: true
     crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
+    mean_file: "/home/ubuntu/Devtech/imagenet/mean.binaryproto"
   }
   data_param {
-    source: "examples/imagenet/ilsvrc12_train_lmdb"
+    source: "/home/ubuntu/Devtech/imagenet/val_db"
     batch_size: 256
     backend: LMDB
   }
@@ -29,10 +29,10 @@ layer {
   transform_param {
     mirror: false
     crop_size: 227
-    mean_file: "data/ilsvrc12/imagenet_mean.binaryproto"
+    mean_file: "/home/ubuntu/Devtech/imagenet/mean.binaryproto"
   }
   data_param {
-    source: "examples/imagenet/ilsvrc12_val_lmdb"
+    source: "/home/ubuntu/Devtech/imagenet/val_db"
     batch_size: 50
     backend: LMDB
   }
@@ -366,19 +366,33 @@ layer {
   }
 }
 layer {
-  name: "accuracy"
+  name: "loss"
+  type: "SoftmaxWithLoss"
+  bottom: "fc8"
+  bottom: "label"
+  top: "loss"
+}
+layer {
+  name: "top-1"
   type: "Accuracy"
   bottom: "fc8"
   bottom: "label"
-  top: "accuracy"
+  top: "top-1"
   include {
     phase: TEST
   }
 }
 layer {
-  name: "loss"
-  type: "SoftmaxWithLoss"
+  name: "top-5"
+  type: "Accuracy"
   bottom: "fc8"
   bottom: "label"
-  top: "loss"
+  top: "top-5"
+  include {
+    phase: TEST
+  }
+  accuracy_param {
+    top_k: 5
+  }
 }
+
diff --git a/src/caffe/layers/accuracy_layer.cpp b/src/caffe/layers/accuracy_layer.cpp
@@ -43,49 +43,55 @@ void AccuracyLayer<Dtype,Mtype>::Reshape(
     vector<int> top_shape_per_class(1);
     top_shape_per_class[0] = bottom[0]->shape(label_axis_);
     top[1]->Reshape(top_shape_per_class);
-    nums_buffer_.Reshape(top_shape_per_class);
+    counts_.Reshape(top_shape_per_class);
+    valid_counts_.Reshape(top_shape_per_class);
   }
 }
 
 template <typename Dtype, typename Mtype>
 void AccuracyLayer<Dtype,Mtype>::Forward_cpu(const vector<Blob<Dtype,Mtype>*>& bottom,
     const vector<Blob<Dtype,Mtype>*>& top) {
-  Mtype accuracy(0.f);
   const Dtype* bottom_data = bottom[0]->cpu_data();
   const Dtype* bottom_label = bottom[1]->cpu_data();
   const int dim = bottom[0]->count() / outer_num_;
   const int num_labels = bottom[0]->shape(label_axis_);
-  vector<Dtype> maxval(top_k_+1);
-  vector<int> max_id(top_k_+1);
+
   if (top.size() > 1) {
-    caffe_set(nums_buffer_.count(), Get<Dtype>(0), nums_buffer_.mutable_cpu_data());
-    caffe_set(top[1]->count(), Get<Dtype>(0), top[1]->mutable_cpu_data());
+    caffe_set(counts_.count(), 0, counts_.mutable_cpu_data());
+    caffe_set(valid_counts_.count(), 0, valid_counts_.mutable_cpu_data());
   }
-  int count = 0;
+
+  int count = 0, validCount = 0;
   for (int i = 0; i < outer_num_; ++i) {
     for (int j = 0; j < inner_num_; ++j) {
       const int label_value =
-          static_cast<int>(Get<Mtype>(bottom_label[i * inner_num_ + j]));
+          Get<int>(bottom_label[i * inner_num_ + j]);
       if (has_ignore_label_ && label_value == ignore_label_) {
         continue;
       }
-      if (top.size() > 1) ++nums_buffer_.mutable_cpu_data()[label_value];
+      if (top.size() > 1) 
+          ++counts_.mutable_cpu_data()[label_value];
       DCHECK_GE(label_value, 0);
       DCHECK_LT(label_value, num_labels);
       // Top-k accuracy
-      std::vector<std::pair<Mtype, int> > bottom_data_vector;
+      std::vector<std::pair<Dtype, int> > bottom_data_vector;
       for (int k = 0; k < num_labels; ++k) {
         bottom_data_vector.push_back(std::make_pair(
-            Get<Mtype>(bottom_data[i * dim + k * inner_num_ + j]), k));
+            bottom_data[i * dim + k * inner_num_ + j], k));
       }
       std::partial_sort(
-          bottom_data_vector.begin(), bottom_data_vector.begin() + top_k_,
-          bottom_data_vector.end(), std::greater<std::pair<Mtype, int> >());
+          bottom_data_vector.begin(), 
+          bottom_data_vector.begin() + top_k_,
+          bottom_data_vector.end(), 
+          std::greater<std::pair<Dtype, int> >());
+
       // check if true label is in top k predictions
       for (int k = 0; k < top_k_; k++) {
         if (bottom_data_vector[k].second == label_value) {
-          ++accuracy;
-          if (top.size() > 1) ++top[1]->mutable_cpu_data()[label_value];
+          ++validCount;
+          if (top.size() > 1) {
+              valid_counts_.mutable_cpu_data()[label_value]++;
+          }
           break;
         }
       }
@@ -94,12 +100,16 @@ void AccuracyLayer<Dtype,Mtype>::Forward_cpu(const vector<Blob<Dtype,Mtype>*>& b
   }
 
   // LOG(INFO) << "Accuracy: " << accuracy;
-  top[0]->mutable_cpu_data()[0] = Get<Dtype>(accuracy / count);
+  double ratio = (double) validCount / count;
+  top[0]->mutable_cpu_data()[0] = Get<Dtype>(ratio);
   if (top.size() > 1) {
     for (int i = 0; i < top[1]->count(); ++i) {
-      top[1]->mutable_cpu_data()[i] =
-          nums_buffer_.cpu_data()[i] == 0. ? 0.
-          : top[1]->cpu_data()[i] / nums_buffer_.cpu_data()[i];
+        ratio = 
+            counts_.cpu_data()[i] == 0 ? 
+            0.0 : 
+            (double) valid_counts_.cpu_data()[i] / counts_.cpu_data()[i];
+
+      top[1]->mutable_cpu_data()[i] = (Dtype) ratio;
     }
   }
   // Accuracy layer should not be used as a loss function.

diff --git a/src/caffe/layers/softmax_loss_layer.cu b/src/caffe/layers/softmax_loss_layer.cu
@@ -4,31 +4,66 @@
 
 #include "caffe/layer.hpp"
 #include "caffe/util/math_functions.hpp"
+#include "caffe/util/gpu_memory.hpp"
 #include "caffe/vision_layers.hpp"
+#include "cub/cub/cub.cuh"
+
+#define NUM_CUDA_THREADS 128
 
 namespace caffe {
 
 template <typename Dtype, typename Mtype>
 __global__ void SoftmaxLossForwardGPU(const int nthreads,
-          const Dtype* prob_data, const Dtype* label, Dtype* loss,
-          const int num, const int dim, const int spatial_dim,
-          const bool has_ignore_label_, const int ignore_label_,
-          Dtype* counts) {
-  CUDA_KERNEL_LOOP(index, nthreads) {
-    const int n = index / spatial_dim;
-    const int s = index % spatial_dim;
+                                      const Dtype* prob_data, 
+                                      const Dtype* label, 
+                                      Dtype* loss,
+                                      const int num, 
+                                      const int dim, 
+                                      const int spatial_dim,
+                                      const bool has_ignore_label_, 
+                                      const int ignore_label_,
+                                      Dtype* counts,
+                                      Mtype* results) {
+
+  typedef cub::BlockReduce<Mtype, NUM_CUDA_THREADS> BlockReduceF;
+  typedef cub::BlockReduce<int,   NUM_CUDA_THREADS> BlockReduceI;
+
+  __shared__ typename BlockReduceF::TempStorage tempStorageF;
+  __shared__ typename BlockReduceI::TempStorage tempStorageI;
+
+  Mtype lossSum(0);
+  int count(0);
+  for( int idx = blockIdx.x*blockDim.x + threadIdx.x ; idx < nthreads ; idx += blockDim.x*gridDim.x ) {
+    const int n = idx / spatial_dim;
+    const int s = idx % spatial_dim;
     const int label_value = Get<int>(label[n * spatial_dim + s]);
     if (has_ignore_label_ && label_value == ignore_label_) {
-      loss[index] = Get<Dtype>(0);
-      counts[index] = Get<Dtype>(0);
+      loss[idx] = Get<Dtype>(0);
+      counts[idx] = Get<Dtype>(0);
     } else {
-      loss[index] = Get<Dtype>( -log(max(Get<Mtype>(prob_data[n * dim + label_value * spatial_dim + s]),
-                      Mtype(FLT_MIN))) );
-      counts[index] = Get<Dtype>(1);
+      Mtype tmp = -log(max(Get<Mtype>(prob_data[n * dim + label_value * spatial_dim + s]), Mtype(FLT_MIN)));
+      loss[idx] = Get<Dtype>(tmp);
+      counts[idx] = Get<Dtype>(1);
+      lossSum += tmp;
+      count += 1;
     }
   }
+
+  lossSum = BlockReduceF(tempStorageF).Sum(lossSum);
+  count   = BlockReduceI(tempStorageI).Sum(count);
+
+  if( threadIdx.x == 0 ) {
+      results[0] = lossSum;
+      results[1] = Mtype(count);
+  }
 }
 
+template< typename Dtype >
+struct GetFtype { typedef Dtype Type; };
+
+template<>
+struct GetFtype<float16> { typedef float Type; };
+
 template <typename Dtype, typename Mtype>
 void SoftmaxWithLossLayer<Dtype,Mtype>::Forward_gpu(
     const vector<Blob<Dtype,Mtype>*>& bottom, const vector<Blob<Dtype,Mtype>*>& top) {
@@ -44,16 +79,24 @@ void SoftmaxWithLossLayer<Dtype,Mtype>::Forward_gpu(
   // Similarly, this memory is never used elsewhere, and thus we can use it
   // to avoid having to allocate additional GPU memory.
   Dtype* counts = prob_.mutable_gpu_diff();
+
+  // TODO: Use 0-copy instead of a memcpy!
+  typedef typename GetFtype<Dtype>::Type Ftype;
+  Ftype *workspace;
+  gpu_memory::allocate((void**) &workspace, 2*sizeof(Ftype));
+
   // NOLINT_NEXT_LINE(whitespace/operators)
-  SoftmaxLossForwardGPU<Dtype,Mtype><<<CAFFE_GET_BLOCKS(nthreads),
-      CAFFE_CUDA_NUM_THREADS>>>(nthreads, prob_data, label, loss_data,
-      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts);
-  Mtype loss;
-  caffe_gpu_asum<Dtype,Mtype>(nthreads, loss_data, &loss);
+  SoftmaxLossForwardGPU<Dtype, Ftype><<<1, NUM_CUDA_THREADS>>>(
+      nthreads, prob_data, label, loss_data,
+      outer_num_, dim, inner_num_, has_ignore_label_, ignore_label_, counts, workspace);
+
+  Ftype results[2];
+  CUDA_CHECK(cudaMemcpy(results, workspace, sizeof(results), cudaMemcpyDeviceToHost));
+  gpu_memory::deallocate(workspace);
+
+  Ftype loss = results[0];
   if (normalize_) {
-    Mtype count;
-    caffe_gpu_asum<Dtype,Mtype>(nthreads, counts, &count);
-    loss /= count;
+    loss /= results[1];
   } else {
     loss /= outer_num_;
   }

diff --git a/tools/caffe_fp16.cpp b/tools/caffe_fp16.cpp
@@ -178,18 +178,18 @@ int test() {
 
   vector<Blob<float16,CAFFE_FP16_MTYPE>* > bottom_vec;
   vector<int> test_score_output_id;
-  vector<CAFFE_FP16_MTYPE> test_score;
+  vector<float> test_score;
   float loss = 0;
   for (int i = 0; i < FLAGS_iterations; ++i) {
     CAFFE_FP16_MTYPE iter_loss;
     const vector<Blob<float16,CAFFE_FP16_MTYPE>*>& result =
         caffe_net.Forward(bottom_vec, &iter_loss);
-    loss += iter_loss;
+    loss += (float) iter_loss;
     int idx = 0;
     for (int j = 0; j < result.size(); ++j) {
       const float16* result_vec = result[j]->cpu_data();
       for (int k = 0; k < result[j]->count(); ++k, ++idx) {
-        const CAFFE_FP16_MTYPE score = Get<CAFFE_FP16_MTYPE>(result_vec[k]);
+        const float score = Get<float>(result_vec[k]);
         if (i == 0) {
           test_score.push_back(score);
           test_score_output_id.push_back(j);
@@ -202,15 +202,15 @@ int test() {
       }
     }
   }
-  loss /= FLAGS_iterations;
+  loss /= (float) FLAGS_iterations;
   LOG(INFO) << "Loss: " << loss;
   for (int i = 0; i < test_score.size(); ++i) {
     const std::string& output_name = caffe_net.blob_names()[
         caffe_net.output_blob_indices()[test_score_output_id[i]]];
     const float loss_weight = caffe_net.blob_loss_weights()[
         caffe_net.output_blob_indices()[test_score_output_id[i]]];
     std::ostringstream loss_msg_stream;
-    const float mean_score = test_score[i] / FLAGS_iterations;
+    const float mean_score = test_score[i] / (float) FLAGS_iterations;
     if (loss_weight) {
       loss_msg_stream << " (* " << loss_weight
                       << " = " << loss_weight * mean_score << " loss)";