BVLC · longjon · Sep 20, 2014 · Sep 20, 2014 · Sep 20, 2014
diff --git a/include/caffe/vision_layers.hpp b/include/caffe/vision_layers.hpp
@@ -97,6 +97,7 @@ class ConvolutionLayer : public Layer<Dtype> {
   int num_output_;
   int height_out_, width_out_;
   bool bias_term_;
+  bool is_1x1_;
 
   /// M_ is the channel dimension of the output for a single group, which is the
   /// leading dimension of the filter matrix.

diff --git a/src/caffe/layers/conv_layer.cpp b/src/caffe/layers/conv_layer.cpp
@@ -47,6 +47,10 @@ void ConvolutionLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom,
     stride_h_ = conv_param.stride_h();
     stride_w_ = conv_param.stride_w();
   }
+  // Special case: im2col is the identity for 1x1 convolution with stride 1
+  // and no padding, so flag for skipping the buffer and transformation.
+  is_1x1_ = kernel_w_ == 1 && kernel_h_ == 1
+      && stride_h_ == 1 && stride_w_ == 1 && pad_h_ == 0 && pad_w_ == 0;
   // Configure output channels and groups.
   channels_ = bottom[0]->channels();
   num_output_ = this->layer_param_.convolution_param().num_output();
@@ -118,7 +122,8 @@ void ConvolutionLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom,
   K_ = channels_ * kernel_h_ * kernel_w_ / group_;
   N_ = height_out_ * width_out_;
   // The im2col result buffer will only hold one image at a time to avoid
-  // overly large memory usage.
+  // overly large memory usage. In the special case of 1x1 convolution
+  // it goes lazily unused to save memory.
   col_buffer_.Reshape(
       1, channels_ * kernel_h_ * kernel_w_, height_out_, width_out_);
   for (int top_id = 0; top_id < top.size(); ++top_id) {
@@ -137,21 +142,28 @@ void ConvolutionLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->cpu_data();
     Dtype* top_data = top[i]->mutable_cpu_data();
-    Dtype* col_data = col_buffer_.mutable_cpu_data();
+    Dtype* col_buff = NULL;
+    if (!is_1x1_) {
+      col_buff = col_buffer_.mutable_cpu_data();
+    }
     const Dtype* weight = this->blobs_[0]->cpu_data();
     int weight_offset = M_ * K_;  // number of filter parameters in a group
     int col_offset = K_ * N_;  // number of values in an input region / column
     int top_offset = M_ * N_;  // number of values in an output region / column
     for (int n = 0; n < num_; ++n) {
       // im2col transformation: unroll input regions for filtering
       // into column matrix for multplication.
-      im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-          width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-          col_data);
+      if (!is_1x1_) {
+        im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
+            width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+            col_buff);
+      } else {  // special case for 1x1 convolution
+        col_buff = bottom[i]->mutable_cpu_data() + bottom[i]->offset(n);
+      }
       // Take inner products for groups.
       for (int g = 0; g < group_; ++g) {
         caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-          (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
+          (Dtype)1., weight + weight_offset * g, col_buff + col_offset * g,
           (Dtype)0., top_data + top[i]->offset(n) + top_offset * g);
       }
       // Add bias.
@@ -199,22 +211,28 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
       if (!top_diff) {
         top_diff = top[i]->cpu_diff();
       }
-      Dtype* col_data = col_buffer_.mutable_cpu_data();
-      Dtype* col_diff = col_buffer_.mutable_cpu_diff();
+      Dtype* col_buff = NULL;
+      if (!is_1x1_) {
+        col_buff = col_buffer_.mutable_cpu_data();
+      }
       const Dtype* bottom_data = bottom[i]->cpu_data();
       Dtype* bottom_diff = bottom[i]->mutable_cpu_diff();
       for (int n = 0; n < num_; ++n) {
         // Since we saved memory in the forward pass by not storing all col
         // data, we will need to recompute them.
-        im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                   width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-                   stride_h_, stride_w_, col_data);
+        if (!is_1x1_) {
+          im2col_cpu(bottom_data + bottom[i]->offset(n), channels_, height_,
+                    width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                    stride_h_, stride_w_, col_buff);
+        } else {
+          col_buff = bottom[i]->mutable_cpu_data() + bottom[i]->offset(n);
+        }
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           for (int g = 0; g < group_; ++g) {
             caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
                 (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
-                col_data + col_offset * g, (Dtype)1.,
+                col_buff + col_offset * g, (Dtype)1.,
                 weight_diff + weight_offset * g);
           }
         }
@@ -223,16 +241,21 @@ void ConvolutionLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top,
           if (weight == NULL) {
             weight = this->blobs_[0]->cpu_data();
           }
+          if (is_1x1_) {
+            col_buff = bottom[i]->mutable_cpu_diff() + bottom[i]->offset(n);
+          }
           for (int g = 0; g < group_; ++g) {
             caffe_cpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
                 (Dtype)1., weight + weight_offset * g,
                 top_diff + top[i]->offset(n) + top_offset * g,
-                (Dtype)0., col_diff + col_offset * g);
+                (Dtype)0., col_buff + col_offset * g);
           }
           // col2im back to the data
-          col2im_cpu(col_diff, channels_, height_, width_,
-              kernel_h_, kernel_w_, pad_h_, pad_w_,
-              stride_h_, stride_w_, bottom_diff + bottom[i]->offset(n));
+          if (!is_1x1_) {
+            col2im_cpu(col_buff, channels_, height_, width_,
+                kernel_h_, kernel_w_, pad_h_, pad_w_,
+                stride_h_, stride_w_, bottom_diff + bottom[i]->offset(n));
+          }
         }
       }
     }

diff --git a/src/caffe/layers/conv_layer.cu b/src/caffe/layers/conv_layer.cu
@@ -15,21 +15,28 @@ void ConvolutionLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
   for (int i = 0; i < bottom.size(); ++i) {
     const Dtype* bottom_data = bottom[i]->gpu_data();
     Dtype* top_data = top[i]->mutable_gpu_data();
-    Dtype* col_data = col_buffer_.mutable_gpu_data();
+    Dtype* col_buff = NULL;
+    if (!is_1x1_) {
+      col_buff = col_buffer_.mutable_gpu_data();
+    }
     const Dtype* weight = this->blobs_[0]->gpu_data();
     int weight_offset = M_ * K_;
     int col_offset = K_ * N_;
     int top_offset = M_ * N_;
     for (int n = 0; n < num_; ++n) {
       // im2col transformation: unroll input regions for filtering
       // into column matrix for multplication.
-      im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-          width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-          col_data);
+      if (!is_1x1_) {
+        im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
+            width_, kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+            col_buff);
+      } else {
+        col_buff = bottom[i]->mutable_gpu_data() + bottom[i]->offset(n);
+      }
       // Take inner products for groups.
       for (int g = 0; g < group_; ++g) {
         caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, M_, N_, K_,
-          (Dtype)1., weight + weight_offset * g, col_data + col_offset * g,
+          (Dtype)1., weight + weight_offset * g, col_buff + col_offset * g,
           (Dtype)0., top_data + top[i]->offset(n) + top_offset * g);
       }
       // Add bias.
@@ -78,22 +85,28 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
       if (!top_diff) {
         top_diff = top[i]->gpu_diff();
       }
-      Dtype* col_data = col_buffer_.mutable_gpu_data();
-      Dtype* col_diff = col_buffer_.mutable_gpu_diff();
+      Dtype* col_buff = NULL;
+      if (!is_1x1_) {
+        col_buff = col_buffer_.mutable_gpu_data();
+      }
       const Dtype* bottom_data = bottom[i]->gpu_data();
       Dtype* bottom_diff = bottom[i]->mutable_gpu_diff();
       for (int n = 0; n < num_; ++n) {
         // Since we saved memory in the forward pass by not storing all col
         // data, we will need to recompute them.
-        im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
-                   width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
-                   stride_h_, stride_w_, col_data);
+        if (!is_1x1_) {
+          im2col_gpu(bottom_data + bottom[i]->offset(n), channels_, height_,
+                    width_, kernel_h_, kernel_w_, pad_h_, pad_w_,
+                    stride_h_, stride_w_, col_buff);
+        } else {
+          col_buff = bottom[i]->mutable_gpu_data() + bottom[i]->offset(n);
+        }
         // gradient w.r.t. weight. Note that we will accumulate diffs.
         if (this->param_propagate_down_[0]) {
           for (int g = 0; g < group_; ++g) {
             caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasTrans, M_, K_, N_,
                 (Dtype)1., top_diff + top[i]->offset(n) + top_offset * g,
-                col_data + col_offset * g, (Dtype)1.,
+                col_buff + col_offset * g, (Dtype)1.,
                 weight_diff + weight_offset * g);
           }
         }
@@ -102,16 +115,21 @@ void ConvolutionLayer<Dtype>::Backward_gpu(const vector<Blob<Dtype>*>& top,
           if (weight == NULL) {
             weight = this->blobs_[0]->gpu_data();
           }
+          if (is_1x1_) {
+            col_buff = bottom[i]->mutable_gpu_diff() + bottom[i]->offset(n);
+          }
           for (int g = 0; g < group_; ++g) {
             caffe_gpu_gemm<Dtype>(CblasTrans, CblasNoTrans, K_, N_, M_,
                 (Dtype)1., weight + weight_offset * g,
                 top_diff + top[i]->offset(n) + top_offset * g,
-                (Dtype)0., col_diff + col_offset * g);
+                (Dtype)0., col_buff + col_offset * g);
           }
           // col2im back to the data
-          col2im_gpu(col_diff, channels_, height_, width_,
-              kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
-              bottom_diff + bottom[i]->offset(n));
+          if (!is_1x1_) {
+            col2im_gpu(col_buff, channels_, height_, width_,
+                kernel_h_, kernel_w_, pad_h_, pad_w_, stride_h_, stride_w_,
+                bottom_diff + bottom[i]->offset(n));
+          }
         }
       }
     }

diff --git a/src/caffe/test/test_convolution_layer.cpp b/src/caffe/test/test_convolution_layer.cpp
@@ -217,6 +217,33 @@ TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolution) {
   }
 }
 
+TYPED_TEST(ConvolutionLayerTest, Test1x1Convolution) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  convolution_param->set_kernel_size(1);
+  convolution_param->set_stride(1);
+  convolution_param->set_num_output(4);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("constant");
+  convolution_param->mutable_bias_filler()->set_value(0.1);
+  shared_ptr<Layer<Dtype> > layer(
+      new ConvolutionLayer<Dtype>(layer_param));
+  layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_);
+  layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_);
+  // Check against reference convolution.
+  const Dtype* top_data;
+  const Dtype* ref_top_data;
+  caffe_conv(this->blob_bottom_, convolution_param, layer->blobs(),
+      this->MakeReferenceTop(this->blob_top_));
+  top_data = this->blob_top_->cpu_data();
+  ref_top_data = this->ref_blob_top_->cpu_data();
+  for (int i = 0; i < this->blob_top_->count(); ++i) {
+    EXPECT_NEAR(top_data[i], ref_top_data[i], 1e-4);
+  }
+}
+
 TYPED_TEST(ConvolutionLayerTest, TestSimpleConvolutionGroup) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;
@@ -359,6 +386,24 @@ TYPED_TEST(ConvolutionLayerTest, TestGradient) {
       this->blob_top_vec_);
 }
 
+TYPED_TEST(ConvolutionLayerTest, Test1x1Gradient) {
+  typedef typename TypeParam::Dtype Dtype;
+  LayerParameter layer_param;
+  ConvolutionParameter* convolution_param =
+      layer_param.mutable_convolution_param();
+  this->blob_bottom_vec_.push_back(this->blob_bottom_2_);
+  this->blob_top_vec_.push_back(this->blob_top_2_);
+  convolution_param->set_kernel_size(1);
+  convolution_param->set_stride(1);
+  convolution_param->set_num_output(2);
+  convolution_param->mutable_weight_filler()->set_type("gaussian");
+  convolution_param->mutable_bias_filler()->set_type("gaussian");
+  ConvolutionLayer<Dtype> layer(layer_param);
+  GradientChecker<Dtype> checker(1e-2, 1e-3);
+  checker.CheckGradientExhaustive(&layer, this->blob_bottom_vec_,
+      this->blob_top_vec_);
+}
+
 TYPED_TEST(ConvolutionLayerTest, TestGradientGroup) {
   typedef typename TypeParam::Dtype Dtype;
   LayerParameter layer_param;