Skip to content

Commit

Permalink
Update computation of variance and global stats in BatchNormLayer
Browse files Browse the repository at this point in the history
  • Loading branch information
kkhoot committed Nov 12, 2015
1 parent f5fd18b commit 0ad1d8a
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 54 deletions.
55 changes: 28 additions & 27 deletions src/caffe/layers/batch_norm_layer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <vector>

#include "caffe/common_layers.hpp"
#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {
Expand Down Expand Up @@ -80,65 +79,67 @@ void BatchNormLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom,
int num = bottom[0]->shape(0);
int spatial_dim = bottom[0]->count()/(bottom[0]->shape(0)*channels_);

// elementwise square
caffe_powx(bottom[0]->count(), bottom_data, Dtype(2),
temp_.mutable_cpu_data());
if (bottom[0] != top[0]) {
caffe_copy(bottom[0]->count(), bottom_data, top_data);
}

if (use_global_stats_) {
// use the stored mean/variance estimates. TODO(cdoersch): allow an option
// to use an unbiased variance estimate, like the paper does.
const Dtype scale_factor = 1 / this->blobs_[2]->cpu_data()[0];
const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
0 : 1 / this->blobs_[2]->cpu_data()[0];
caffe_cpu_scale(variance_.count(), scale_factor,
this->blobs_[0]->cpu_data(), mean_.mutable_cpu_data());
caffe_cpu_scale(variance_.count(), scale_factor,
this->blobs_[1]->cpu_data(), variance_.mutable_cpu_data());
} else {
// computes variance using var(X) = E(X^2) - (EX)^2
// compute mean
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), bottom_data,
spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
mean_.mutable_cpu_data());
}

// subtract mean
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, -1, num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 1., top_data);

if (!use_global_stats_) {
// compute variance using var(X) = E((X-EX)^2)
caffe_powx(top[0]->count(), top_data, Dtype(2),
temp_.mutable_cpu_data()); // (X-EX)^2
caffe_cpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), temp_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.cpu_data(), batch_sum_multiplier_.cpu_data(), 0.,
variance_.mutable_cpu_data());
variance_.mutable_cpu_data()); // E((X_EX)^2)

// compute and save moving average
this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
this->blobs_[2]->mutable_cpu_data()[0] += 1;
caffe_cpu_axpby(mean_.count(), Dtype(1), mean_.cpu_data(),
moving_average_fraction_, this->blobs_[0]->mutable_cpu_data());
Dtype m = Dtype(bottom[0]->count()/channels_);
caffe_cpu_axpby(variance_.count(), m/(m-1), variance_.cpu_data(),
moving_average_fraction_, this->blobs_[1]->mutable_cpu_data());
int m = bottom[0]->count()/channels_;
Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
caffe_cpu_axpby(variance_.count(), bias_correction_factor,
variance_.cpu_data(), moving_average_fraction_,
this->blobs_[1]->mutable_cpu_data());
}
// elementwise square of mean
caffe_powx(mean_.count(), mean_.cpu_data(), Dtype(2),
temp_.mutable_cpu_data());

caffe_sub(mean_.count(), variance_.cpu_data(), temp_.cpu_data(),
variance_.mutable_cpu_data()); // variance

// normalize variance
caffe_add_scalar(variance_.count(), eps_, variance_.mutable_cpu_data());
caffe_powx(variance_.count(), variance_.cpu_data(), Dtype(0.5),
variance_.mutable_cpu_data());

// do mean and variance normalization
if (bottom[0] != top[0]) {
caffe_copy(bottom[0]->count(), bottom_data, top_data);
}
// subtract mean
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), mean_.cpu_data(), 0.,
num_by_chans_.mutable_cpu_data());
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, -1, num_by_chans_.cpu_data(),
spatial_sum_multiplier_.cpu_data(), 1., top_data);
// replicate variance to input size
caffe_cpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.cpu_data(), variance_.cpu_data(), 0.,
Expand Down
56 changes: 29 additions & 27 deletions src/caffe/layers/batch_norm_layer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
#include <vector>

#include "caffe/common_layers.hpp"
#include "caffe/layer.hpp"
#include "caffe/util/math_functions.hpp"

namespace caffe {
Expand All @@ -15,65 +14,68 @@ void BatchNormLayer<Dtype>::Forward_gpu(const vector<Blob<Dtype>*>& bottom,
int num = bottom[0]->shape(0);
int spatial_dim = bottom[0]->count()/(channels_*bottom[0]->shape(0));

// elementwise square
caffe_gpu_powx(bottom[0]->count(), bottom_data, Dtype(2),
temp_.mutable_gpu_data());
if (bottom[0] != top[0]) {
caffe_copy(bottom[0]->count(), bottom_data, top_data);
}


if (use_global_stats_) {
// use the stored mean/variance estimates. TODO(cdoersch): allow an option
// to use an unbiased variance estimate, like the paper does.
const Dtype scale_factor = 1 / this->blobs_[2]->cpu_data()[0];
const Dtype scale_factor = this->blobs_[2]->cpu_data()[0] == 0 ?
0 : 1 / this->blobs_[2]->cpu_data()[0];
caffe_gpu_scale(variance_.count(), scale_factor,
this->blobs_[0]->gpu_data(), mean_.mutable_gpu_data());
caffe_gpu_scale(variance_.count(), scale_factor,
this->blobs_[1]->gpu_data(), variance_.mutable_gpu_data());
} else {
// computes variance using var(X) = E(X^2) - (EX)^2
// compute mean
caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), bottom_data,
spatial_sum_multiplier_.gpu_data(), 0.,
num_by_chans_.mutable_gpu_data());
caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
mean_.mutable_gpu_data());
}

// subtract mean
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
num_by_chans_.mutable_gpu_data());
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, -1, num_by_chans_.gpu_data(),
spatial_sum_multiplier_.gpu_data(), 1., top_data);

if (!use_global_stats_) {
// compute variance using var(X) = E((X-EX)^2)
caffe_gpu_powx(top[0]->count(), top_data, Dtype(2),
temp_.mutable_gpu_data()); // (X-EX)^2
caffe_gpu_gemv<Dtype>(CblasNoTrans, channels_ * num, spatial_dim,
1. / (num * spatial_dim), temp_.gpu_data(),
spatial_sum_multiplier_.gpu_data(), 0.,
num_by_chans_.mutable_gpu_data());
caffe_gpu_gemv<Dtype>(CblasTrans, num, channels_, 1.,
num_by_chans_.gpu_data(), batch_sum_multiplier_.gpu_data(), 0.,
variance_.mutable_gpu_data());
variance_.mutable_gpu_data()); // E((X_EX)^2)

// compute and save moving average
this->blobs_[2]->mutable_cpu_data()[0] *= moving_average_fraction_;
this->blobs_[2]->mutable_cpu_data()[0] += 1;
caffe_gpu_axpby(mean_.count(), Dtype(1), mean_.gpu_data(),
moving_average_fraction_, this->blobs_[0]->mutable_gpu_data());
Dtype m = Dtype(bottom[0]->count()/channels_);
caffe_gpu_axpby(variance_.count(), m/(m-1), variance_.gpu_data(),
moving_average_fraction_, this->blobs_[1]->mutable_gpu_data());
int m = bottom[0]->count()/channels_;
Dtype bias_correction_factor = m > 1 ? Dtype(m)/(m-1) : 1;
caffe_gpu_axpby(variance_.count(), bias_correction_factor,
variance_.gpu_data(), moving_average_fraction_,
this->blobs_[1]->mutable_gpu_data());
}
// elementwise square of mean
caffe_gpu_powx(mean_.count(), mean_.gpu_data(), Dtype(2),
temp_.mutable_gpu_data());

caffe_gpu_sub(mean_.count(), variance_.gpu_data(), temp_.gpu_data(),
variance_.mutable_gpu_data()); // variance

// normalize variance
caffe_gpu_add_scalar(variance_.count(), eps_, variance_.mutable_gpu_data());
caffe_gpu_powx(variance_.count(), variance_.gpu_data(), Dtype(0.5),
variance_.mutable_gpu_data());

// do mean and variance normalization
if (bottom[0] != top[0]) {
caffe_copy(bottom[0]->count(), bottom_data, top_data);
}
// subtract mean
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.gpu_data(), mean_.gpu_data(), 0.,
num_by_chans_.mutable_gpu_data());
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, channels_ * num,
spatial_dim, 1, -1, num_by_chans_.gpu_data(),
spatial_sum_multiplier_.gpu_data(), 1., top_data);
// replicate variance to input size
caffe_gpu_gemm<Dtype>(CblasNoTrans, CblasNoTrans, num, channels_, 1, 1,
batch_sum_multiplier_.gpu_data(), variance_.gpu_data(), 0.,
Expand Down

0 comments on commit 0ad1d8a

Please sign in to comment.