diff --git a/examples/mnist/lenet_adadelta_solver.prototxt b/examples/mnist/lenet_adadelta_solver.prototxt index b77b451d56a..776d1e06139 100644 --- a/examples/mnist/lenet_adadelta_solver.prototxt +++ b/examples/mnist/lenet_adadelta_solver.prototxt @@ -7,6 +7,8 @@ test_iter: 100 # Carry out testing every 500 training iterations. test_interval: 500 # The base learning rate, momentum and the weight decay of the network. +base_lr: 1.0 +lr_policy: "fixed" momentum: 0.95 weight_decay: 0.0005 # Display every 100 iterations diff --git a/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt b/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt index 4e43468a71f..065647df31b 100644 --- a/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt +++ b/examples/mnist/mnist_autoencoder_solver_adadelta.prototxt @@ -5,6 +5,8 @@ test_state: { stage: 'test-on-test' } test_iter: 100 test_interval: 500 test_compute_loss: true +base_lr: 1.0 +lr_policy: "fixed" momentum: 0.95 delta: 1e-8 display: 100 diff --git a/include/caffe/solver.hpp b/include/caffe/solver.hpp index 495cd4f159e..5fefd01e549 100644 --- a/include/caffe/solver.hpp +++ b/include/caffe/solver.hpp @@ -82,12 +82,12 @@ class SGDSolver : public Solver { const vector > >& history() { return history_; } protected: + void PreSolve(); Dtype GetLearningRate(); virtual void ApplyUpdate(); virtual void Normalize(int param_id); virtual void Regularize(int param_id); virtual void ComputeUpdateValue(int param_id, Dtype rate); - virtual void PreSolve(); virtual void ClipGradients(); virtual void SnapshotSolverState(const string& model_filename); virtual void SnapshotSolverStateToBinaryProto(const string& model_filename); @@ -162,19 +162,13 @@ template class AdaDeltaSolver : public SGDSolver { public: explicit AdaDeltaSolver(const SolverParameter& param) - : SGDSolver(param) { PreSolve(); constructor_sanity_check(); } + : SGDSolver(param) { AdaDeltaPreSolve(); } explicit AdaDeltaSolver(const string& param_file) - : SGDSolver(param_file) { PreSolve(); constructor_sanity_check(); } + : SGDSolver(param_file) { AdaDeltaPreSolve(); } protected: - virtual void PreSolve(); - virtual void ComputeUpdateValue(); - void constructor_sanity_check() { - CHECK_EQ(0, this->param_.base_lr()) - << "Learning rate cannot be used with AdaDelta."; - CHECK_EQ("", this->param_.lr_policy()) - << "Learning rate policy cannot be applied to AdaDelta."; - } + void AdaDeltaPreSolve(); + virtual void ComputeUpdateValue(int param_id, Dtype rate); DISABLE_COPY_AND_ASSIGN(AdaDeltaSolver); }; diff --git a/src/caffe/solver.cpp b/src/caffe/solver.cpp index 34a290ffe3d..78902ca0ebc 100644 --- a/src/caffe/solver.cpp +++ b/src/caffe/solver.cpp @@ -935,10 +935,10 @@ void RMSPropSolver::ComputeUpdateValue(int param_id, Dtype rate) { } template -void AdaDeltaSolver::PreSolve() { +void AdaDeltaSolver::AdaDeltaPreSolve() { // Add the extra history entries for AdaDelta after those from // SGDSolver::PreSolve - const vector > >& net_params = this->net_->params(); + const vector*>& net_params = this->net_->learnable_params(); for (int i = 0; i < net_params.size(); ++i) { const vector& shape = net_params[i]->shape(); this->history_.push_back( @@ -947,172 +947,134 @@ void AdaDeltaSolver::PreSolve() { } template -void AdaDeltaSolver::ComputeUpdateValue() { - const vector > >& net_params = this->net_->params(); - const vector& net_params_weight_decay = - this->net_->params_weight_decay(); +void AdaDeltaSolver::ComputeUpdateValue(int param_id, Dtype rate) { + const vector*>& net_params = this->net_->learnable_params(); + const vector& net_params_lr = this->net_->params_lr(); Dtype delta = this->param_.delta(); Dtype momentum = this->param_.momentum(); - Dtype weight_decay = this->param_.weight_decay(); - string regularization_type = this->param_.regularization_type(); + Dtype local_rate = rate * net_params_lr[param_id]; size_t update_history_offset = net_params.size(); switch (Caffe::mode()) { - case Caffe::CPU: - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else if (regularization_type == "L1") { - caffe_cpu_sign(net_params[param_id]->count(), - net_params[param_id]->cpu_data(), - this->temp_[param_id]->mutable_cpu_data()); - caffe_axpy(net_params[param_id]->count(), - local_decay, - this->temp_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } + case Caffe::CPU: { + // compute square of gradient in update + caffe_powx(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), Dtype(2), + this->update_[param_id]->mutable_cpu_data()); - // compute square of gradient in update - caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); - - // update history of gradients - caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, - this->update_[param_id]->cpu_data(), momentum, - this->history_[param_id]->mutable_cpu_data()); - - // add delta to history to guard against dividing by zero later - caffe_set(net_params[param_id]->count(), delta, - this->temp_[param_id]->mutable_cpu_data()); - - caffe_add(net_params[param_id]->count(), - this->temp_[param_id]->cpu_data(), - this->history_[update_history_offset + param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - caffe_add(net_params[param_id]->count(), - this->temp_[param_id]->cpu_data(), - this->history_[param_id]->cpu_data(), - this->temp_[param_id]->mutable_cpu_data()); - - // divide history of updates by history of gradients - caffe_div(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), - this->temp_[param_id]->cpu_data(), - this->update_[param_id]->mutable_cpu_data()); - - // jointly compute the RMS of both for update and gradient history - caffe_powx(net_params[param_id]->count(), - this->update_[param_id]->cpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_cpu_data()); - - // compute the update - caffe_mul(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), - this->update_[param_id]->cpu_data(), - net_params[param_id]->mutable_cpu_diff()); - - // compute square of update - caffe_powx(net_params[param_id]->count(), - net_params[param_id]->cpu_diff(), Dtype(2), - this->update_[param_id]->mutable_cpu_data()); - - // update history of updates - caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, - this->update_[param_id]->cpu_data(), momentum, - this->history_[update_history_offset + param_id]->mutable_cpu_data()); - } + // update history of gradients + caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, + this->update_[param_id]->cpu_data(), momentum, + this->history_[param_id]->mutable_cpu_data()); + + // add delta to history to guard against dividing by zero later + caffe_set(net_params[param_id]->count(), delta, + this->temp_[param_id]->mutable_cpu_data()); + + caffe_add(net_params[param_id]->count(), + this->temp_[param_id]->cpu_data(), + this->history_[update_history_offset + param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + caffe_add(net_params[param_id]->count(), + this->temp_[param_id]->cpu_data(), + this->history_[param_id]->cpu_data(), + this->temp_[param_id]->mutable_cpu_data()); + + // divide history of updates by history of gradients + caffe_div(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), + this->temp_[param_id]->cpu_data(), + this->update_[param_id]->mutable_cpu_data()); + + // jointly compute the RMS of both for update and gradient history + caffe_powx(net_params[param_id]->count(), + this->update_[param_id]->cpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_cpu_data()); + + // compute the update + caffe_mul(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), + this->update_[param_id]->cpu_data(), + net_params[param_id]->mutable_cpu_diff()); + + // compute square of update + caffe_powx(net_params[param_id]->count(), + net_params[param_id]->cpu_diff(), Dtype(2), + this->update_[param_id]->mutable_cpu_data()); + + // update history of updates + caffe_cpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, + this->update_[param_id]->cpu_data(), momentum, + this->history_[update_history_offset + param_id]->mutable_cpu_data()); + + // apply learning rate + caffe_cpu_scale(net_params[param_id]->count(), local_rate, + net_params[param_id]->cpu_diff(), + net_params[param_id]->mutable_cpu_diff()); break; - case Caffe::GPU: + } + case Caffe::GPU: { #ifndef CPU_ONLY - for (int param_id = 0; param_id < net_params.size(); ++param_id) { - Dtype local_decay = weight_decay * net_params_weight_decay[param_id]; - - if (local_decay) { - if (regularization_type == "L2") { - // add weight decay - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - net_params[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else if (regularization_type == "L1") { - caffe_gpu_sign(net_params[param_id]->count(), - net_params[param_id]->gpu_data(), - this->temp_[param_id]->mutable_gpu_data()); - caffe_gpu_axpy(net_params[param_id]->count(), - local_decay, - this->temp_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - } else { - LOG(FATAL) << "Unknown regularization type: " << regularization_type; - } - } + // compute square of gradient in update + caffe_gpu_powx(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), Dtype(2), + this->update_[param_id]->mutable_gpu_data()); - // compute square of gradient in update - caffe_gpu_powx(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), Dtype(2), - this->update_[param_id]->mutable_gpu_data()); - - // update history of gradients - caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, - this->update_[param_id]->gpu_data(), momentum, - this->history_[param_id]->mutable_gpu_data()); - - // add delta to history to guard against dividing by zero later - caffe_gpu_set(net_params[param_id]->count(), delta, - this->temp_[param_id]->mutable_gpu_data()); - - caffe_gpu_add(net_params[param_id]->count(), - this->temp_[param_id]->gpu_data(), - this->history_[update_history_offset + param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - caffe_gpu_add(net_params[param_id]->count(), - this->temp_[param_id]->gpu_data(), - this->history_[param_id]->gpu_data(), - this->temp_[param_id]->mutable_gpu_data()); - - // divide history of updates by history of gradients - caffe_gpu_div(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), - this->temp_[param_id]->gpu_data(), - this->update_[param_id]->mutable_gpu_data()); - - // jointly compute the RMS of both for update and gradient history - caffe_gpu_powx(net_params[param_id]->count(), - this->update_[param_id]->gpu_data(), Dtype(0.5), - this->update_[param_id]->mutable_gpu_data()); - - // compute the update and copy to net_diff - caffe_gpu_mul(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), - this->update_[param_id]->gpu_data(), - net_params[param_id]->mutable_gpu_diff()); - - // compute square of update - caffe_gpu_powx(net_params[param_id]->count(), - net_params[param_id]->gpu_diff(), Dtype(2), - this->update_[param_id]->mutable_gpu_data()); - - // update history of updates - caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, - this->update_[param_id]->gpu_data(), momentum, - this->history_[update_history_offset + param_id]->mutable_gpu_data()); - } + // update history of gradients + caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, + this->update_[param_id]->gpu_data(), momentum, + this->history_[param_id]->mutable_gpu_data()); + + // add delta to history to guard against dividing by zero later + caffe_gpu_set(net_params[param_id]->count(), delta, + this->temp_[param_id]->mutable_gpu_data()); + + caffe_gpu_add(net_params[param_id]->count(), + this->temp_[param_id]->gpu_data(), + this->history_[update_history_offset + param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + caffe_gpu_add(net_params[param_id]->count(), + this->temp_[param_id]->gpu_data(), + this->history_[param_id]->gpu_data(), + this->temp_[param_id]->mutable_gpu_data()); + + // divide history of updates by history of gradients + caffe_gpu_div(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), + this->temp_[param_id]->gpu_data(), + this->update_[param_id]->mutable_gpu_data()); + + // jointly compute the RMS of both for update and gradient history + caffe_gpu_powx(net_params[param_id]->count(), + this->update_[param_id]->gpu_data(), Dtype(0.5), + this->update_[param_id]->mutable_gpu_data()); + + // compute the update and copy to net_diff + caffe_gpu_mul(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), + this->update_[param_id]->gpu_data(), + net_params[param_id]->mutable_gpu_diff()); + + // compute square of update + caffe_gpu_powx(net_params[param_id]->count(), + net_params[param_id]->gpu_diff(), Dtype(2), + this->update_[param_id]->mutable_gpu_data()); + + // update history of updates + caffe_gpu_axpby(net_params[param_id]->count(), Dtype(1) - momentum, + this->update_[param_id]->gpu_data(), momentum, + this->history_[update_history_offset + param_id]->mutable_gpu_data()); + + // apply learning rate + caffe_gpu_scale(net_params[param_id]->count(), local_rate, + net_params[param_id]->gpu_diff(), + net_params[param_id]->mutable_gpu_diff()); #else NO_GPU; #endif break; + } default: LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode(); } diff --git a/src/caffe/test/test_gradient_based_solver.cpp b/src/caffe/test/test_gradient_based_solver.cpp index 277aa3a5c8e..1d84b1a72ed 100644 --- a/src/caffe/test/test_gradient_based_solver.cpp +++ b/src/caffe/test/test_gradient_based_solver.cpp @@ -165,10 +165,6 @@ class GradientBasedSolverTest : public MultiDeviceTest { " bottom: 'targets' " " } " "} "; - if (learning_rate != 0) { - proto << "base_lr: " << learning_rate << " "; - proto << "lr_policy: 'fixed' "; - } if (weight_decay != 0) { proto << "weight_decay: " << weight_decay << " "; } @@ -897,6 +893,138 @@ TYPED_TEST(NesterovSolverTest, TestSnapshotShare) { } } +template +class AdaDeltaSolverTest : public GradientBasedSolverTest { + typedef typename TypeParam::Dtype Dtype; + + protected: + virtual void InitSolver(const SolverParameter& param) { + this->solver_.reset(new AdaDeltaSolver(param)); + } + + virtual SolverParameter_SolverType solver_type() { + return SolverParameter_SolverType_ADADELTA; + } +}; + +TYPED_TEST_CASE(AdaDeltaSolverTest, TestDtypesAndDevices); + +TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdate) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + this->TestLeastSquaresUpdate(kLearningRate); +} + +TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithWeightDecay) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.5; + const Dtype kMomentum = 0.95; + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum); +} + +TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithHalfMomentum) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.0; + const Dtype kMomentum = 0.5; + const int kNumIters = 1; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum); + } +} + +TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithMomentum) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.0; + const Dtype kMomentum = 0.95; + const int kNumIters = 1; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum); + } +} + +TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.0; + const Dtype kMomentum = 0.95; + const int kNumIters = 4; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + +TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithEverything) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.95; + const int kNumIters = 4; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + +TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithEverythingShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.95; + const int kNumIters = 4; + this->share_ = true; + for (int i = 0; i <= kNumIters; ++i) { + this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); + } +} + +TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccum) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.95; + const int kNumIters = 4; + const int kIterSize = 2; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} + +TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithEverythingAccumShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.95; + const int kNumIters = 4; + const int kIterSize = 2; + this->share_ = true; + this->CheckAccumulation(kLearningRate, kWeightDecay, kMomentum, kNumIters, + kIterSize); +} + +TYPED_TEST(AdaDeltaSolverTest, TestSnapshot) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.95; + const int kNumIters = 4; + for (int i = 1; i <= kNumIters; ++i) { + this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i); + } +} + +TYPED_TEST(AdaDeltaSolverTest, TestSnapshotShare) { + typedef typename TypeParam::Dtype Dtype; + const Dtype kLearningRate = 1.0; + const Dtype kWeightDecay = 0.1; + const Dtype kMomentum = 0.95; + const int kNumIters = 4; + this->share_ = true; + for (int i = 1; i <= kNumIters; ++i) { + this->TestSnapshot(kLearningRate, kWeightDecay, kMomentum, i); + } +} + template class RMSPropSolverTest : public GradientBasedSolverTest { typedef typename TypeParam::Dtype Dtype; @@ -1003,78 +1131,4 @@ TYPED_TEST(RMSPropSolverTest, TestSnapshotShare) { } } -template -class AdaDeltaSolverTest : public GradientBasedSolverTest { - typedef typename TypeParam::Dtype Dtype; - - protected: - virtual void InitSolver(const SolverParameter& param) { - this->solver_.reset(new AdaDeltaSolver(param)); - } - - virtual SolverParameter_SolverType solver_type() { - return SolverParameter_SolverType_ADADELTA; - } -}; - -TYPED_TEST_CASE(AdaDeltaSolverTest, TestDtypesAndDevices); - -TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdate) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.0; - this->TestLeastSquaresUpdate(kLearningRate); -} - -TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithWeightDecay) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.0; - const Dtype kWeightDecay = 0.5; - const Dtype kMomentum = 0.95; - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum); -} - -TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithHalfMomentum) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.0; - const Dtype kWeightDecay = 0.0; - const Dtype kMomentum = 0.5; - const int kNumIters = 1; - for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum); - } -} - -TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithMomentum) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.0; - const Dtype kWeightDecay = 0.0; - const Dtype kMomentum = 0.95; - const int kNumIters = 1; - for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum); - } -} - -TYPED_TEST(AdaDeltaSolverTest, TestLeastSquaresUpdateWithMomentumMultiIter) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.0; - const Dtype kWeightDecay = 0.0; - const Dtype kMomentum = 0.95; - const int kNumIters = 4; - for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); - } -} - -TYPED_TEST(AdaDeltaSolverTest, TestAdaDeltaLeastSquaresUpdateWithEverything) { - typedef typename TypeParam::Dtype Dtype; - const Dtype kLearningRate = 0.0; - const Dtype kWeightDecay = 0.1; - const Dtype kMomentum = 0.95; - const int kNumIters = 4; - for (int i = 0; i <= kNumIters; ++i) { - this->TestLeastSquaresUpdate(kLearningRate, kWeightDecay, kMomentum, i); - } -} - } // namespace caffe