-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
merge with lasted develop branch. Optimizer lib2 #2386
Changes from 16 commits
62cd5c7
3158efe
5b8a0c5
8610ba1
b4aa0ec
26e9c4e
5f9cd8c
b9d024e
5ab958b
fd8c510
3b1294a
81cad37
beb2697
5a1e678
bc26df7
b9cb0f2
6cbbc2e
f5ff283
e456796
33b4dee
0fc4201
b7e68e0
b72e8aa
1814fc2
e148bc1
a46f3fc
df5bc78
65d9e33
ec65fa8
baef96e
99849cf
72b6b26
03884f0
a166e52
33ddc89
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
include_directories(${CMAKE_CURRENT_BINARY_DIR}) | ||
|
||
set(OPITMIZER_SRCS | ||
adadelta_optimizer.cc | ||
adagrad_optimizer.cc | ||
adam_optimizer.cc | ||
optimizer.cc | ||
parameter_optimizer.cc | ||
sgd_optmizer.cc | ||
) | ||
|
||
add_library(optimizer STATIC ${OPITMIZER_SRCS}) | ||
add_dependencies(optimizer gen_proto_cpp) | ||
|
||
add_simple_unittest(tensor_test) | ||
add_simple_unittest(parameter_optimizer_test) | ||
add_dependencies(parameter_optimizer_test optimizer) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
#pragma once | ||
/** | ||
* @brief tensor used by optimizer | ||
*/ | ||
|
||
#include <string.h> | ||
#include <memory> | ||
#include "paddle/utils/Common.h" | ||
#include "paddle/utils/Logging.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
template <class T> | ||
class TensorT { | ||
public: | ||
TensorT(size_t size) : height_(1), width_(size) { data_ = new T[size]; } | ||
TensorT(T* data, size_t size) : height_(1), width_(size), data_(data) {} | ||
TensorT(T* data, size_t h, size_t w) : height_(h), width_(w), data_(data_) {} | ||
TensorT(const TensorT& t) | ||
: TensorT(1, t.size(), 0, t.get_buffer(), false, false) {} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess here is copy constructor, and what here is doing is that it created a new tensor, copying from the old tensor. And they shared the same buffer. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 fix done. |
||
~TensorT() { | ||
if (data_) delete data_; | ||
} | ||
|
||
TensorT& operator=(const TensorT& t) { | ||
this->width_ = t.size(); | ||
this->data_ = t.get_buffer(); | ||
} | ||
T* get_buffer() { return this->data_; } | ||
T& operator[](const size_t idx) { | ||
CHECK(idx >= 0 && idx < this->width_) << "out of index range"; | ||
return data_[idx]; | ||
} | ||
T& operator[](const size_t idx) const { | ||
CHECK(idx >= 0 && idx < this->width_) << "out of index range"; | ||
return data_[idx]; | ||
} | ||
// TODO: replace with tensorshape | ||
size_t size() const { return this->width_ * this->height_; } | ||
|
||
protected: | ||
size_t height_; | ||
size_t width_; | ||
T* data_; | ||
}; | ||
|
||
// TODO(zhihong): design problem of dynamic datatype, need to fix it | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that when porting "majel" to PaddlePaddle, we already included boost/variant.hpp for the "single value multiple type" container. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, 👍. Either we can wait for their majel port job finish, or implement another one with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. |
||
typedef TensorT<float> Tensor; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#include <iostream> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe we should add There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
#include "gtest/gtest.h" | ||
#include "tensor.h" | ||
|
||
using namespace paddle; | ||
using namespace paddle::optimizer; | ||
|
||
TEST(Tensor, indexer) { | ||
Tensor t(3); | ||
for (auto i = 0; i < t.size(); ++i) { | ||
t[i] = i; | ||
} | ||
ASSERT_EQ(t[2], 2); | ||
ASSERT_EQ(t[1], 1); | ||
} | ||
|
||
int main(int argc, char** argv) { | ||
testing::InitGoogleTest(&argc, argv); | ||
return RUN_ALL_TESTS(); | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#include "adadelta_optimizer.h" | ||
#include <algorithm> | ||
#include <cmath> | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
void AdadeltaOptimizer::set_weight(Tensor* p) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the content of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix done There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Need Google style function names. Please replace all C++ function names with CamelCase. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://google.github.io/styleguide/cppguide.html#Function_Names
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, sorry! |
||
parameter_ = p; | ||
size_t size = p->size(); | ||
accum_gradient_ = new Tensor(size); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should we free previous accum_gradient_, accum_delta_, update_delta_ if not NULL? Same for other There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my mind, an optimizer instance is mapping to a parameter, we never reuse the instance, even though restarting from a checkpoint. In such a situation, we should not implement with pointer guard. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my opinion, the contract should be API, not an oral or documentation contract like "we never reuse the instance". An oral contract does not prevent client user from reusing the instance. If we have a oral contract, people could easily forget. If we have a documentation contract, it could be easily get out of date, or many people will not look at the document. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's true in real life. Thanks a lot! fix it. |
||
accum_delta_ = new Tensor(size); | ||
update_delta_ = new Tensor(size); | ||
} | ||
|
||
void AdadeltaOptimizer::Update(const Tensor* gradient) { | ||
num_sample_passed_ += 1; | ||
double learning_rate = lr_policy_->LearningRate(num_sample_passed_); | ||
Tensor& param = *parameter_; | ||
const Tensor& grad = *gradient; | ||
Tensor& accum_g = *accum_gradient_; | ||
Tensor& accum_d = *accum_delta_; | ||
Tensor& update_d = *update_delta_; | ||
for (size_t i = 0; i < param.size(); ++i) { | ||
accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i]; | ||
|
||
update_d[i] = std::sqrt(accum_d[i] + epsilon_) / | ||
std::sqrt(accum_g[i] + epsilon_) * grad[i]; | ||
|
||
accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i]; | ||
|
||
param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i]; | ||
} | ||
} | ||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#pragma once | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class AdadeltaOptimizer : public ParameterOptimizer { | ||
public: | ||
AdadeltaOptimizer(double rho, double epsilon, double decay, LrPolicy *lr) | ||
: ParameterOptimizer(lr), | ||
accum_gradient_(nullptr), | ||
accum_delta_(nullptr), | ||
update_delta_(nullptr), | ||
rho_(rho), | ||
epsilon_(epsilon), | ||
decay_(decay) {} | ||
~AdadeltaOptimizer() { | ||
if (accum_gradient_) delete accum_gradient_; | ||
if (accum_delta_) delete accum_delta_; | ||
if (update_delta_) delete update_delta_; | ||
} | ||
void Update(const Tensor *gradient); | ||
void set_weight(Tensor *p); | ||
|
||
private: | ||
Tensor *accum_gradient_; | ||
Tensor *accum_delta_; | ||
Tensor *update_delta_; | ||
double rho_; | ||
double epsilon_; | ||
double decay_; | ||
}; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#include <cmath> | ||
|
||
#include "adagrad_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
void AdagradOptimizer::set_weight(Tensor* p) { | ||
parameter_ = p; | ||
size_t size = p->size(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems the content of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. that's a horrible mistake.... |
||
accum_gradient_ = new Tensor(size); | ||
} | ||
|
||
void AdagradOptimizer::Update(const Tensor* gradient) { | ||
num_sample_passed_ += 1; | ||
double learning_rate = lr_policy_->LearningRate(num_sample_passed_); | ||
Tensor& param = *parameter_; | ||
Tensor& accum_g = *accum_gradient_; | ||
const Tensor& grad = *gradient; | ||
for (size_t i = 0; i < param.size(); ++i) { | ||
accum_g[i] += grad[i] * grad[i]; | ||
param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) + | ||
learning_rate * decay_ * param[i]; | ||
} | ||
} | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
#pragma once | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class AdagradOptimizer : public ParameterOptimizer { | ||
public: | ||
AdagradOptimizer(double epsilon, double decay, LrPolicy *lr) | ||
: ParameterOptimizer(lr), | ||
accum_gradient_(nullptr), | ||
epsilon_(epsilon), | ||
decay_(decay) {} | ||
~AdagradOptimizer() { | ||
if (accum_gradient_) delete accum_gradient_; | ||
} | ||
void Update(const Tensor *gradient); | ||
void set_weight(Tensor *p); | ||
|
||
private: | ||
Tensor *accum_gradient_; | ||
double epsilon_; | ||
double decay_; | ||
}; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#include "adam_optimizer.h" | ||
#include <cmath> | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
void AdamOptimizer::set_weight(Tensor *p) { | ||
parameter_ = p; | ||
size_t size = p->size(); | ||
momentums_ = new Tensor(size); | ||
velocitys_ = new Tensor(size); | ||
} | ||
|
||
void AdamOptimizer::Update(const Tensor *gradient) { | ||
num_sample_passed_ += 1; | ||
double learning_rate = lr_policy_->LearningRate(num_sample_passed_); | ||
double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_); | ||
double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_); | ||
learning_rate *= std::sqrt(coef2) / coef1; | ||
Tensor ¶m = *parameter_; | ||
const Tensor &grad = *gradient; | ||
Tensor &m = *momentums_; | ||
Tensor &v = *velocitys_; | ||
for (size_t i = 0; i < param.size(); ++i) { | ||
m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i]; | ||
v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i]; | ||
param[i] -= | ||
learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]); | ||
} | ||
} | ||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
#pragma once | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class AdamOptimizer : public ParameterOptimizer { | ||
public: | ||
AdamOptimizer( | ||
double beta_1, double beta_2, double epsilon, double decay, LrPolicy *lr) | ||
: ParameterOptimizer(lr), | ||
momentums_(nullptr), | ||
velocitys_(nullptr), | ||
beta_1_(beta_1), | ||
beta_2_(beta_2), | ||
epsilon_(epsilon), | ||
decay_(decay) {} | ||
~AdamOptimizer() { | ||
if (momentums_) delete momentums_; | ||
if (velocitys_) delete velocitys_; | ||
} | ||
void Update(const Tensor *gradient); | ||
void set_weight(Tensor *p); | ||
|
||
private: | ||
Tensor *momentums_; | ||
Tensor *velocitys_; | ||
double beta_1_; | ||
double beta_2_; | ||
double epsilon_; | ||
double decay_; | ||
}; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#pragma once | ||
|
||
#include <algorithm> | ||
#include "OptimizerConfig.pb.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class LrPolicy { | ||
public: | ||
virtual ~LrPolicy() {} | ||
virtual double LearningRate(const uint64_t num_sample_passed) = 0; | ||
}; | ||
|
||
// constant learning rate policy | ||
class ConstLr final : public LrPolicy { | ||
public: | ||
ConstLr(double lr) : learning_rate(lr){}; | ||
double LearningRate(const uint64_t num_sample_passed) { | ||
return learning_rate; | ||
} | ||
|
||
private: | ||
double learning_rate; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. fix done. |
||
}; | ||
|
||
class LinearLr final : public LrPolicy { | ||
public: | ||
LinearLr(double lr, double lr_decay_a, double lr_decay_b) | ||
: learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {} | ||
double LearningRate(const uint64_t num_sample_passed) { | ||
return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b); | ||
} | ||
|
||
private: | ||
double learning_rate; | ||
double lr_decay_a; | ||
double lr_decay_b; | ||
}; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
can you explain to me what does ":" do here? Sorry I am not too familiar, and don't know what's the keyword to search for.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is initializer in c++, which is the idiomatic way in c++ initializes parameter.
please check here for detail. http://en.cppreference.com/w/cpp/language/direct_initialization
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
括号后面的":",这个是c++里的初始化手段,和构造函数还不是同一个概念。初始化列表和构造函数的关系类比python的__new__和__init__的关系。初始化列表会在构造函数前完成(就是花括号里的东西)。
中文叫 初始化列表,英文叫 constructor initializer list。
1、初始化列表在任何函数执行之前完成
2、初始化列表中的参数赋值顺序是由成员声明顺序决定
并且对于非POD类型的成员具有限制:
https://stackoverflow.com/questions/5816218/difference-between-initializer-and-default-initializer-list-in-c
https://stackoverflow.com/questions/9903248/initializing-fields-in-constructor-initializer-list-vs-constructor-body
一般推荐非静态成员都使用该方法初始化