-
Notifications
You must be signed in to change notification settings - Fork 5.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
merge with lasted develop branch. Optimizer lib2 #2386
Changes from 26 commits
62cd5c7
3158efe
5b8a0c5
8610ba1
b4aa0ec
26e9c4e
5f9cd8c
b9d024e
5ab958b
fd8c510
3b1294a
81cad37
beb2697
5a1e678
bc26df7
b9cb0f2
6cbbc2e
f5ff283
e456796
33b4dee
0fc4201
b7e68e0
b72e8aa
1814fc2
e148bc1
a46f3fc
df5bc78
65d9e33
ec65fa8
baef96e
99849cf
72b6b26
03884f0
a166e52
33ddc89
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
include_directories(${CMAKE_CURRENT_BINARY_DIR}) | ||
|
||
set(OPITMIZER_SRCS | ||
adadelta_optimizer.cc | ||
adagrad_optimizer.cc | ||
adam_optimizer.cc | ||
optimizer.cc | ||
parameter_optimizer.cc | ||
sgd_optimizer.cc | ||
) | ||
|
||
add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS}) | ||
add_dependencies(paddle_optimizer gen_proto_cpp) | ||
|
||
add_simple_unittest(serialization_test) | ||
add_simple_unittest(parameter_optimizer_test) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#pragma once | ||
/** | ||
* @brief tensor used by optimizer | ||
*/ | ||
|
||
#include <string.h> | ||
#include <memory> | ||
#include "paddle/math/MemoryHandle.h" | ||
#include "paddle/utils/Common.h" | ||
#include "paddle/utils/Logging.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
template <class T> | ||
class TensorT { | ||
public: | ||
TensorT(size_t size) | ||
: TensorT(std::make_shared<CpuMemoryHandle>(size * sizeof(float)), size) { | ||
} | ||
TensorT(CpuMemHandlePtr handle, size_t size) | ||
: height_(1), | ||
width_(size), | ||
data_(reinterpret_cast<T*>(handle->getBuf())) {} | ||
|
||
TensorT(T* data, size_t size) : height_(1), width_(size), data_(data) {} | ||
|
||
TensorT(T* data, size_t h, size_t w) : height_(h), width_(w), data_(data) {} | ||
|
||
virtual ~TensorT() {} | ||
|
||
T* get_buffer() { return this->data_; } | ||
|
||
T& operator[](const size_t idx) { | ||
CHECK(idx >= 0 && idx < this->width_) << "out of index range"; | ||
return data_[idx]; | ||
} | ||
T& operator[](const size_t idx) const { | ||
CHECK(idx >= 0 && idx < this->width_) << "out of index range"; | ||
return data_[idx]; | ||
} | ||
// TODO: replace with tensorshape | ||
size_t size() const { return this->width_ * this->height_; } | ||
|
||
protected: | ||
size_t height_; | ||
size_t width_; | ||
T* data_; | ||
}; | ||
|
||
// TODO(zhihong): design problem of dynamic datatype, need to fix it | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It seems that when porting "majel" to PaddlePaddle, we already included boost/variant.hpp for the "single value multiple type" container. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agreed, 👍. Either we can wait for their majel port job finish, or implement another one with There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. |
||
typedef TensorT<float> Tensor; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
#include "adadelta_optimizer.h" | ||
#include <algorithm> | ||
#include <cmath> | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
void AdadeltaOptimizer::Update(const Tensor* gradient) { | ||
num_sample_passed_ += 1; | ||
double learning_rate = lr_policy_->LearningRate(num_sample_passed_); | ||
Tensor& param = *parameter_; | ||
const Tensor& grad = *gradient; | ||
Tensor& accum_g = *accum_gradient_; | ||
Tensor& accum_d = *accum_delta_; | ||
Tensor& update_d = *update_delta_; | ||
for (size_t i = 0; i < param.size(); ++i) { | ||
accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i]; | ||
|
||
update_d[i] = std::sqrt(accum_d[i] + epsilon_) / | ||
std::sqrt(accum_g[i] + epsilon_) * grad[i]; | ||
|
||
accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i]; | ||
|
||
param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i]; | ||
} | ||
} | ||
|
||
const char* AdadeltaOptimizer::SerializeState(int* state_len) { | ||
AdadeltaOptimizerState state; | ||
// TODO(zhihong) : add lr_policy serialization | ||
state.set_num_sample_passed(num_sample_passed_); | ||
|
||
TensorToProto(*parameter_, state.mutable_parameter()); | ||
TensorToProto(*accum_gradient_, state.mutable_accum_gradient()); | ||
TensorToProto(*accum_delta_, state.mutable_accum_delta()); | ||
TensorToProto(*update_delta_, state.mutable_update_delta()); | ||
auto str = state.SerializeAsString(); | ||
*state_len = str.size(); | ||
return str.c_str(); | ||
} | ||
|
||
void AdadeltaOptimizer::DeserializeState(const std::string& str) { | ||
AdadeltaOptimizerState state; | ||
state.ParseFromString(str); | ||
// TODO(zhihong) : add lr_policy DeserializeState | ||
num_sample_passed_ = state.num_sample_passed(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Here is missing
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. firstly, secondly, In that format, obviously it is easy to reading. But need a pair of There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see. |
||
|
||
ProtoToTensor(state.parameter(), parameter_); | ||
ProtoToTensor(state.accum_gradient(), accum_gradient_); | ||
ProtoToTensor(state.accum_delta(), accum_delta_); | ||
ProtoToTensor(state.update_delta(), update_delta_); | ||
} | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,39 @@ | ||
#pragma once | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class AdadeltaOptimizer : public ParameterOptimizer { | ||
public: | ||
AdadeltaOptimizer( | ||
Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay) | ||
: ParameterOptimizer(parameter, lr), | ||
accum_gradient_(new Tensor(parameter->size())), | ||
accum_delta_(new Tensor(parameter->size())), | ||
update_delta_(new Tensor(parameter->size())), | ||
rho_(rho), | ||
epsilon_(epsilon), | ||
decay_(decay) {} | ||
|
||
~AdadeltaOptimizer() { | ||
if (accum_gradient_) delete accum_gradient_; | ||
if (accum_delta_) delete accum_delta_; | ||
if (update_delta_) delete update_delta_; | ||
} | ||
void Update(const Tensor *gradient); | ||
const char *SerializeState(int *state_len); | ||
void DeserializeState(const std::string &state); | ||
|
||
private: | ||
Tensor *accum_gradient_; | ||
Tensor *accum_delta_; | ||
Tensor *update_delta_; | ||
double rho_; | ||
double epsilon_; | ||
double decay_; | ||
}; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
#include <cmath> | ||
|
||
#include "adagrad_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
void AdagradOptimizer::Update(const Tensor* gradient) { | ||
num_sample_passed_ += 1; | ||
double learning_rate = lr_policy_->LearningRate(num_sample_passed_); | ||
Tensor& param = *parameter_; | ||
Tensor& accum_g = *accum_gradient_; | ||
const Tensor& grad = *gradient; | ||
for (size_t i = 0; i < param.size(); ++i) { | ||
accum_g[i] += grad[i] * grad[i]; | ||
param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) + | ||
learning_rate * decay_ * param[i]; | ||
} | ||
} | ||
const char* AdagradOptimizer::SerializeState(int* state_len) { | ||
AdagradOptimizerState state; | ||
// TODO(zhihong) : add lr_policy serialization | ||
state.set_num_sample_passed(num_sample_passed_); | ||
|
||
TensorToProto(*parameter_, state.mutable_parameter()); | ||
TensorToProto(*accum_gradient_, state.mutable_accum_gradient()); | ||
auto str = state.SerializeAsString(); | ||
*state_len = str.size(); | ||
return str.c_str(); | ||
} | ||
|
||
void AdagradOptimizer::DeserializeState(const std::string& str) { | ||
AdagradOptimizerState state; | ||
state.ParseFromString(str); | ||
// TODO(zhihong) : add lr_policy DeserializeState | ||
num_sample_passed_ = state.num_sample_passed(); | ||
ProtoToTensor(state.parameter(), parameter_); | ||
ProtoToTensor(state.accum_gradient(), accum_gradient_); | ||
} | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
#pragma once | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class AdagradOptimizer : public ParameterOptimizer { | ||
public: | ||
AdagradOptimizer(Tensor *parameter, | ||
LrPolicy *lr, | ||
double epsilon, | ||
double decay) | ||
: ParameterOptimizer(parameter, lr), | ||
accum_gradient_(new Tensor(parameter->size())), | ||
epsilon_(epsilon), | ||
decay_(decay) {} | ||
~AdagradOptimizer() { | ||
if (accum_gradient_) delete accum_gradient_; | ||
} | ||
void Update(const Tensor *gradient); | ||
const char *SerializeState(int *state_len); | ||
void DeserializeState(const std::string &state); | ||
|
||
private: | ||
Tensor *accum_gradient_; | ||
double epsilon_; | ||
double decay_; | ||
}; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#include "adam_optimizer.h" | ||
#include <cmath> | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
void AdamOptimizer::Update(const Tensor *gradient) { | ||
num_sample_passed_ += 1; | ||
double learning_rate = lr_policy_->LearningRate(num_sample_passed_); | ||
double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_); | ||
double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_); | ||
learning_rate *= std::sqrt(coef2) / coef1; | ||
Tensor ¶m = *parameter_; | ||
const Tensor &grad = *gradient; | ||
Tensor &m = *momentums_; | ||
Tensor &v = *velocitys_; | ||
for (size_t i = 0; i < param.size(); ++i) { | ||
m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i]; | ||
v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i]; | ||
param[i] -= | ||
learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]); | ||
} | ||
} | ||
|
||
const char *AdamOptimizer::SerializeState(int *state_len) { | ||
AdamOptimizerState state; | ||
// TODO(zhihong) : add lr_policy serialization | ||
state.set_num_sample_passed(num_sample_passed_); | ||
|
||
TensorToProto(*parameter_, state.mutable_parameter()); | ||
TensorToProto(*velocitys_, state.mutable_momentums()); | ||
auto str = state.SerializeAsString(); | ||
*state_len = str.size(); | ||
return str.c_str(); | ||
} | ||
|
||
void AdamOptimizer::DeserializeState(const std::string &str) { | ||
AdamOptimizerState state; | ||
state.ParseFromString(str); | ||
// TODO(zhihong) : add lr_policy DeserializeState | ||
num_sample_passed_ = state.num_sample_passed(); | ||
|
||
ProtoToTensor(state.parameter(), parameter_); | ||
ProtoToTensor(state.velocitys(), velocitys_); | ||
} | ||
} // namespace optimizer | ||
} // namespace paddle |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
#pragma once | ||
|
||
#include "parameter_optimizer.h" | ||
|
||
namespace paddle { | ||
namespace optimizer { | ||
|
||
class AdamOptimizer : public ParameterOptimizer { | ||
public: | ||
AdamOptimizer(Tensor *parameter, | ||
LrPolicy *lr, | ||
double beta_1, | ||
double beta_2, | ||
double epsilon, | ||
double decay) | ||
: ParameterOptimizer(parameter, lr), | ||
momentums_(new Tensor(parameter->size())), | ||
velocitys_(new Tensor(parameter->size())), | ||
beta_1_(beta_1), | ||
beta_2_(beta_2), | ||
epsilon_(epsilon), | ||
decay_(decay) {} | ||
~AdamOptimizer() { | ||
if (momentums_) delete momentums_; | ||
if (velocitys_) delete velocitys_; | ||
} | ||
void Update(const Tensor *gradient); | ||
const char *SerializeState(int *state_len); | ||
void DeserializeState(const std::string &state); | ||
|
||
private: | ||
Tensor *momentums_; | ||
Tensor *velocitys_; | ||
double beta_1_; | ||
double beta_2_; | ||
double epsilon_; | ||
double decay_; | ||
}; | ||
|
||
} // namespace optimizer | ||
} // namespace paddle |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
这里
std::make_shared<CpuMemoryHandle>(size * sizeof(float))
没人引用了,执行完buffer就立刻被释放掉了。我想了下能不能这样写作为参考:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
that's right, I made a mistake here, Tensor have to save the shared_ptr for extending its lifetime.
fix it, init data_ with data_ptr_.