Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

merge with lasted develop branch. Optimizer lib2 #2386

Merged
merged 35 commits into from
Jun 20, 2017
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
62cd5c7
"failed to resolve conflict. apply to HEAD"
dzhwinter Jun 4, 2017
3158efe
"move cmake scripts too"
dzhwinter Jun 4, 2017
5b8a0c5
"optimizer remove init create with proto"
dzhwinter Jun 5, 2017
8610ba1
"remove get config proto"
dzhwinter Jun 5, 2017
b4aa0ec
"modify update interface"
dzhwinter Jun 5, 2017
26e9c4e
"add vector alias to make name clear"
dzhwinter Jun 5, 2017
5f9cd8c
"rename test file name"
dzhwinter Jun 5, 2017
b9d024e
"remove useless test file"
dzhwinter Jun 5, 2017
5ab958b
"change size_t type to avoid warning"
dzhwinter Jun 5, 2017
fd8c510
"format name with google style"
dzhwinter Jun 6, 2017
3b1294a
"add checkpoint interface: set state, get state"
dzhwinter Jun 6, 2017
81cad37
"remove comments"
dzhwinter Jun 6, 2017
beb2697
"change header guard to pragma"
dzhwinter Jun 6, 2017
5a1e678
"update macro and fix some part"
dzhwinter Jun 6, 2017
bc26df7
"polish code style and update based review comment"
dzhwinter Jun 7, 2017
b9cb0f2
"update marco"
dzhwinter Jun 7, 2017
6cbbc2e
"add comments"
dzhwinter Jun 7, 2017
f5ff283
"fix comment"
dzhwinter Jun 7, 2017
e456796
"update with comment"
dzhwinter Jun 9, 2017
33b4dee
"update serialization part"
dzhwinter Jun 9, 2017
0fc4201
"update interface"
dzhwinter Jun 9, 2017
b7e68e0
"serialization modify"
dzhwinter Jun 11, 2017
b72e8aa
"seperate serialization proto state"
dzhwinter Jun 13, 2017
1814fc2
"fix lr_policy serialization"
dzhwinter Jun 14, 2017
e148bc1
"remove unused tensor line"
dzhwinter Jun 14, 2017
a46f3fc
"fix double release tensor buffer error."
dzhwinter Jun 14, 2017
df5bc78
"fix tensor shared_ptr"
dzhwinter Jun 15, 2017
65d9e33
"modify config name"
dzhwinter Jun 19, 2017
ec65fa8
"protobuf required to optional"
dzhwinter Jun 19, 2017
baef96e
Merge branch 'develop' into optimizer_lib2
dzhwinter Jun 19, 2017
99849cf
rename Tensor.h
dzhwinter Jun 19, 2017
72b6b26
"ci formatter"
dzhwinter Jun 19, 2017
03884f0
formatter
dzhwinter Jun 19, 2017
a166e52
"formatter in docker"
dzhwinter Jun 19, 2017
33ddc89
formatter in docker
dzhwinter Jun 19, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmake/util.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ function(link_paddle_exe TARGET_NAME)
paddle_parameter
paddle_proto
paddle_cuda
paddle_optimizer
${EXTERNAL_LIBS}
${CMAKE_THREAD_LIBS_INIT}
${CMAKE_DL_LIBS}
Expand Down
1 change: 1 addition & 0 deletions paddle/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ add_subdirectory(gserver)
add_subdirectory(pserver)
add_subdirectory(trainer)
add_subdirectory(scripts)
add_subdirectory(optimizer)

# Do not build go directory until go cmake is working smoothly.
# if(CMAKE_Go_COMPILER)
Expand Down
1 change: 1 addition & 0 deletions paddle/math/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,4 @@ add_simple_unittest(test_FPException)
add_simple_unittest(test_GpuProfiler)
add_simple_unittest(test_BaseMatrix)
add_simple_unittest(test_Matrix)
add_simple_unittest(test_Matrix2)
16 changes: 16 additions & 0 deletions paddle/optimizer/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
include_directories(${CMAKE_CURRENT_BINARY_DIR})

set(OPITMIZER_SRCS
adadelta_optimizer.cc
adagrad_optimizer.cc
adam_optimizer.cc
optimizer.cc
parameter_optimizer.cc
sgd_optimizer.cc
)

add_library(paddle_optimizer STATIC ${OPITMIZER_SRCS})
add_dependencies(paddle_optimizer gen_proto_cpp)

add_simple_unittest(serialization_test)
add_simple_unittest(parameter_optimizer_test)
55 changes: 55 additions & 0 deletions paddle/optimizer/Tensor.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#pragma once
/**
* @brief tensor used by optimizer
*/

#include <string.h>
#include <memory>
#include "paddle/math/MemoryHandle.h"
#include "paddle/utils/Common.h"
#include "paddle/utils/Logging.h"

namespace paddle {
namespace optimizer {

template <class T>
class TensorT {
public:
TensorT(size_t size)
: TensorT(std::make_shared<CpuMemoryHandle>(size * sizeof(float)), size) {
}
TensorT(CpuMemHandlePtr handle, size_t size)
: height_(1),
width_(size),
data_(reinterpret_cast<T*>(handle->getBuf())) {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

这里std::make_shared<CpuMemoryHandle>(size * sizeof(float))没人引用了,执行完buffer就立刻被释放掉了。
我想了下能不能这样写作为参考:

template <class T>
class TensorT {
public:
  TensorT(size_t size)
    : TensorT(std::shared_ptr<T> sp(new int[size], std::default_delete<T[]>()), size) {
  }
  TensorT(std::shared_ptr<T> data, size_t size)
    : height_(1),
      width_(size),
      data_ptr_(data) {}

  TensorT(T* data, size_t size) : TensorT(data, 1, size) {}

  TensorT(T* data, size_t h, size_t w) : height_(h), width_(w), data_(data) {}

  virtual ~TensorT() {}

  T* get_buffer() {
    auto data = data_;
    if (data == nullptr) {
      data = data_ptr.get();
    }

    return data;
  }

  T& operator[](const size_t idx) {
    auto data = data_;
    if (data == nullptr) {
      data = data_ptr.get();
    }
    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
    return data[idx];
  }
  T& operator[](const size_t idx) const {
    if (data == nullptr) {
      data = data_ptr.get();
    }
    CHECK(idx >= 0 && idx < this->width_) << "out of index range";
    return data[idx];
  }
  // TODO: replace with tensorshape                                                                                                                                                                
  size_t size() const { return this->width_ * this->height_; }

protected:
  size_t height_;
  size_t width_;
  std::shared_ptr<T*> data_ptr_; // managed data                                                                                                                                                   
  T* data_; // unmanaged data                                                                                                                                                                      
};

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's right, I made a mistake here, Tensor have to save the shared_ptr for extending its lifetime.
fix it, init data_ with data_ptr_.


TensorT(T* data, size_t size) : height_(1), width_(size), data_(data) {}

TensorT(T* data, size_t h, size_t w) : height_(h), width_(w), data_(data) {}

virtual ~TensorT() {}

T* get_buffer() { return this->data_; }

T& operator[](const size_t idx) {
CHECK(idx >= 0 && idx < this->width_) << "out of index range";
return data_[idx];
}
T& operator[](const size_t idx) const {
CHECK(idx >= 0 && idx < this->width_) << "out of index range";
return data_[idx];
}
// TODO: replace with tensorshape
size_t size() const { return this->width_ * this->height_; }

protected:
size_t height_;
size_t width_;
T* data_;
};

// TODO(zhihong): design problem of dynamic datatype, need to fix it
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems that when porting "majel" to PaddlePaddle, we already included boost/variant.hpp for the "single value multiple type" container.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, 👍. Either we can wait for their majel port job finish, or implement another one with typeid reflection. It is a follow-up question.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see.

typedef TensorT<float> Tensor;

} // namespace optimizer
} // namespace paddle
55 changes: 55 additions & 0 deletions paddle/optimizer/adadelta_optimizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
#include "adadelta_optimizer.h"
#include <algorithm>
#include <cmath>

namespace paddle {
namespace optimizer {

void AdadeltaOptimizer::Update(const Tensor* gradient) {
num_sample_passed_ += 1;
double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
Tensor& param = *parameter_;
const Tensor& grad = *gradient;
Tensor& accum_g = *accum_gradient_;
Tensor& accum_d = *accum_delta_;
Tensor& update_d = *update_delta_;
for (size_t i = 0; i < param.size(); ++i) {
accum_g[i] = rho_ * accum_g[i] + (1.0 - rho_) * grad[i] * grad[i];

update_d[i] = std::sqrt(accum_d[i] + epsilon_) /
std::sqrt(accum_g[i] + epsilon_) * grad[i];

accum_d[i] = rho_ * accum_d[i] + (1.0 - rho_) * update_d[i] * update_d[i];

param[i] -= learning_rate * update_d[i] + learning_rate * decay_ * param[i];
}
}

const char* AdadeltaOptimizer::SerializeState(int* state_len) {
AdadeltaOptimizerState state;
// TODO(zhihong) : add lr_policy serialization
state.set_num_sample_passed(num_sample_passed_);

TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
TensorToProto(*accum_delta_, state.mutable_accum_delta());
TensorToProto(*update_delta_, state.mutable_update_delta());
auto str = state.SerializeAsString();
*state_len = str.size();
return str.c_str();
}

void AdadeltaOptimizer::DeserializeState(const std::string& str) {
AdadeltaOptimizerState state;
state.ParseFromString(str);
// TODO(zhihong) : add lr_policy DeserializeState
num_sample_passed_ = state.num_sample_passed();
Copy link
Contributor

@helinwang helinwang Jun 10, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is missing epsilon_, rho_, decay_.
Maybe to avoid potential error like this, we could do (just a suggestion):

class AdadeltaOptimizer {
private:
  AdadeltaOptimizerState state_;
}

AdadeltaOptimizer::Update(...) {
  // directly use state_.learningRate
}

// Clear ProtoTensors when not used to save memory.
AdadeltaOptimizer::ClearStateTensors() {
  state_.parameter().Clear();
  state_.accum_gradient().Clear();
  state_.accum_delta().Clear();
  state_.update_delta().Clear();
}

AdadeltaOptimizer::SerializeState() {
  TensorToProto(*parameter_, state_.mutable_parameter());
  TensorToProto(*accum_gradient_, state_.mutable_accum_gradient());
  TensorToProto(*accum_delta_, state_.mutable_accum_delta());
  TensorToProto(*update_delta_, state_.mutable_update_delta());
  auto r = state.SerializeAsString().c_str();
  ClearStateTensors();
  return r;
}

AdadeltaOptimizer::DeserializeState() {
  state_.ParseFromString(str);
  ProtoToTensor(state_.parameter(), parameter_);
  ProtoToTensor(state_.accum_gradient(), accum_gradient_);
  ProtoToTensor(state_.accum_delta(), accum_delta_);
  ProtoToTensor(state_.update_delta(), update_delta_);
  ClearStateTensors();
}

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

firstly, epsilon_, rho_, decay_ is not missing by careless. Consider that we have to call create_with_config function signature with config, which contains these three parameters. And these ones is constant varible during training process. Do we really need to save them since we already have to save the config proto file?
If we want the training state self-contained, these constant hyperparameters should be there.

secondly, In that format, obviously it is easy to reading. But need a pair of config/state for every optimizer, which lead to copy/paste config config and Serialize/Deserialize code many times, we should find another elegant and idiomatic way to do that. The idea is the same way goes with https://github.com/caffe2/caffe2/blob/master/caffe2/core/qtensor_serialization.h

Copy link
Contributor

@helinwang helinwang Jun 12, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • I see, they are constants initialized by constructor, and constructor arguments are already saved by OptimizerConfig. My bad. Sorry!

  • which lead to copy/paste config config and Serialize/Deserialize code many times

    I understand the config proto could have duplicates (e.g., probably every config proto have a learning rate field.). I would argue that having one config proto per optimizer makes people understand what a specific optimizer's argument easily. Please take a look at this example, we are currently using a single config proto LayerConfig for all layers. My opinion is it's too hard for anyone to read, and understand what parameter each layer needs.

    You mentioned "Serialize/Deserialize code" need to be copy / pasted many times. I think with the approach of having one config for all optimizers, we still need to write the Serialize/Deserialize code for each class?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see.
the config proto e.g. SGDConfig only contains constant hyperparameter, the state proto only contains the others. Combine them together will create the optimizer with a state. Separate different optimizer state into independently for reading. I thought this would be better than adding a state member into each optimizer class.
fix done.


ProtoToTensor(state.parameter(), parameter_);
ProtoToTensor(state.accum_gradient(), accum_gradient_);
ProtoToTensor(state.accum_delta(), accum_delta_);
ProtoToTensor(state.update_delta(), update_delta_);
}

} // namespace optimizer
} // namespace paddle
39 changes: 39 additions & 0 deletions paddle/optimizer/adadelta_optimizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#pragma once

#include "parameter_optimizer.h"

namespace paddle {
namespace optimizer {

class AdadeltaOptimizer : public ParameterOptimizer {
public:
AdadeltaOptimizer(
Tensor *parameter, LrPolicy *lr, double rho, double epsilon, double decay)
: ParameterOptimizer(parameter, lr),
accum_gradient_(new Tensor(parameter->size())),
accum_delta_(new Tensor(parameter->size())),
update_delta_(new Tensor(parameter->size())),
rho_(rho),
epsilon_(epsilon),
decay_(decay) {}

~AdadeltaOptimizer() {
if (accum_gradient_) delete accum_gradient_;
if (accum_delta_) delete accum_delta_;
if (update_delta_) delete update_delta_;
}
void Update(const Tensor *gradient);
const char *SerializeState(int *state_len);
void DeserializeState(const std::string &state);

private:
Tensor *accum_gradient_;
Tensor *accum_delta_;
Tensor *update_delta_;
double rho_;
double epsilon_;
double decay_;
};

} // namespace optimizer
} // namespace paddle
42 changes: 42 additions & 0 deletions paddle/optimizer/adagrad_optimizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#include <cmath>

#include "adagrad_optimizer.h"

namespace paddle {
namespace optimizer {

void AdagradOptimizer::Update(const Tensor* gradient) {
num_sample_passed_ += 1;
double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
Tensor& param = *parameter_;
Tensor& accum_g = *accum_gradient_;
const Tensor& grad = *gradient;
for (size_t i = 0; i < param.size(); ++i) {
accum_g[i] += grad[i] * grad[i];
param[i] += learning_rate * grad[i] / std::sqrt(accum_g[i] + epsilon_) +
learning_rate * decay_ * param[i];
}
}
const char* AdagradOptimizer::SerializeState(int* state_len) {
AdagradOptimizerState state;
// TODO(zhihong) : add lr_policy serialization
state.set_num_sample_passed(num_sample_passed_);

TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
auto str = state.SerializeAsString();
*state_len = str.size();
return str.c_str();
}

void AdagradOptimizer::DeserializeState(const std::string& str) {
AdagradOptimizerState state;
state.ParseFromString(str);
// TODO(zhihong) : add lr_policy DeserializeState
num_sample_passed_ = state.num_sample_passed();
ProtoToTensor(state.parameter(), parameter_);
ProtoToTensor(state.accum_gradient(), accum_gradient_);
}

} // namespace optimizer
} // namespace paddle
32 changes: 32 additions & 0 deletions paddle/optimizer/adagrad_optimizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#pragma once

#include "parameter_optimizer.h"

namespace paddle {
namespace optimizer {

class AdagradOptimizer : public ParameterOptimizer {
public:
AdagradOptimizer(Tensor *parameter,
LrPolicy *lr,
double epsilon,
double decay)
: ParameterOptimizer(parameter, lr),
accum_gradient_(new Tensor(parameter->size())),
epsilon_(epsilon),
decay_(decay) {}
~AdagradOptimizer() {
if (accum_gradient_) delete accum_gradient_;
}
void Update(const Tensor *gradient);
const char *SerializeState(int *state_len);
void DeserializeState(const std::string &state);

private:
Tensor *accum_gradient_;
double epsilon_;
double decay_;
};

} // namespace optimizer
} // namespace paddle
47 changes: 47 additions & 0 deletions paddle/optimizer/adam_optimizer.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#include "adam_optimizer.h"
#include <cmath>

namespace paddle {
namespace optimizer {

void AdamOptimizer::Update(const Tensor *gradient) {
num_sample_passed_ += 1;
double learning_rate = lr_policy_->LearningRate(num_sample_passed_);
double coef1 = 1.0 - std::pow(beta_1_, num_sample_passed_);
double coef2 = 1.0 - std::pow(beta_2_, num_sample_passed_);
learning_rate *= std::sqrt(coef2) / coef1;
Tensor &param = *parameter_;
const Tensor &grad = *gradient;
Tensor &m = *momentums_;
Tensor &v = *velocitys_;
for (size_t i = 0; i < param.size(); ++i) {
m[i] = beta_1_ * m[i] + (1.0 - beta_1_) * grad[i];
v[i] = beta_2_ * v[i] + (1.0 - beta_2_) * grad[i] * grad[i];
param[i] -=
learning_rate * (m[i] / std::sqrt(v[i] + epsilon_) + decay_ * param[i]);
}
}

const char *AdamOptimizer::SerializeState(int *state_len) {
AdamOptimizerState state;
// TODO(zhihong) : add lr_policy serialization
state.set_num_sample_passed(num_sample_passed_);

TensorToProto(*parameter_, state.mutable_parameter());
TensorToProto(*velocitys_, state.mutable_momentums());
auto str = state.SerializeAsString();
*state_len = str.size();
return str.c_str();
}

void AdamOptimizer::DeserializeState(const std::string &str) {
AdamOptimizerState state;
state.ParseFromString(str);
// TODO(zhihong) : add lr_policy DeserializeState
num_sample_passed_ = state.num_sample_passed();

ProtoToTensor(state.parameter(), parameter_);
ProtoToTensor(state.velocitys(), velocitys_);
}
} // namespace optimizer
} // namespace paddle
41 changes: 41 additions & 0 deletions paddle/optimizer/adam_optimizer.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#pragma once

#include "parameter_optimizer.h"

namespace paddle {
namespace optimizer {

class AdamOptimizer : public ParameterOptimizer {
public:
AdamOptimizer(Tensor *parameter,
LrPolicy *lr,
double beta_1,
double beta_2,
double epsilon,
double decay)
: ParameterOptimizer(parameter, lr),
momentums_(new Tensor(parameter->size())),
velocitys_(new Tensor(parameter->size())),
beta_1_(beta_1),
beta_2_(beta_2),
epsilon_(epsilon),
decay_(decay) {}
~AdamOptimizer() {
if (momentums_) delete momentums_;
if (velocitys_) delete velocitys_;
}
void Update(const Tensor *gradient);
const char *SerializeState(int *state_len);
void DeserializeState(const std::string &state);

private:
Tensor *momentums_;
Tensor *velocitys_;
double beta_1_;
double beta_2_;
double epsilon_;
double decay_;
};

} // namespace optimizer
} // namespace paddle
Loading