Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix config #44

Merged
merged 2 commits into from
Jun 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 6 additions & 4 deletions paddle/fluid/framework/fleet/heter_ps/heter_comm_inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -449,16 +449,18 @@ int HeterComm<KeyType, ValType, GradType>::get_index_by_devid(int devid) {
template <typename KeyType, typename ValType, typename GradType>
void HeterComm<KeyType, ValType, GradType>::set_sparse_sgd(
const OptimizerConfig& optimizer_config) {
for (auto& table : tables_) {
table->set_sparse_sgd(optimizer_config);
for (int i = 0; i < resource_->total_device(); ++i) {
AnyDeviceGuard guard(resource_->dev_id(i));
ptr_tables_[i]->set_sparse_sgd(optimizer_config);
}
}

template <typename KeyType, typename ValType, typename GradType>
void HeterComm<KeyType, ValType, GradType>::set_embedx_sgd(
const OptimizerConfig& optimizer_config) {
for (auto& table : tables_) {
table->set_embedx_sgd(optimizer_config);
for (int i = 0; i < resource_->total_device(); ++i) {
AnyDeviceGuard guard(resource_->dev_id(i));
ptr_tables_[i]->set_embedx_sgd(optimizer_config);
}
}

Expand Down
2 changes: 2 additions & 0 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -585,6 +585,8 @@ void PSGPUWrapper::BuildGPUTask(std::shared_ptr<HeterContext> gpu_task) {
HeterPs_ = HeterPsBase::get_instance(size_max, resource_, feature_value_accessor_, optimizer_type_);
#ifdef PADDLE_WITH_CUDA
HeterPs_->set_nccl_comm_and_size(inner_comms_, inter_comms_, node_size_);
HeterPs_->set_sparse_sgd(optimizer_config_);
HeterPs_->set_embedx_sgd(optimizer_config_);
#endif
auto build_dynamic_mf_func = [this, &gpu_task](int i, int j) {
this->HeterPs_->set_multi_mf_dim(multi_mf_dim_, max_mf_dim_);
Expand Down
8 changes: 2 additions & 6 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -318,24 +318,20 @@ void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
float learning_rate, float initial_g2sum,
float initial_range, float beta1_decay_rate,
float beta2_decay_rate, float ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
optimizer_config_.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
learning_rate, initial_g2sum, initial_range,
beta1_decay_rate, beta2_decay_rate, ada_epsilon);
HeterPs_->set_sparse_sgd(optimizer_config);
}

void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate, float mf_initial_g2sum,
float mf_initial_range, float mf_min_bound,
float mf_max_bound, float mf_beta1_decay_rate,
float mf_beta2_decay_rate, float mf_ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
optimizer_config_.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
mf_initial_g2sum, mf_initial_range,
mf_min_bound, mf_max_bound, mf_beta1_decay_rate,
mf_beta2_decay_rate, mf_ada_epsilon);
HeterPs_->set_embedx_sgd(optimizer_config);
}

} // end namespace framework
Expand Down
44 changes: 19 additions & 25 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,8 +319,8 @@ class PSGPUWrapper {
config["embedx_dim"] = sparse_table_accessor.embedx_dim();
config["nonclk_coeff"] = sparse_table_accessor_parameter.nonclk_coeff();
config["clk_coeff"] = sparse_table_accessor_parameter.click_coeff();

config["mf_create_thresholds"] = sparse_table_accessor.embedx_threshold();

if (accessor_class == "CtrDymfAccessor") {
// optimizer config for embed_w and embedx
add_sparse_optimizer(config, sparse_table_accessor.embed_sgd_param());
Expand Down Expand Up @@ -348,13 +348,13 @@ class PSGPUWrapper {
? 10.0
: config["max_bound"];
float learning_rate = (config.find("learning_rate") == config.end())
? 1.0
? 0.05
: config["learning_rate"];
float initial_g2sum = (config.find("initial_g2sum") == config.end())
? 1.0
? 3.0
: config["initial_g2sum"];
float initial_range = (config.find("initial_range") == config.end())
? 1.0
? 1e-4
: config["initial_range"];
float beta1_decay_rate = (config.find("beta1_decay_rate") == config.end())
? 0.9
Expand All @@ -371,19 +371,19 @@ class PSGPUWrapper {
? static_cast<float>(1.0)
: config["mf_create_thresholds"];
float mf_learning_rate = (config.find("mf_learning_rate") == config.end())
? 1.0
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

所以原始的学习率是 1.0?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

原始学习率是0.05

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

用的是optimizer_conf.h里的默认配置

? 0.05
: config["mf_learning_rate"];
float mf_initial_g2sum = (config.find("mf_initial_g2sum") == config.end())
? 1.0
? 3.0
: config["mf_initial_g2sum"];
float mf_initial_range = (config.find("mf_initial_range") == config.end())
? 1.0
? 1e-4
: config["mf_initial_range"];
float mf_min_bound = (config.find("mf_min_bound") == config.end())
? 1.0
? -10.0
: config["mf_min_bound"];
float mf_max_bound = (config.find("mf_max_bound") == config.end())
? 1.0
? 10.0
: config["mf_max_bound"];
float mf_beta1_decay_rate = (config.find("mf_beta1_decay_rate") == config.end())
? 0.9
Expand All @@ -394,20 +394,14 @@ class PSGPUWrapper {
float mf_ada_epsilon = (config.find("mf_ada_epsilon") == config.end())
? 1e-8
: config["mf_ada_epsilon"];
for (size_t i = 0; i < heter_devices_.size(); i++) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE_GPU_SUCCESS(cudaSetDevice(heter_devices_[i]));
#elif defined(PADDLE_WITH_XPU_KP)
PADDLE_ENFORCE_XPU_SUCCESS(xpu_set_device(heter_devices_[i]));
#endif
this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound,
learning_rate, initial_g2sum, initial_range,
beta1_decay_rate, beta2_decay_rate, ada_epsilon);
this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate,
mf_initial_g2sum, mf_initial_range, mf_min_bound,
mf_max_bound, mf_beta1_decay_rate, mf_beta2_decay_rate,
mf_ada_epsilon);
}

this->SetSparseSGD(nonclk_coeff, clk_coeff, min_bound, max_bound,
learning_rate, initial_g2sum, initial_range,
beta1_decay_rate, beta2_decay_rate, ada_epsilon);
this->SetEmbedxSGD(mf_create_thresholds, mf_learning_rate,
mf_initial_g2sum, mf_initial_range, mf_min_bound,
mf_max_bound, mf_beta1_decay_rate, mf_beta2_decay_rate,
mf_ada_epsilon);

// set optimizer type(naive,adagrad,std_adagrad,adam,share_adam)
optimizer_type_ = (config.find("optimizer_type") == config.end())
Expand Down Expand Up @@ -630,7 +624,7 @@ class PSGPUWrapper {
bool running_ = false;
std::vector<std::shared_ptr<ThreadPool>> pull_thread_pool_;
std::vector<std::shared_ptr<ThreadPool>> hbm_thread_pool_;

OptimizerConfig optimizer_config_;
protected:
static bool is_initialized_;
};
Expand Down
25 changes: 0 additions & 25 deletions paddle/fluid/framework/fleet/ps_gpu_wrapper.kps
Original file line number Diff line number Diff line change
Expand Up @@ -258,31 +258,6 @@ void PSGPUWrapper::CopyForPush(const paddle::platform::Place& place,
xpu_wait(stream);
}

void PSGPUWrapper::SetSparseSGD(float nonclk_coeff, float clk_coeff,
float min_bound, float max_bound,
float learning_rate, float initial_g2sum,
float initial_range, float beta1_decay_rate,
float beta2_decay_rate, float ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_sparse_sgd(nonclk_coeff, clk_coeff, min_bound, max_bound,
learning_rate, initial_g2sum, initial_range,
beta1_decay_rate, beta2_decay_rate, ada_epsilon);
HeterPs_->set_sparse_sgd(optimizer_config);
}

void PSGPUWrapper::SetEmbedxSGD(float mf_create_thresholds,
float mf_learning_rate, float mf_initial_g2sum,
float mf_initial_range, float mf_min_bound,
float mf_max_bound, float mf_beta1_decay_rate,
float mf_beta2_decay_rate, float mf_ada_epsilon) {
OptimizerConfig optimizer_config;
optimizer_config.set_embedx_sgd(mf_create_thresholds, mf_learning_rate,
mf_initial_g2sum, mf_initial_range,
mf_min_bound, mf_max_bound,mf_beta1_decay_rate,
mf_beta2_decay_rate, mf_ada_epsilon);
HeterPs_->set_embedx_sgd(optimizer_config);
}

} // end namespace framework
} // end namespace paddle
#endif