Skip to content

Commit

Permalink
merge develop
Browse files Browse the repository at this point in the history
  • Loading branch information
liutiexing committed Mar 23, 2022
2 parents 565773a + 5b758c2 commit 2780d08
Show file tree
Hide file tree
Showing 403 changed files with 15,808 additions and 7,951 deletions.
2 changes: 1 addition & 1 deletion cmake/cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ elseif(NEW_RELEASE_ALL)
add_definitions(-DNEW_RELEASE_ALL)
set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
set(paddle_known_gpu_archs11 "35 50 60 61 70 75 80")
elseif(NEW_RELEASE_PYPI)
message("Using New Release Strategy - Cubin Packge")
add_definitions(-DNEW_RELEASE_PYPI)
Expand Down
6 changes: 3 additions & 3 deletions cmake/external/ascend.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -90,9 +90,9 @@ endif()
if (WITH_ASCEND_CL)
macro(find_ascend_toolkit_version ascend_toolkit_version_info)
file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
if(NOT ASCEND_TOOLKIT_VERSION)
set(ASCEND_TOOLKIT_VERSION "???")
Expand Down
6 changes: 6 additions & 0 deletions cmake/inference_lib.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,12 @@ copy(inference_lib_dist
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
copy(inference_lib_dist
SRCS ${PADDLE_SOURCE_DIR}/paddle/extension.h
DSTS ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)
Expand Down
5 changes: 4 additions & 1 deletion cmake/phi.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ function(kernel_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
Expand Down Expand Up @@ -151,6 +151,9 @@ function(kernel_library TARGET)
file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc )
list(APPEND kps_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
endif()
endif()
else()
# TODO(chenweihang): impl compile by source later
Expand Down
2 changes: 1 addition & 1 deletion paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
Original file line number Diff line number Diff line change
Expand Up @@ -395,7 +395,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
platform::CUDADeviceGuard gpuGuard;
for (auto& place : places) {
gpuGuard.SetDeviceIndex(place.GetDeviceId());
auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace());
barrierTensors.push_back(dt);
}
auto task = ProcessGroupNCCL::AllReduce(barrierTensors);
Expand Down
6 changes: 2 additions & 4 deletions paddle/fluid/distributed/collective/reducer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ EagerReducer::EagerReducer(
if (find_unused_vars_each_step_) {
global_used_vars_ = paddle::experimental::empty(
ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
TransToBackend(inner_place_));
inner_place_);
}
}

Expand Down Expand Up @@ -363,10 +363,8 @@ void EagerReducer::InitializeGroups(
} else {
// process the dense gradient.
InitializeDenseGroups(tensor_indices_, &group);
// experimental::Backend backend = TransToBackend(inner_place_);
group.dense_contents_ = paddle::experimental::empty(
ScalarArray({group.all_length_}), group.dtype_,
TransToBackend(inner_place_));
ScalarArray({group.all_length_}), group.dtype_, inner_place_);
}

// map tensors to this group by VariableLocator
Expand Down
33 changes: 15 additions & 18 deletions paddle/fluid/distributed/ps/service/brpc_ps_client.cc
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ int32_t BrpcPsClient::initialize() {
std::thread(std::bind(&BrpcPsClient::push_dense_task_consume, this));
// for debug
// _print_thread =
// std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));
// std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));

return 0;
}
Expand Down Expand Up @@ -1315,11 +1315,11 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
CostTimer parse_timer("pserver_client_push_sparse_parse");
int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) {
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume, task_num:"
// << push_sparse_async_num << ", max_task_limit:" <<
// FLAGS_pserver_max_async_call_num;
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume,
// task_num:"
// << push_sparse_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms
// push_sparse_async_num = _push_sparse_task_queue_map[table_id]->size();
push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
}
auto put_timer = std::make_shared<CostTimer>("client_push_sparse_put");
Expand Down Expand Up @@ -1381,8 +1381,7 @@ void BrpcPsClient::push_sparse_task_consume() {
::ThreadPool async_push_sparse_shard_threads(
FLAGS_pserver_sparse_merge_thread);
while (_running) {
platform::Timer timeline;
timeline.Start();
auto async_start_time_ms = butil::gettimeofday_ms();
// 所有sparseTable的pushTask 进行处理
for (auto &push_sparse_task_itr : _push_sparse_task_queue_map) {
auto table_id = push_sparse_task_itr.first;
Expand Down Expand Up @@ -1497,9 +1496,8 @@ void BrpcPsClient::push_sparse_task_consume() {
std::vector<std::future<int>>().swap(merge_status);
}
}
timeline.Pause();
auto wait_ms =
FLAGS_pserver_async_push_sparse_interval_ms - (timeline.ElapsedMS());
auto wait_ms = FLAGS_pserver_async_push_sparse_interval_ms -
(butil::gettimeofday_ms() - async_start_time_ms);
if (wait_ms > 0) {
usleep(wait_ms * 1000);
}
Expand Down Expand Up @@ -1661,9 +1659,10 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions,
std::make_shared<CostTimer>("pserver_client_push_dense_parse");
int push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
while (push_dense_async_num > FLAGS_pserver_max_async_call_num) {
LOG(INFO) << "push_dense Waiting for async_call_num comsume, task_num:"
<< push_dense_async_num
<< ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
// LOG(INFO) << "push_dense Waiting for async_call_num comsume,
// task_num:"
// << push_dense_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep(5000); // 5ms
push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
}
Expand Down Expand Up @@ -1701,8 +1700,7 @@ void BrpcPsClient::push_dense_task_consume() {
static bool scale_gradient = FLAGS_pserver_scale_gradient_by_merge;
::ThreadPool async_merge_dense_threads(10);
while (_running) {
platform::Timer timeline;
timeline.Start();
auto async_start_time_ms = butil::gettimeofday_ms();
for (auto &task_queue_itr : _push_dense_task_queue_map) {
auto &task_queue = task_queue_itr.second;
auto queue_size = task_queue->Size();
Expand Down Expand Up @@ -1791,9 +1789,8 @@ void BrpcPsClient::push_dense_task_consume() {
push_dense_raw_gradient(task_ptr, total_send_data, total_send_data_size,
closure);
}
timeline.Pause();
auto wait_ms =
FLAGS_pserver_async_push_dense_interval_ms - (timeline.ElapsedMS());
auto wait_ms = FLAGS_pserver_async_push_dense_interval_ms -
(butil::gettimeofday_ms() - async_start_time_ms);
if (wait_ms > 0) {
usleep(wait_ms * 1000);
}
Expand Down
74 changes: 25 additions & 49 deletions paddle/fluid/distributed/ps/service/communicator/communicator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */

#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"

#include <google/protobuf/text_format.h>

#include "gflags/gflags.h"
#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h"

Expand Down Expand Up @@ -66,34 +65,9 @@ std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
void Communicator::InitBrpcClient(
const std::string &dist_desc,
const std::vector<std::string> &host_sign_list) {
// not used, just for psclient's init
std::map<uint64_t, std::vector<paddle::distributed::Region>>
_dense_pull_regions;
for (auto &iter : recv_varname_to_ctx_) {
auto tid = iter.first;
auto var_names = iter.second;

auto &regions = _dense_pull_regions[tid];
regions.reserve(var_names.size());
for (auto &t : var_names) {
Variable *var = recv_scope_->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
float *w = tensor->data<float>();
paddle::distributed::Region reg(w, tensor->numel());
regions.emplace_back(std::move(reg));
}
}

auto fleet = paddle::distributed::FleetWrapper::GetInstance();
if (_worker_ptr.get() == nullptr) {
google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
init_gflag(_ps_param.init_gflags());
servers_ = host_sign_list.size();
_ps_env = paddle::distributed::PaddlePSEnvironment();
_ps_env.set_ps_servers(&host_sign_list, servers_);
_worker_ptr = std::unique_ptr<paddle::distributed::PSClient>(
paddle::distributed::PSClientFactory::create(_ps_param));
_worker_ptr->configure(_ps_param, _dense_pull_regions, _ps_env,
trainer_id_);
_worker_ptr = fleet->worker_ptr_;
}
return;
}
Expand Down Expand Up @@ -146,11 +120,11 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
for (auto &t : varnames) {
Variable *var = scope->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place());

float *temp_recv_data = tensor->mutable_data<float>(platform::CPUPlace());
VLOG(1) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
VLOG(3) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
<< table_id << " Temp_data[0] " << temp_recv_data[0]
<< " Temp_data[-1] " << temp_recv_data[tensor->numel() - 1];
if (platform::is_gpu_place(tensor->place())) {
Expand Down Expand Up @@ -481,7 +455,7 @@ void AsyncCommunicator::RecvNoBarrier() {
for (auto &t : var_names) {
Variable *var = recv_scope_->FindVar(t);
LoDTensor *tensor = var->GetMutable<LoDTensor>();
VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
<< platform::is_gpu_place(tensor->place());
if (platform::is_gpu_place(tensor->place())) {
#ifdef PADDLE_WITH_CUDA
Expand Down Expand Up @@ -653,7 +627,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
if (batch_size == -1) {
batch_size = cur_batch_size;
} else {
} else if (batch_size != cur_batch_size) {
// CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist = false;
break;
Expand All @@ -676,7 +650,8 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
size_t output_len = 0;
size_t input_idx = 0;

VLOG(2) << "fleet.cc::emb_dim: " << fea_dim;
VLOG(2) << "fleet.cc::emb_dim: " << fea_dim << " batch_size: " << batch_size
<< " batch_size_consist: " << batch_size_consist;

// TODO(zhaocaibei123): check type of show/clk is int? float? uint64?
// const long int* show_tensor = shows->data<int64_t>();
Expand All @@ -687,13 +662,14 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
for (size_t index = 0; index < inputs->size(); ++index) {
framework::LoDTensor *g_tensor = outputs->at(index);
float *g = g_tensor->data<float>();
// no cvm

if (batch_size_consist) { // TODO(zhaocaibei123): add config
// scale_sparse_gradient_with_batch_size_
Eigen::Map<
Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
g_mat(g, g_tensor->numel() / fea_dim, fea_dim);
g_mat.rightCols(fea_dim) *= batch_size;
g_mat.rightCols(fea_dim - 2) *=
batch_size; // hard code here, because of cvm_grad op
}

const framework::LoDTensor *tensor = inputs->at(index);
Expand All @@ -710,16 +686,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue;
}
push_keys.emplace_back(real_id);
push_values.emplace_back(fea_dim + 3);
push_values.emplace_back(fea_dim + 1);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] =
(i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
push_values.back()[2] =
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
// push_values.back()[1] =
// (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
// push_values.back()[2] =
// (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));

float *data = push_values.back().data() + 3;
float *data = push_values.back().data() + 1; // hard code here

memcpy(data, g + output_len, sizeof(float) * fea_dim);

Expand All @@ -733,16 +709,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue;
}
push_keys.emplace_back(real_id);
push_values.emplace_back(fea_dim + 3);
push_values.emplace_back(fea_dim + 1);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values.back()[0] = 2; // TODO(zhaocaibei123): slot
push_values.back()[1] =
(i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
push_values.back()[2] =
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
// push_values.back()[1] =
// (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
// push_values.back()[2] =
// (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));

float *data = push_values.back().data() + 3;
float *data = push_values.back().data() + 1;

memcpy(data, g + output_len, sizeof(float) * fea_dim);

Expand Down Expand Up @@ -837,7 +813,7 @@ void AsyncCommunicator::Stop() {
if (!communicator_) {
VLOG(0) << "Communicator is not inited, do nothing";
} else {
_worker_ptr->finalize_worker();
// _worker_ptr->finalize_worker();
VLOG(1) << "client finalize_worker done";
if (recv_thread_) {
VLOG(1) << "stop recv thread";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -360,13 +360,13 @@ class Communicator {

PSClient *GetPsClient() { return _worker_ptr.get(); }

std::unique_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
std::shared_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
return std::move(_worker_ptr);
}

RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }

std::unique_ptr<PSClient> _worker_ptr; // pointer to worker
std::shared_ptr<PSClient> _worker_ptr; // pointer to worker

protected:
bool running_ = false;
Expand Down
3 changes: 2 additions & 1 deletion paddle/fluid/distributed/ps/table/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,12 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI
set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(downpour_ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
cc_library(ctr_double_accessor SRCS ctr_double_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(ctr_accessor SRCS ctr_accessor.cc sparse_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(downpour_ctr_accessor SRCS downpour_ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)

Expand Down
Loading

1 comment on commit 2780d08

@paddle-bot-old
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Congratulation! Your pull request passed all required CI. You could ask reviewer(s) to approve and merge. 🎉

Please sign in to comment.