Skip to content

Commit

Permalink
Merge branch 'develop' into support_acti
Browse files Browse the repository at this point in the history
  • Loading branch information
xiaoxiaohehe001 authored May 11, 2023
2 parents bcc59de + 04e5e7b commit 7463161
Show file tree
Hide file tree
Showing 102 changed files with 2,212 additions and 955 deletions.
3 changes: 2 additions & 1 deletion cmake/external/openblas.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ set(CBLAS_TAG v0.3.7)
# https://github.com/PaddlePaddle/Paddle/pull/52983
if(UNIX
AND NOT APPLE
AND NOT WITH_ROCM)
AND NOT WITH_ROCM
AND NOT WITH_XPU)
set(CBLAS_TAG v0.3.18)
endif()

Expand Down
27 changes: 15 additions & 12 deletions cmake/external/xpu.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
set(XPU_XFT_LIB_NAME "libxft.so")

set(XPU_BASE_DATE "20230427")
set(XPU_XCCL_BASE_VERSION "1.0.13")
set(XPU_XCCL_BASE_VERSION "1.0.49.2")
set(XPU_XFT_BASE_VERSION "latest")

if(NOT DEFINED XPU_BASE_URL)
Expand All @@ -30,35 +30,41 @@ if(NOT XPU_XFT_BASE_URL)
)
endif()

if(WITH_XCCL_RDMA)
set(XPU_XCCL_PREFIX "xccl_rdma")
else()
set(XPU_XCCL_PREFIX "xccl_socket")
endif()

if(WITH_AARCH64)
set(XPU_XRE_DIR_NAME "xre-kylin_aarch64")
set(XPU_XDNN_DIR_NAME "xdnn-kylin_aarch64")
set(XPU_XCCL_DIR_NAME "xccl-kylin_aarch64")
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-kylin_aarch64")
set(XPU_XFT_DIR_NAME "") # TODO: xft has no kylin output at now.
elseif(WITH_SUNWAY)
set(XPU_XRE_DIR_NAME "xre-deepin_sw6_64")
set(XPU_XDNN_DIR_NAME "xdnn-deepin_sw6_64")
set(XPU_XCCL_DIR_NAME "xccl-deepin_sw6_64")
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-deepin_sw6_64")
set(XPU_XFT_DIR_NAME "") # TODO: xft has no deepin output at now.
elseif(WITH_BDCENTOS)
set(XPU_XRE_DIR_NAME "xre-bdcentos_x86_64")
set(XPU_XDNN_DIR_NAME "xdnn-bdcentos_x86_64")
set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64")
set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82")
elseif(WITH_UBUNTU)
set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64")
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-ubuntu_x86_64")
set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64")
elseif(WITH_CENTOS)
set(XPU_XRE_DIR_NAME "xre-centos7_x86_64")
set(XPU_XDNN_DIR_NAME "xdnn-centos7_x86_64")
set(XPU_XCCL_DIR_NAME "xccl-bdcentos_x86_64")
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-bdcentos_x86_64")
set(XPU_XFT_DIR_NAME "xft_bdcentos6u3_x86_64_gcc82")
else()
set(XPU_XRE_DIR_NAME "xre-ubuntu_x86_64")
set(XPU_XDNN_DIR_NAME "xdnn-ubuntu_x86_64")
set(XPU_XCCL_DIR_NAME "xccl-ubuntu_x86_64")
set(XPU_XCCL_DIR_NAME "${XPU_XCCL_PREFIX}-ubuntu_x86_64")
set(XPU_XFT_DIR_NAME "xft_ubuntu1604_x86_64")
endif()

Expand All @@ -75,9 +81,6 @@ set(XPU_XFT_URL "${XPU_XFT_BASE_URL}/${XPU_XFT_DIR_NAME}.tar.gz")
set(XPU_PACK_DEPENCE_URL
"https://baidu-kunlun-public.su.bcebos.com/paddle_depence/pack_paddle_depence.sh"
CACHE STRING "" FORCE)
set(XPU_CHECK_DEPENCE_URL
"https://baidu-kunlun-public.su.bcebos.com/paddle_depence/check_xpu_dependence.sh"
CACHE STRING "" FORCE)
set(XPU_XFT_GET_DEPENCE_URL
"https://baidu-kunlun-public.su.bcebos.com/paddle_depence/get_xft_dependence.sh"
CACHE STRING "" FORCE)
Expand Down Expand Up @@ -115,8 +118,8 @@ ExternalProject_Add(
PREFIX ${SNAPPY_PREFIX_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND
wget ${XPU_CHECK_DEPENCE_URL} && bash check_xpu_dependence.sh
${XPU_BASE_URL} ${XPU_XCCL_BASE_URL} && wget ${XPU_PACK_DEPENCE_URL} && bash
bash ${CMAKE_SOURCE_DIR}/tools/xpu/check_xpu_dependence.sh ${XPU_BASE_URL}
${XPU_XCCL_BASE_URL} && wget ${XPU_PACK_DEPENCE_URL} && bash
pack_paddle_depence.sh ${XPU_XRE_URL} ${XPU_XRE_DIR_NAME} ${XPU_XDNN_URL}
${XPU_XDNN_DIR_NAME} ${XPU_XCCL_URL} ${XPU_XCCL_DIR_NAME} && wget
${XPU_XFT_GET_DEPENCE_URL} && bash get_xft_dependence.sh ${XPU_XFT_URL}
Expand Down
108 changes: 94 additions & 14 deletions paddle/fluid/distributed/collective/process_group_bkcl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Recv(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_recv";
VLOG(3) << "calling bkcl_recv"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", src_rank: " << src_rank << ", numel: " << output->numel()
<< ", dtype: " << output->type() << ", sync_op: " << sync_op
<< ", use_calc_stream: " << use_calc_stream;
int r = bkcl_recv(comm,
output->data(),
output->numel(),
Expand Down Expand Up @@ -148,7 +154,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Send(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_send";
VLOG(3) << "calling bkcl_send"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", dst_rank: " << dst_rank
<< ", input numel: " << input.numel()
<< ", dtype: " << input.type() << ", sync_op: " << sync_op
<< ", use_calc_stream: " << use_calc_stream;
int r = bkcl_send(comm,
input.data(),
input.numel(),
Expand Down Expand Up @@ -276,7 +289,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_all_reduce";
VLOG(3) << "calling bkcl_all_reduce"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", numel: " << input.numel() << ", dtype: " << input.type()
<< ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
<< ", sync_op: " << sync_op
<< ", use_calc_stream: " << use_calc_stream;
int r =
bkcl_all_reduce(comm,
input.data(),
Expand Down Expand Up @@ -307,7 +327,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
BKCLContext_t comm,
const XPUStream& stream) {
int root = opts.source_rank + opts.source_root;
VLOG(3) << "bkcl_broadcast";
VLOG(3) << "calling bkcl_broadcast"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", root: " << root << ", numel: " << input.numel()
<< ", dtype: " << input.type() << ", sync_op: " << sync_op
<< ", use_calc_stream: " << use_calc_stream;
int r =
bkcl_broadcast(comm,
input.data(),
Expand Down Expand Up @@ -346,7 +372,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_all_gather";
VLOG(3) << "calling bkcl_all_gather"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", numel: " << in_tensor_maybe_partial.numel()
<< ", dtype: " << input.type() << ", sync_op: " << sync_op
<< ", use_calc_stream: " << use_calc_stream;
int r =
bkcl_all_gather(comm,
in_tensor_maybe_partial.data(),
Expand Down Expand Up @@ -375,7 +407,15 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Reduce(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_reduce";
VLOG(3) << "calling bkcl_reduce"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", root: " << opts.root_rank << ", numel: " << input.numel()
<< ", dtype: " << input.type()
<< ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
<< ", sync_op: " << sync_op
<< ", use_calc_stream: " << use_calc_stream;
int r = bkcl_reduce(comm,
input.data(),
output->data(),
Expand Down Expand Up @@ -405,7 +445,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::ReduceScatter(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_reduce_scatter";
VLOG(3) << "calling bkcl_reduce_scatter"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", numel: " << output->numel() << ", dtype: " << input.type()
<< ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
<< ", sync_op: " << sync_op
<< ", use_calc_stream: " << use_calc_stream;
int r = bkcl_reduce_scatter(
comm,
input.data(),
Expand Down Expand Up @@ -491,8 +538,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_all_reduce";

VLOG(3) << "calling bkcl_all_reduce"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", numel: " << input.numel() << ", dtype: " << input.type()
<< ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
<< ", sync_op: " << true << ", use_calc_stream: " << false;
int r =
bkcl_all_reduce(comm,
input.data(),
Expand Down Expand Up @@ -535,7 +587,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllReduce(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_all_reduce";
VLOG(3) << "calling bkcl_all_reduce"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", numel: " << input.numel() << ", dtype: " << input.type()
<< ", reduce_type: " << ToBKCLRedType(opts.reduce_op)
<< ", sync_op: " << sync_op << ", use_calc_stream: " << false;
int r =
bkcl_all_reduce(comm,
input.data(),
Expand Down Expand Up @@ -580,7 +638,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
const XPUStream& stream) {
const auto root =
opts.source_rank * in_tensors.size() + opts.source_root;
VLOG(3) << "bkcl_broadcast";
VLOG(3) << "calling bkcl_broadcast"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", root: " << root << ", numel: " << input.numel()
<< ", dtype: " << input.type() << ", sync_op: " << true
<< ", use_calc_stream: " << false;
int r =
bkcl_broadcast(comm,
input.data(),
Expand Down Expand Up @@ -626,7 +690,13 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::Broadcast(
const XPUStream& stream) {
const auto root =
opts.source_rank * in_tensors.size() + opts.source_root;
VLOG(3) << "bkcl_broadcast";
VLOG(3) << "calling bkcl_broadcast"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", root: " << root << ", numel: " << input.numel()
<< ", dtype: " << input.type() << ", sync_op: " << sync_op
<< ", use_calc_stream: " << false;
int r =
bkcl_broadcast(comm,
input.data(),
Expand Down Expand Up @@ -671,7 +741,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_all_gather";
VLOG(3) << "calling bkcl_all_gather"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", numel: " << input.numel() << ", dtype: " << input.type()
<< ", sync_op: " << true << ", use_calc_stream: " << false;
int r =
bkcl_all_gather(comm,
input.data(),
Expand Down Expand Up @@ -712,7 +787,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupBKCL::AllGather(
const phi::DenseTensor& input,
BKCLContext_t comm,
const XPUStream& stream) {
VLOG(3) << "bkcl_all_gather";
VLOG(3) << "calling bkcl_all_gather"
<< ", rank_id: " << platform::GetBKCLRankID(comm)
<< ", dev_id: " << platform::GetBKCLDevID(comm)
<< ", nranks: " << platform::GetBKCLNRanks(comm)
<< ", numel: " << input.numel() << ", dtype: " << input.type()
<< ", sync_op: " << sync_op << ", use_calc_stream: " << false;
int r =
bkcl_all_gather(comm,
input.data(),
Expand Down
6 changes: 4 additions & 2 deletions paddle/fluid/distributed/collective/process_group_custom.cc
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,14 @@ void ProcessGroupCustom::BroadcastUniqueCustomID(
std::vector<phi::ccl::CCLRootId>& ccl_ids) { // NOLINT
if (rank_ == 0) {
for (size_t i = 0; i < ccl_ids.size(); i++) {
auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" +
std::to_string(i);
store_->set(key, ccl_ids[i]);
}
} else {
for (size_t i = 0; i < ccl_ids.size(); i++) {
auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(i);
auto key = "ProcessGroupCustom/ccl_ids/" + std::to_string(gid_) + "/" +
std::to_string(i);
ccl_ids[i] = store_->get(key);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@
"matmul_double_grad",
"tanh_double_grad",
"add_double_grad",
"multiply_double_grad",
"subtract_double_grad",
]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ DeviceContext* StreamAnalyzer::ParseDeviceContext(

DeviceContext* dev_ctx = nullptr;

// only gpu needs update. xpu not need, because xpu memcpy op kernel is
// only gpu need update. xpu not need, because xpu memcpy op kernel is
// synchronous.
if (platform::is_gpu_place(place_) || platform::is_custom_place(place_)) {
VLOG(6) << "Parse DeviceContext for " << op_type
Expand Down
Loading

0 comments on commit 7463161

Please sign in to comment.