merge develop

PaddlePaddle · Mar 23, 2022 · 2780d08 · 2780d08 · paddle-bot-old · Mar 24, 2022
2 parents 565773a + 5b758c2
commit 2780d08
Show file tree

Hide file tree

Showing 403 changed files with 15,808 additions and 7,951 deletions.
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -11,7 +11,7 @@ elseif(NEW_RELEASE_ALL)
   add_definitions(-DNEW_RELEASE_ALL)
   set(paddle_known_gpu_archs "35 50 52 60 61 70 75 80 86")
   set(paddle_known_gpu_archs10 "35 50 52 60 61 70 75")
-  set(paddle_known_gpu_archs11 "35 50 52 60 61 70 75 80")
+  set(paddle_known_gpu_archs11 "35 50 60 61 70 75 80")
 elseif(NEW_RELEASE_PYPI)
   message("Using New Release Strategy - Cubin Packge")
   add_definitions(-DNEW_RELEASE_PYPI)

diff --git a/cmake/external/ascend.cmake b/cmake/external/ascend.cmake
@@ -90,9 +90,9 @@ endif()
 if (WITH_ASCEND_CL)
 macro(find_ascend_toolkit_version ascend_toolkit_version_info) 
     file(READ ${ascend_toolkit_version_info} ASCEND_TOOLKIT_VERSION_CONTENTS)
-    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
-    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
-    string(REGEX REPLACE "[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
+    string(REGEX MATCH "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION_CONTENTS}")
+    string(REGEX REPLACE "version=([0-9]+\.[0-9]+\.(RC)?[0-9]+\.[a-z]*[0-9]*)" "\\1" ASCEND_TOOLKIT_VERSION "${ASCEND_TOOLKIT_VERSION}")
+    string(REGEX REPLACE "[A-Z]|[a-z|\.]" "" CANN_VERSION ${ASCEND_TOOLKIT_VERSION})
     add_definitions("-DCANN_VERSION_CODE=${CANN_VERSION}")
     if(NOT ASCEND_TOOLKIT_VERSION)
         set(ASCEND_TOOLKIT_VERSION "???")

diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake
@@ -258,6 +258,12 @@ copy(inference_lib_dist
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/any.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/optional.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
+        copy(inference_lib_dist
+        SRCS  ${PADDLE_SOURCE_DIR}/paddle/utils/none.h
+        DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/utils/)
 copy(inference_lib_dist
         SRCS  ${PADDLE_SOURCE_DIR}/paddle/extension.h
         DSTS  ${PADDLE_INFERENCE_INSTALL_DIR}/paddle/include/experimental/)

diff --git a/cmake/phi.cmake b/cmake/phi.cmake
@@ -118,7 +118,7 @@ function(kernel_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
             list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
         endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
             list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
         endif()
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
@@ -151,6 +151,9 @@ function(kernel_library TARGET)
                 file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
                 list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc )
+                list(APPEND kps_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+            endif()
         endif()
     else()
         # TODO(chenweihang): impl compile by source later

diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -395,7 +395,7 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
   platform::CUDADeviceGuard gpuGuard;
   for (auto& place : places) {
     gpuGuard.SetDeviceIndex(place.GetDeviceId());
-    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::Backend::GPU);
+    auto dt = full({1}, 0, phi::DataType::FLOAT32, phi::GPUPlace());
     barrierTensors.push_back(dt);
   }
   auto task = ProcessGroupNCCL::AllReduce(barrierTensors);

diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
@@ -321,7 +321,7 @@ EagerReducer::EagerReducer(
   if (find_unused_vars_each_step_) {
     global_used_vars_ = paddle::experimental::empty(
         ScalarArray({static_cast<int32_t>(tensors_.size())}), DataType::INT32,
-        TransToBackend(inner_place_));
+        inner_place_);
   }
 }
 
@@ -363,10 +363,8 @@ void EagerReducer::InitializeGroups(
     } else {
       // process the dense gradient.
       InitializeDenseGroups(tensor_indices_, &group);
-      // experimental::Backend backend =  TransToBackend(inner_place_);
       group.dense_contents_ = paddle::experimental::empty(
-          ScalarArray({group.all_length_}), group.dtype_,
-          TransToBackend(inner_place_));
+          ScalarArray({group.all_length_}), group.dtype_, inner_place_);
     }
 
     // map tensors to this group by VariableLocator

diff --git a/paddle/fluid/distributed/ps/service/brpc_ps_client.cc b/paddle/fluid/distributed/ps/service/brpc_ps_client.cc
@@ -238,7 +238,7 @@ int32_t BrpcPsClient::initialize() {
       std::thread(std::bind(&BrpcPsClient::push_dense_task_consume, this));
   // for debug
   // _print_thread =
-  //     std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));
+  //    std::thread(std::bind(&BrpcPsClient::print_queue_size_thread, this));
 
   return 0;
 }
@@ -1315,11 +1315,11 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
   CostTimer parse_timer("pserver_client_push_sparse_parse");
   int push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
   while (push_sparse_async_num > FLAGS_pserver_max_async_call_num) {
-    // LOG(INFO) << "push_sparse Waiting for async_call_num comsume, task_num:"
-    //    << push_sparse_async_num << ", max_task_limit:" <<
-    //    FLAGS_pserver_max_async_call_num;
+    //    LOG(INFO) << "push_sparse Waiting for async_call_num comsume,
+    //    task_num:"
+    //              << push_sparse_async_num
+    //              << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
     usleep(5000);  // 5ms
-    // push_sparse_async_num = _push_sparse_task_queue_map[table_id]->size();
     push_sparse_async_num = _push_sparse_task_queue_map[table_id]->Size();
   }
   auto put_timer = std::make_shared<CostTimer>("client_push_sparse_put");
@@ -1381,8 +1381,7 @@ void BrpcPsClient::push_sparse_task_consume() {
   ::ThreadPool async_push_sparse_shard_threads(
       FLAGS_pserver_sparse_merge_thread);
   while (_running) {
-    platform::Timer timeline;
-    timeline.Start();
+    auto async_start_time_ms = butil::gettimeofday_ms();
     // 所有sparseTable的pushTask 进行处理
     for (auto &push_sparse_task_itr : _push_sparse_task_queue_map) {
       auto table_id = push_sparse_task_itr.first;
@@ -1497,9 +1496,8 @@ void BrpcPsClient::push_sparse_task_consume() {
         std::vector<std::future<int>>().swap(merge_status);
       }
     }
-    timeline.Pause();
-    auto wait_ms =
-        FLAGS_pserver_async_push_sparse_interval_ms - (timeline.ElapsedMS());
+    auto wait_ms = FLAGS_pserver_async_push_sparse_interval_ms -
+                   (butil::gettimeofday_ms() - async_start_time_ms);
     if (wait_ms > 0) {
       usleep(wait_ms * 1000);
     }
@@ -1661,9 +1659,10 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions,
       std::make_shared<CostTimer>("pserver_client_push_dense_parse");
   int push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
   while (push_dense_async_num > FLAGS_pserver_max_async_call_num) {
-    LOG(INFO) << "push_dense Waiting for async_call_num comsume, task_num:"
-              << push_dense_async_num
-              << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
+    //    LOG(INFO) << "push_dense Waiting for async_call_num comsume,
+    //    task_num:"
+    //              << push_dense_async_num
+    //              << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
     usleep(5000);  // 5ms
     push_dense_async_num = _push_dense_task_queue_map[table_id]->Size();
   }
@@ -1701,8 +1700,7 @@ void BrpcPsClient::push_dense_task_consume() {
   static bool scale_gradient = FLAGS_pserver_scale_gradient_by_merge;
   ::ThreadPool async_merge_dense_threads(10);
   while (_running) {
-    platform::Timer timeline;
-    timeline.Start();
+    auto async_start_time_ms = butil::gettimeofday_ms();
     for (auto &task_queue_itr : _push_dense_task_queue_map) {
       auto &task_queue = task_queue_itr.second;
       auto queue_size = task_queue->Size();
@@ -1791,9 +1789,8 @@ void BrpcPsClient::push_dense_task_consume() {
       push_dense_raw_gradient(task_ptr, total_send_data, total_send_data_size,
                               closure);
     }
-    timeline.Pause();
-    auto wait_ms =
-        FLAGS_pserver_async_push_dense_interval_ms - (timeline.ElapsedMS());
+    auto wait_ms = FLAGS_pserver_async_push_dense_interval_ms -
+                   (butil::gettimeofday_ms() - async_start_time_ms);
     if (wait_ms > 0) {
       usleep(wait_ms * 1000);
     }

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.cc b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
-
 #include <google/protobuf/text_format.h>
-
 #include "gflags/gflags.h"
 #include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
+#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
 
@@ -66,34 +65,9 @@ std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
 void Communicator::InitBrpcClient(
     const std::string &dist_desc,
     const std::vector<std::string> &host_sign_list) {
-  // not used, just for psclient's init
-  std::map<uint64_t, std::vector<paddle::distributed::Region>>
-      _dense_pull_regions;
-  for (auto &iter : recv_varname_to_ctx_) {
-    auto tid = iter.first;
-    auto var_names = iter.second;
-
-    auto &regions = _dense_pull_regions[tid];
-    regions.reserve(var_names.size());
-    for (auto &t : var_names) {
-      Variable *var = recv_scope_->FindVar(t);
-      LoDTensor *tensor = var->GetMutable<LoDTensor>();
-      float *w = tensor->data<float>();
-      paddle::distributed::Region reg(w, tensor->numel());
-      regions.emplace_back(std::move(reg));
-    }
-  }
-
+  auto fleet = paddle::distributed::FleetWrapper::GetInstance();
   if (_worker_ptr.get() == nullptr) {
-    google::protobuf::TextFormat::ParseFromString(dist_desc, &_ps_param);
-    init_gflag(_ps_param.init_gflags());
-    servers_ = host_sign_list.size();
-    _ps_env = paddle::distributed::PaddlePSEnvironment();
-    _ps_env.set_ps_servers(&host_sign_list, servers_);
-    _worker_ptr = std::unique_ptr<paddle::distributed::PSClient>(
-        paddle::distributed::PSClientFactory::create(_ps_param));
-    _worker_ptr->configure(_ps_param, _dense_pull_regions, _ps_env,
-                           trainer_id_);
+    _worker_ptr = fleet->worker_ptr_;
   }
   return;
 }
@@ -146,11 +120,11 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
   for (auto &t : varnames) {
     Variable *var = scope->FindVar(t);
     LoDTensor *tensor = var->GetMutable<LoDTensor>();
-    VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
+    VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
             << platform::is_gpu_place(tensor->place());
 
     float *temp_recv_data = tensor->mutable_data<float>(platform::CPUPlace());
-    VLOG(1) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
+    VLOG(3) << "AsyncCommunicator::RpcRecvDense Var " << t << " table_id "
             << table_id << " Temp_data[0] " << temp_recv_data[0]
             << " Temp_data[-1] " << temp_recv_data[tensor->numel() - 1];
     if (platform::is_gpu_place(tensor->place())) {
@@ -481,7 +455,7 @@ void AsyncCommunicator::RecvNoBarrier() {
     for (auto &t : var_names) {
       Variable *var = recv_scope_->FindVar(t);
       LoDTensor *tensor = var->GetMutable<LoDTensor>();
-      VLOG(1) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
+      VLOG(3) << "AsyncCommunicator::RecvNoBarrier Var " << t << " On gpu? "
               << platform::is_gpu_place(tensor->place());
       if (platform::is_gpu_place(tensor->place())) {
 #ifdef PADDLE_WITH_CUDA
@@ -653,7 +627,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
         input->lod().size() ? input->lod()[0].size() - 1 : input->dims()[0];
     if (batch_size == -1) {
       batch_size = cur_batch_size;
-    } else {
+    } else if (batch_size != cur_batch_size) {
       // CHECK(batch_size == cur_batch_size);  // NOLINT
       batch_size_consist = false;
       break;
@@ -676,7 +650,8 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
   size_t output_len = 0;
   size_t input_idx = 0;
 
-  VLOG(2) << "fleet.cc::emb_dim: " << fea_dim;
+  VLOG(2) << "fleet.cc::emb_dim: " << fea_dim << " batch_size: " << batch_size
+          << " batch_size_consist: " << batch_size_consist;
 
   // TODO(zhaocaibei123): check type of show/clk is int? float? uint64?
   // const long int* show_tensor = shows->data<int64_t>();
@@ -687,13 +662,14 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
   for (size_t index = 0; index < inputs->size(); ++index) {
     framework::LoDTensor *g_tensor = outputs->at(index);
     float *g = g_tensor->data<float>();
-    // no cvm
+
     if (batch_size_consist) {  // TODO(zhaocaibei123): add config
                                // scale_sparse_gradient_with_batch_size_
       Eigen::Map<
           Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>
           g_mat(g, g_tensor->numel() / fea_dim, fea_dim);
-      g_mat.rightCols(fea_dim) *= batch_size;
+      g_mat.rightCols(fea_dim - 2) *=
+          batch_size;  // hard code here, because of cvm_grad op
     }
 
     const framework::LoDTensor *tensor = inputs->at(index);
@@ -710,16 +686,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
             continue;
           }
           push_keys.emplace_back(real_id);
-          push_values.emplace_back(fea_dim + 3);
+          push_values.emplace_back(fea_dim + 1);
           // slot show clk grad... consistent with CtrCommonPushValue defined in
           // ctr_accessor.h
           push_values.back()[0] = 2;  // TODO(zhaocaibei123): slot
-          push_values.back()[1] =
-              (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
-          push_values.back()[2] =
-              (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
+          // push_values.back()[1] =
+          //    (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
+          // push_values.back()[2] =
+          //    (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
 
-          float *data = push_values.back().data() + 3;
+          float *data = push_values.back().data() + 1;  // hard code here
 
           memcpy(data, g + output_len, sizeof(float) * fea_dim);
 
@@ -733,16 +709,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
           continue;
         }
         push_keys.emplace_back(real_id);
-        push_values.emplace_back(fea_dim + 3);
+        push_values.emplace_back(fea_dim + 1);
         // slot show clk grad... consistent with CtrCommonPushValue defined in
         // ctr_accessor.h
         push_values.back()[0] = 2;  // TODO(zhaocaibei123): slot
-        push_values.back()[1] =
-            (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
-        push_values.back()[2] =
-            (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
+        // push_values.back()[1] =
+        //    (i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
+        // push_values.back()[2] =
+        //    (i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
 
-        float *data = push_values.back().data() + 3;
+        float *data = push_values.back().data() + 1;
 
         memcpy(data, g + output_len, sizeof(float) * fea_dim);
 
@@ -837,7 +813,7 @@ void AsyncCommunicator::Stop() {
   if (!communicator_) {
     VLOG(0) << "Communicator is not inited, do nothing";
   } else {
-    _worker_ptr->finalize_worker();
+    // _worker_ptr->finalize_worker();
     VLOG(1) << "client finalize_worker done";
     if (recv_thread_) {
       VLOG(1) << "stop recv thread";

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -360,13 +360,13 @@ class Communicator {
 
   PSClient *GetPsClient() { return _worker_ptr.get(); }
 
-  std::unique_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
+  std::shared_ptr<paddle::distributed::PSClient> GetPsClientPtr() {
     return std::move(_worker_ptr);
   }
 
   RecvCtxMap &GetRecvCtxMap() { return recv_varname_to_ctx_; }
 
-  std::unique_ptr<PSClient> _worker_ptr;  // pointer to worker
+  std::shared_ptr<PSClient> _worker_ptr;  // pointer to worker
 
  protected:
   bool running_ = false;

diff --git a/paddle/fluid/distributed/ps/table/CMakeLists.txt b/paddle/fluid/distributed/ps/table/CMakeLists.txt
@@ -43,11 +43,12 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI
 set_source_files_properties(sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+set_source_files_properties(sparse_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(downpour_ctr_accessor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 set_source_files_properties(memory_sparse_table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
 cc_library(sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS ${TABLE_DEPS} ps_framework_proto)
 cc_library(ctr_double_accessor SRCS ctr_double_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
-cc_library(ctr_accessor SRCS ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
+cc_library(ctr_accessor SRCS ctr_accessor.cc sparse_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 cc_library(downpour_ctr_accessor SRCS downpour_ctr_accessor.cc DEPS ${TABLE_DEPS} ps_framework_proto sparse_sgd_rule)
 cc_library(memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto ${TABLE_DEPS} fs afs_wrapper ctr_accessor common_table)