diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu index ec2a5983e14e6..c252ed385513b 100644 --- a/paddle/fluid/framework/data_feed.cu +++ b/paddle/fluid/framework/data_feed.cu @@ -2133,8 +2133,9 @@ int GraphDataGenerator::FillInferBuf() { global_infer_node_type_start[infer_cursor] += total_row_; infer_node_end_ = global_infer_node_type_start[infer_cursor]; cursor_ = infer_cursor; + return 1; } - return 1; + return 0; } void GraphDataGenerator::ClearSampleState() { diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu index adefb4eb13de1..17a74c460147f 100644 --- a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu +++ b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu @@ -23,6 +23,9 @@ #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h" #define ALIGN_INT64(LEN) (uint64_t((LEN) + 7) & uint64_t(~7)) #define HBMPS_MAX_BUFF 1024 * 1024 + +DECLARE_bool(enable_neighbor_list_use_uva); + namespace paddle { namespace framework { /* @@ -895,8 +898,14 @@ void GpuPsGraphTable::build_graph_on_single_gpu(const GpuPsCommGraph& g, gpu_graph_list_[offset].node_size = 0; } if (g.neighbor_size) { - cudaError_t cudaStatus = cudaMalloc(&gpu_graph_list_[offset].neighbor_list, + cudaError_t cudaStatus; + if (!FLAGS_enable_neighbor_list_use_uva) { + cudaStatus = cudaMalloc(&gpu_graph_list_[offset].neighbor_list, + g.neighbor_size * sizeof(uint64_t)); + } else { + cudaStatus = cudaMallocManaged(&gpu_graph_list_[offset].neighbor_list, g.neighbor_size * sizeof(uint64_t)); + } PADDLE_ENFORCE_EQ(cudaStatus, cudaSuccess, platform::errors::InvalidArgument( @@ -964,9 +973,13 @@ void GpuPsGraphTable::build_graph_from_cpu( gpu_graph_list_[offset].node_size = 0; } if (cpu_graph_list[i].neighbor_size) { - CUDA_CHECK( - cudaMalloc(&gpu_graph_list_[offset].neighbor_list, - cpu_graph_list[i].neighbor_size * sizeof(uint64_t))); + if (!FLAGS_enable_neighbor_list_use_uva) { + CUDA_CHECK(cudaMalloc(&gpu_graph_list_[offset].neighbor_list, + cpu_graph_list[i].neighbor_size * sizeof(uint64_t))); + } else { + CUDA_CHECK(cudaMallocManaged(&gpu_graph_list_[offset].neighbor_list, + cpu_graph_list[i].neighbor_size * sizeof(uint64_t))); + } CUDA_CHECK( cudaMemcpyAsync(gpu_graph_list_[offset].neighbor_list, diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc index c983412eba50a..a6c23a2c831be 100644 --- a/paddle/phi/core/flags.cc +++ b/paddle/phi/core/flags.cc @@ -845,6 +845,18 @@ PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel, false, "It controls whether load graph node and edge with " "mutli threads parallely."); + +/** + * Distributed related FLAG + * Name: FLAGS_enable_neighbor_list_use_uva + * Since Version: 2.2.0 + * Value Range: bool, default=false + * Example: + * Note: Control whether store neighbor_list with UVA + */ +PADDLE_DEFINE_EXPORTED_bool(enable_neighbor_list_use_uva, + false, + "It controls whether store neighbor_list with UVA"); /** * Distributed related FLAG diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py index 592f12f4abf80..9ef3817571c65 100755 --- a/python/paddle/distributed/ps/the_one_ps.py +++ b/python/paddle/distributed/ps/the_one_ps.py @@ -1630,7 +1630,8 @@ def _save_cache_model(self, dirname, **kwargs): return feasign_num def _save_cache_table(self, table_id, pass_id, mem_cache_key_threshold): - if self.role_maker._is_first_worker(): + fleet.util.barrier() + if self.context['use_ps_gpu'] or self.role_maker._is_first_worker(): self._worker.save_cache_table( table_id, pass_id, mem_cache_key_threshold )