qingshui · qingshui · Mar 6, 2023 · Feb 23, 2023 · Feb 27, 2023 · Mar 1, 2023
diff --git a/paddle/fluid/framework/data_feed.cu b/paddle/fluid/framework/data_feed.cu
@@ -2133,8 +2133,9 @@ int GraphDataGenerator::FillInferBuf() {
     global_infer_node_type_start[infer_cursor] += total_row_;
     infer_node_end_ = global_infer_node_type_start[infer_cursor];
     cursor_ = infer_cursor;
+    return 1;
   }
-  return 1;
+  return 0;
 }
 
 void GraphDataGenerator::ClearSampleState() {

diff --git a/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu b/paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table_inl.cu
@@ -23,6 +23,9 @@
 #include "paddle/fluid/framework/fleet/heter_ps/graph_gpu_ps_table.h"
 #define ALIGN_INT64(LEN) (uint64_t((LEN) + 7) & uint64_t(~7))
 #define HBMPS_MAX_BUFF 1024 * 1024
+
+DECLARE_bool(enable_neighbor_list_use_uva);
+
 namespace paddle {
 namespace framework {
 /*
@@ -895,8 +898,14 @@ void GpuPsGraphTable::build_graph_on_single_gpu(const GpuPsCommGraph& g,
     gpu_graph_list_[offset].node_size = 0;
   }
   if (g.neighbor_size) {
-    cudaError_t cudaStatus = cudaMalloc(&gpu_graph_list_[offset].neighbor_list,
+    cudaError_t cudaStatus;
+    if (!FLAGS_enable_neighbor_list_use_uva) {
+      cudaStatus = cudaMalloc(&gpu_graph_list_[offset].neighbor_list,
+                                        g.neighbor_size * sizeof(uint64_t));
+    } else {
+      cudaStatus = cudaMallocManaged(&gpu_graph_list_[offset].neighbor_list,
                                         g.neighbor_size * sizeof(uint64_t));
+    }
     PADDLE_ENFORCE_EQ(cudaStatus,
                       cudaSuccess,
                       platform::errors::InvalidArgument(
@@ -964,9 +973,13 @@ void GpuPsGraphTable::build_graph_from_cpu(
       gpu_graph_list_[offset].node_size = 0;
     }
     if (cpu_graph_list[i].neighbor_size) {
-      CUDA_CHECK(
-          cudaMalloc(&gpu_graph_list_[offset].neighbor_list,
-                     cpu_graph_list[i].neighbor_size * sizeof(uint64_t)));
+      if (!FLAGS_enable_neighbor_list_use_uva) {
+        CUDA_CHECK(cudaMalloc(&gpu_graph_list_[offset].neighbor_list,
+                    cpu_graph_list[i].neighbor_size * sizeof(uint64_t)));
+      } else {
+        CUDA_CHECK(cudaMallocManaged(&gpu_graph_list_[offset].neighbor_list,
+                    cpu_graph_list[i].neighbor_size * sizeof(uint64_t)));
+      }
 
       CUDA_CHECK(
           cudaMemcpyAsync(gpu_graph_list_[offset].neighbor_list,

diff --git a/paddle/phi/core/flags.cc b/paddle/phi/core/flags.cc
@@ -845,6 +845,18 @@ PADDLE_DEFINE_EXPORTED_bool(graph_load_in_parallel,
                             false,
                             "It controls whether load graph node and edge with "
                             "mutli threads parallely.");
+
+/**
+ * Distributed related FLAG
+ * Name: FLAGS_enable_neighbor_list_use_uva
+ * Since Version: 2.2.0
+ * Value Range: bool, default=false
+ * Example:
+ * Note: Control whether store neighbor_list with UVA
+ */
+PADDLE_DEFINE_EXPORTED_bool(enable_neighbor_list_use_uva,
+                            false,
+                            "It controls whether store neighbor_list with UVA");
 
 /**
  * Distributed related FLAG

diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
@@ -1630,7 +1630,8 @@ def _save_cache_model(self, dirname, **kwargs):
         return feasign_num
 
     def _save_cache_table(self, table_id, pass_id, mem_cache_key_threshold):
-        if self.role_maker._is_first_worker():
+        fleet.util.barrier()
+        if self.context['use_ps_gpu'] or self.role_maker._is_first_worker():
             self._worker.save_cache_table(
                 table_id, pass_id, mem_cache_key_threshold
             )