Skip to content

Commit

Permalink
fix multi-machine save model (PaddlePaddle#217)
Browse files Browse the repository at this point in the history
  • Loading branch information
qingshui authored and danleifeng committed Sep 12, 2023
1 parent 58d3ab5 commit 975675d
Show file tree
Hide file tree
Showing 2 changed files with 4 additions and 3 deletions.
3 changes: 2 additions & 1 deletion paddle/fluid/framework/fleet/heter_ps/graph_gpu_wrapper.cu
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,8 @@ void GraphGpuWrapper::init_type_keys(
auto place = platform::CUDAPlace(gpuid);
platform::CUDADeviceGuard guard(gpuid);
keys[f_idx][j] =
memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t));
memory::AllocShared(place, tmp_keys[j].size() * sizeof(uint64_t),
phi::Stream(reinterpret_cast<phi::StreamId>(stream)));
cudaMemcpyAsync(keys[f_idx][j]->ptr(),
tmp_keys[j].data(),
sizeof(uint64_t) * tmp_keys[j].size(),
Expand Down
4 changes: 2 additions & 2 deletions python/paddle/distributed/ps/the_one_ps.py
Original file line number Diff line number Diff line change
Expand Up @@ -1719,7 +1719,7 @@ def _save_dense_params(self, *args, **kwargs):

def _save_persistables(self, *args, **kwargs):
fleet.util.barrier()
if self.role_maker._is_first_worker():
if self.context['use_ps_gpu'] or self.role_maker._is_first_worker():
self._save_distributed_persistables(*args, **kwargs)
fleet.util.barrier()

Expand All @@ -1737,7 +1737,7 @@ def _load_one_table(self, table_id, path, mode):

def _load_persistables(self, path, mode):
fleet.util.barrier()
if self.role_maker._is_first_worker():
if self.context['use_ps_gpu'] or self.role_maker._is_first_worker():
self._worker.load_model(path, mode)
fleet.util.barrier()

Expand Down

0 comments on commit 975675d

Please sign in to comment.