Skip to content

Commit

Permalink
TL/CUDA: fix cache unmap (#642)
Browse files Browse the repository at this point in the history
Co-authored-by: valentin petrov <valentinp@nvidia.com>
  • Loading branch information
Sergei-Lebedev and valentin petrov authored Oct 13, 2022
1 parent 0f70d73 commit 5c29031
Show file tree
Hide file tree
Showing 4 changed files with 65 additions and 6 deletions.
50 changes: 47 additions & 3 deletions src/components/tl/cuda/alltoallv/alltoallv_ce.c
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,47 @@ ucc_status_t ucc_tl_cuda_alltoallv_ce_post_copies(ucc_tl_cuda_task_t *task)
return status;
}

ucc_status_t ucc_tl_cuda_alltoallv_unmap(ucc_tl_cuda_task_t *task)
{
ucc_tl_cuda_team_t *team = TASK_TEAM(task);
ucc_rank_t i, dst;
volatile ucc_tl_cuda_sync_t *peer_sync;
ucc_tl_cuda_cache_t *cache;
ucc_status_t status;

for (i = 0; i < UCC_TL_TEAM_SIZE(team); i++) {
if (i == UCC_TL_TEAM_RANK(team) ||
!ucc_tl_cuda_team_topo_is_direct(&team->super, team->topo,
UCC_TL_TEAM_RANK(team), i)) {
continue;
}
peer_sync = TASK_SYNC(task, i);
cache = ucc_tl_cuda_get_cache(team, i);

status = ucc_tl_cuda_unmap_memhandle(
(uintptr_t)peer_sync->mem_info_src.ptr,
task->alltoallv_ce.peer_map_addr_src[i], cache, 0);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
}

for (i = 0; i < team->topo->num_proxies; i++) {
dst = team->topo->proxies[i].dst;
peer_sync = TASK_SYNC(task, dst);
cache = ucc_tl_cuda_get_cache(team, dst);

status = ucc_tl_cuda_unmap_memhandle(
(uintptr_t)peer_sync->mem_info_dst.ptr,
task->alltoallv_ce.peer_map_addr_dst[dst], cache, 0);
if (ucc_unlikely(status != UCC_OK)) {
return status;
}
}

return UCC_OK;
}

void ucc_tl_cuda_alltoallv_ce_progress(ucc_coll_task_t *coll_task)
{
ucc_tl_cuda_task_t *task = ucc_derived_of(coll_task, ucc_tl_cuda_task_t);
Expand Down Expand Up @@ -301,11 +342,14 @@ void ucc_tl_cuda_alltoallv_ce_progress(ucc_coll_task_t *coll_task)
ucc_assert(task->alltoallv_ce.stage == ALLTOALL_CE_STAGE_BAR);
break;
}
task->super.status =
ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar);
if (task->super.status == UCC_OK) {

status = ucc_tl_cuda_shm_barrier_test(UCC_TL_TEAM_RANK(team), task->bar);
if (status == UCC_OK) {
status = ucc_tl_cuda_alltoallv_unmap(task);
ucc_tl_cuda_put_sync(task);
}

task->super.status = status;
}

ucc_status_t ucc_tl_cuda_alltoallv_ce_start(ucc_coll_task_t *coll_task)
Expand Down
17 changes: 16 additions & 1 deletion src/components/tl/cuda/tl_cuda_cache.c
Original file line number Diff line number Diff line change
Expand Up @@ -291,21 +291,36 @@ ucc_tl_cuda_map_memhandle(const void *d_ptr, size_t size,
}

ucc_status_t ucc_tl_cuda_unmap_memhandle(uintptr_t d_bptr, void *mapped_addr,
ucc_tl_cuda_cache_t *cache)
ucc_tl_cuda_cache_t *cache, int force)
{

if ((d_bptr == 0) || (mapped_addr == 0)) {
return UCC_OK;
}

#if ENABLE_CACHE
ucs_pgt_region_t *pgt_region;
ucc_tl_cuda_cache_region_t *region;

/* use write lock because cache maybe modified */
pthread_rwlock_wrlock(&cache->lock);
pgt_region = ucs_pgtable_lookup(&cache->pgtable, d_bptr);

ucc_debug("%s: tl_cuda unmap addr:%p region:"
UCS_PGT_REGION_FMT, cache->name, (void*)d_bptr,
UCS_PGT_REGION_ARG(pgt_region));

ucc_assert(pgt_region != NULL);
region = ucc_derived_of(pgt_region, ucc_tl_cuda_cache_region_t);

ucc_assert(region->refcount >= 1);
region->refcount--;

if ((region->refcount == 0 ) && (force == 1)) {
ucs_pgtable_remove(&cache->pgtable, &region->super);
CUDA_FUNC(cudaIpcCloseMemHandle(mapped_addr));
}

pthread_rwlock_unlock(&cache->lock);
#else
CUDA_FUNC(cudaIpcCloseMemHandle(mapped_addr));
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/cuda/tl_cuda_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ ucc_status_t ucc_tl_cuda_map_memhandle(const void *dptr, size_t size,
ucc_tl_cuda_cache_t *cache);

ucc_status_t ucc_tl_cuda_unmap_memhandle(uintptr_t d_bptr, void *mapped_addr,
ucc_tl_cuda_cache_t *cache);
ucc_tl_cuda_cache_t *cache, int force);

ucc_tl_cuda_cache_t* ucc_tl_cuda_get_cache(ucc_tl_cuda_team_t *team,
ucc_rank_t rank);
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/cuda/tl_cuda_team.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ UCC_CLASS_CLEANUP_FUNC(ucc_tl_cuda_team_t)
if (self->scratch.rem[i]) {
ucc_tl_cuda_unmap_memhandle((uintptr_t)self->scratch.rem_info[i].ptr,
self->scratch.rem[i],
ucc_tl_cuda_get_cache(self, i));
ucc_tl_cuda_get_cache(self, i), 1);
}
}

Expand Down

0 comments on commit 5c29031

Please sign in to comment.