Skip to content

Commit

Permalink
TL/CUDA: fix linear algorithms
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei-Lebedev committed Mar 20, 2023
1 parent 3ff41ce commit 51cada1
Show file tree
Hide file tree
Showing 18 changed files with 261 additions and 115 deletions.
2 changes: 1 addition & 1 deletion src/components/ec/base/ucc_ec_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ typedef struct ucc_ee_executor_params {

/* Maximum number of buffers for UCC_EE_EXECUTOR_TASK_REDUCE_MULTI_DST and
UCC_EE_EXECUTOR_TASK_COPY_MULTI operations */
#define UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS 6
#define UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS 7

/* Reduces "n_srcs" buffers (each contains "count" elements of type "dt")
into "dst" buffer.
Expand Down
43 changes: 41 additions & 2 deletions src/components/ec/cuda/ec_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = {
ucc_offsetof(ucc_ec_cuda_config_t, exec_num_streams),
UCC_CONFIG_TYPE_ULUNITS},

{"EXEC_COPY_LARGE_THRESH", "1M",
"Memcopy size to switch from kernel copy to cudaMemcpy",
ucc_offsetof(ucc_ec_cuda_config_t, exec_copy_thresh),
UCC_CONFIG_TYPE_MEMUNITS},

{"REDUCE_NUM_BLOCKS", "auto",
"Number of thread blocks to use for reduction in interruptible mode",
ucc_offsetof(ucc_ec_cuda_config_t, reduce_num_blocks),
Expand Down Expand Up @@ -146,6 +151,40 @@ static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = {
.obj_cleanup = ucc_ec_cuda_event_cleanup,
};

static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
{
ucc_ec_cuda_executor_interruptible_task_t *task =
(ucc_ec_cuda_executor_interruptible_task_t *) obj;
cudaGraphNode_t memcpy_node;
int i;

CUDA_FUNC(cudaGraphCreate(&task->graph, 0));
for (i = 0; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
CUDA_FUNC(
cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0,
(void*)1, (void*)1, 1, cudaMemcpyDefault));
}
CUDA_FUNC(
cudaGraphInstantiate(&task->graph_exec, task->graph, NULL,
NULL, 0));
}

static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
{
ucc_ec_cuda_executor_interruptible_task_t *task =
(ucc_ec_cuda_executor_interruptible_task_t *) obj;

CUDA_FUNC(cudaGraphExecDestroy(task->graph_exec));
CUDA_FUNC(cudaGraphDestroy(task->graph));
}

static ucc_mpool_ops_t ucc_ec_cuda_interruptible_task_mpool_ops = {
.chunk_alloc = ucc_mpool_hugetlb_malloc,
.chunk_release = ucc_mpool_hugetlb_free,
.obj_init = ucc_ec_cuda_graph_init,
.obj_cleanup = ucc_ec_cuda_graph_cleanup,
};

static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock)
{
if (*nt != UCC_ULUNITS_AUTO) {
Expand Down Expand Up @@ -243,8 +282,8 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
status = ucc_mpool_init(
&ucc_ec_cuda.executor_interruptible_tasks, 0,
sizeof(ucc_ec_cuda_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE,
16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE,
"interruptible executor tasks");
16, UINT_MAX, &ucc_ec_cuda_interruptible_task_mpool_ops,
UCC_THREAD_MULTIPLE, "interruptible executor tasks");
if (status != UCC_OK) {
ec_error(&ucc_ec_cuda.super, "failed to create interruptible tasks pool");
return status;
Expand Down
13 changes: 8 additions & 5 deletions src/components/ec/cuda/ec_cuda.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -48,6 +48,7 @@ typedef struct ucc_ec_cuda_config {
unsigned long reduce_num_blocks;
int reduce_num_threads;
int use_cooperative_launch;
unsigned long exec_copy_thresh;
} ucc_ec_cuda_config_t;

typedef struct ucc_ec_cuda {
Expand Down Expand Up @@ -75,12 +76,14 @@ typedef struct ucc_ec_cuda_stream_request {
cudaStream_t stream;
} ucc_ec_cuda_stream_request_t;

#define MAX_SUBTASKS 12
typedef struct ucc_ec_cuda_executor_interruptible_task {
ucc_ee_executor_task_t super;
void *event;
cudaGraph_t graph;
cudaGraphExec_t graph_exec;
} ucc_ec_cuda_executor_interruptible_task_t;

#define MAX_SUBTASKS 12
typedef struct ucc_ec_cuda_executor_persistent_task {
ucc_ee_executor_task_t super;
int num_subtasks;
Expand Down Expand Up @@ -133,9 +136,9 @@ extern ucc_ec_cuda_t ucc_ec_cuda;
ucc_ec_cuda.stream_initialized = 1; \
} \
ucc_spin_unlock(&ucc_ec_cuda.init_spinlock); \
if (ucc_unlikely(cudaSuccess != cuda_st)) { \
return cuda_error_to_ucc_status(cuda_st); \
} \
if (ucc_unlikely(cudaSuccess != cuda_st)) { \
return cuda_error_to_ucc_status(cuda_st); \
} \
} \
} while(0)

Expand Down
56 changes: 50 additions & 6 deletions src/components/ec/cuda/ec_cuda_executor_interruptible.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -54,7 +54,11 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
{
cudaStream_t stream = NULL;
ucc_ec_cuda_executor_interruptible_task_t *ee_task;
ucc_status_t status;
ucc_status_t status;
cudaGraphNode_t nodes[UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS];
size_t num_nodes = UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS;
int i;


status = ucc_cuda_executor_interruptible_get_stream(&stream);
if (ucc_unlikely(status != UCC_OK)) {
Expand Down Expand Up @@ -85,10 +89,46 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
}
break;
case UCC_EE_EXECUTOR_TASK_COPY_MULTI:
status = ucc_ec_cuda_copy_multi_kernel(task_args, stream);
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to start copy multi op");
goto free_task;
if ((task_args->copy_multi.counts[0] > EC_CUDA_CONFIG->exec_copy_thresh) &&
(task_args->copy_multi.num_vectors > 2)) {
cudaGraphGetNodes(ee_task->graph, nodes, &num_nodes);
for (i = 0; i < task_args->copy_multi.num_vectors; i++) {
status = CUDA_FUNC(
cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
task_args->copy_multi.dst[i],
task_args->copy_multi.src[i],
task_args->copy_multi.counts[i],
cudaMemcpyDefault));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
goto free_task;
}

}
for (; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
status = CUDA_FUNC(
cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
task_args->copy_multi.dst[0],
task_args->copy_multi.src[0],
1, cudaMemcpyDefault));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
goto free_task;
}
}

status = CUDA_FUNC(cudaGraphLaunch(ee_task->graph_exec, stream));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
goto free_task;
}

} else {
status = ucc_ec_cuda_copy_multi_kernel(task_args, stream);
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to start copy multi op");
goto free_task;
}
}
break;
case UCC_EE_EXECUTOR_TASK_REDUCE:
Expand Down Expand Up @@ -141,6 +181,10 @@ ucc_cuda_executor_interruptible_task_finalize(ucc_ee_executor_task_t *task)

ucc_assert(task->status == UCC_OK);
status = ucc_ec_cuda_event_destroy(ee_task->event);
// if (ee_task->graph) {
// cudaGraphExecDestroy(ee_task->graph_exec);
// cudaGraphDestroy(ee_task->graph);
// }
ucc_mpool_put(task);
return status;
}
Expand Down
3 changes: 2 additions & 1 deletion src/components/tl/cuda/allgather/allgather_linear.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo))) {
if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
return UCC_ERR_NOT_SUPPORTED;
}

Expand Down
Loading

0 comments on commit 51cada1

Please sign in to comment.