Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TL/CUDA: fix linear algorithms #751

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/components/ec/base/ucc_ec_base.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -79,7 +79,7 @@ typedef struct ucc_ee_executor_params {

/* Maximum number of buffers for UCC_EE_EXECUTOR_TASK_REDUCE_MULTI_DST and
UCC_EE_EXECUTOR_TASK_COPY_MULTI operations */
#define UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS 6
Sergei-Lebedev marked this conversation as resolved.
Show resolved Hide resolved
#define UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS 7

/* Reduces "n_srcs" buffers (each contains "count" elements of type "dt")
into "dst" buffer.
Expand Down
43 changes: 41 additions & 2 deletions src/components/ec/cuda/ec_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,11 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = {
ucc_offsetof(ucc_ec_cuda_config_t, exec_num_streams),
UCC_CONFIG_TYPE_ULUNITS},

{"EXEC_COPY_LARGE_THRESH", "1M",
"Single memcopy size to switch from kernel copy to cudaMemcpy",
ucc_offsetof(ucc_ec_cuda_config_t, exec_copy_thresh),
UCC_CONFIG_TYPE_MEMUNITS},

{"REDUCE_NUM_BLOCKS", "auto",
"Number of thread blocks to use for reduction in interruptible mode",
ucc_offsetof(ucc_ec_cuda_config_t, reduce_num_blocks),
Expand Down Expand Up @@ -146,6 +151,40 @@ static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = {
.obj_cleanup = ucc_ec_cuda_event_cleanup,
};

static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
{
ucc_ec_cuda_executor_interruptible_task_t *task =
(ucc_ec_cuda_executor_interruptible_task_t *) obj;
cudaGraphNode_t memcpy_node;
int i;

CUDA_FUNC(cudaGraphCreate(&task->graph, 0));
for (i = 0; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
CUDA_FUNC(
cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0,
(void*)1, (void*)1, 1, cudaMemcpyDefault));
}

CUDA_FUNC(
cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0));
}

static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
{
ucc_ec_cuda_executor_interruptible_task_t *task =
(ucc_ec_cuda_executor_interruptible_task_t *) obj;

CUDA_FUNC(cudaGraphExecDestroy(task->graph_exec));
CUDA_FUNC(cudaGraphDestroy(task->graph));
}

static ucc_mpool_ops_t ucc_ec_cuda_interruptible_task_mpool_ops = {
.chunk_alloc = ucc_mpool_hugetlb_malloc,
.chunk_release = ucc_mpool_hugetlb_free,
.obj_init = ucc_ec_cuda_graph_init,
.obj_cleanup = ucc_ec_cuda_graph_cleanup,
};

static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock)
{
if (*nt != UCC_ULUNITS_AUTO) {
Expand Down Expand Up @@ -243,8 +282,8 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
status = ucc_mpool_init(
&ucc_ec_cuda.executor_interruptible_tasks, 0,
sizeof(ucc_ec_cuda_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE,
16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE,
"interruptible executor tasks");
16, UINT_MAX, &ucc_ec_cuda_interruptible_task_mpool_ops,
UCC_THREAD_MULTIPLE, "interruptible executor tasks");
if (status != UCC_OK) {
ec_error(&ucc_ec_cuda.super, "failed to create interruptible tasks pool");
return status;
Expand Down
14 changes: 9 additions & 5 deletions src/components/ec/cuda/ec_cuda.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand All @@ -14,6 +14,8 @@
#include <cuda_runtime.h>

#define WARP_SIZE 32
#define MAX_SUBTASKS 12

typedef enum ucc_ec_cuda_strm_task_mode {
UCC_EC_CUDA_TASK_KERNEL,
UCC_EC_CUDA_TASK_MEM_OPS,
Expand Down Expand Up @@ -48,6 +50,7 @@ typedef struct ucc_ec_cuda_config {
unsigned long reduce_num_blocks;
int reduce_num_threads;
int use_cooperative_launch;
unsigned long exec_copy_thresh;
} ucc_ec_cuda_config_t;

typedef struct ucc_ec_cuda {
Expand Down Expand Up @@ -78,9 +81,10 @@ typedef struct ucc_ec_cuda_stream_request {
typedef struct ucc_ec_cuda_executor_interruptible_task {
ucc_ee_executor_task_t super;
void *event;
cudaGraph_t graph;
cudaGraphExec_t graph_exec;
} ucc_ec_cuda_executor_interruptible_task_t;

#define MAX_SUBTASKS 12
typedef struct ucc_ec_cuda_executor_persistent_task {
ucc_ee_executor_task_t super;
int num_subtasks;
Expand Down Expand Up @@ -133,9 +137,9 @@ extern ucc_ec_cuda_t ucc_ec_cuda;
ucc_ec_cuda.stream_initialized = 1; \
} \
ucc_spin_unlock(&ucc_ec_cuda.init_spinlock); \
if (ucc_unlikely(cudaSuccess != cuda_st)) { \
return cuda_error_to_ucc_status(cuda_st); \
} \
if (ucc_unlikely(cudaSuccess != cuda_st)) { \
return cuda_error_to_ucc_status(cuda_st); \
} \
} \
} while(0)

Expand Down
57 changes: 51 additions & 6 deletions src/components/ec/cuda/ec_cuda_executor_interruptible.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down Expand Up @@ -54,7 +54,11 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
{
cudaStream_t stream = NULL;
ucc_ec_cuda_executor_interruptible_task_t *ee_task;
ucc_status_t status;
ucc_status_t status;
cudaGraphNode_t nodes[UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS];
size_t num_nodes = UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS;
int i;


status = ucc_cuda_executor_interruptible_get_stream(&stream);
if (ucc_unlikely(status != UCC_OK)) {
Expand Down Expand Up @@ -85,10 +89,51 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
}
break;
case UCC_EE_EXECUTOR_TASK_COPY_MULTI:
status = ucc_ec_cuda_copy_multi_kernel(task_args, stream);
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to start copy multi op");
goto free_task;
if ((task_args->copy_multi.counts[0] > EC_CUDA_CONFIG->exec_copy_thresh) &&
Sergei-Lebedev marked this conversation as resolved.
Show resolved Hide resolved
(task_args->copy_multi.num_vectors > 2)) {
status = CUDA_FUNC(cudaGraphGetNodes(ee_task->graph, nodes,
&num_nodes));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to get graph nodes");
goto free_task;
}
for (i = 0; i < task_args->copy_multi.num_vectors; i++) {
status = CUDA_FUNC(
cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: zero-length operations are not supported, so maybe skip when counts[i]=0

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can't skip it since all nodes of the graphs should be valid

task_args->copy_multi.dst[i],
task_args->copy_multi.src[i],
task_args->copy_multi.counts[i],
cudaMemcpyDefault));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
goto free_task;
}

}
for (; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
status = CUDA_FUNC(
cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
task_args->copy_multi.dst[0],
task_args->copy_multi.src[0],
1, cudaMemcpyDefault));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
goto free_task;
}
}

status = CUDA_FUNC(cudaGraphLaunch(ee_task->graph_exec, stream));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
goto free_task;
}

} else {
status = ucc_ec_cuda_copy_multi_kernel(task_args, stream);
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to start copy multi op");
goto free_task;
}
}
break;
case UCC_EE_EXECUTOR_TASK_REDUCE:
Expand Down
3 changes: 2 additions & 1 deletion src/components/tl/cuda/allgather/allgather_linear.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args,
ucc_tl_cuda_task_t *task;
ucc_status_t status;

if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo))) {
if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
Sergei-Lebedev marked this conversation as resolved.
Show resolved Hide resolved
UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
return UCC_ERR_NOT_SUPPORTED;
}

Expand Down
Loading