diff --git a/src/components/ec/base/ucc_ec_base.h b/src/components/ec/base/ucc_ec_base.h index 5e91c57865..52a2427318 100644 --- a/src/components/ec/base/ucc_ec_base.h +++ b/src/components/ec/base/ucc_ec_base.h @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/ec/cuda/ec_cuda.c b/src/components/ec/cuda/ec_cuda.c index 9b149e4f41..2357023fed 100644 --- a/src/components/ec/cuda/ec_cuda.c +++ b/src/components/ec/cuda/ec_cuda.c @@ -51,7 +51,7 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = { UCC_CONFIG_TYPE_ULUNITS}, {"EXEC_COPY_LARGE_THRESH", "1M", - "Memcopy size to switch from kernel copy to cudaMemcpy", + "Single memcopy size to switch from kernel copy to cudaMemcpy", ucc_offsetof(ucc_ec_cuda_config_t, exec_copy_thresh), UCC_CONFIG_TYPE_MEMUNITS}, @@ -164,9 +164,9 @@ static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NO cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0, (void*)1, (void*)1, 1, cudaMemcpyDefault)); } + CUDA_FUNC( - cudaGraphInstantiate(&task->graph_exec, task->graph, NULL, - NULL, 0)); + cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0)); } static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused diff --git a/src/components/ec/cuda/ec_cuda.h b/src/components/ec/cuda/ec_cuda.h index 012ba65a15..a8310303ab 100644 --- a/src/components/ec/cuda/ec_cuda.h +++ b/src/components/ec/cuda/ec_cuda.h @@ -14,6 +14,8 @@ #include #define WARP_SIZE 32 +#define MAX_SUBTASKS 12 + typedef enum ucc_ec_cuda_strm_task_mode { UCC_EC_CUDA_TASK_KERNEL, UCC_EC_CUDA_TASK_MEM_OPS, @@ -76,7 +78,6 @@ typedef struct ucc_ec_cuda_stream_request { cudaStream_t stream; } ucc_ec_cuda_stream_request_t; -#define MAX_SUBTASKS 12 typedef struct ucc_ec_cuda_executor_interruptible_task { ucc_ee_executor_task_t super; void *event; diff --git a/src/components/ec/cuda/ec_cuda_executor_interruptible.c b/src/components/ec/cuda/ec_cuda_executor_interruptible.c index b3d2e68b68..74cc80b96e 100644 --- a/src/components/ec/cuda/ec_cuda_executor_interruptible.c +++ b/src/components/ec/cuda/ec_cuda_executor_interruptible.c @@ -91,7 +91,12 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor, case UCC_EE_EXECUTOR_TASK_COPY_MULTI: if ((task_args->copy_multi.counts[0] > EC_CUDA_CONFIG->exec_copy_thresh) && (task_args->copy_multi.num_vectors > 2)) { - cudaGraphGetNodes(ee_task->graph, nodes, &num_nodes); + status = CUDA_FUNC(cudaGraphGetNodes(ee_task->graph, nodes, + &num_nodes)); + if (ucc_unlikely(status != UCC_OK)) { + ec_error(&ucc_ec_cuda.super, "failed to get graph nodes"); + goto free_task; + } for (i = 0; i < task_args->copy_multi.num_vectors; i++) { status = CUDA_FUNC( cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i], @@ -181,10 +186,6 @@ ucc_cuda_executor_interruptible_task_finalize(ucc_ee_executor_task_t *task) ucc_assert(task->status == UCC_OK); status = ucc_ec_cuda_event_destroy(ee_task->event); - // if (ee_task->graph) { - // cudaGraphExecDestroy(ee_task->graph_exec); - // cudaGraphDestroy(ee_task->graph); - // } ucc_mpool_put(task); return status; } diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c index 572035c6a5..b6c2bd5c62 100644 --- a/src/components/tl/cuda/allgather/allgather_linear.c +++ b/src/components/tl/cuda/allgather/allgather_linear.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c index c1600541a1..41a044a7c4 100644 --- a/src/components/tl/cuda/allgatherv/allgatherv_linear.c +++ b/src/components/tl/cuda/allgatherv/allgatherv_linear.c @@ -157,7 +157,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task) ucc_status_t st; int step, i; void * sbuf, *dbuf; - ucc_rank_t peer;//, nv; + ucc_rank_t peer; size_t send_size, frag_size, frag_offset, local_offset, remote_offset, scratch_offset, rank_offset; ucc_ee_executor_task_args_t eargs; @@ -191,7 +191,6 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task) continue; } - for (i = 0; i < tsize; i++) { if (get_rank_step(task, i, 0) < step) { return UCC_INPROGRESS; @@ -222,7 +221,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task) } eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI; for (i = 0; i < tsize - 1; i++) { - peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team); + peer = (trank + i + 1) % tsize; scratch_offset = get_scratch_offset(team, dt, trank); dbuf = PTR_OFFSET(TASK_SCRATCH(task, peer), remote_offset + scratch_offset); @@ -252,7 +251,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task) } else if (step == (num_steps - 1)) { eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI; for (i = 0; i < tsize - 1; i++) { - peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team); + peer = (trank + i + 1) % tsize; scratch_offset = get_scratch_offset(team, dt, peer); rank_offset = task->allgatherv_linear.get_offset(task, peer) * dt_size; @@ -290,7 +289,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task) scratch_offset = get_scratch_offset(team, dt, trank); eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI; for (i = 0; i < tsize - 1; i++) { - peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team); + peer = (trank + i + 1) % tsize; dbuf = PTR_OFFSET(TASK_SCRATCH(task, peer), remote_offset + scratch_offset); eargs.copy_multi.src[i] = sbuf; @@ -306,7 +305,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task) eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI; for (i = 0; i < tsize - 1; i++) { - peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team); + peer = (trank + i + 1) % tsize; scratch_offset = get_scratch_offset(team, dt, peer); rank_offset = task->allgatherv_linear.get_offset(task, peer) * dt_size; diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c index 8635aa3760..f76dfc4771 100644 --- a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c +++ b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c @@ -1,5 +1,5 @@ /** - * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. * * See file LICENSE for terms. */ diff --git a/tools/perf/ucc_pt_benchmark.cc b/tools/perf/ucc_pt_benchmark.cc index be04ff5088..80ec28dad2 100644 --- a/tools/perf/ucc_pt_benchmark.cc +++ b/tools/perf/ucc_pt_benchmark.cc @@ -1,3 +1,9 @@ +/** + * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * See file LICENSE for terms. + */ + #include #include "ucc_pt_benchmark.h" #include "components/mc/ucc_mc.h"