Skip to content

Commit

Permalink
REVIEW: fix review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
Sergei-Lebedev committed Mar 28, 2023
1 parent 3eab731 commit 0ed5edd
Show file tree
Hide file tree
Showing 8 changed files with 25 additions and 18 deletions.
2 changes: 1 addition & 1 deletion src/components/ec/base/ucc_ec_base.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down
6 changes: 3 additions & 3 deletions src/components/ec/cuda/ec_cuda.c
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = {
UCC_CONFIG_TYPE_ULUNITS},

{"EXEC_COPY_LARGE_THRESH", "1M",
"Memcopy size to switch from kernel copy to cudaMemcpy",
"Single memcopy size to switch from kernel copy to cudaMemcpy",
ucc_offsetof(ucc_ec_cuda_config_t, exec_copy_thresh),
UCC_CONFIG_TYPE_MEMUNITS},

Expand Down Expand Up @@ -164,9 +164,9 @@ static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NO
cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0,
(void*)1, (void*)1, 1, cudaMemcpyDefault));
}

CUDA_FUNC(
cudaGraphInstantiate(&task->graph_exec, task->graph, NULL,
NULL, 0));
cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0));
}

static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
Expand Down
3 changes: 2 additions & 1 deletion src/components/ec/cuda/ec_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
#include <cuda_runtime.h>

#define WARP_SIZE 32
#define MAX_SUBTASKS 12

typedef enum ucc_ec_cuda_strm_task_mode {
UCC_EC_CUDA_TASK_KERNEL,
UCC_EC_CUDA_TASK_MEM_OPS,
Expand Down Expand Up @@ -76,7 +78,6 @@ typedef struct ucc_ec_cuda_stream_request {
cudaStream_t stream;
} ucc_ec_cuda_stream_request_t;

#define MAX_SUBTASKS 12
typedef struct ucc_ec_cuda_executor_interruptible_task {
ucc_ee_executor_task_t super;
void *event;
Expand Down
11 changes: 6 additions & 5 deletions src/components/ec/cuda/ec_cuda_executor_interruptible.c
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,12 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
case UCC_EE_EXECUTOR_TASK_COPY_MULTI:
if ((task_args->copy_multi.counts[0] > EC_CUDA_CONFIG->exec_copy_thresh) &&
(task_args->copy_multi.num_vectors > 2)) {
cudaGraphGetNodes(ee_task->graph, nodes, &num_nodes);
status = CUDA_FUNC(cudaGraphGetNodes(ee_task->graph, nodes,
&num_nodes));
if (ucc_unlikely(status != UCC_OK)) {
ec_error(&ucc_ec_cuda.super, "failed to get graph nodes");
goto free_task;
}
for (i = 0; i < task_args->copy_multi.num_vectors; i++) {
status = CUDA_FUNC(
cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
Expand Down Expand Up @@ -181,10 +186,6 @@ ucc_cuda_executor_interruptible_task_finalize(ucc_ee_executor_task_t *task)

ucc_assert(task->status == UCC_OK);
status = ucc_ec_cuda_event_destroy(ee_task->event);
// if (ee_task->graph) {
// cudaGraphExecDestroy(ee_task->graph_exec);
// cudaGraphDestroy(ee_task->graph);
// }
ucc_mpool_put(task);
return status;
}
Expand Down
2 changes: 1 addition & 1 deletion src/components/tl/cuda/allgather/allgather_linear.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down
11 changes: 5 additions & 6 deletions src/components/tl/cuda/allgatherv/allgatherv_linear.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
ucc_status_t st;
int step, i;
void * sbuf, *dbuf;
ucc_rank_t peer;//, nv;
ucc_rank_t peer;
size_t send_size, frag_size, frag_offset, local_offset, remote_offset,
scratch_offset, rank_offset;
ucc_ee_executor_task_args_t eargs;
Expand Down Expand Up @@ -191,7 +191,6 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
continue;
}


for (i = 0; i < tsize; i++) {
if (get_rank_step(task, i, 0) < step) {
return UCC_INPROGRESS;
Expand Down Expand Up @@ -222,7 +221,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
}
eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI;
for (i = 0; i < tsize - 1; i++) {
peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
peer = (trank + i + 1) % tsize;
scratch_offset = get_scratch_offset(team, dt, trank);
dbuf = PTR_OFFSET(TASK_SCRATCH(task, peer),
remote_offset + scratch_offset);
Expand Down Expand Up @@ -252,7 +251,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
} else if (step == (num_steps - 1)) {
eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI;
for (i = 0; i < tsize - 1; i++) {
peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
peer = (trank + i + 1) % tsize;
scratch_offset = get_scratch_offset(team, dt, peer);
rank_offset =
task->allgatherv_linear.get_offset(task, peer) * dt_size;
Expand Down Expand Up @@ -290,7 +289,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
scratch_offset = get_scratch_offset(team, dt, trank);
eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI;
for (i = 0; i < tsize - 1; i++) {
peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
peer = (trank + i + 1) % tsize;
dbuf = PTR_OFFSET(TASK_SCRATCH(task, peer),
remote_offset + scratch_offset);
eargs.copy_multi.src[i] = sbuf;
Expand All @@ -306,7 +305,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)

eargs.task_type = UCC_EE_EXECUTOR_TASK_COPY_MULTI;
for (i = 0; i < tsize - 1; i++) {
peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
peer = (trank + i + 1) % tsize;
scratch_offset = get_scratch_offset(team, dt, peer);
rank_offset =
task->allgatherv_linear.get_offset(task, peer) * dt_size;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/
Expand Down
6 changes: 6 additions & 0 deletions tools/perf/ucc_pt_benchmark.cc
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
/**
* Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* See file LICENSE for terms.
*/

#include <iomanip>
#include "ucc_pt_benchmark.h"
#include "components/mc/ucc_mc.h"
Expand Down

0 comments on commit 0ed5edd

Please sign in to comment.