REVIEW: fix review comments

openucx · Mar 28, 2023 · 0ed5edd · 0ed5edd
1 parent 3eab731
commit 0ed5edd
Show file tree

Hide file tree

Showing 8 changed files with 25 additions and 18 deletions.
diff --git a/src/components/ec/base/ucc_ec_base.h b/src/components/ec/base/ucc_ec_base.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */

diff --git a/src/components/ec/cuda/ec_cuda.c b/src/components/ec/cuda/ec_cuda.c
@@ -51,7 +51,7 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = {
      UCC_CONFIG_TYPE_ULUNITS},
 
     {"EXEC_COPY_LARGE_THRESH", "1M",
-     "Memcopy size to switch from kernel copy to cudaMemcpy",
+     "Single memcopy size to switch from kernel copy to cudaMemcpy",
      ucc_offsetof(ucc_ec_cuda_config_t, exec_copy_thresh),
      UCC_CONFIG_TYPE_MEMUNITS},
 
@@ -164,9 +164,9 @@ static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NO
             cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0,
                                      (void*)1, (void*)1, 1, cudaMemcpyDefault));
     }
+
     CUDA_FUNC(
-        cudaGraphInstantiate(&task->graph_exec, task->graph, NULL,
-                             NULL, 0));
+        cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0));
 }
 
 static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused

diff --git a/src/components/ec/cuda/ec_cuda.h b/src/components/ec/cuda/ec_cuda.h
@@ -14,6 +14,8 @@
 #include <cuda_runtime.h>
 
 #define WARP_SIZE 32
+#define MAX_SUBTASKS 12
+
 typedef enum ucc_ec_cuda_strm_task_mode {
     UCC_EC_CUDA_TASK_KERNEL,
     UCC_EC_CUDA_TASK_MEM_OPS,
@@ -76,7 +78,6 @@ typedef struct ucc_ec_cuda_stream_request {
     cudaStream_t        stream;
 } ucc_ec_cuda_stream_request_t;
 
-#define MAX_SUBTASKS 12
 typedef struct ucc_ec_cuda_executor_interruptible_task {
     ucc_ee_executor_task_t  super;
     void                   *event;

diff --git a/src/components/ec/cuda/ec_cuda_executor_interruptible.c b/src/components/ec/cuda/ec_cuda_executor_interruptible.c
@@ -91,7 +91,12 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
     case UCC_EE_EXECUTOR_TASK_COPY_MULTI:
         if ((task_args->copy_multi.counts[0] > EC_CUDA_CONFIG->exec_copy_thresh) &&
             (task_args->copy_multi.num_vectors > 2)) {
-            cudaGraphGetNodes(ee_task->graph, nodes, &num_nodes);
+            status = CUDA_FUNC(cudaGraphGetNodes(ee_task->graph, nodes,
+                                                 &num_nodes));
+            if (ucc_unlikely(status != UCC_OK)) {
+                ec_error(&ucc_ec_cuda.super, "failed to get graph nodes");
+                goto free_task;
+            }
             for (i = 0; i < task_args->copy_multi.num_vectors; i++) {
                 status = CUDA_FUNC(
                     cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
@@ -181,10 +186,6 @@ ucc_cuda_executor_interruptible_task_finalize(ucc_ee_executor_task_t *task)
 
     ucc_assert(task->status == UCC_OK);
     status = ucc_ec_cuda_event_destroy(ee_task->event);
-    // if (ee_task->graph) {
-    //     cudaGraphExecDestroy(ee_task->graph_exec);
-    //     cudaGraphDestroy(ee_task->graph);
-    // }
     ucc_mpool_put(task);
     return status;
 }

diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */

diff --git a/src/components/tl/cuda/allgatherv/allgatherv_linear.c b/src/components/tl/cuda/allgatherv/allgatherv_linear.c
@@ -157,7 +157,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
     ucc_status_t            st;
     int                     step, i;
     void *                  sbuf, *dbuf;
-    ucc_rank_t              peer;//, nv;
+    ucc_rank_t              peer;
     size_t send_size, frag_size, frag_offset, local_offset, remote_offset,
         scratch_offset, rank_offset;
     ucc_ee_executor_task_args_t eargs;
@@ -191,7 +191,6 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
             continue;
         }
 
-
         for (i = 0; i < tsize; i++) {
             if (get_rank_step(task, i, 0) < step) {
                 return UCC_INPROGRESS;
@@ -222,7 +221,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
             }
             eargs.task_type =  UCC_EE_EXECUTOR_TASK_COPY_MULTI;
             for (i = 0; i < tsize - 1; i++) {
-                peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
+                peer = (trank + i + 1) % tsize;
                 scratch_offset = get_scratch_offset(team, dt, trank);
                 dbuf           = PTR_OFFSET(TASK_SCRATCH(task, peer),
                                   remote_offset + scratch_offset);
@@ -252,7 +251,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
         } else if (step == (num_steps - 1)) {
             eargs.task_type =  UCC_EE_EXECUTOR_TASK_COPY_MULTI;
             for (i = 0; i < tsize - 1; i++) {
-                peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
+                peer = (trank + i + 1) % tsize;
                 scratch_offset = get_scratch_offset(team, dt, peer);
                 rank_offset =
                     task->allgatherv_linear.get_offset(task, peer) * dt_size;
@@ -290,7 +289,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
             scratch_offset = get_scratch_offset(team, dt, trank);
             eargs.task_type =  UCC_EE_EXECUTOR_TASK_COPY_MULTI;
             for (i = 0; i < tsize - 1; i++) {
-                peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
+                peer = (trank + i + 1) % tsize;
                 dbuf = PTR_OFFSET(TASK_SCRATCH(task, peer),
                                   remote_offset + scratch_offset);
                 eargs.copy_multi.src[i]    = sbuf;
@@ -306,7 +305,7 @@ ucc_tl_cuda_allgatherv_linear_progress_frag(ucc_tl_cuda_task_t *task)
 
             eargs.task_type =  UCC_EE_EXECUTOR_TASK_COPY_MULTI;
             for (i = 0; i < tsize - 1; i++) {
-                peer = (trank + i + 1) % UCC_TL_TEAM_SIZE(team);
+                peer = (trank + i + 1) % tsize;
                 scratch_offset = get_scratch_offset(team, dt, peer);
                 rank_offset =
                     task->allgatherv_linear.get_offset(task, peer) * dt_size;

diff --git a/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c b/src/components/tl/cuda/reduce_scatter/reduce_scatter_linear.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */

diff --git a/tools/perf/ucc_pt_benchmark.cc b/tools/perf/ucc_pt_benchmark.cc
@@ -1,3 +1,9 @@
+/**
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ *
+ * See file LICENSE for terms.
+ */
+
 #include <iomanip>
 #include "ucc_pt_benchmark.h"
 #include "components/mc/ucc_mc.h"