openucx · Sergei-Lebedev · Apr 20, 2023 · Mar 15, 2023 · Mar 28, 2023 · samnordmann
diff --git a/src/components/ec/base/ucc_ec_base.h b/src/components/ec/base/ucc_ec_base.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -79,7 +79,7 @@ typedef struct ucc_ee_executor_params {
 
 /* Maximum number of buffers for UCC_EE_EXECUTOR_TASK_REDUCE_MULTI_DST and
    UCC_EE_EXECUTOR_TASK_COPY_MULTI operations */
-#define UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS 6
+#define UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS 7
 
 /* Reduces "n_srcs" buffers (each contains "count" elements of type "dt")
    into "dst" buffer.

diff --git a/src/components/ec/cuda/ec_cuda.c b/src/components/ec/cuda/ec_cuda.c
@@ -50,6 +50,11 @@ static ucc_config_field_t ucc_ec_cuda_config_table[] = {
      ucc_offsetof(ucc_ec_cuda_config_t, exec_num_streams),
      UCC_CONFIG_TYPE_ULUNITS},
 
+    {"EXEC_COPY_LARGE_THRESH", "1M",
+     "Single memcopy size to switch from kernel copy to cudaMemcpy",
+     ucc_offsetof(ucc_ec_cuda_config_t, exec_copy_thresh),
+     UCC_CONFIG_TYPE_MEMUNITS},
+
     {"REDUCE_NUM_BLOCKS", "auto",
      "Number of thread blocks to use for reduction in interruptible mode",
      ucc_offsetof(ucc_ec_cuda_config_t, reduce_num_blocks),
@@ -146,6 +151,40 @@ static ucc_mpool_ops_t ucc_ec_cuda_event_mpool_ops = {
     .obj_cleanup   = ucc_ec_cuda_event_cleanup,
 };
 
+static void ucc_ec_cuda_graph_init(ucc_mpool_t *mp, void *obj, void *chunk) //NOLINT: mp is unused
+{
+    ucc_ec_cuda_executor_interruptible_task_t *task =
+         (ucc_ec_cuda_executor_interruptible_task_t *) obj;
+    cudaGraphNode_t memcpy_node;
+    int i;
+
+    CUDA_FUNC(cudaGraphCreate(&task->graph, 0));
+    for (i = 0; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
+        CUDA_FUNC(
+            cudaGraphAddMemcpyNode1D(&memcpy_node, task->graph, NULL, 0,
+                                     (void*)1, (void*)1, 1, cudaMemcpyDefault));
+    }
+
+    CUDA_FUNC(
+        cudaGraphInstantiateWithFlags(&task->graph_exec, task->graph, 0));
+}
+
+static void ucc_ec_cuda_graph_cleanup(ucc_mpool_t *mp, void *obj) //NOLINT: mp is unused
+{
+    ucc_ec_cuda_executor_interruptible_task_t *task =
+         (ucc_ec_cuda_executor_interruptible_task_t *) obj;
+
+    CUDA_FUNC(cudaGraphExecDestroy(task->graph_exec));
+    CUDA_FUNC(cudaGraphDestroy(task->graph));
+}
+
+static ucc_mpool_ops_t ucc_ec_cuda_interruptible_task_mpool_ops = {
+    .chunk_alloc   = ucc_mpool_hugetlb_malloc,
+    .chunk_release = ucc_mpool_hugetlb_free,
+    .obj_init      = ucc_ec_cuda_graph_init,
+    .obj_cleanup   = ucc_ec_cuda_graph_cleanup,
+};
+
 static inline void ucc_ec_cuda_set_threads_nbr(int *nt, int maxThreadsPerBlock)
 {
     if (*nt != UCC_ULUNITS_AUTO) {
@@ -243,8 +282,8 @@ static ucc_status_t ucc_ec_cuda_init(const ucc_ec_params_t *ec_params)
     status = ucc_mpool_init(
         &ucc_ec_cuda.executor_interruptible_tasks, 0,
         sizeof(ucc_ec_cuda_executor_interruptible_task_t), 0, UCC_CACHE_LINE_SIZE,
-        16, UINT_MAX, NULL, UCC_THREAD_MULTIPLE,
-        "interruptible executor tasks");
+        16, UINT_MAX, &ucc_ec_cuda_interruptible_task_mpool_ops,
+        UCC_THREAD_MULTIPLE, "interruptible executor tasks");
     if (status != UCC_OK) {
         ec_error(&ucc_ec_cuda.super, "failed to create interruptible tasks pool");
         return status;

diff --git a/src/components/ec/cuda/ec_cuda.h b/src/components/ec/cuda/ec_cuda.h
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -14,6 +14,8 @@
 #include <cuda_runtime.h>
 
 #define WARP_SIZE 32
+#define MAX_SUBTASKS 12
+
 typedef enum ucc_ec_cuda_strm_task_mode {
     UCC_EC_CUDA_TASK_KERNEL,
     UCC_EC_CUDA_TASK_MEM_OPS,
@@ -48,6 +50,7 @@ typedef struct ucc_ec_cuda_config {
     unsigned long                  reduce_num_blocks;
     int                            reduce_num_threads;
     int                            use_cooperative_launch;
+    unsigned long                  exec_copy_thresh;
 } ucc_ec_cuda_config_t;
 
 typedef struct ucc_ec_cuda {
@@ -78,9 +81,10 @@ typedef struct ucc_ec_cuda_stream_request {
 typedef struct ucc_ec_cuda_executor_interruptible_task {
     ucc_ee_executor_task_t  super;
     void                   *event;
+    cudaGraph_t             graph;
+    cudaGraphExec_t         graph_exec;
 } ucc_ec_cuda_executor_interruptible_task_t;
 
-#define MAX_SUBTASKS 12
 typedef struct ucc_ec_cuda_executor_persistent_task {
     ucc_ee_executor_task_t       super;
     int                          num_subtasks;
@@ -133,9 +137,9 @@ extern ucc_ec_cuda_t ucc_ec_cuda;
             ucc_ec_cuda.stream_initialized = 1;                                \
         }                                                                      \
         ucc_spin_unlock(&ucc_ec_cuda.init_spinlock);                           \
-        if (ucc_unlikely(cudaSuccess != cuda_st)) {                     \
-            return cuda_error_to_ucc_status(cuda_st);                   \
-        }                                                               \
+        if (ucc_unlikely(cudaSuccess != cuda_st)) {                            \
+            return cuda_error_to_ucc_status(cuda_st);                          \
+        }                                                                      \
     }                                                                          \
 } while(0)
 

diff --git a/src/components/ec/cuda/ec_cuda_executor_interruptible.c b/src/components/ec/cuda/ec_cuda_executor_interruptible.c
@@ -1,5 +1,5 @@
 /**
- * Copyright (c) 2020-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
  *
  * See file LICENSE for terms.
  */
@@ -54,7 +54,11 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
 {
     cudaStream_t stream = NULL;
     ucc_ec_cuda_executor_interruptible_task_t *ee_task;
-    ucc_status_t                               status;
+    ucc_status_t status;
+    cudaGraphNode_t nodes[UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS];
+    size_t num_nodes = UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS;
+    int i;
+
 
     status = ucc_cuda_executor_interruptible_get_stream(&stream);
     if (ucc_unlikely(status != UCC_OK)) {
@@ -85,10 +89,51 @@ ucc_cuda_executor_interruptible_task_post(ucc_ee_executor_t *executor,
         }
         break;
     case UCC_EE_EXECUTOR_TASK_COPY_MULTI:
-        status = ucc_ec_cuda_copy_multi_kernel(task_args, stream);
-        if (ucc_unlikely(status != UCC_OK)) {
-            ec_error(&ucc_ec_cuda.super, "failed to start copy multi op");
-            goto free_task;
+        if ((task_args->copy_multi.counts[0] > EC_CUDA_CONFIG->exec_copy_thresh) &&
+            (task_args->copy_multi.num_vectors > 2)) {
+            status = CUDA_FUNC(cudaGraphGetNodes(ee_task->graph, nodes,
+                                                 &num_nodes));
+            if (ucc_unlikely(status != UCC_OK)) {
+                ec_error(&ucc_ec_cuda.super, "failed to get graph nodes");
+                goto free_task;
+            }
+            for (i = 0; i < task_args->copy_multi.num_vectors; i++) {
+                status = CUDA_FUNC(
+                    cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
+                                                       task_args->copy_multi.dst[i],
+                                                       task_args->copy_multi.src[i],
+                                                       task_args->copy_multi.counts[i],
+                                                       cudaMemcpyDefault));
+                if (ucc_unlikely(status != UCC_OK)) {
+                    ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
+                    goto free_task;
+                }
+
+            }
+            for (; i < UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS; i++) {
+                status = CUDA_FUNC(
+                    cudaGraphExecMemcpyNodeSetParams1D(ee_task->graph_exec, nodes[i],
+                                                       task_args->copy_multi.dst[0],
+                                                       task_args->copy_multi.src[0],
+                                                       1, cudaMemcpyDefault));
+                if (ucc_unlikely(status != UCC_OK)) {
+                    ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
+                    goto free_task;
+                }
+            }
+
+            status = CUDA_FUNC(cudaGraphLaunch(ee_task->graph_exec, stream));
+            if (ucc_unlikely(status != UCC_OK)) {
+                ec_error(&ucc_ec_cuda.super, "failed to instantiate graph");
+                goto free_task;
+            }
+
+        } else {
+            status = ucc_ec_cuda_copy_multi_kernel(task_args, stream);
+            if (ucc_unlikely(status != UCC_OK)) {
+                ec_error(&ucc_ec_cuda.super, "failed to start copy multi op");
+                goto free_task;
+            }
         }
         break;
     case UCC_EE_EXECUTOR_TASK_REDUCE:

diff --git a/src/components/tl/cuda/allgather/allgather_linear.c b/src/components/tl/cuda/allgather/allgather_linear.c
@@ -15,7 +15,8 @@ ucc_status_t ucc_tl_cuda_allgather_linear_init(ucc_base_coll_args_t *coll_args,
     ucc_tl_cuda_task_t *task;
     ucc_status_t        status;
 
-    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo))) {
+    if (ucc_unlikely(!ucc_tl_cuda_team_topo_is_fully_conntected(team->topo) ||
+        UCC_TL_TEAM_SIZE(team) - 1 > UCC_EE_EXECUTOR_MULTI_OP_NUM_BUFS)) {
         return UCC_ERR_NOT_SUPPORTED;
     }