Skip to content

Commit

Permalink
TOOLS: use separate cuda alloc in perftest (#872)
Browse files Browse the repository at this point in the history
* TOOLS: use separate cuda alloc in perftest

* REVIEW: fix review comments
  • Loading branch information
Sergei-Lebedev authored Jan 3, 2024
1 parent 7e22dde commit e038288
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 0 deletions.
59 changes: 59 additions & 0 deletions tools/perf/ucc_pt_coll.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,56 @@
*/

#include "ucc_pt_coll.h"
#include "ucc_pt_cuda.h"
#include "utils/ucc_malloc.h"

ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len,
ucc_memory_type_t mem_type)
{
ucc_status_t status;
int cuda_st;

switch (mem_type) {
case UCC_MEMORY_TYPE_CUDA:
*h_ptr = new ucc_mc_buffer_header_t;
(*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA;
cuda_st = ucc_pt_cudaMalloc(&((*h_ptr)->addr), len);
if (cuda_st != 0) {
return UCC_ERR_NO_MEMORY;
}
cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len);
if (cuda_st != 0) {
ucc_pt_cudaFree((*h_ptr)->addr);
delete *h_ptr;
return UCC_ERR_NO_MEMORY;
}
return UCC_OK;
case UCC_MEMORY_TYPE_CUDA_MANAGED:
*h_ptr = new ucc_mc_buffer_header_t;
(*h_ptr)->mt = UCC_MEMORY_TYPE_CUDA_MANAGED;
cuda_st = ucc_pt_cudaMallocManaged(&((*h_ptr)->addr), len);
if (cuda_st != 0) {
return UCC_ERR_NO_MEMORY;
}
cuda_st = ucc_pt_cudaMemset((*h_ptr)->addr, 0, len);
if (cuda_st != 0) {
ucc_pt_cudaFree((*h_ptr)->addr);
delete *h_ptr;
return UCC_ERR_NO_MEMORY;
}
return UCC_OK;
case UCC_MEMORY_TYPE_HOST:
*h_ptr = new ucc_mc_buffer_header_t;
(*h_ptr)->mt = UCC_MEMORY_TYPE_HOST;
(*h_ptr)->addr = ucc_malloc(len, "perftest data");
if (!((*h_ptr)->addr)) {
return UCC_ERR_NO_MEMORY;
}
memset((*h_ptr)->addr, 0, len);
return UCC_OK;
default:
break;
}

status = ucc_mc_alloc(h_ptr, len, mem_type);
if (status != UCC_OK) {
Expand All @@ -26,6 +71,20 @@ ucc_status_t ucc_pt_alloc(ucc_mc_buffer_header_t **h_ptr, size_t len,

ucc_status_t ucc_pt_free(ucc_mc_buffer_header_t *h_ptr)
{
switch (h_ptr->mt) {
case UCC_MEMORY_TYPE_CUDA:
case UCC_MEMORY_TYPE_CUDA_MANAGED:
ucc_pt_cudaFree(h_ptr->addr);
delete h_ptr;
return UCC_OK;
case UCC_MEMORY_TYPE_HOST:
ucc_free(h_ptr->addr);
delete h_ptr;
return UCC_OK;
default:
break;
}

return ucc_mc_free(h_ptr);
}

Expand Down
4 changes: 4 additions & 0 deletions tools/perf/ucc_pt_cuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ void ucc_pt_cuda_init(void)
LOAD_CUDA_SYM("cudaGetErrorString", getErrorString);
LOAD_CUDA_SYM("cudaStreamCreateWithFlags", streamCreateWithFlags);
LOAD_CUDA_SYM("cudaStreamDestroy", streamDestroy);
LOAD_CUDA_SYM("cudaMalloc", cudaMalloc);
LOAD_CUDA_SYM("cudaFree", cudaFree);
LOAD_CUDA_SYM("cudaMemset", cudaMemset);
LOAD_CUDA_SYM("cudaMallocManaged", cudaMallocManaged);

ucc_pt_cuda_iface.available = 1;
}
42 changes: 42 additions & 0 deletions tools/perf/ucc_pt_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

#define cudaSuccess 0
#define cudaStreamNonBlocking 0x01 /**< Stream does not synchronize with stream 0 (the NULL stream) */
#define cudaMemAttachGlobal 0x01 /**< Memory can be accessed by any stream on any device*/
typedef struct CUStream_st *cudaStream_t;

#define STR(x) #x
Expand All @@ -31,6 +32,10 @@ typedef struct ucc_pt_cuda_iface {
int (*streamCreateWithFlags)(cudaStream_t *stream, unsigned int flags);
int (*streamDestroy)(cudaStream_t stream);
char* (*getErrorString)(int err);
int (*cudaMalloc)(void **devptr, size_t size);
int (*cudaMallocManaged)(void **ptr, size_t size, unsigned int flags);
int (*cudaFree)(void *devptr);
int (*cudaMemset)(void *devptr, int value, size_t count);
} ucc_pt_cuda_iface_t;

extern ucc_pt_cuda_iface_t ucc_pt_cuda_iface;
Expand Down Expand Up @@ -74,4 +79,41 @@ static inline int ucc_pt_cudaStreamDestroy(cudaStream_t stream)
return 0;
}

static inline int ucc_pt_cudaMalloc(void **devptr, size_t size)
{
if (!ucc_pt_cuda_iface.available) {
return 1;
}
CUDA_CHECK(ucc_pt_cuda_iface.cudaMalloc(devptr, size));
return 0;
}

static inline int ucc_pt_cudaMallocManaged(void **ptr, size_t size)
{
if (!ucc_pt_cuda_iface.available) {
return 1;
}
CUDA_CHECK(ucc_pt_cuda_iface.cudaMallocManaged(ptr, size,
cudaMemAttachGlobal));
return 0;
}

static inline int ucc_pt_cudaFree(void *devptr)
{
if (!ucc_pt_cuda_iface.available) {
return 1;
}
CUDA_CHECK(ucc_pt_cuda_iface.cudaFree(devptr));
return 0;
}

static inline int ucc_pt_cudaMemset(void *devptr, int value, size_t count)
{
if (!ucc_pt_cuda_iface.available) {
return 1;
}
CUDA_CHECK(ucc_pt_cuda_iface.cudaMemset(devptr, value, count));
return 0;
}

#endif

0 comments on commit e038288

Please sign in to comment.