Skip to content

Commit

Permalink
Fixing OTF2 clock to use new timestamps.
Browse files Browse the repository at this point in the history
Also updated GPU example to create many streams.
  • Loading branch information
khuck committed Jul 28, 2020
1 parent 6286f90 commit 9bd65b9
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 8 deletions.
6 changes: 2 additions & 4 deletions src/apex/otf2_listener.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <mutex>
#include <chrono>
#include "apex_cxx_shared_lock.hpp"
#include "profiler.hpp"

namespace apex {

Expand All @@ -40,10 +41,7 @@ namespace apex {
/* All OTF2 callback functions have to be declared static, so that they
* can be registered with the OTF2 library */
static OTF2_TimeStamp get_time( void ) {
using namespace std::chrono;
uint64_t stamp =
duration_cast<nanoseconds>(
system_clock::now().time_since_epoch()).count();
uint64_t stamp = profiler::get_time_ns();
stamp = stamp - globalOffset;
return stamp;
}
Expand Down
25 changes: 21 additions & 4 deletions src/unit_tests/CUDA/multiGpuThread.cu
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,16 @@
#define ARR_SIZE 10
#define NUM_THR 8

#define RUNTIME_API_CALL(apiFuncCall) \
do { \
cudaError_t _status = apiFuncCall; \
if (_status != cudaSuccess) { \
fprintf(stderr, "%s:%d: error: function %s failed with error %s.\n", \
__FILE__, __LINE__, #apiFuncCall, cudaGetErrorString(_status));\
exit(-1); \
} \
} while (0)

typedef struct {
int *arr;
int *dev_arr;
Expand All @@ -30,14 +40,21 @@ void *thread_func(void* struc)
cuda_st * data = (cuda_st*)struc;
printf("thread %d func start\n", data->thr_num);
printf("arr %d = ", data->dev_num);
for(int i=0; i<10; i++) {
int i;
for(i=0; i<10; i++) {
printf("%d ", data->arr[i]);
}
printf("\n");
cudaSetDevice(data->dev_num);
cudaMemcpy(data->dev_arr, data->arr, sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
kernel_fc<<<1,ARR_SIZE>>>(data->dev_arr, data->dev_result);
cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
cudaStream_t stream;
RUNTIME_API_CALL(cudaStreamCreate(&stream));
for (i=0 ; i<10 ; i++) {
cudaMemcpy(data->dev_arr, data->arr, sizeof(int)*ARR_SIZE, cudaMemcpyHostToDevice);
kernel_fc<<<1,ARR_SIZE,0,stream>>>(data->dev_arr, data->dev_result);
cudaMemcpy(data->result, data->dev_result, sizeof(int), cudaMemcpyDeviceToHost);
RUNTIME_API_CALL(cudaStreamSynchronize(stream));
}
RUNTIME_API_CALL(cudaStreamDestroy(stream));
printf("thread %d func exit\n", data->thr_num);
return NULL;
}
Expand Down

0 comments on commit 9bd65b9

Please sign in to comment.