Skip to content

Commit

Permalink
Adding extra calls to query HIP memory periodically, and to query SMI…
Browse files Browse the repository at this point in the history
… memory at alloc/free points.
  • Loading branch information
khuck committed Feb 7, 2024
1 parent 770e94f commit fa89f87
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 9 deletions.
18 changes: 12 additions & 6 deletions src/apex/apex_ompt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#ifdef APEX_HAVE_OTF2
#include "otf2_listener.hpp"
#endif
#include "apex_rocm_smi.hpp"


std::mutex apex_apex_threadid_mutex;
std::atomic<uint64_t> apex_numthreads{0};
Expand Down Expand Up @@ -153,7 +155,7 @@ class Globals{
auto iter = g.data_map.find(id);
if (iter != g.data_map.end()) {
iter->second.parent_ts_stop = apex::profiler::now_ns();
std::cout << "Updated " << id << std::endl;
// std::cout << "Updated " << id << std::endl;
}
g.unlock();
}
Expand Down Expand Up @@ -224,8 +226,8 @@ void stop_async_task(std::shared_ptr<apex::task_wrapper> tt, uint64_t start, uin
prof->set_start(start);
prof->set_end(end);
}
std::cout << "Parent prof " << correlationId << ": " << (uint64_t)as_data.parent_ts_start << " " << (uint64_t)as_data.parent_ts_stop << std::endl;
std::cout << "Final prof " << correlationId << ": " << prof->get_start_ns() << " " << prof->get_stop_ns() << std::endl;
//std::cout << "Parent prof " << correlationId << ": " << (uint64_t)as_data.parent_ts_start << " " << (uint64_t)as_data.parent_ts_stop << std::endl;
//std::cout << "Final prof " << correlationId << ": " << prof->get_start_ns() << " " << prof->get_stop_ns() << std::endl;
// important! Otherwise we might get the wrong end timestamp.
prof->stopped = true;
// Get the singleton APEX instance
Expand Down Expand Up @@ -416,7 +418,7 @@ static void print_record_ompt(ompt_record_ompt_t *rec) {
end = target_end_times[rec->target_id] + 1000;
}
}
std::cout << "Target " << rec->target_id << ": " << start << " - " << end << std::endl;
// std::cout << "Target " << rec->target_id << ": " << start << " - " << end << std::endl;
parent_thread = target_parent_thread_ids[rec->target_id];
active_target_addrs.erase(rec->target_id);
active_target_devices.erase(rec->target_id);
Expand Down Expand Up @@ -527,7 +529,7 @@ static void print_record_ompt(ompt_record_ompt_t *rec) {
target_start_times[rec->target_id] = rec->time - 1000;
}
target_end_times[rec->target_id] = target_data_op_rec.end_time;
std::cout << "Updated Target " << rec->target_id << ": " << target_start_times[rec->target_id] << " - " << target_end_times[rec->target_id] << std::endl;
//std::cout << "Updated Target " << rec->target_id << ": " << target_start_times[rec->target_id] << " - " << target_end_times[rec->target_id] << std::endl;
}
if (codeptr_ra != nullptr) {
ss << ": UNRESOLVED ADDR " << codeptr_ra;
Expand Down Expand Up @@ -584,7 +586,7 @@ static void print_record_ompt(ompt_record_ompt_t *rec) {
target_start_times[rec->target_id] = rec->time - 1000;
}
target_end_times[rec->target_id] = target_kernel_rec.end_time;
std::cout << "Updated Target " << rec->target_id << ": " << target_start_times[rec->target_id] << " - " << target_end_times[rec->target_id] << std::endl;
//std::cout << "Updated Target " << rec->target_id << ": " << target_start_times[rec->target_id] << " - " << target_end_times[rec->target_id] << std::endl;
}
if (codeptr_ra != nullptr) {
ss << ": UNRESOLVED ADDR " << codeptr_ra;
Expand Down Expand Up @@ -1043,6 +1045,7 @@ extern "C" void apex_target_data_op (
apex::sample_value("GPU: OpenMP Target Data Alloc",bytes);
std::unique_lock<std::mutex> l(allocation_lock);
allocations[src_addr] = bytes;
apex::rsmi::monitor::instance().explicitMemCheck();
break;
}
case ompt_target_data_transfer_to_device: {
Expand All @@ -1065,6 +1068,7 @@ extern "C" void apex_target_data_op (
allocations.erase(src_addr);
}
apex::sample_value("GPU: OpenMP Target Data Delete",mybytes);
apex::rsmi::monitor::instance().explicitMemCheck();
break;
}
case ompt_target_data_associate: {
Expand All @@ -1080,6 +1084,7 @@ extern "C" void apex_target_data_op (
apex::sample_value("GPU: OpenMP Target Data Alloc Async",bytes);
std::unique_lock<std::mutex> l(allocation_lock);
allocations[src_addr] = bytes;
apex::rsmi::monitor::instance().explicitMemCheck();
break;
}
case ompt_target_data_transfer_to_device_async: {
Expand All @@ -1102,6 +1107,7 @@ extern "C" void apex_target_data_op (
allocations.erase(src_addr);
}
apex::sample_value("GPU: OpenMP Target Data Delete Async",mybytes);
apex::rsmi::monitor::instance().explicitMemCheck();
break;
}
#endif
Expand Down
25 changes: 23 additions & 2 deletions src/apex/apex_rocm_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ do { \
} while (0);

#define MILLIONTH 1.0e-6 // scale to MB
#define BILLIONTH 1.0e-9 // scale to GB
#define BILLIONTH 1.0/(1024.0*1024.0*1024.0) // scale to GB
#define PCIE_THROUGHPUT 1.0e-3 // to scale KB to MB
#define NVLINK_BW 1.0e-3 // to scale MB/s to GB/s
#define WATTS 1.0e-6 // scale uW to W
Expand Down Expand Up @@ -119,6 +119,10 @@ void monitor::stop (void) {
if (!success) return;
}

/* Forward declaration to avoid including hip headers */
extern "C" int hipMemGetInfo(size_t* free, size_t* total);
extern "C" const char* hipGetErrorString(int hipError);

void monitor::explicitMemCheck (void) {
if (!success) return;
indexMutex.lock();
Expand All @@ -135,6 +139,24 @@ void monitor::explicitMemCheck (void) {
double value = (double)(memory_usage) * BILLIONTH;
sample_value(tmp, value);
}
size_t free_byte, total_byte;
auto hip_status = hipMemGetInfo(&free_byte, &total_byte);
if (0 != hip_status){
fprintf(stderr, "Error: hipMemGetInfo fails, %s \n",
hipGetErrorString(hip_status));
} else {
std::stringstream ss;
ss << "GPU: hipMemGetInfo free (GB)";
std::string tmp = ss.str();
double value = (double)(free_byte) * BILLIONTH;
sample_value(tmp, value);
ss.str("");
ss.clear();
ss << "GPU: hipMemGetInfo used (GB)";
tmp = ss.str();
value = (double)(total_byte - free_byte) * BILLIONTH;
sample_value(tmp, value);
}
}

void monitor::query(void) {
Expand Down Expand Up @@ -386,7 +408,6 @@ double monitor::getAvailableMemory() {
avail = (double)(memory_total - memory_usage);
break;
}

return avail;
}

Expand Down
2 changes: 1 addition & 1 deletion src/scripts/apex_exec
Original file line number Diff line number Diff line change
Expand Up @@ -526,9 +526,9 @@ fi
if [ $ompt = yes ]; then
export OMP_TOOL=enabled
export OMP_TOOL_LIBRARIES=${BASEDIR}/${LIBDIR}/${APEX_LIBRARY_NAME}_ompt${SHLIBX}
export OMP_TOOL_VERBOSE_INIT=stdout
OMPT_LIB=:${BASEDIR}/${LIBDIR}/${APEX_LIBRARY_NAME}_ompt${SHLIBX}
if [ $verbose = yes ]; then
export OMP_TOOL_VERBOSE_INIT=stdout
export OMP_DISPLAY_ENV=true
fi
fi
Expand Down

0 comments on commit fa89f87

Please sign in to comment.