Skip to content

Commit

Permalink
Working OpenMP target support!
Browse files Browse the repository at this point in the history
  • Loading branch information
khuck committed May 3, 2022
1 parent c0d7491 commit 5e2d561
Show file tree
Hide file tree
Showing 9 changed files with 713 additions and 105 deletions.
652 changes: 634 additions & 18 deletions src/apex/apex_ompt.cpp

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion src/apex/apex_rocm_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ do { \

namespace apex { namespace rsmi {

std::set<uint32_t> monitor::activeDeviceIndices;
//std::set<uint32_t> monitor::activeDeviceIndices;
std::mutex monitor::indexMutex;

monitor::monitor (void) {
Expand Down
4 changes: 2 additions & 2 deletions src/apex/apex_rocm_smi.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,14 @@ class monitor {
~monitor (void);
void query();
void stop();
static void activateDeviceIndex(uint32_t index);
void activateDeviceIndex(uint32_t index);
private:
bool success;
uint32_t deviceCount;
std::vector<uint64_t> devices;
std::vector<DeviceInfo> deviceInfos;
std::vector<bool> queried_once;
static std::set<uint32_t> activeDeviceIndices;
std::set<uint32_t> activeDeviceIndices;
static std::mutex indexMutex;
//double convertValue(nvmlFieldValue_t &value);
}; // class monitor
Expand Down
96 changes: 53 additions & 43 deletions src/apex/async_thread_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,25 +8,54 @@

#pragma once

#include <iomanip>
#include <sstream>
#include <string>

namespace apex {

class cuda_thread_node {
class base_thread_node {
public:
uint32_t _device;
apex_async_activity_t _activity;
base_thread_node(uint32_t device, apex_async_activity_t activity) :
_device(device), _activity(activity) { }
virtual bool operator==(const base_thread_node &rhs) const {
return (_device == rhs._device && _activity == rhs._activity);
}
virtual bool operator<(const base_thread_node &rhs) const {
if (_device<rhs._device) {
return true;
} else if (_device == rhs._device &&
_activity < rhs._activity && apex_options::use_otf2()) {
return true;
}
return false;
}
virtual std::string name () {
std::stringstream ss;
ss << "\"GPU [" << _device << "]";
std::string tmp{ss.str()};
return tmp;
}
};


class cuda_thread_node : public base_thread_node {
public:
uint32_t _context;
uint32_t _stream;
apex_async_activity_t _activity;
cuda_thread_node(uint32_t device, uint32_t context, uint32_t stream,
apex_async_activity_t activity) :
_device(device), _context(context), _stream(stream),
_activity(activity) { }
bool operator==(const cuda_thread_node &rhs) const {
base_thread_node(device, activity),
_context(context), _stream(stream) { }
virtual bool operator==(const cuda_thread_node &rhs) const {
return (_device == rhs._device &&
_context == rhs._context &&
_stream == rhs._stream &&
_activity == rhs._activity);
}
bool operator<(const cuda_thread_node &rhs) const {
virtual bool operator<(const cuda_thread_node &rhs) const {
if (_device<rhs._device) {
return true;
} else if (_device == rhs._device && _context < rhs._context) {
Expand All @@ -41,23 +70,28 @@ namespace apex {
}
return false;
}
virtual std::string name () {
std::stringstream ss;
ss << "\"CUDA [" << _device << ":" << _context
<< ":" << std::setfill('0') << std::setw(5) << _stream << "]";
std::string tmp{ss.str()};
return tmp;
}
};

class hip_thread_node {
class hip_thread_node : public base_thread_node {
public:
uint32_t _device;
uint32_t _queue;
apex_async_activity_t _activity;
hip_thread_node(uint32_t device, uint32_t command_queue,
apex_async_activity_t activity) :
_device(device), _queue(command_queue),
_activity(activity) { }
bool operator==(const hip_thread_node &rhs) const {
base_thread_node(device, activity),
_queue(command_queue) { }
virtual bool operator==(const hip_thread_node &rhs) const {
return (_device == rhs._device &&
_queue == rhs._queue &&
_activity == rhs._activity);
}
bool operator<(const hip_thread_node &rhs) const {
virtual bool operator<(const hip_thread_node &rhs) const {
if (_device<rhs._device) {
return true;
} else if (_device == rhs._device && _queue < rhs._queue) {
Expand All @@ -68,38 +102,14 @@ namespace apex {
}
return false;
}
};

class dummy_thread_node {
public:
uint32_t _device;
apex_async_activity_t _activity;
dummy_thread_node(uint32_t device, apex_async_activity_t activity) :
_device(device), _activity(activity) { }
bool operator==(const dummy_thread_node &rhs) const {
return (_device == rhs._device && _activity == rhs._activity);
}
bool operator<(const dummy_thread_node &rhs) const {
if (_device<rhs._device) {
return true;
} else if (_device == rhs._device &&
_activity < rhs._activity && apex_options::use_otf2()) {
return true;
}
return false;
virtual std::string name () {
std::stringstream ss;
ss << "\"HIP [" << _device
<< ":" << std::setfill('0') << std::setw(5) << _queue << "]";
std::string tmp{ss.str()};
return tmp;
}
};

}

#ifdef APEX_WITH_CUDA
using async_thread_node = apex::cuda_thread_node;
#endif

#ifdef APEX_WITH_HIP
using async_thread_node = apex::hip_thread_node;
#endif

#if !defined(APEX_WITH_CUDA) && !defined(APEX_WITH_HIP)
using async_thread_node = apex::dummy_thread_node;
#endif
3 changes: 2 additions & 1 deletion src/apex/event_listener.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ class async_event_data : public event_data {
uint64_t parent_tid;
std::string name;
bool reverse_flow;
bool flow;
async_event_data() {};
async_event_data(double _parent_ts_start, std::string _cat, uint64_t _id,
uint64_t _parent_tid, std::string _name) :
Expand All @@ -119,7 +120,7 @@ class async_event_data : public event_data {
cat(_cat),
id(_id),
parent_tid(_parent_tid),
name(_name), reverse_flow(false) {};
name(_name), reverse_flow(false), flow(true) {};
~async_event_data() {};
};

Expand Down
23 changes: 6 additions & 17 deletions src/apex/otf2_listener.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,26 +49,15 @@ using namespace std;

namespace apex {

uint32_t otf2_listener::make_vtid (async_thread_node &node) {
uint32_t otf2_listener::make_vtid (base_thread_node &node) {
size_t tid;
/* There is a potential for overlap here, but not a high potential. The CPU and the GPU
* would BOTH have to spawn 64k+ threads/streams for this to happen. */
if (vthread_map.count(node) == 0) {
// build the thread name for viewers
std::stringstream ss;
#ifdef APEX_WITH_CUPTI
ss << "CUDA[" << node._device;
ss << ":" << node._context;
ss << ":" << node._stream;
#endif
#ifdef APEX_WITH_HIP
ss << "HIP[" << node._device;
ss << ":" << node._queue;
#endif
#if !defined(APEX_WITH_CUPTI) && !defined(APEX_WITH_HIP)
ss << "GPU[" << node._device;
#endif
ss << "] " << activity_to_string(node._activity);
ss << node.name();
ss << " " << activity_to_string(node._activity);
std::string name{ss.str()};
// lock the archive lock, we need to make an event writer
write_lock_type lock(_archive_mutex);
Expand All @@ -86,7 +75,7 @@ namespace apex {
// done with the set of event threads, so unlock.
_event_set_mutex.unlock();
// use the OTF2 thread index (not reversed) for the vthread_map
vthread_map.insert(std::pair<async_thread_node, size_t>(node,id));
vthread_map.insert(std::pair<base_thread_node, size_t>(node,id));
// construct a globally unique ID for this thread on this rank
uint64_t my_node_id = my_saved_node_id;
my_node_id = (my_node_id << 32) + id;
Expand Down Expand Up @@ -2678,7 +2667,7 @@ namespace apex {

#endif

void otf2_listener::on_async_event(async_thread_node &node,
void otf2_listener::on_async_event(base_thread_node &node,
std::shared_ptr<profiler> &p) {
// This could be a callback from a library before APEX is ready
// Something like OpenMP or CUDA/CUPTI or...?
Expand Down Expand Up @@ -2739,7 +2728,7 @@ namespace apex {

}

void otf2_listener::on_async_metric(async_thread_node &node,
void otf2_listener::on_async_metric(base_thread_node &node,
std::shared_ptr<profiler> &p) {
// This could be a callback from a library before APEX is ready
// Something like OpenMP or CUDA/CUPTI or...?
Expand Down
8 changes: 4 additions & 4 deletions src/apex/otf2_listener.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,9 +193,9 @@ namespace apex {
uint64_t stamp, bool is_enter);
#endif
std::mutex _vthread_mutex;
std::map<async_thread_node, size_t> vthread_map;
std::map<base_thread_node, size_t> vthread_map;
std::map<uint32_t, OTF2_EvtWriter*> vthread_evt_writer_map;
uint32_t make_vtid (async_thread_node &node);
uint32_t make_vtid (base_thread_node &node);
std::map<uint32_t,uint64_t> last_ts;
uint64_t dropped;
int64_t synchronizeClocks(void);
Expand Down Expand Up @@ -241,9 +241,9 @@ namespace apex {
{ APEX_UNUSED(data); };
void on_send(message_event_data &data);
void on_recv(message_event_data &data);
void on_async_event(async_thread_node &node,
void on_async_event(base_thread_node &node,
std::shared_ptr<profiler> &p);
void on_async_metric(async_thread_node &node,
void on_async_metric(base_thread_node &node,
std::shared_ptr<profiler> &p);

};
Expand Down
22 changes: 7 additions & 15 deletions src/apex/trace_event_listener.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,32 +214,22 @@ void trace_event_listener::set_metadata(const char * name, const char * value) {
APEX_UNUSED(value);
}

std::string trace_event_listener::make_tid (async_thread_node &node) {
std::string trace_event_listener::make_tid (base_thread_node &node) {
size_t tid;
/* There is a potential for overlap here, but not a high potential. The CPU and the GPU
* would BOTH have to spawn 64k+ threads/streams for this to happen. */
if (vthread_map.count(node) == 0) {
size_t id = vthread_map.size()+1;
//uint32_t id_reversed = simple_reverse(id);
uint32_t id_shifted = id << 16;
vthread_map.insert(std::pair<async_thread_node, size_t>(node,id_shifted));
vthread_map.insert(std::pair<base_thread_node, size_t>(node,id_shifted));
std::stringstream ss;
ss << fixed;
ss << "{\"name\":\"thread_name\""
<< ",\"ph\":\"M\",\"pid\":" << saved_node_id
<< ",\"tid\":" << id_shifted
<< ",\"args\":{\"name\":";
#ifdef APEX_WITH_CUDA
ss << "\"CUDA [" << node._device << ":" << node._context
<< ":" << std::setfill('0') << setw(5) << node._stream << "]";
#endif
#ifdef APEX_WITH_HIP
ss << "\"HIP [" << node._device
<< ":" << std::setfill('0') << setw(5) << node._queue << "]";
#endif
#if !defined(APEX_WITH_CUDA) && !defined(APEX_WITH_HIP)
ss << "\"GPU [" << node._device << "]";
#endif
ss << node.name();
//ss << "" << activity_to_string(node._activity);
ss << "\"";
ss << "}},\n";
Expand All @@ -258,7 +248,7 @@ std::string trace_event_listener::make_tid (async_thread_node &node) {
return label;
}

void trace_event_listener::on_async_event(async_thread_node &node,
void trace_event_listener::on_async_event(base_thread_node &node,
std::shared_ptr<profiler> &p, const async_event_data& data) {
if (!_terminate) {
std::stringstream ss;
Expand All @@ -277,6 +267,7 @@ void trace_event_listener::on_async_event(async_thread_node &node,
<< ",\"args\":{\"GUID\":" << p->guid << ",\"Parent GUID\":" << pguid << "}},\n";
// write a flow event pair!
// make sure the start of the flow is before the end of the flow, ideally the middle of the parent
if (data.flow) {
if (data.reverse_flow) {
double begin_ts = (p->get_stop_us() + p->get_start_us()) * 0.5;
double end_ts = std::min(p->get_stop_us(), data.parent_ts_stop);
Expand All @@ -288,12 +279,13 @@ void trace_event_listener::on_async_event(async_thread_node &node,
write_flow_event(ss, begin_ts, 's', data.cat, data.id, saved_node_id, data.parent_tid, data.name);
write_flow_event(ss, end_ts, 't', data.cat, data.id, saved_node_id, atol(tid.c_str()), data.name);
}
}
write_to_trace(ss);
flush_trace_if_necessary();
}
}

void trace_event_listener::on_async_metric(async_thread_node &node,
void trace_event_listener::on_async_metric(base_thread_node &node,
std::shared_ptr<profiler> &p) {
if (!_terminate) {
std::stringstream ss;
Expand Down
8 changes: 4 additions & 4 deletions src/apex/trace_event_listener.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ class trace_event_listener : public event_listener {
void on_recv(message_event_data &data) { APEX_UNUSED(data); };
void set_node_id(int node_id, int node_count);
void set_metadata(const char * name, const char * value);
void on_async_event(async_thread_node &node, std::shared_ptr<profiler> &p,
void on_async_event(base_thread_node &node, std::shared_ptr<profiler> &p,
const async_event_data& data);
void on_async_metric(async_thread_node &node, std::shared_ptr<profiler> &p);
void on_async_metric(base_thread_node &node, std::shared_ptr<profiler> &p);
void end_trace_time(void);

private:
Expand All @@ -65,7 +65,7 @@ class trace_event_listener : public event_listener {
void close_trace(void);
void flush_trace_if_necessary(void);
void _common_stop(std::shared_ptr<profiler> &p);
std::string make_tid (async_thread_node &node);
std::string make_tid (base_thread_node &node);
long unsigned int get_thread_id_metadata();
static bool _initialized;
size_t get_thread_index(void);
Expand All @@ -84,7 +84,7 @@ class trace_event_listener : public event_listener {
std::map<size_t, std::mutex*> mutexes;
std::map<size_t, std::stringstream*> streams;
std::mutex _vthread_mutex;
std::map<async_thread_node, size_t> vthread_map;
std::map<base_thread_node, size_t> vthread_map;
double _end_time;
};

Expand Down

0 comments on commit 5e2d561

Please sign in to comment.