Skip to content

Commit

Permalink
Collect more info on cuduEventRecord for stream wait sync events (#808)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: #808

## Overview
To effectively understand synchronization due to CUDA events we also need to track CUDA event record calls and which CUDA event record call the sync occurss on.
Thus when an event synchronization occurs we track
1. We can figure out which cudaEventRecord() this sync took place on
2. Based on that we know which kernel/memcpy launch was recorded by the event, this will typically be the previous kernel/memcpy launch in runtime.

CUPTI' CUDA Event record tells us the correlation ID of the CUDA record [documentation](https://docs.nvidia.com/cupti/annotated.html#structCUpti__ActivityEvent)
```
uint32_t CUpti_ActivityEvent::correlationId [inherited]
The correlation ID of the event. Use of this ID is user-defined, but typically this ID value will equal the correlation ID of the kernel for which the event was gathered.
```

## In this change
* Enable logging cuda Event Record Runtime events (and event sychronize calls too).
* Track the correlation ID for CUPTI Event records as shown above.
* In the CUPTI Stream Wait event we emit the correlation

Reviewed By: anupambhatnagar, davidberard98

Differential Revision: D49694339

fbshipit-source-id: 68b5ca2cc2cdfee6a4b1c90d213813695812e02a
  • Loading branch information
briancoutinho authored and facebook-github-bot committed Sep 28, 2023
1 parent ef57024 commit 4df86b0
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 18 deletions.
9 changes: 7 additions & 2 deletions libkineto/src/CuptiActivity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,16 @@ inline bool isEventSync(CUpti_ActivitySynchronizationType type) {
}

inline std::string eventSyncInfo(
const CUpti_ActivitySynchronization& act, int32_t srcStream) {
const CUpti_ActivitySynchronization& act,
int32_t srcStream,
int32_t srcCorrId
) {
return fmt::format(R"JSON(
"wait_on_stream": {},
"wait_on_cuda_event_record_corr_id": {},
"wait_on_cuda_event_id": {},)JSON",
srcStream,
srcCorrId,
act.cudaEventId
);
}
Expand Down Expand Up @@ -75,7 +80,7 @@ inline const std::string CudaSyncActivity::metadataJson() const {
"stream": {}, "correlation": {},
"device": {}, "context": {})JSON",
syncTypeString(sync.type),
isEventSync(raw().type) ? eventSyncInfo(raw(), srcStream_) : "",
isEventSync(raw().type) ? eventSyncInfo(raw(), srcStream_, srcCorrId_) : "",
sync.streamId, sync.correlationId,
deviceId(), sync.contextId);
// clang-format on
Expand Down
8 changes: 6 additions & 2 deletions libkineto/src/CuptiActivity.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,8 +129,11 @@ struct CudaSyncActivity : public CuptiActivity<CUpti_ActivitySynchronization> {
explicit CudaSyncActivity(
const CUpti_ActivitySynchronization* activity,
const ITraceActivity* linked,
int32_t srcStream)
: CuptiActivity(activity, linked), srcStream_(srcStream) {}
int32_t srcStream,
int32_t srcCorrId)
: CuptiActivity(activity, linked),
srcStream_(srcStream),
srcCorrId_(srcCorrId) {}
int64_t correlationId() const override {return raw().correlationId;}
int64_t deviceId() const override;
int64_t resourceId() const override;
Expand All @@ -143,6 +146,7 @@ struct CudaSyncActivity : public CuptiActivity<CUpti_ActivitySynchronization> {

private:
const int32_t srcStream_;
const int32_t srcCorrId_;
};


Expand Down
44 changes: 30 additions & 14 deletions libkineto/src/CuptiActivityProfiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <atomic>
#include <functional>
#include <iomanip>
#include <optional>
#include <string>
#include <thread>
#include <type_traits>
Expand Down Expand Up @@ -66,11 +67,18 @@ struct std::hash<CtxEventPair> {
}
};

struct WaitEventInfo {
// CUDA stream that the CUDA event was recorded on
uint32_t stream;
// Correlation ID of the cudaEventRecord event
uint32_t correlationId;
};

namespace {

// Map (ctx, eventId) -> stream that recorded the cudaEvent
std::unordered_map<CtxEventPair, uint32_t>& waitEventMap() {
static std::unordered_map<CtxEventPair, uint32_t> waitEventMap_;
// Map (ctx, eventId) -> (stream, corr Id) that recorded the CUDA event
std::unordered_map<CtxEventPair, WaitEventInfo>& waitEventMap() {
static std::unordered_map<CtxEventPair, WaitEventInfo> waitEventMap_;
return waitEventMap_;
}

Expand Down Expand Up @@ -415,12 +423,10 @@ inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) {
if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 ||
// Don't care about cudaEvents
// Support cudaEventRecord and cudaEventSynchronize, revisit if others are needed
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020 ||
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020) {
cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020) {
return true;
}

Expand Down Expand Up @@ -499,13 +505,14 @@ void CuptiActivityProfiler::handleOverheadActivity(
}


int32_t getStreamForWaitEvent(uint32_t ctx, uint32_t eventId) {
std::optional<WaitEventInfo> getWaitEventInfo(
uint32_t ctx, uint32_t eventId) {
auto key = CtxEventPair{ctx, eventId};
auto it = waitEventMap().find(key);
if (it != waitEventMap().end()) {
return it->second;
}
return -1;
return std::nullopt;
}

void CuptiActivityProfiler::handleCudaEventActivity(
Expand All @@ -516,9 +523,9 @@ void CuptiActivityProfiler::handleCudaEventActivity(
<< " streamId=" << activity->streamId
<< " contextId=" << activity->contextId;

// Update the stream the cudaEvent was last recorded on
// Update the stream, corrID the cudaEvent was last recorded on
auto key = CtxEventPair{activity->contextId, activity->eventId};
waitEventMap()[key] = activity->streamId;
waitEventMap()[key] = WaitEventInfo{activity->streamId, activity->correlationId};
}

void CuptiActivityProfiler::handleCudaSyncActivity(
Expand Down Expand Up @@ -555,10 +562,19 @@ void CuptiActivityProfiler::handleCudaSyncActivity(

const ITraceActivity* linked =
linkedActivity(activity->correlationId, cpuCorrelationMap_);
int32_t src_stream = getStreamForWaitEvent(
activity->contextId, activity->cudaEventId);
int32_t src_stream = -1, src_corrid = -1;

if (isEventSync(activity->type)) {
auto maybe_wait_event_info = getWaitEventInfo(
activity->contextId, activity->cudaEventId);
if (maybe_wait_event_info) {
src_stream = maybe_wait_event_info->stream;
src_corrid = maybe_wait_event_info->correlationId;
}
}

const auto& cuda_sync_activity = traceBuffers_->addActivityWrapper(
CudaSyncActivity(activity, linked, src_stream));
CudaSyncActivity(activity, linked, src_stream, src_corrid));

if (outOfRange(cuda_sync_activity)) {
return;
Expand Down

0 comments on commit 4df86b0

Please sign in to comment.