diff --git a/libkineto/src/CuptiActivity.cpp b/libkineto/src/CuptiActivity.cpp index 3b0b03c9b..5ecfa1ad0 100644 --- a/libkineto/src/CuptiActivity.cpp +++ b/libkineto/src/CuptiActivity.cpp @@ -38,11 +38,16 @@ inline bool isEventSync(CUpti_ActivitySynchronizationType type) { } inline std::string eventSyncInfo( - const CUpti_ActivitySynchronization& act, int32_t srcStream) { + const CUpti_ActivitySynchronization& act, + int32_t srcStream, + int32_t srcCorrId + ) { return fmt::format(R"JSON( "wait_on_stream": {}, + "wait_on_cuda_event_record_corr_id": {}, "wait_on_cuda_event_id": {},)JSON", srcStream, + srcCorrId, act.cudaEventId ); } @@ -75,7 +80,7 @@ inline const std::string CudaSyncActivity::metadataJson() const { "stream": {}, "correlation": {}, "device": {}, "context": {})JSON", syncTypeString(sync.type), - isEventSync(raw().type) ? eventSyncInfo(raw(), srcStream_) : "", + isEventSync(raw().type) ? eventSyncInfo(raw(), srcStream_, srcCorrId_) : "", sync.streamId, sync.correlationId, deviceId(), sync.contextId); // clang-format on diff --git a/libkineto/src/CuptiActivity.h b/libkineto/src/CuptiActivity.h index aef4ca83a..8127eb54d 100644 --- a/libkineto/src/CuptiActivity.h +++ b/libkineto/src/CuptiActivity.h @@ -129,8 +129,11 @@ struct CudaSyncActivity : public CuptiActivity { explicit CudaSyncActivity( const CUpti_ActivitySynchronization* activity, const ITraceActivity* linked, - int32_t srcStream) - : CuptiActivity(activity, linked), srcStream_(srcStream) {} + int32_t srcStream, + int32_t srcCorrId) + : CuptiActivity(activity, linked), + srcStream_(srcStream), + srcCorrId_(srcCorrId) {} int64_t correlationId() const override {return raw().correlationId;} int64_t deviceId() const override; int64_t resourceId() const override; @@ -143,6 +146,7 @@ struct CudaSyncActivity : public CuptiActivity { private: const int32_t srcStream_; + const int32_t srcCorrId_; }; diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index 1d9bb95a3..265fd8d7f 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -66,11 +67,18 @@ struct std::hash { } }; +struct WaitEventInfo { + // CUDA stream that the CUDA event was recorded on + uint32_t stream; + // Correlation ID of the cudaEventRecord event + uint32_t correlationId; +}; + namespace { -// Map (ctx, eventId) -> stream that recorded the cudaEvent -std::unordered_map& waitEventMap() { - static std::unordered_map waitEventMap_; +// Map (ctx, eventId) -> (stream, corr Id) that recorded the CUDA event +std::unordered_map& waitEventMap() { + static std::unordered_map waitEventMap_; return waitEventMap_; } @@ -415,12 +423,10 @@ inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) { if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 || cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 || cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 || - // Don't care about cudaEvents + // Support cudaEventRecord and cudaEventSynchronize, revisit if others are needed cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 || cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020) { + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020) { return true; } @@ -499,13 +505,14 @@ void CuptiActivityProfiler::handleOverheadActivity( } -int32_t getStreamForWaitEvent(uint32_t ctx, uint32_t eventId) { +std::optional getWaitEventInfo( + uint32_t ctx, uint32_t eventId) { auto key = CtxEventPair{ctx, eventId}; auto it = waitEventMap().find(key); if (it != waitEventMap().end()) { return it->second; } - return -1; + return std::nullopt; } void CuptiActivityProfiler::handleCudaEventActivity( @@ -516,9 +523,9 @@ void CuptiActivityProfiler::handleCudaEventActivity( << " streamId=" << activity->streamId << " contextId=" << activity->contextId; - // Update the stream the cudaEvent was last recorded on + // Update the stream, corrID the cudaEvent was last recorded on auto key = CtxEventPair{activity->contextId, activity->eventId}; - waitEventMap()[key] = activity->streamId; + waitEventMap()[key] = WaitEventInfo{activity->streamId, activity->correlationId}; } void CuptiActivityProfiler::handleCudaSyncActivity( @@ -555,10 +562,19 @@ void CuptiActivityProfiler::handleCudaSyncActivity( const ITraceActivity* linked = linkedActivity(activity->correlationId, cpuCorrelationMap_); - int32_t src_stream = getStreamForWaitEvent( - activity->contextId, activity->cudaEventId); + int32_t src_stream = -1, src_corrid = -1; + + if (isEventSync(activity->type)) { + auto maybe_wait_event_info = getWaitEventInfo( + activity->contextId, activity->cudaEventId); + if (maybe_wait_event_info) { + src_stream = maybe_wait_event_info->stream; + src_corrid = maybe_wait_event_info->correlationId; + } + } + const auto& cuda_sync_activity = traceBuffers_->addActivityWrapper( - CudaSyncActivity(activity, linked, src_stream)); + CudaSyncActivity(activity, linked, src_stream, src_corrid)); if (outOfRange(cuda_sync_activity)) { return;