From 4df86b0b1f6dca9a49a6d92514e8ad2866d68297 Mon Sep 17 00:00:00 2001 From: Brian Coutinho Date: Wed, 27 Sep 2023 17:46:22 -0700 Subject: [PATCH] Collect more info on cuduEventRecord for stream wait sync events (#808) Summary: Pull Request resolved: https://github.com/pytorch/kineto/pull/808 ## Overview To effectively understand synchronization due to CUDA events we also need to track CUDA event record calls and which CUDA event record call the sync occurss on. Thus when an event synchronization occurs we track 1. We can figure out which cudaEventRecord() this sync took place on 2. Based on that we know which kernel/memcpy launch was recorded by the event, this will typically be the previous kernel/memcpy launch in runtime. CUPTI' CUDA Event record tells us the correlation ID of the CUDA record [documentation](https://docs.nvidia.com/cupti/annotated.html#structCUpti__ActivityEvent) ``` uint32_t CUpti_ActivityEvent::correlationId [inherited] The correlation ID of the event. Use of this ID is user-defined, but typically this ID value will equal the correlation ID of the kernel for which the event was gathered. ``` ## In this change * Enable logging cuda Event Record Runtime events (and event sychronize calls too). * Track the correlation ID for CUPTI Event records as shown above. * In the CUPTI Stream Wait event we emit the correlation Reviewed By: anupambhatnagar, davidberard98 Differential Revision: D49694339 fbshipit-source-id: 68b5ca2cc2cdfee6a4b1c90d213813695812e02a --- libkineto/src/CuptiActivity.cpp | 9 +++-- libkineto/src/CuptiActivity.h | 8 +++-- libkineto/src/CuptiActivityProfiler.cpp | 44 +++++++++++++++++-------- 3 files changed, 43 insertions(+), 18 deletions(-) diff --git a/libkineto/src/CuptiActivity.cpp b/libkineto/src/CuptiActivity.cpp index 3b0b03c9b..5ecfa1ad0 100644 --- a/libkineto/src/CuptiActivity.cpp +++ b/libkineto/src/CuptiActivity.cpp @@ -38,11 +38,16 @@ inline bool isEventSync(CUpti_ActivitySynchronizationType type) { } inline std::string eventSyncInfo( - const CUpti_ActivitySynchronization& act, int32_t srcStream) { + const CUpti_ActivitySynchronization& act, + int32_t srcStream, + int32_t srcCorrId + ) { return fmt::format(R"JSON( "wait_on_stream": {}, + "wait_on_cuda_event_record_corr_id": {}, "wait_on_cuda_event_id": {},)JSON", srcStream, + srcCorrId, act.cudaEventId ); } @@ -75,7 +80,7 @@ inline const std::string CudaSyncActivity::metadataJson() const { "stream": {}, "correlation": {}, "device": {}, "context": {})JSON", syncTypeString(sync.type), - isEventSync(raw().type) ? eventSyncInfo(raw(), srcStream_) : "", + isEventSync(raw().type) ? eventSyncInfo(raw(), srcStream_, srcCorrId_) : "", sync.streamId, sync.correlationId, deviceId(), sync.contextId); // clang-format on diff --git a/libkineto/src/CuptiActivity.h b/libkineto/src/CuptiActivity.h index aef4ca83a..8127eb54d 100644 --- a/libkineto/src/CuptiActivity.h +++ b/libkineto/src/CuptiActivity.h @@ -129,8 +129,11 @@ struct CudaSyncActivity : public CuptiActivity { explicit CudaSyncActivity( const CUpti_ActivitySynchronization* activity, const ITraceActivity* linked, - int32_t srcStream) - : CuptiActivity(activity, linked), srcStream_(srcStream) {} + int32_t srcStream, + int32_t srcCorrId) + : CuptiActivity(activity, linked), + srcStream_(srcStream), + srcCorrId_(srcCorrId) {} int64_t correlationId() const override {return raw().correlationId;} int64_t deviceId() const override; int64_t resourceId() const override; @@ -143,6 +146,7 @@ struct CudaSyncActivity : public CuptiActivity { private: const int32_t srcStream_; + const int32_t srcCorrId_; }; diff --git a/libkineto/src/CuptiActivityProfiler.cpp b/libkineto/src/CuptiActivityProfiler.cpp index 1d9bb95a3..265fd8d7f 100644 --- a/libkineto/src/CuptiActivityProfiler.cpp +++ b/libkineto/src/CuptiActivityProfiler.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -66,11 +67,18 @@ struct std::hash { } }; +struct WaitEventInfo { + // CUDA stream that the CUDA event was recorded on + uint32_t stream; + // Correlation ID of the cudaEventRecord event + uint32_t correlationId; +}; + namespace { -// Map (ctx, eventId) -> stream that recorded the cudaEvent -std::unordered_map& waitEventMap() { - static std::unordered_map waitEventMap_; +// Map (ctx, eventId) -> (stream, corr Id) that recorded the CUDA event +std::unordered_map& waitEventMap() { + static std::unordered_map waitEventMap_; return waitEventMap_; } @@ -415,12 +423,10 @@ inline static bool isBlockListedRuntimeCbid(CUpti_CallbackId cbid) { if (cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetDevice_v3020 || cbid == CUPTI_RUNTIME_TRACE_CBID_cudaSetDevice_v3020 || cbid == CUPTI_RUNTIME_TRACE_CBID_cudaGetLastError_v3020 || - // Don't care about cudaEvents + // Support cudaEventRecord and cudaEventSynchronize, revisit if others are needed cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreate_v3020 || cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventCreateWithFlags_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventRecord_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020 || - cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventSynchronize_v3020) { + cbid == CUPTI_RUNTIME_TRACE_CBID_cudaEventDestroy_v3020) { return true; } @@ -499,13 +505,14 @@ void CuptiActivityProfiler::handleOverheadActivity( } -int32_t getStreamForWaitEvent(uint32_t ctx, uint32_t eventId) { +std::optional getWaitEventInfo( + uint32_t ctx, uint32_t eventId) { auto key = CtxEventPair{ctx, eventId}; auto it = waitEventMap().find(key); if (it != waitEventMap().end()) { return it->second; } - return -1; + return std::nullopt; } void CuptiActivityProfiler::handleCudaEventActivity( @@ -516,9 +523,9 @@ void CuptiActivityProfiler::handleCudaEventActivity( << " streamId=" << activity->streamId << " contextId=" << activity->contextId; - // Update the stream the cudaEvent was last recorded on + // Update the stream, corrID the cudaEvent was last recorded on auto key = CtxEventPair{activity->contextId, activity->eventId}; - waitEventMap()[key] = activity->streamId; + waitEventMap()[key] = WaitEventInfo{activity->streamId, activity->correlationId}; } void CuptiActivityProfiler::handleCudaSyncActivity( @@ -555,10 +562,19 @@ void CuptiActivityProfiler::handleCudaSyncActivity( const ITraceActivity* linked = linkedActivity(activity->correlationId, cpuCorrelationMap_); - int32_t src_stream = getStreamForWaitEvent( - activity->contextId, activity->cudaEventId); + int32_t src_stream = -1, src_corrid = -1; + + if (isEventSync(activity->type)) { + auto maybe_wait_event_info = getWaitEventInfo( + activity->contextId, activity->cudaEventId); + if (maybe_wait_event_info) { + src_stream = maybe_wait_event_info->stream; + src_corrid = maybe_wait_event_info->correlationId; + } + } + const auto& cuda_sync_activity = traceBuffers_->addActivityWrapper( - CudaSyncActivity(activity, linked, src_stream)); + CudaSyncActivity(activity, linked, src_stream, src_corrid)); if (outOfRange(cuda_sync_activity)) { return;