From dd964cb0c000a2af04974144d6b8241881a70d4c Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 4 Jun 2024 18:01:51 -0700 Subject: [PATCH 01/13] Add /Zc:__cplusplus --- cmake/CMakeLists.txt | 5 +++++ cmake/onnxruntime_providers_cuda.cmake | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 5200b447d553f..61e0f7d6c574e 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -46,6 +46,11 @@ else() set(CMAKE_CXX_STANDARD 17) endif() +if (MSVC) + # Make sure Visual Studio sets __cplusplus macro correctly: https://learn.microsoft.com/en-us/cpp/build/reference/zc-cplusplus + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Zc:__cplusplus") +endif() + set_property(GLOBAL PROPERTY USE_FOLDERS ON) # NOTE: POSITION INDEPENDENT CODE hurts performance, and it only make sense on POSIX systems set(CMAKE_POSITION_INDEPENDENT_CODE ON) diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 46bc5fb3bd1ac..3b48a40bf1166 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -175,6 +175,10 @@ endif() endif() + if(MSVC) + target_compile_options(${target} PRIVATE "$<$:SHELL:-Xcompiler /Zc:__cplusplus>") + endif() + onnxruntime_add_include_to_target(${target} onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers) if (onnxruntime_ENABLE_TRAINING_OPS) onnxruntime_add_include_to_target(${target} onnxruntime_training) From e31edf74f6cd5a87628a0476831338b348084f3a Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 4 Jun 2024 18:12:44 -0700 Subject: [PATCH 02/13] update cutlass --- cmake/deps.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/deps.txt b/cmake/deps.txt index 88c1881ad82fb..703988a1513eb 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -53,7 +53,7 @@ pytorch_cpuinfo;https://github.com/pytorch/cpuinfo/archive/959002f82d7962a473d8b re2;https://github.com/google/re2/archive/refs/tags/2024-05-01.tar.gz;206cfee5ee0b4c6844680ba66275e9e8faa77405 safeint;https://github.com/dcleblanc/SafeInt/archive/refs/tags/3.0.28.zip;23f252040ff6cb9f1fd18575b32fa8fb5928daac tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2493f0949dc4be6b6a45e81.zip;67b833913605a4f3f499894ab11528a702c2b381 -cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.1.0.zip;757f90a795034a89d4f48a79d1f009f7a04c8dee +cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.5.0.zip;ae038931b9fc2c416c17d9cda91d9706b343f56d utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/5356c4a943a35e74d7cdc69486afcb8703b9a59a.zip;522382c2af437e09124287e5879ab64af5b2e299 From 4ee7731ae6790251f4b550a691532f2955a2796b Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 5 Jun 2024 10:16:16 -0700 Subject: [PATCH 03/13] Add code to use batch hook --- .../bert/cutlass_fmha/fmha_launch_template.h | 105 ++++++++++++++---- 1 file changed, 85 insertions(+), 20 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h index c12cb374d9adf..918e6defe12e4 100644 --- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h +++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h @@ -16,6 +16,23 @@ namespace onnxruntime { namespace contrib { namespace cuda { +// TODO: remove this flag and unused code after testing. +#define USE_MEMORY_EFFICIENT_TO_BATCH_HOOK 0 + +#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK +struct GQAToBatchHook { + template + CUTLASS_DEVICE static bool advance_to_batch(Params& p, int64_t& q_start, int64_t& k_start) { + auto batch_id = blockIdx.z; + q_start = batch_id* p.num_queries const int64_t max_sequence_length = p.v_strideB / p.v_strideM; + const bool is_kv_bsnh = (p.k_strideH == p.head_dim && p.k_strideM == p.num_heads * p.head_dim); + k_start = batch_id * (is_kv_bsnh ? max_sequence_length : p.num_heads * max_sequence_length); + return true; + } +}; + +#else + template struct RightPaddingBatchHook { using scalar_t = typename AttentionKernel::scalar_t; @@ -51,18 +68,34 @@ struct RightPaddingBatchHook { return false; } + // TODO: use GroupQueryAttentionToBatchHook + bool is_kv_bsnh = (p.k_strideH == p.head_dim && p.k_strideM == p.num_heads * p.head_dim); + int64_t q_start = batch_id * p.num_queries; + const int64_t max_sequence_length = p.v_strideB / p.v_strideM; + int64_t k_start = batch_id * (is_kv_bsnh ? max_sequence_length : p.num_heads * max_sequence_length); + // Advance to the current batch / head / query_start - p.query_ptr += batch_id * p.q_strideB + query_start * p.q_strideM + head_id * p.q_strideH; - p.key_ptr += batch_id * p.k_strideB + head_id * p.k_strideH; - p.value_ptr += batch_id * p.v_strideB + head_id * p.v_strideH; - p.output_ptr += int64_t(batch_id * p.num_queries) * p.o_strideM + int64_t(query_start) * p.o_strideM + head_id * p.head_dim_value; + // p.query_ptr += batch_id * p.q_strideB + query_start * p.q_strideM + head_id * p.q_strideH; + p.query_ptr += (q_start + query_start) * p.q_strideM + head_id * p.q_strideH; + + // p.key_ptr += batch_id * p.k_strideB + head_id * p.k_strideH; + p.key_ptr += k_start * p.k_strideM + head_id * p.k_strideH; + + // p.value_ptr += batch_id * p.v_strideB + head_id * p.v_strideH; + p.value_ptr += k_start * p.v_strideM + head_id * p.v_strideH; + + // p.output_ptr += int64_t(batch_id * p.num_queries) * p.o_strideM + int64_t(query_start) * p.o_strideM + // + head_id * p.head_dim_value; + p.output_ptr += int64_t(q_start + query_start) * p.o_strideM + head_id * p.head_dim_value; if (kSupportsBias && p.attn_bias_ptr != nullptr) { p.attn_bias_ptr += (batch_id * p.bias_strideB) + (head_id * p.bias_strideH); } if (p.output_accum_ptr != nullptr) { - p.output_accum_ptr += int64_t(batch_id * p.num_queries) * (p.head_dim_value * p.num_heads) + - int64_t(query_start) * (p.head_dim_value * p.num_heads) + + // p.output_accum_ptr += int64_t(batch_id * p.num_queries) * (p.head_dim_value * p.num_heads) + + // int64_t(query_start) * (p.head_dim_value * p.num_heads) + + // head_id * p.head_dim_value; + p.output_accum_ptr += int64_t(q_start + query_start) * (p.head_dim_value * p.num_heads) + head_id * p.head_dim_value; } else { // Accumulate directly in the destination buffer (eg for f32) @@ -76,11 +109,11 @@ struct RightPaddingBatchHook { } // Custom masking - if (p.causal_diagonal_ptr) { - p.causal_diagonal_offset = p.causal_diagonal_ptr[batch_id]; - } + // if (p.causal_diagonal_ptr) { + // p.causal_diagonal_offset = p.causal_diagonal_ptr[batch_id]; + // } if (p.custom_mask_type == AttentionKernel::CausalFromBottomRight) { - p.causal_diagonal_offset += p.num_keys - p.num_queries; + p.causal_diagonal_offset = p.num_keys - p.num_queries; } if (p.custom_mask_type == AttentionKernel::CausalFromTopLeft || p.custom_mask_type == AttentionKernel::CausalFromBottomRight) { @@ -126,8 +159,8 @@ struct RightPaddingBatchHook { p.num_queries = warp_uniform(p.num_queries); p.num_keys = warp_uniform(p.num_keys); p.num_heads = warp_uniform(p.num_heads); - p.head_dim = warp_uniform(p.head_dim); - p.head_dim_value = warp_uniform(p.head_dim_value); + // p.head_dim = warp_uniform(p.head_dim); + // p.head_dim_value = warp_uniform(p.head_dim_value); p.o_strideM = warp_uniform(p.o_strideM); p.custom_mask_type = warp_uniform(p.custom_mask_type); return true; @@ -142,10 +175,14 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) } AK::attention_kernel(p); } +#endif -template +#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK == 0 +template +#else +template +#endif void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { - using Attention = AttentionKernel; typename Attention::Params p; { // set parameters p.query_ptr = const_cast(reinterpret_cast(params.query)); @@ -220,9 +257,12 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { } auto kernel_fn = attention_kernel_batched_impl; + +#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK == 0 if (params.has_custom_right_padding) { kernel_fn = attention_kernel_batched_impl_right_padding; } +#endif int smem_bytes = sizeof(typename Attention::SharedStorage); if (smem_bytes > 0xc000) { @@ -237,20 +277,45 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { kernel_fn<<>>(p); } -template +template +void RunCutlassFmha(const MemoryEfficientAttentionParams& params) { + constexpr bool kSupportsDropout = false; + constexpr bool kSupportsBias = true; + +#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK + if (params.has_custom_right_padding) { + using Attention = AttentionKernel; + LaunchCutlassFmha(params); + } else { + using Attention = AttentionKernel; + LaunchCutlassFmha(params); + } +#else + using Attention = AttentionKernel; + LaunchCutlassFmha(params); +#endif +} + +template void DispatchIsAligned(const MemoryEfficientAttentionParams& params) { - using AlignedAK = AttentionKernel; + using AlignedAK = AttentionKernel; #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(push) #pragma warning(disable : 6287 4189) // kAligned is used via capture so 4189 warning seems incorrect #endif + // Run a more efficient kernel with `isAligned=True` when memory is correctly aligned. bool is_aligned = params.qk_head_size % AlignedAK::kAlignmentQ == 0 && params.qk_head_size % AlignedAK::kAlignmentK == 0 && params.v_head_size % AlignedAK::kAlignmentV == 0; + DISPATCH_BOOL(is_aligned, kIsAligned, ([&]() { - LaunchCutlassFmha(params); + RunCutlassFmha(params); })); + #if defined(_MSC_VER) && !defined(__clang__) #pragma warning(pop) #endif @@ -259,11 +324,11 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) { template void DispatchBlockSize(const MemoryEfficientAttentionParams& params) { if (params.v_head_size <= 64) { - DispatchIsAligned(params); + DispatchIsAligned(params); } else if (params.v_head_size <= 128) { - DispatchIsAligned(params); + DispatchIsAligned(params); } else { - DispatchIsAligned(params); + DispatchIsAligned(params); } } From 94a3b2e934ca7d82212c326a59ad80c38cc998c0 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 5 Jun 2024 10:41:07 -0700 Subject: [PATCH 04/13] update cgmanifest --- cgmanifests/generated/cgmanifest.json | 2 +- .../github/azure-pipelines/templates/download-deps.yml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 78db7d735dad9..cda0222322d14 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -306,7 +306,7 @@ "component": { "type": "git", "git": { - "commitHash": "6f47420213f757831fae65c686aa471749fa8d60", + "commitHash": "7d49e6c7e2f8896c47f586706e67e1fb215529dc", "repositoryUrl": "https://github.com/NVIDIA/cutlass.git" }, "comments": "cutlass" diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index e7b230008dad4..f97fe5ef751e5 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.156 + version: 1.0.157 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.156 + version: 1.0.157 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. From fd89bb956224c758da288c984098ea6563651e0f Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 5 Jun 2024 11:12:53 -0700 Subject: [PATCH 05/13] limit max head size = 1024 --- .../contrib_ops/cuda/bert/attention.cc | 4 +--- .../bert/cutlass_fmha/fmha_launch_template.h | 2 +- .../cutlass_fmha/memory_efficient_attention.h | 9 ++++++-- .../cuda/bert/group_query_attention.cc | 3 +-- .../cuda/bert/multihead_attention.cc | 21 +++++++++---------- .../contrib_ops/cuda/bert/packed_attention.cc | 9 ++++---- .../cuda/bert/packed_multihead_attention.cc | 4 +--- 7 files changed, 25 insertions(+), 27 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/attention.cc b/onnxruntime/contrib_ops/cuda/bert/attention.cc index 7a807342ad685..3e6edb162360d 100644 --- a/onnxruntime/contrib_ops/cuda/bert/attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/attention.cc @@ -219,11 +219,9 @@ Status Attention::ComputeInternal(OpKernelContext* context) const { !disable_memory_efficient_attention_ && nullptr == past && nullptr == present && - (parameters.head_size & 7) == 0 && - (parameters.v_head_size & 7) == 0 && (nullptr == mask_index || parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START) && (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) && - has_memory_efficient_attention(sm, sizeof(T) == 2); + has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size); if (use_memory_efficient_attention) { bool is_good_for_rpb = relative_position_bias != nullptr && parameters.sequence_length % (4 * sizeof(T)) == 0; diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h index 918e6defe12e4..9ef1cd51e4af0 100644 --- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h +++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h @@ -328,7 +328,7 @@ void DispatchBlockSize(const MemoryEfficientAttentionParams& params) { } else if (params.v_head_size <= 128) { DispatchIsAligned(params); } else { - DispatchIsAligned(params); + DispatchIsAligned(params); } } diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h index 484b783db1724..08a562a12b844 100644 --- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h +++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/memory_efficient_attention.h @@ -11,6 +11,8 @@ namespace onnxruntime { namespace contrib { namespace cuda { +constexpr int kEfficientAttentionMaxHeadSize = 1024; + struct MemoryEfficientAttentionParams { int32_t sm; bool is_half; @@ -49,8 +51,11 @@ struct MemoryEfficientAttentionParams { void run_memory_efficient_attention(const MemoryEfficientAttentionParams& params); -inline bool has_memory_efficient_attention(int32_t sm, bool is_half) { - return sm >= (is_half ? 53 : 50); +inline bool has_memory_efficient_attention(int32_t sm, bool is_half, int qk_head_size, int v_head_size) { + return sm >= (is_half ? 53 : 50) && + (qk_head_size & 7) == 0 && + (v_head_size & 7) == 0 && + qk_head_size <= kEfficientAttentionMaxHeadSize && v_head_size <= kEfficientAttentionMaxHeadSize; } void run_memory_efficient_attention_sm80(const MemoryEfficientAttentionParams& params); diff --git a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc index 3c968d6c8b347..0c26f04edef99 100644 --- a/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/group_query_attention.cc @@ -161,9 +161,8 @@ Status GroupQueryAttention::ComputeInternal(OpKernelContext* context) const { !use_flash_attention && !disable_memory_efficient_attention_ && local_window_size_ == -1 && - (parameters.head_size & 7) == 0 && (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) && - has_memory_efficient_attention(sm, sizeof(T) == 2); + has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.head_size); if (!use_flash_attention && !use_memory_efficient_attention && local_window_size_ != -1) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Local attention UNSUPPORTED for sm < 80 on CUDA."); diff --git a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc index 2ef011cdd9a21..5ae7c149fa05c 100644 --- a/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/multihead_attention.cc @@ -235,17 +235,16 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { bool is_good_for_rpb = relative_position_bias != nullptr && parameters.sequence_length % (4 * sizeof(T)) == 0; - bool use_memory_efficient_attention = !use_flash_attention && - fused_runner == nullptr && - fused_cross_attention_kernel == nullptr && - !disable_memory_efficient_attention_ && - (parameters.head_size & 7) == 0 && - (parameters.v_head_size & 7) == 0 && - is_long_sequence && - !past_no_bias && - (relative_position_bias == nullptr || is_good_for_rpb) && - (nullptr == key_padding_mask || parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START) && - has_memory_efficient_attention(sm, sizeof(T) == 2); + bool use_memory_efficient_attention = + !use_flash_attention && + fused_runner == nullptr && + fused_cross_attention_kernel == nullptr && + !disable_memory_efficient_attention_ && + is_long_sequence && + !past_no_bias && + (relative_position_bias == nullptr || is_good_for_rpb) && + (nullptr == key_padding_mask || parameters.mask_type == AttentionMaskType::MASK_1D_KEY_SEQ_LEN_START) && + has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size); #else constexpr bool use_memory_efficient_attention = false; #endif diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc index e4b90727121cf..0146cce30c7d1 100644 --- a/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/packed_attention.cc @@ -288,11 +288,10 @@ Status PackedAttention::ComputeInternal(OpKernelContext* context) const { if (nullptr == fused_runner) { int sm = device_prop.major * 10 + device_prop.minor; bool is_good_for_rpb = !parameters.has_relative_position_bias || parameters.sequence_length % (4 * sizeof(T)) == 0; - use_memory_efficient_attention = is_good_for_rpb && - sizeof(T) == 2 && // only enable for fp16 - (parameters.head_size & 7) == 0 && - (parameters.v_head_size & 7) == 0 && - has_memory_efficient_attention(sm, sizeof(T) == 2); + use_memory_efficient_attention = + is_good_for_rpb && + sizeof(T) == 2 && // only enable for fp16 + has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size); } #endif diff --git a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc index 00ab32886112b..3fbbafc01254e 100644 --- a/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc +++ b/onnxruntime/contrib_ops/cuda/bert/packed_multihead_attention.cc @@ -272,9 +272,7 @@ Status PackedMultiHeadAttention::ComputeInternal(OpKernelContext* context) co use_memory_efficient_attention = is_good_for_rpb && (sizeof(T) == 2 || parameters.sequence_length >= attention::kMinSeqLenForMemoryEfficientAttentionFp32) && - (parameters.head_size & 7) == 0 && - (parameters.v_head_size & 7) == 0 && - has_memory_efficient_attention(sm, sizeof(T) == 2); + has_memory_efficient_attention(sm, sizeof(T) == 2, parameters.head_size, parameters.v_head_size); } #endif From be8f3c658e3b97f9659536979f25c31319818407 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 5 Jun 2024 22:09:25 +0000 Subject: [PATCH 06/13] fix linux build --- .../cuda/bert/flash_attention/flash_fwd_kernel.h | 4 +--- .../cuda/bert/flash_attention/kernel_traits.h | 12 +++++------- .../contrib_ops/cuda/bert/flash_attention/utils.h | 3 +-- .../q4gemm/threadblock/quantb_mma_multistage.h | 2 +- 4 files changed, 8 insertions(+), 13 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h index 028233f66850f..1fac03882b4b1 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h @@ -10,8 +10,7 @@ #endif #include -#include -#include +#include #include #include @@ -98,7 +97,6 @@ inline __device__ void compute_attn_1rowblock(const Params& params, const int bi constexpr int kBlockN = Kernel_traits::kBlockN; constexpr int kHeadDim = Kernel_traits::kHeadDim; constexpr int kNWarps = Kernel_traits::kNWarps; - constexpr int MMA_M = kBlockM / decltype(cute::size<0>(typename Kernel_traits::TiledMma::TiledShape_MNK{}))::value; const BlockInfo binfo(params, bidb); if (m_block * kBlockM >= binfo.actual_seqlen_q || binfo.actual_seqlen_k == 0) return; diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/kernel_traits.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/kernel_traits.h index 1c0ed7f2fc2e8..52a4e56491c5e 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/kernel_traits.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/kernel_traits.h @@ -3,7 +3,7 @@ ******************************************************************************/ #pragma once -#include +#include #include #include @@ -32,10 +32,8 @@ struct Flash_kernel_traits { std::is_same_v, MMA_Atom, MMA_Atom>; - using ValLayoutMNK = cute::Layout>; #else using MMA_Atom_Arch = MMA_Atom; - using ValLayoutMNK = cute::Layout>; #endif #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 750 @@ -77,7 +75,7 @@ struct Flash_fwd_kernel_traits : public Base { using TiledMma = TiledMMA< typename Base::MMA_Atom_Arch, Layout, _1, _1>>, // 4x1x1 or 8x1x1 thread group - typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM + Tile, _16, _16>>; using SmemLayoutAtomQ = decltype(composition(Swizzle{}, // This has to be kBlockKSmem, using kHeadDim gives wrong results for d=128 @@ -208,17 +206,17 @@ struct Flash_bwd_kernel_traits : public Base { using TiledMmaSdP = TiledMMA< typename Base::MMA_Atom_Arch, cute::Layout, cute::Int, _1>>, - typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM + Tile, Int<16 * kNWarps / AtomLayoutMSdP>, _16>>; using TiledMmadKV = TiledMMA< typename Base::MMA_Atom_Arch, cute::Layout, cute::Int, _1>>, - typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM + Tile, Int<16 * kNWarps / AtomLayoutNdKV>, _16>>; using TiledMmadQ = TiledMMA< typename Base::MMA_Atom_Arch, cute::Layout, cute::Int, _1>>, // 2x4x1 or 4x2x1 thread group - typename Base::ValLayoutMNK>; // 1x2x1 or 1x2x2 value group for 16x16x16 MMA and LDSM + Tile, Int<16 * kNWarps / AtomLayoutMdQ>, _16>>; using SmemLayoutAtomQdO = decltype(composition(Swizzle{}, cute::Layout>, diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/utils.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/utils.h index 271112c5e890a..7aefd4799bc4d 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/utils.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/utils.h @@ -13,8 +13,7 @@ #include #endif -#include -#include +#include #include #include diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h index 28364cc34f2d7..ebb58565b6e84 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h @@ -490,7 +490,7 @@ class QuantBMmaMultistage : // accuracy, where each mainloop iteration first accumulates into a temporary // set of freshly-cleared accumulators, which are subsequently added to the // final accumulator set. - static bool const kStagedAccumulation = arch::UseStagedAccumulation::value; + static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation::value; }; private: From 02886c2ec51d681da84afdd959430bc4629caa1d Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 6 Jun 2024 00:30:26 +0000 Subject: [PATCH 07/13] use GQAToBatchHook --- .../cuda/bert/cutlass_fmha/fmha_launch_template.h | 5 +++-- .../cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h | 5 ++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h index 9ef1cd51e4af0..a2ce6fbbedf41 100644 --- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h +++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h @@ -17,14 +17,15 @@ namespace contrib { namespace cuda { // TODO: remove this flag and unused code after testing. -#define USE_MEMORY_EFFICIENT_TO_BATCH_HOOK 0 +#define USE_MEMORY_EFFICIENT_TO_BATCH_HOOK 1 #if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK struct GQAToBatchHook { template CUTLASS_DEVICE static bool advance_to_batch(Params& p, int64_t& q_start, int64_t& k_start) { auto batch_id = blockIdx.z; - q_start = batch_id* p.num_queries const int64_t max_sequence_length = p.v_strideB / p.v_strideM; + q_start = batch_id * p.num_queries; + const int64_t max_sequence_length = p.v_strideB / p.v_strideM; const bool is_kv_bsnh = (p.k_strideH == p.head_dim && p.k_strideM == p.num_heads * p.head_dim); k_start = batch_id * (is_kv_bsnh ? max_sequence_length : p.num_heads * max_sequence_length); return true; diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h index ebb58565b6e84..0bc442497deb8 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h @@ -490,7 +490,10 @@ class QuantBMmaMultistage : // accuracy, where each mainloop iteration first accumulates into a temporary // set of freshly-cleared accumulators, which are subsequently added to the // final accumulator set. - static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation::value; + + // Change it to false to avoid build error: class "cutlass::arch::OpMultiplyAdd" has no member "ElementA" + // static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation::value; + static bool const kStagedAccumulation = false; }; private: From 0f460d1dad1c14842894b92f41b4260d3b48c095 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 6 Jun 2024 15:49:19 +0000 Subject: [PATCH 08/13] cutlass patch to fix hrsqrt not found for SM<53 --- cmake/external/cutlass.cmake | 1 + cmake/patches/cutlass/cutlass_3.5.0.patch | 25 +++++++++++++++++++++++ 2 files changed, 26 insertions(+) create mode 100644 cmake/patches/cutlass/cutlass_3.5.0.patch diff --git a/cmake/external/cutlass.cmake b/cmake/external/cutlass.cmake index f04f4bec76cd5..1ece2e7a509ba 100644 --- a/cmake/external/cutlass.cmake +++ b/cmake/external/cutlass.cmake @@ -3,6 +3,7 @@ FetchContent_Declare( cutlass URL ${DEP_URL_cutlass} URL_HASH SHA1=${DEP_SHA1_cutlass} + PATCH_COMMAND ${Patch_EXECUTABLE} --binary --ignore-whitespace -p1 < ${PROJECT_SOURCE_DIR}/patches/cutlass/cutlass_3.5.0.patch ) FetchContent_GetProperties(cutlass) diff --git a/cmake/patches/cutlass/cutlass_3.5.0.patch b/cmake/patches/cutlass/cutlass_3.5.0.patch new file mode 100644 index 0000000000000..3b829d2f8b2cf --- /dev/null +++ b/cmake/patches/cutlass/cutlass_3.5.0.patch @@ -0,0 +1,25 @@ +diff --git a/include/cutlass/functional.h b/include/cutlass/functional.h +index 964d2ff3..b366bc14 100644 +--- a/include/cutlass/functional.h ++++ b/include/cutlass/functional.h +@@ -39,6 +39,7 @@ + #include "cutlass/numeric_types.h" + + #include ++#include + + #if defined(CUTLASS_ARCH_WMMA_ENABLED) + #include +@@ -230,8 +231,12 @@ struct inverse_square_root { + CUTLASS_HOST_DEVICE + half_t operator()(half_t const &lhs) const { + #if defined(__CUDA_ARCH__) ++#if (__CUDA_ARCH__ >= 530) + auto result = hrsqrt(reinterpret_cast<__half const &>(lhs)); + return reinterpret_cast(result); ++#else ++ return half_t::convert((rsqrtf(half_t::convert(lhs)))); ++#endif + #else + return half_t(1.f / std::sqrt(half_t::convert(lhs))); + #endif From ed63a78c6dcb06deca38fcac761f8b0fea717c1c Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 6 Jun 2024 16:23:26 +0000 Subject: [PATCH 09/13] suppress TRT deprecated warnings --- cmake/onnxruntime_providers_tensorrt.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index e56de0c7124dc..77ae15710e001 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -188,6 +188,10 @@ set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime") target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1) target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT}) + + # Suppress deprecated warnings + target_compile_options(onnxruntime_providers_tensorrt PRIVATE /wd4996) + if (WIN32) target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4456) endif() From 274862bedf60fdd5b7638f048c8f2484d72fda48 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 6 Jun 2024 17:10:28 +0000 Subject: [PATCH 10/13] undo to_batch_hook --- .../bert/cutlass_fmha/fmha_launch_template.h | 72 +++---------------- 1 file changed, 9 insertions(+), 63 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h index a2ce6fbbedf41..481f7f67a7e83 100644 --- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h +++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h @@ -16,24 +16,6 @@ namespace onnxruntime { namespace contrib { namespace cuda { -// TODO: remove this flag and unused code after testing. -#define USE_MEMORY_EFFICIENT_TO_BATCH_HOOK 1 - -#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK -struct GQAToBatchHook { - template - CUTLASS_DEVICE static bool advance_to_batch(Params& p, int64_t& q_start, int64_t& k_start) { - auto batch_id = blockIdx.z; - q_start = batch_id * p.num_queries; - const int64_t max_sequence_length = p.v_strideB / p.v_strideM; - const bool is_kv_bsnh = (p.k_strideH == p.head_dim && p.k_strideM == p.num_heads * p.head_dim); - k_start = batch_id * (is_kv_bsnh ? max_sequence_length : p.num_heads * max_sequence_length); - return true; - } -}; - -#else - template struct RightPaddingBatchHook { using scalar_t = typename AttentionKernel::scalar_t; @@ -69,34 +51,18 @@ struct RightPaddingBatchHook { return false; } - // TODO: use GroupQueryAttentionToBatchHook - bool is_kv_bsnh = (p.k_strideH == p.head_dim && p.k_strideM == p.num_heads * p.head_dim); - int64_t q_start = batch_id * p.num_queries; - const int64_t max_sequence_length = p.v_strideB / p.v_strideM; - int64_t k_start = batch_id * (is_kv_bsnh ? max_sequence_length : p.num_heads * max_sequence_length); - // Advance to the current batch / head / query_start - // p.query_ptr += batch_id * p.q_strideB + query_start * p.q_strideM + head_id * p.q_strideH; - p.query_ptr += (q_start + query_start) * p.q_strideM + head_id * p.q_strideH; - - // p.key_ptr += batch_id * p.k_strideB + head_id * p.k_strideH; - p.key_ptr += k_start * p.k_strideM + head_id * p.k_strideH; - - // p.value_ptr += batch_id * p.v_strideB + head_id * p.v_strideH; - p.value_ptr += k_start * p.v_strideM + head_id * p.v_strideH; - - // p.output_ptr += int64_t(batch_id * p.num_queries) * p.o_strideM + int64_t(query_start) * p.o_strideM - // + head_id * p.head_dim_value; - p.output_ptr += int64_t(q_start + query_start) * p.o_strideM + head_id * p.head_dim_value; + p.query_ptr += batch_id * p.q_strideB + query_start * p.q_strideM + head_id * p.q_strideH; + p.key_ptr += batch_id * p.k_strideB + head_id * p.k_strideH; + p.value_ptr += batch_id * p.v_strideB + head_id * p.v_strideH; + p.output_ptr += int64_t(batch_id * p.num_queries) * p.o_strideM + int64_t(query_start) * p.o_strideM + head_id * p.head_dim_value; if (kSupportsBias && p.attn_bias_ptr != nullptr) { p.attn_bias_ptr += (batch_id * p.bias_strideB) + (head_id * p.bias_strideH); } if (p.output_accum_ptr != nullptr) { - // p.output_accum_ptr += int64_t(batch_id * p.num_queries) * (p.head_dim_value * p.num_heads) + - // int64_t(query_start) * (p.head_dim_value * p.num_heads) + - // head_id * p.head_dim_value; - p.output_accum_ptr += int64_t(q_start + query_start) * (p.head_dim_value * p.num_heads) + + p.output_accum_ptr += int64_t(batch_id * p.num_queries) * (p.head_dim_value * p.num_heads) + + int64_t(query_start) * (p.head_dim_value * p.num_heads) + head_id * p.head_dim_value; } else { // Accumulate directly in the destination buffer (eg for f32) @@ -160,8 +126,8 @@ struct RightPaddingBatchHook { p.num_queries = warp_uniform(p.num_queries); p.num_keys = warp_uniform(p.num_keys); p.num_heads = warp_uniform(p.num_heads); - // p.head_dim = warp_uniform(p.head_dim); - // p.head_dim_value = warp_uniform(p.head_dim_value); + p.head_dim = warp_uniform(p.head_dim); + p.head_dim_value = warp_uniform(p.head_dim_value); p.o_strideM = warp_uniform(p.o_strideM); p.custom_mask_type = warp_uniform(p.custom_mask_type); return true; @@ -176,13 +142,8 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) } AK::attention_kernel(p); } -#endif -#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK == 0 template -#else -template -#endif void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { typename Attention::Params p; { // set parameters @@ -259,11 +220,9 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { auto kernel_fn = attention_kernel_batched_impl; -#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK == 0 if (params.has_custom_right_padding) { kernel_fn = attention_kernel_batched_impl_right_padding; } -#endif int smem_bytes = sizeof(typename Attention::SharedStorage); if (smem_bytes > 0xc000) { @@ -280,24 +239,11 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { template void RunCutlassFmha(const MemoryEfficientAttentionParams& params) { - constexpr bool kSupportsDropout = false; + constexpr bool kSupportsDropout = true; constexpr bool kSupportsBias = true; - -#if USE_MEMORY_EFFICIENT_TO_BATCH_HOOK - if (params.has_custom_right_padding) { - using Attention = AttentionKernel; - LaunchCutlassFmha(params); - } else { - using Attention = AttentionKernel; - LaunchCutlassFmha(params); - } -#else using Attention = AttentionKernel; LaunchCutlassFmha(params); -#endif } template From 8d5c4c0888b010000f1690b56d7981ef3ccc46ec Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 6 Jun 2024 23:04:27 +0000 Subject: [PATCH 11/13] suppress trt deprecate warning and clean up --- cmake/onnxruntime_providers_tensorrt.cmake | 8 ++++---- .../bert/cutlass_fmha/fmha_launch_template.h | 19 ++++--------------- .../threadblock/quantb_mma_multistage.h | 8 ++++---- 3 files changed, 12 insertions(+), 23 deletions(-) diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 77ae15710e001..0a771554d126a 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -15,7 +15,7 @@ if (WIN32) add_definitions(-D_SILENCE_EXPERIMENTAL_FILESYSTEM_DEPRECATION_WARNING=1) set(OLD_CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4099 /wd4551 /wd4505 /wd4515 /wd4706 /wd4456 /wd4324 /wd4701 /wd4804 /wd4702 /wd4458 /wd4703") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4099 /wd4551 /wd4505 /wd4515 /wd4706 /wd4456 /wd4324 /wd4701 /wd4804 /wd4702 /wd4458 /wd4703 /wd4996") if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4805") endif() @@ -189,10 +189,10 @@ target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1) target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT}) - # Suppress deprecated warnings - target_compile_options(onnxruntime_providers_tensorrt PRIVATE /wd4996) - if (WIN32) + # Suppress deprecated warnings for TRT 10 + target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4996) + target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4456) endif() diff --git a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h index 481f7f67a7e83..a5de20e44be1a 100644 --- a/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h +++ b/onnxruntime/contrib_ops/cuda/bert/cutlass_fmha/fmha_launch_template.h @@ -75,10 +75,6 @@ struct RightPaddingBatchHook { batch_id * lse_dim * p.num_heads + head_id * lse_dim + query_start; } - // Custom masking - // if (p.causal_diagonal_ptr) { - // p.causal_diagonal_offset = p.causal_diagonal_ptr[batch_id]; - // } if (p.custom_mask_type == AttentionKernel::CausalFromBottomRight) { p.causal_diagonal_offset = p.num_keys - p.num_queries; } @@ -143,8 +139,10 @@ __global__ void __launch_bounds__(AK::kNumThreads, AK::kMinBlocksPerSm) AK::attention_kernel(p); } -template +template void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { + constexpr bool dropout = false; + using Attention = AttentionKernel; typename Attention::Params p; { // set parameters p.query_ptr = const_cast(reinterpret_cast(params.query)); @@ -237,15 +235,6 @@ void LaunchCutlassFmha(const MemoryEfficientAttentionParams& params) { kernel_fn<<>>(p); } -template -void RunCutlassFmha(const MemoryEfficientAttentionParams& params) { - constexpr bool kSupportsDropout = true; - constexpr bool kSupportsBias = true; - using Attention = AttentionKernel; - LaunchCutlassFmha(params); -} - template void DispatchIsAligned(const MemoryEfficientAttentionParams& params) { using AlignedAK = AttentionKernel; @@ -260,7 +249,7 @@ void DispatchIsAligned(const MemoryEfficientAttentionParams& params) { params.v_head_size % AlignedAK::kAlignmentV == 0; DISPATCH_BOOL(is_aligned, kIsAligned, ([&]() { - RunCutlassFmha(params); + LaunchCutlassFmha(params); })); #if defined(_MSC_VER) && !defined(__clang__) diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h index 0bc442497deb8..6e281241a3427 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h @@ -490,10 +490,10 @@ class QuantBMmaMultistage : // accuracy, where each mainloop iteration first accumulates into a temporary // set of freshly-cleared accumulators, which are subsequently added to the // final accumulator set. - - // Change it to false to avoid build error: class "cutlass::arch::OpMultiplyAdd" has no member "ElementA" - // static bool const kStagedAccumulation = arch::detail::UseStagedAccumulation::value; - static bool const kStagedAccumulation = false; + + // Change the following to avoid build error: class "cutlass::arch::OpMultiplyAdd" has no member "ElementA". + // kStagedAccumulation = arch::detail::UseStagedAccumulation::value; + static bool const kStagedAccumulation = false; }; private: From a02c612dabf7aa141a0738c1755aeae56e2fbb5a Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 10 Jun 2024 12:57:46 -0700 Subject: [PATCH 12/13] address review feedback --- cmake/onnxruntime_providers_tensorrt.cmake | 6 +----- onnxruntime/core/providers/tensorrt/nv_includes.h | 7 +++---- onnxruntime/test/unittest_main/test_main.cc | 1 + 3 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cmake/onnxruntime_providers_tensorrt.cmake b/cmake/onnxruntime_providers_tensorrt.cmake index 0a771554d126a..e56de0c7124dc 100644 --- a/cmake/onnxruntime_providers_tensorrt.cmake +++ b/cmake/onnxruntime_providers_tensorrt.cmake @@ -15,7 +15,7 @@ if (WIN32) add_definitions(-D_SILENCE_EXPERIMENTAL_FILESYSTEM_DEPRECATION_WARNING=1) set(OLD_CMAKE_CUDA_FLAGS ${CMAKE_CUDA_FLAGS}) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4099 /wd4551 /wd4505 /wd4515 /wd4706 /wd4456 /wd4324 /wd4701 /wd4804 /wd4702 /wd4458 /wd4703 /wd4996") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4099 /wd4551 /wd4505 /wd4515 /wd4706 /wd4456 /wd4324 /wd4701 /wd4804 /wd4702 /wd4458 /wd4703") if (CMAKE_BUILD_TYPE STREQUAL "Debug") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /wd4805") endif() @@ -188,11 +188,7 @@ set_target_properties(onnxruntime_providers_tensorrt PROPERTIES FOLDER "ONNXRuntime") target_compile_definitions(onnxruntime_providers_tensorrt PRIVATE ONNXIFI_BUILD_LIBRARY=1) target_compile_options(onnxruntime_providers_tensorrt PRIVATE ${DISABLED_WARNINGS_FOR_TRT}) - if (WIN32) - # Suppress deprecated warnings for TRT 10 - target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4996) - target_compile_options(onnxruntime_providers_tensorrt INTERFACE /wd4456) endif() diff --git a/onnxruntime/core/providers/tensorrt/nv_includes.h b/onnxruntime/core/providers/tensorrt/nv_includes.h index c3e9f7a3a2a77..047f325f49b70 100644 --- a/onnxruntime/core/providers/tensorrt/nv_includes.h +++ b/onnxruntime/core/providers/tensorrt/nv_includes.h @@ -2,12 +2,11 @@ // Licensed under the MIT License. #pragma once -// File to include the required TRT headers with workarounds for warnings we can't fix. - -// Ignore warning C4100: unreferenced formal parameter +// File to include the required TRT headers with workarounds for warnings we can't fix or not fixed yet. #if defined(_MSC_VER) #pragma warning(push) -#pragma warning(disable : 4100) +#pragma warning(disable : 4100) // Ignore warning C4100: unreferenced formal parameter +#pragma warning(disable : 4996) // Ignore warning C4996: 'nvinfer1::IPluginV2' was declared deprecated #endif #include diff --git a/onnxruntime/test/unittest_main/test_main.cc b/onnxruntime/test/unittest_main/test_main.cc index d7e8bf9063645..b7c3b38538421 100644 --- a/onnxruntime/test/unittest_main/test_main.cc +++ b/onnxruntime/test/unittest_main/test_main.cc @@ -36,6 +36,7 @@ void ortenv_setup() { #if defined(_MSC_VER) #pragma warning(push) #pragma warning(disable : 4100) // Ignore warning C4100: unreferenced format parameter. +#pragma warning(disable : 4996) // Ignore warning C4996: 'nvinfer1::IPluginV2' was declared deprecated #endif // TensorRT will load/unload libraries as builder objects are created and torn down. This will happen for From 415c5e1083895f4f5df0ce27a5a4e2cafdd6e517 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 10 Jun 2024 15:13:56 -0700 Subject: [PATCH 13/13] fix more 4996 warnings --- .../tensorrt/tensorrt_execution_provider.cc | 22 +++++++++++++++++++ .../tensorrt_execution_provider_custom_ops.cc | 10 +++++++++ 2 files changed, 32 insertions(+) diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index dff74a404a456..9c2db494f0e41 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -3142,7 +3142,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView if (mem_size > max_ctx_mem_size_) { max_ctx_mem_size_ = mem_size; } + +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) // nvinfer1::ICudaEngine::createExecutionContextWithoutDeviceMemory was deprecated +#endif trt_context = std::unique_ptr(trt_engine->createExecutionContextWithoutDeviceMemory()); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif } else { trt_context = std::unique_ptr(trt_engine->createExecutionContext()); } @@ -3588,8 +3596,15 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromGraph(const GraphView if (context_update) { if (trt_state->context_memory_sharing_enable) { +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) // nvinfer1::ICudaEngine::createExecutionContextWithoutDeviceMemory was deprecated +#endif *(trt_state->context) = std::unique_ptr( trt_state->engine->get()->createExecutionContextWithoutDeviceMemory()); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif } else { *(trt_state->context) = std::unique_ptr( trt_state->engine->get()->createExecutionContext()); @@ -3805,7 +3820,14 @@ Status TensorrtExecutionProvider::CreateNodeComputeInfoFromPrecompiledEngine(con if (mem_size > max_ctx_mem_size_) { max_ctx_mem_size_ = mem_size; } +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) // nvinfer1::ICudaEngine::createExecutionContextWithoutDeviceMemory was deprecated +#endif trt_context = std::unique_ptr(trt_engine->createExecutionContextWithoutDeviceMemory()); +#if defined(_MSC_VER) +#pragma warning(pop) +#endif } else { trt_context = std::unique_ptr(trt_engine->createExecutionContext()); } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc index 58a1afd005563..a4d2d6c9d65f3 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_custom_ops.cc @@ -60,6 +60,11 @@ common::Status CreateTensorRTCustomOpDomainList(std::vector& TensorrtLogger trt_logger = GetTensorrtLogger(false); initLibNvInferPlugins(&trt_logger, ""); +#if defined(_MSC_VER) +#pragma warning(push) +#pragma warning(disable : 4996) // Ignore warning C4996: 'nvinfer1::*' was declared deprecated +#endif + int num_plugin_creator = 0; auto plugin_creators = getPluginRegistry()->getPluginCreatorList(&num_plugin_creator); std::unordered_set registered_plugin_names; @@ -79,6 +84,11 @@ common::Status CreateTensorRTCustomOpDomainList(std::vector& custom_op_domain->custom_ops_.push_back(created_custom_op_list.back().get()); registered_plugin_names.insert(plugin_name); } + +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + custom_op_domain->domain_ = "trt.plugins"; domain_list.push_back(custom_op_domain.get()); } catch (const std::exception&) {