From 07e36ffa2f1d0036fdc84be12a2665b77ebe5ff6 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Fri, 11 Aug 2023 15:30:19 +0000 Subject: [PATCH 01/17] Hide triple chevron and CUB kernels --- .../dispatch/dispatch_adjacent_difference.cuh | 22 ++---- .../device/dispatch/dispatch_batch_memcpy.cuh | 41 +++++------ .../device/dispatch/dispatch_histogram.cuh | 6 +- .../device/dispatch/dispatch_merge_sort.cuh | 68 +++++++++---------- .../device/dispatch/dispatch_radix_sort.cuh | 22 +++--- cub/cub/device/dispatch/dispatch_reduce.cuh | 42 ++++++------ .../dispatch/dispatch_reduce_by_key.cuh | 3 +- cub/cub/device/dispatch/dispatch_rle.cuh | 18 ++--- cub/cub/device/dispatch/dispatch_scan.cuh | 22 +++--- .../device/dispatch/dispatch_scan_by_key.cuh | 34 +++++----- .../dispatch/dispatch_segmented_sort.cuh | 30 ++++---- .../device/dispatch/dispatch_select_if.cuh | 20 +++--- .../device/dispatch/dispatch_spmv_orig.cuh | 55 +++++++-------- .../dispatch/dispatch_three_way_partition.cuh | 10 +-- .../dispatch/dispatch_unique_by_key.cuh | 2 +- cub/cub/util_device.cuh | 2 +- cub/cub/util_macro.cuh | 5 ++ .../cuda/detail/core/triple_chevron_launch.h | 5 +- 18 files changed, 203 insertions(+), 204 deletions(-) diff --git a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh index 25f5c25e7f4..b005f361d7f 100644 --- a/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh +++ b/cub/cub/device/dispatch/dispatch_adjacent_difference.cuh @@ -42,22 +42,14 @@ CUB_NAMESPACE_BEGIN - -template -void __global__ DeviceAdjacentDifferenceInitKernel(InputIteratorT first, - InputT *result, - OffsetT num_tiles, - int items_per_tile) +template +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceInitKernel(InputIteratorT first, + InputT *result, + OffsetT num_tiles, + int items_per_tile) { const int tile_idx = static_cast(blockIdx.x * blockDim.x + threadIdx.x); - AgentDifferenceInitT::Process(tile_idx, - first, - result, - num_tiles, - items_per_tile); + AgentDifferenceInitT::Process(tile_idx, first, result, num_tiles, items_per_tile); } template -void __global__ +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceAdjacentDifferenceDifferenceKernel(InputIteratorT input, InputT *first_tile_previous, OutputIteratorT result, diff --git a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh index 4dfddce59e2..04384ae0451 100644 --- a/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh +++ b/cub/cub/device/dispatch/dispatch_batch_memcpy.cuh @@ -70,9 +70,10 @@ struct AgentBatchMemcpyLargeBuffersPolicy template -__global__ void InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state, - BlockOffsetScanTileStateT block_offset_scan_tile_state, - TileOffsetT num_tiles) +CUB_DETAIL_KERNEL_ATTRIBUTES void +InitTileStateKernel(BufferOffsetScanTileStateT buffer_offset_scan_tile_state, + BlockOffsetScanTileStateT block_offset_scan_tile_state, + TileOffsetT num_tiles) { // Initialize tile status buffer_offset_scan_tile_state.InitializeStatus(num_tiles); @@ -93,12 +94,13 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT::BLOCK_THREADS)) - __global__ void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferTileOffsetItT buffer_tile_offsets, - TileT buffer_offset_tile, - TileOffsetT last_tile_offset) + CUB_DETAIL_KERNEL_ATTRIBUTES + void MultiBlockBatchMemcpyKernel(InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + BufferTileOffsetItT buffer_tile_offsets, + TileT buffer_offset_tile, + TileOffsetT last_tile_offset) { using StatusWord = typename TileT::StatusWord; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::AgentLargeBufferPolicyT; @@ -219,16 +221,17 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentSmallBufferPolicyT::BLOCK_THREADS)) - __global__ void BatchMemcpyKernel(InputBufferIt input_buffer_it, - OutputBufferIt output_buffer_it, - BufferSizeIteratorT buffer_sizes, - BufferOffsetT num_buffers, - BlevBufferSrcsOutItT blev_buffer_srcs, - BlevBufferDstsOutItT blev_buffer_dsts, - BlevBufferSizesOutItT blev_buffer_sizes, - BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, - BLevBufferOffsetTileState blev_buffer_scan_state, - BLevBlockOffsetTileState blev_block_scan_state) + CUB_DETAIL_KERNEL_ATTRIBUTES + void BatchMemcpyKernel(InputBufferIt input_buffer_it, + OutputBufferIt output_buffer_it, + BufferSizeIteratorT buffer_sizes, + BufferOffsetT num_buffers, + BlevBufferSrcsOutItT blev_buffer_srcs, + BlevBufferDstsOutItT blev_buffer_dsts, + BlevBufferSizesOutItT blev_buffer_sizes, + BlevBufferTileOffsetsOutItT blev_buffer_tile_offsets, + BLevBufferOffsetTileState blev_buffer_scan_state, + BLevBlockOffsetTileState blev_block_scan_state) { // Internal type used for storing a buffer's size using BufferSizeT = cub::detail::value_t; diff --git a/cub/cub/device/dispatch/dispatch_histogram.cuh b/cub/cub/device/dispatch/dispatch_histogram.cuh index b0b8d6fa879..400ea363051 100644 --- a/cub/cub/device/dispatch/dispatch_histogram.cuh +++ b/cub/cub/device/dispatch/dispatch_histogram.cuh @@ -89,7 +89,7 @@ CUB_NAMESPACE_BEGIN * Drain queue descriptor for dynamically mapping tile data onto thread blocks */ template -__global__ void +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramInitKernel(ArrayWrapper num_output_bins_wrapper, ArrayWrapper d_output_histograms_wrapper, GridQueue tile_queue) @@ -193,8 +193,8 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS)) __global__ - void DeviceHistogramSweepKernel( +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::AgentHistogramPolicyT::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceHistogramSweepKernel( SampleIteratorT d_samples, ArrayWrapper num_output_bins_wrapper, ArrayWrapper num_privatized_bins_wrapper, diff --git a/cub/cub/device/dispatch/dispatch_merge_sort.cuh b/cub/cub/device/dispatch/dispatch_merge_sort.cuh index 7b6c73a1ddf..015a25ed01a 100644 --- a/cub/cub/device/dispatch/dispatch_merge_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_merge_sort.cuh @@ -38,7 +38,6 @@ CUB_NAMESPACE_BEGIN - template -void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) -DeviceMergeSortBlockSortKernel(bool ping, - KeyInputIteratorT keys_in, - ValueInputIteratorT items_in, - KeyIteratorT keys_out, - ValueIteratorT items_out, - OffsetT keys_count, - KeyT *tmp_keys_out, - ValueT *tmp_items_out, - CompareOpT compare_op, - char *vshmem) +__launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortBlockSortKernel(bool ping, + KeyInputIteratorT keys_in, + ValueInputIteratorT items_in, + KeyIteratorT keys_out, + ValueIteratorT items_out, + OffsetT keys_count, + KeyT *tmp_keys_out, + ValueT *tmp_items_out, + CompareOpT compare_op, + char *vshmem) { extern __shared__ char shmem[]; using ActivePolicyT = typename ChainedPolicyT::ActivePolicy::MergeSortPolicy; @@ -95,19 +94,16 @@ DeviceMergeSortBlockSortKernel(bool ping, agent.Process(); } -template -__global__ void DeviceMergeSortPartitionKernel(bool ping, - KeyIteratorT keys_ping, - KeyT *keys_pong, - OffsetT keys_count, - OffsetT num_partitions, - OffsetT *merge_partitions, - CompareOpT compare_op, - OffsetT target_merged_tiles_number, - int items_per_tile) +template +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortPartitionKernel(bool ping, + KeyIteratorT keys_ping, + KeyT *keys_pong, + OffsetT keys_count, + OffsetT num_partitions, + OffsetT *merge_partitions, + CompareOpT compare_op, + OffsetT target_merged_tiles_number, + int items_per_tile) { OffsetT partition_idx = blockDim.x * blockIdx.x + threadIdx.x; @@ -136,17 +132,17 @@ template -void __global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) -DeviceMergeSortMergeKernel(bool ping, - KeyIteratorT keys_ping, - ValueIteratorT items_ping, - OffsetT keys_count, - KeyT *keys_pong, - ValueT *items_pong, - CompareOpT compare_op, - OffsetT *merge_partitions, - OffsetT target_merged_tiles_number, - char *vshmem) +__launch_bounds__(ChainedPolicyT::ActivePolicy::MergeSortPolicy::BLOCK_THREADS) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceMergeSortMergeKernel(bool ping, + KeyIteratorT keys_ping, + ValueIteratorT items_ping, + OffsetT keys_count, + KeyT *keys_pong, + ValueT *items_pong, + CompareOpT compare_op, + OffsetT *merge_partitions, + OffsetT target_merged_tiles_number, + char *vshmem) { extern __shared__ char shmem[]; diff --git a/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/cub/device/dispatch/dispatch_radix_sort.cuh index 9a75e6fe0bb..d5d2ef93b79 100644 --- a/cub/cub/device/dispatch/dispatch_radix_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_radix_sort.cuh @@ -33,9 +33,6 @@ #pragma once -#include -#include - #include #include #include @@ -52,6 +49,9 @@ #include +#include +#include + // suppress warnings triggered by #pragma unroll: // "warning: loop not unrolled: the optimizer was unable to perform the requested transformation; the transformation might be disabled or specified as part of an unsupported transformation ordering [-Wpass-failed=transform-warning]" #if defined(__clang__) @@ -79,7 +79,7 @@ template < __launch_bounds__ (int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltUpsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::UpsweepPolicy::BLOCK_THREADS))) -__global__ void DeviceRadixSortUpsweepKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortUpsweepKernel( const KeyT *d_keys, ///< [in] Input keys buffer OffsetT *d_spine, ///< [out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) OffsetT /*num_items*/, ///< [in] Total number of input data items @@ -138,7 +138,7 @@ template < typename ChainedPolicyT, ///< Chained tuning policy typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::ScanPolicy::BLOCK_THREADS), 1) -__global__ void RadixSortScanBinsKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void RadixSortScanBinsKernel( OffsetT *d_spine, ///< [in,out] Privatized (per block) digit histograms (striped, i.e., 0s counts from each block, then 1s counts from each block, etc.) int num_counts) ///< [in] Total number of bin-counts { @@ -191,7 +191,7 @@ template < __launch_bounds__ (int((ALT_DIGIT_BITS) ? int(ChainedPolicyT::ActivePolicy::AltDownsweepPolicy::BLOCK_THREADS) : int(ChainedPolicyT::ActivePolicy::DownsweepPolicy::BLOCK_THREADS))) -__global__ void DeviceRadixSortDownsweepKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortDownsweepKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer const ValueT *d_values_in, ///< [in] Input values buffer @@ -255,7 +255,7 @@ template < typename OffsetT, ///< Signed integer type for global offsets typename DecomposerT = detail::identity_decomposer_t> __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -__global__ void DeviceRadixSortSingleTileKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortSingleTileKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer const ValueT *d_values_in, ///< [in] Input values buffer @@ -380,7 +380,7 @@ template < __launch_bounds__ (int((ALT_DIGIT_BITS) ? ChainedPolicyT::ActivePolicy::AltSegmentedPolicy::BLOCK_THREADS : ChainedPolicyT::ActivePolicy::SegmentedPolicy::BLOCK_THREADS)) -__global__ void DeviceSegmentedRadixSortKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedRadixSortKernel( const KeyT *d_keys_in, ///< [in] Input keys buffer KeyT *d_keys_out, ///< [in] Output keys buffer const ValueT *d_values_in, ///< [in] Input values buffer @@ -552,7 +552,7 @@ template -__global__ __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) +CUB_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(ChainedPolicyT::ActivePolicy::HistogramPolicy::BLOCK_THREADS) void DeviceRadixSortHistogramKernel(OffsetT *d_bins_out, const KeyT *d_keys_in, OffsetT num_items, @@ -576,7 +576,7 @@ template < typename PortionOffsetT, typename AtomicOffsetT = PortionOffsetT, typename DecomposerT = detail::identity_decomposer_t> -__global__ void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) +CUB_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(ChainedPolicyT::ActivePolicy::OnesweepPolicy::BLOCK_THREADS) DeviceRadixSortOnesweepKernel (AtomicOffsetT* d_lookback, AtomicOffsetT* d_ctrs, OffsetT* d_bins_out, const OffsetT* d_bins_in, KeyT* d_keys_out, const KeyT* d_keys_in, ValueT* d_values_out, @@ -600,7 +600,7 @@ DeviceRadixSortOnesweepKernel template < typename ChainedPolicyT, typename OffsetT> -__global__ void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRadixSortExclusiveSumKernel(OffsetT* d_bins) { typedef typename ChainedPolicyT::ActivePolicy::ExclusiveSumPolicy ExclusiveSumPolicyT; const int RADIX_BITS = ExclusiveSumPolicyT::RADIX_BITS; diff --git a/cub/cub/device/dispatch/dispatch_reduce.cuh b/cub/cub/device/dispatch/dispatch_reduce.cuh index 698ce0552e8..2dbcdc76fdd 100644 --- a/cub/cub/device/dispatch/dispatch_reduce.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce.cuh @@ -153,12 +153,12 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) -__global__ void DeviceReduceKernel(InputIteratorT d_in, - AccumT* d_out, - OffsetT num_items, - GridEvenShare even_share, - ReductionOpT reduction_op) +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceKernel(InputIteratorT d_in, + AccumT *d_out, + OffsetT num_items, + GridEvenShare even_share, + ReductionOpT reduction_op) { // Thread block type for reducing input tiles using AgentReduceT = @@ -232,12 +232,12 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) -__global__ void DeviceReduceSingleTileKernel(InputIteratorT d_in, - OutputIteratorT d_out, - OffsetT num_items, - ReductionOpT reduction_op, - InitT init) +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SingleTilePolicy::BLOCK_THREADS), 1) // + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceSingleTileKernel(InputIteratorT d_in, + OutputIteratorT d_out, + OffsetT num_items, + ReductionOpT reduction_op, + InitT init) { // Thread block type for reducing input tiles using AgentReduceT = @@ -358,15 +358,15 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) -__global__ void DeviceSegmentedReduceKernel( - InputIteratorT d_in, - OutputIteratorT d_out, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets, - int /*num_segments*/, - ReductionOpT reduction_op, - InitT init) +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReducePolicy::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceSegmentedReduceKernel(InputIteratorT d_in, + OutputIteratorT d_out, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets, + int /*num_segments*/, + ReductionOpT reduction_op, + InitT init) { // Thread block type for reducing input tiles using AgentReduceT = diff --git a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh index a29c1376a41..5040e39f7b3 100644 --- a/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_reduce_by_key.cuh @@ -131,7 +131,8 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS)) __global__ +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ReduceByKeyPolicyT::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceReduceByKeyKernel(KeysInputIteratorT d_keys_in, UniqueOutputIteratorT d_unique_out, ValuesInputIteratorT d_values_in, diff --git a/cub/cub/device/dispatch/dispatch_rle.cuh b/cub/cub/device/dispatch/dispatch_rle.cuh index 06c6fc90f7b..4401a7c1240 100644 --- a/cub/cub/device/dispatch/dispatch_rle.cuh +++ b/cub/cub/device/dispatch/dispatch_rle.cuh @@ -119,15 +119,15 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS)) __global__ - void DeviceRleSweepKernel(InputIteratorT d_in, - OffsetsOutputIteratorT d_offsets_out, - LengthsOutputIteratorT d_lengths_out, - NumRunsOutputIteratorT d_num_runs_out, - ScanTileStateT tile_status, - EqualityOpT equality_op, - OffsetT num_items, - int num_tiles) +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::RleSweepPolicyT::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceRleSweepKernel(InputIteratorT d_in, + OffsetsOutputIteratorT d_offsets_out, + LengthsOutputIteratorT d_lengths_out, + NumRunsOutputIteratorT d_num_runs_out, + ScanTileStateT tile_status, + EqualityOpT equality_op, + OffsetT num_items, + int num_tiles) { using AgentRlePolicyT = typename ChainedPolicyT::ActivePolicy::RleSweepPolicyT; diff --git a/cub/cub/device/dispatch/dispatch_scan.cuh b/cub/cub/device/dispatch/dispatch_scan.cuh index f16f1c0fd96..6893f24e1dc 100644 --- a/cub/cub/device/dispatch/dispatch_scan.cuh +++ b/cub/cub/device/dispatch/dispatch_scan.cuh @@ -68,7 +68,7 @@ CUB_NAMESPACE_BEGIN * Number of tiles */ template -__global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); @@ -94,9 +94,9 @@ __global__ void DeviceScanInitKernel(ScanTileStateT tile_state, int num_tiles) * (i.e., length of `d_selected_out`) */ template -__global__ void DeviceCompactInitKernel(ScanTileStateT tile_state, - int num_tiles, - NumSelectedIteratorT d_num_selected_out) +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceCompactInitKernel(ScanTileStateT tile_state, + int num_tiles, + NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); @@ -165,13 +165,13 @@ template __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanPolicyT::BLOCK_THREADS)) - __global__ void DeviceScanKernel(InputIteratorT d_in, - OutputIteratorT d_out, - ScanTileStateT tile_state, - int start_tile, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanKernel(InputIteratorT d_in, + OutputIteratorT d_out, + ScanTileStateT tile_state, + int start_tile, + ScanOpT scan_op, + InitValueT init_value, + OffsetT num_items) { using RealInitValueT = typename InitValueT::value_type; typedef typename ChainedPolicyT::ActivePolicy::ScanPolicyT ScanPolicyT; diff --git a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh index b70e49be272..62df5c6b913 100644 --- a/cub/cub/device/dispatch/dispatch_scan_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_scan_by_key.cuh @@ -124,17 +124,17 @@ template > -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) -__global__ void DeviceScanByKeyKernel(KeysInputIteratorT d_keys_in, - KeyT *d_keys_prev_in, - ValuesInputIteratorT d_values_in, - ValuesOutputIteratorT d_values_out, - ScanByKeyTileStateT tile_state, - int start_tile, - EqualityOp equality_op, - ScanOpT scan_op, - InitValueT init_value, - OffsetT num_items) +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceScanByKeyKernel(KeysInputIteratorT d_keys_in, + KeyT *d_keys_prev_in, + ValuesInputIteratorT d_values_in, + ValuesOutputIteratorT d_values_out, + ScanByKeyTileStateT tile_state, + int start_tile, + EqualityOp equality_op, + ScanOpT scan_op, + InitValueT init_value, + OffsetT num_items) { using ScanByKeyPolicyT = typename ChainedPolicyT::ActivePolicy::ScanByKeyPolicyT; @@ -166,12 +166,12 @@ __global__ void DeviceScanByKeyKernel(KeysInputIteratorT d_keys_in, } template -__global__ void DeviceScanByKeyInitKernel( - ScanTileStateT tile_state, - KeysInputIteratorT d_keys_in, - cub::detail::value_t *d_keys_prev_in, - unsigned items_per_tile, - int num_tiles) +CUB_DETAIL_KERNEL_ATTRIBUTES void +DeviceScanByKeyInitKernel(ScanTileStateT tile_state, + KeysInputIteratorT d_keys_in, + cub::detail::value_t *d_keys_prev_in, + unsigned items_per_tile, + int num_tiles) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh index 2eec9290bb9..8cc2d01697f 100644 --- a/cub/cub/device/dispatch/dispatch_segmented_sort.cuh +++ b/cub/cub/device/dispatch/dispatch_segmented_sort.cuh @@ -104,7 +104,7 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) -__global__ void DeviceSegmentedSortFallbackKernel( + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortFallbackKernel( const KeyT *d_keys_in_orig, KeyT *d_keys_out_orig, cub::detail::device_double_buffer d_keys_double_buffer, @@ -299,18 +299,18 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::SmallAndMediumSegmentedSortPolicyT::BLOCK_THREADS) -__global__ void DeviceSegmentedSortKernelSmall( - unsigned int small_segments, - unsigned int medium_segments, - unsigned int medium_blocks, - const unsigned int *d_small_segments_indices, - const unsigned int *d_medium_segments_indices, - const KeyT *d_keys_in, - KeyT *d_keys_out, - const ValueT *d_values_in, - ValueT *d_values_out, - BeginOffsetIteratorT d_begin_offsets, - EndOffsetIteratorT d_end_offsets) + CUB_DETAIL_KERNEL_ATTRIBUTES + void DeviceSegmentedSortKernelSmall(unsigned int small_segments, + unsigned int medium_segments, + unsigned int medium_blocks, + const unsigned int *d_small_segments_indices, + const unsigned int *d_medium_segments_indices, + const KeyT *d_keys_in, + KeyT *d_keys_out, + const ValueT *d_values_in, + ValueT *d_values_out, + BeginOffsetIteratorT d_begin_offsets, + EndOffsetIteratorT d_end_offsets) { const unsigned int tid = threadIdx.x; const unsigned int bid = blockIdx.x; @@ -428,7 +428,7 @@ template __launch_bounds__(ChainedPolicyT::ActivePolicy::LargeSegmentPolicy::BLOCK_THREADS) -__global__ void DeviceSegmentedSortKernelLarge( + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortKernelLarge( const unsigned int *d_segments_indices, const KeyT *d_keys_in_orig, KeyT *d_keys_out_orig, @@ -687,7 +687,7 @@ template -__launch_bounds__(1) __global__ void +__launch_bounds__(1) CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentedSortContinuationKernel( LargeKernelT large_kernel, SmallKernelT small_kernel, diff --git a/cub/cub/device/dispatch/dispatch_select_if.cuh b/cub/cub/device/dispatch/dispatch_select_if.cuh index 56fa86e2ad9..6d7dba3186a 100644 --- a/cub/cub/device/dispatch/dispatch_select_if.cuh +++ b/cub/cub/device/dispatch/dispatch_select_if.cuh @@ -131,16 +131,16 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREADS)) __global__ - void DeviceSelectSweepKernel(InputIteratorT d_in, - FlagsInputIteratorT d_flags, - SelectedOutputIteratorT d_selected_out, - NumSelectedIteratorT d_num_selected_out, - ScanTileStateT tile_status, - SelectOpT select_op, - EqualityOpT equality_op, - OffsetT num_items, - int num_tiles) +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::SelectIfPolicyT::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSelectSweepKernel(InputIteratorT d_in, + FlagsInputIteratorT d_flags, + SelectedOutputIteratorT d_selected_out, + NumSelectedIteratorT d_num_selected_out, + ScanTileStateT tile_status, + SelectOpT select_op, + EqualityOpT equality_op, + OffsetT num_items, + int num_tiles) { using AgentSelectIfPolicyT = typename ChainedPolicyT::ActivePolicy::SelectIfPolicyT; diff --git a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh index c38c4bfb488..227c2a42caf 100644 --- a/cub/cub/device/dispatch/dispatch_spmv_orig.cuh +++ b/cub/cub/device/dispatch/dispatch_spmv_orig.cuh @@ -64,37 +64,33 @@ CUB_NAMESPACE_BEGIN /** * Spmv search kernel. Identifies merge path starting coordinates for each tile. */ -template < - typename AgentSpmvPolicyT, ///< Parameterized SpmvPolicy tuning policy type - typename ValueT, ///< Matrix and vector value type - typename OffsetT> ///< Signed integer type for sequence offsets -__global__ void DeviceSpmv1ColKernel( - SpmvParams spmv_params) ///< [in] SpMV input parameter bundle +template ///< Signed integer type for sequence offsets +CUB_DETAIL_KERNEL_ATTRIBUTES void +DeviceSpmv1ColKernel(SpmvParams spmv_params) ///< [in] SpMV input parameter bundle { - typedef CacheModifiedInputIterator< - AgentSpmvPolicyT::VECTOR_VALUES_LOAD_MODIFIER, - ValueT, - OffsetT> - VectorValueIteratorT; - - VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); + typedef CacheModifiedInputIterator + VectorValueIteratorT; - int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; - if (row_idx < spmv_params.num_rows) - { - OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; - OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; + VectorValueIteratorT wrapped_vector_x(spmv_params.d_vector_x); - ValueT value = 0.0; - if (end_nonzero_idx != nonzero_idx) - { - value = spmv_params.d_values[nonzero_idx] * wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; - } + int row_idx = (blockIdx.x * blockDim.x) + threadIdx.x; + if (row_idx < spmv_params.num_rows) + { + OffsetT end_nonzero_idx = spmv_params.d_row_end_offsets[row_idx]; + OffsetT nonzero_idx = spmv_params.d_row_end_offsets[row_idx - 1]; - spmv_params.d_vector_y[row_idx] = value; + ValueT value = 0.0; + if (end_nonzero_idx != nonzero_idx) + { + value = spmv_params.d_values[nonzero_idx] * + wrapped_vector_x[spmv_params.d_column_indices[nonzero_idx]]; } -} + spmv_params.d_vector_y[row_idx] = value; + } +} /** * Spmv search kernel. Identifies merge path starting coordinates for each tile. @@ -104,7 +100,7 @@ template < typename OffsetT, ///< Signed integer type for sequence offsets typename CoordinateT, ///< Merge path coordinate type typename SpmvParamsT> ///< SpmvParams type -__global__ void DeviceSpmvSearchKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvSearchKernel( int num_merge_tiles, ///< [in] Number of SpMV merge tiles (spmv grid size) CoordinateT* d_tile_coordinates, ///< [out] Pointer to the temporary array of tile starting coordinates SpmvParamsT spmv_params) ///< [in] SpMV input parameter bundle @@ -158,7 +154,7 @@ template < bool HAS_ALPHA, ///< Whether the input parameter Alpha is 1 bool HAS_BETA> ///< Whether the input parameter Beta is 0 __launch_bounds__ (int(SpmvPolicyT::BLOCK_THREADS)) -__global__ void DeviceSpmvKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSpmvKernel( SpmvParams spmv_params, ///< [in] SpMV input parameter bundle CoordinateT* d_tile_coordinates, ///< [in] Pointer to the temporary array of tile starting coordinates KeyValuePair* d_tile_carry_pairs, ///< [out] Pointer to the temporary array carry-out dot product row-ids, one per block @@ -191,7 +187,8 @@ __global__ void DeviceSpmvKernel( template ///< Whether the input parameter Beta is 0 -__global__ void DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) +CUB_DETAIL_KERNEL_ATTRIBUTES void +DeviceSpmvEmptyMatrixKernel(SpmvParams spmv_params) { const int row = static_cast(threadIdx.x + blockIdx.x * blockDim.x); @@ -218,7 +215,7 @@ template < typename OffsetT, ///< Signed integer type for global offsets typename ScanTileStateT> ///< Tile status interface type __launch_bounds__ (int(AgentSegmentFixupPolicyT::BLOCK_THREADS)) -__global__ void DeviceSegmentFixupKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceSegmentFixupKernel( PairsInputIteratorT d_pairs_in, ///< [in] Pointer to the array carry-out dot product row-ids, one per spmv block AggregatesOutputIteratorT d_aggregates_out, ///< [in,out] Output value aggregates OffsetT num_items, ///< [in] Total number of items to select from diff --git a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh index 2277956e24c..52f8dec7cde 100644 --- a/cub/cub/device/dispatch/dispatch_three_way_partition.cuh +++ b/cub/cub/device/dispatch/dispatch_three_way_partition.cuh @@ -59,7 +59,8 @@ template -__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS)) __global__ +__launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLOCK_THREADS)) + CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceThreeWayPartitionKernel(InputIteratorT d_in, FirstOutputIteratorT d_first_part_out, SecondOutputIteratorT d_second_part_out, @@ -122,9 +123,10 @@ __launch_bounds__(int(ChainedPolicyT::ActivePolicy::ThreeWayPartitionPolicy::BLO * (i.e., length of @p d_selected_out) */ template -__global__ void DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, - int num_tiles, - NumSelectedIteratorT d_num_selected_out) +CUB_DETAIL_KERNEL_ATTRIBUTES void +DeviceThreeWayPartitionInitKernel(ScanTileStateT tile_state, + int num_tiles, + NumSelectedIteratorT d_num_selected_out) { // Initialize tile status tile_state.InitializeStatus(num_tiles); diff --git a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh index e70d28f2291..c924e71ef7e 100644 --- a/cub/cub/device/dispatch/dispatch_unique_by_key.cuh +++ b/cub/cub/device/dispatch/dispatch_unique_by_key.cuh @@ -60,7 +60,7 @@ template < typename EqualityOpT, ///< Equality operator type typename OffsetT> ///< Signed integer type for global offsets __launch_bounds__ (int(ChainedPolicyT::ActivePolicy::UniqueByKeyPolicyT::BLOCK_THREADS)) -__global__ void DeviceUniqueByKeySweepKernel( +CUB_DETAIL_KERNEL_ATTRIBUTES void DeviceUniqueByKeySweepKernel( KeyInputIteratorT d_keys_in, ///< [in] Pointer to the input sequence of keys ValueInputIteratorT d_values_in, ///< [in] Pointer to the input sequence of values KeyOutputIteratorT d_keys_out, ///< [out] Pointer to the output sequence of selected data items diff --git a/cub/cub/util_device.cuh b/cub/cub/util_device.cuh index d8caaedbb43..c7e15cafe06 100644 --- a/cub/cub/util_device.cuh +++ b/cub/cub/util_device.cuh @@ -70,7 +70,7 @@ CUB_NAMESPACE_BEGIN * \brief Empty kernel for querying PTX manifest metadata (e.g., version) for the current device */ template -__global__ void EmptyKernel(void) { } +CUB_DETAIL_KERNEL_ATTRIBUTES void EmptyKernel(void) { } #endif // DOXYGEN_SHOULD_SKIP_THIS diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index c486aa439f6..b6b72bb1d22 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -32,6 +32,7 @@ #pragma once +#include #include #include "util_namespace.cuh" @@ -113,6 +114,10 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, #define CUB_STATIC_ASSERT(cond, msg) typedef int CUB_CAT(cub_static_assert, __LINE__)[(cond) ? 1 : -1] #endif +#ifndef CUB_DETAIL_KERNEL_ATTRIBUTES +#define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN +#endif + /** @} */ // end group UtilModule CUB_NAMESPACE_END diff --git a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h index 65a7283b742..3cbc0be9bd5 100644 --- a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h +++ b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h @@ -29,6 +29,9 @@ #include #include #include + +#include + #include @@ -37,7 +40,7 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace launcher { - struct triple_chevron + struct _LIBCUDACXX_HIDDEN triple_chevron { typedef size_t Size; dim3 const grid; From ad0fcf55ce7aa8fb70d6a1438141bb4d4306d274 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Fri, 11 Aug 2023 15:37:25 +0000 Subject: [PATCH 02/17] Silence unused attribute warning --- cub/cub/util_namespace.cuh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index cc8e353767f..b8423d9d6ef 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -40,6 +40,8 @@ // version.cuh. #include "version.cuh" +#include + // Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users // that they must now define the qualifier macro, too. #if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER) @@ -189,7 +191,10 @@ CUB_NS_PREFIX \ namespace cub \ { \ - CUB_DETAIL_MAGIC_NS_BEGIN + CUB_DETAIL_MAGIC_NS_BEGIN \ + _LIBCUDACXX_DIAGNOSTIC_PUSH \ + _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") \ + _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") /** * \def CUB_NAMESPACE_END @@ -198,6 +203,7 @@ * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_END \ + _LIBCUDACXX_DIAGNOSTIC_PUSH \ CUB_DETAIL_MAGIC_NS_END \ } /* end namespace cub */ \ CUB_NS_POSTFIX From e1bea1d1cbb156120ff641fb7a4267d98e3cc743 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 11 Sep 2023 20:34:31 +0000 Subject: [PATCH 03/17] Hide Thrust kernels --- cub/cub/util_namespace.cuh | 2 +- .../system/cuda/detail/core/agent_launcher.h | 148 ++++++++++-------- 2 files changed, 82 insertions(+), 68 deletions(-) diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index b8423d9d6ef..6006ddc7779 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -203,7 +203,7 @@ * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_END \ - _LIBCUDACXX_DIAGNOSTIC_PUSH \ + _LIBCUDACXX_DIAGNOSTIC_POP \ CUB_DETAIL_MAGIC_NS_END \ } /* end namespace cub */ \ CUB_NS_POSTFIX diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index dbb26f33f72..94353318b32 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -31,9 +31,14 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include +#include + #include #include +#include + +#include + #include #include @@ -41,12 +46,19 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace core { +CUB_DETAIL_MAGIC_NS_BEGIN +_LIBCUDACXX_DIAGNOSTIC_PUSH +_LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") +_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") +#ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES +#define THRUST_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN +#endif #if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) #if 0 template - void __global__ + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(Args... args) { @@ -55,105 +67,105 @@ namespace core { } #else template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0) { extern __shared__ char shmem[]; Agent::entry(x0, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1) { extern __shared__ char shmem[]; Agent::entry(x0, x1, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, shmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) { extern __shared__ char shmem[]; @@ -166,7 +178,7 @@ namespace core { #if 0 template - void __global__ + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, Args... args) { @@ -176,7 +188,7 @@ namespace core { } #else template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0) { extern __shared__ char shmem[]; @@ -184,7 +196,7 @@ namespace core { Agent::entry(x0, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1) { extern __shared__ char shmem[]; @@ -192,7 +204,7 @@ namespace core { Agent::entry(x0, x1, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2) { extern __shared__ char shmem[]; @@ -200,7 +212,7 @@ namespace core { Agent::entry(x0, x1, x2, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3) { extern __shared__ char shmem[]; @@ -208,7 +220,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) { extern __shared__ char shmem[]; @@ -216,7 +228,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) { extern __shared__ char shmem[]; @@ -224,7 +236,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) { extern __shared__ char shmem[]; @@ -232,7 +244,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) { extern __shared__ char shmem[]; @@ -240,7 +252,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) { extern __shared__ char shmem[]; @@ -248,7 +260,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) { extern __shared__ char shmem[]; @@ -256,7 +268,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) { extern __shared__ char shmem[]; @@ -264,7 +276,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) { extern __shared__ char shmem[]; @@ -272,7 +284,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) { extern __shared__ char shmem[]; @@ -280,7 +292,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) { extern __shared__ char shmem[]; @@ -288,7 +300,7 @@ namespace core { Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, vshmem); } template - void __global__ __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) { extern __shared__ char shmem[]; @@ -299,71 +311,71 @@ namespace core { #else #if 0 template - void __global__ _kernel_agent(Args... args) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(Args... args) {} template - void __global__ _kernel_agent_vshmem(char*, Args... args) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*, Args... args) {} #else template - void __global__ _kernel_agent(_0) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0) {} template - void __global__ _kernel_agent(_0,_1) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1) {} template - void __global__ _kernel_agent(_0,_1,_2) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2) {} template - void __global__ _kernel_agent(_0,_1,_2,_3) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3) {} template - void __global__ _kernel_agent(_0,_1,_2,_3, _4) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4) {} template - void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5) {} template - void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5, _6) {} template - void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7) {} template - void __global__ _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7, _8) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7, _8) {} template - void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} template - void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} template - void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} template - void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC) {} template - void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD) {} template - void __global__ _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD, _xE) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD, _xE) {} //////////////////////////////////////////////////////////// template - void __global__ _kernel_agent_vshmem(char*,_0) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1,_2) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7) {} template - void __global__ _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7, _8) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7, _8) {} template - void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} template - void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} template - void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} template - void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) {} template - void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) {} template - void __global__ _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) {} + void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) {} #endif #endif @@ -1139,7 +1151,9 @@ namespace core { }; -} // namespace core -} +_LIBCUDACXX_DIAGNOSTIC_POP +CUB_DETAIL_MAGIC_NS_END +} // namespace core +} // namespace cuda_cub THRUST_NAMESPACE_END #endif From 7774e41f64455bacfd10c8c4aaa5a340ce8cf1e7 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 11 Sep 2023 21:52:52 +0000 Subject: [PATCH 04/17] Document symbols visibility practices --- cub/docs/developer_overview.rst | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/cub/docs/developer_overview.rst b/cub/docs/developer_overview.rst index c60886d0d0d..14d8265797d 100644 --- a/cub/docs/developer_overview.rst +++ b/cub/docs/developer_overview.rst @@ -714,3 +714,32 @@ we introduced ``cub::detail::temporary_storage::layout``: // `allocation_2` alias `allocation_1`, safe to use in stream order use(allocation_2.get(), stream); + +Symbols visibility +==================================== + +Using CUB/Thrust in shared libraries is a known source of issues. +For a while, the solution to these issues consisted of wrapping CUB/Thrust namespaces with +the ``THRUST_CUB_WRAPPED_NAMESPACE`` macro so that different shared libraries have different symbols. +This solution has poor discoverability, +since issues present themselves in forms of segmentation faults, hangs, wrong results, etc. +To eliminate the symbol visibility issues on our end, we follow the following rules: + + #. Hiding kernel launchers: + it's important that kernel launchers like Thrust ``triple_chevron`` always reside in the same + library as the API using these kernel launchers. + + #. Hiding all kernels: + it's important that kernels always reside in the same library as the API using these kernels. + + #. Incorporating GPU architectures into symbol names: + it's important that kernels compiled for a given GPU architecture are always used by the host + API compiled for that architecture. + +To satisfy (1), ``thrust::cuda_cub::launcher::triple_chevron`` visibility is hidden. + +To satisfy (2), instead of annotating kernels as ``__global__`` we annotate them as +``CUB_DETAIL_KERNEL_ATTRIBUTES``. + +To satisfy (3), CUB symbols are placed inside an inline namespace containing the set of +GPU architectures for which the TU is being compiled. From e21f700c44d6d4c8af8f7f6b38d5b2f650f25764 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 11 Sep 2023 22:10:56 +0000 Subject: [PATCH 05/17] Use inline arch namespace in Thrust --- thrust/thrust/detail/config/namespace.h | 87 ++++++++++++++++++- .../system/cuda/detail/core/agent_launcher.h | 6 +- 2 files changed, 88 insertions(+), 5 deletions(-) diff --git a/thrust/thrust/detail/config/namespace.h b/thrust/thrust/detail/config/namespace.h index 9c790461693..0a8133fc140 100644 --- a/thrust/thrust/detail/config/namespace.h +++ b/thrust/thrust/detail/config/namespace.h @@ -16,6 +16,9 @@ #pragma once +#include +#include + /** * \file namespace.h * \brief Utilities that allow `thrust::` to be placed inside an @@ -84,6 +87,86 @@ #define THRUST_NS_QUALIFIER ::thrust #endif +#if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + +#if !defined(THRUST_DETAIL_MAGIC_NS_NAME) +#define THRUST_DETAIL_COUNT_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, N, ...) \ + N +#define THRUST_DETAIL_COUNT(...) \ + THRUST_DETAIL_IDENTITY(THRUST_DETAIL_COUNT_N(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, \ + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)) +#define THRUST_DETAIL_IDENTITY(N) N +#define THRUST_DETAIL_APPLY(MACRO, ...) THRUST_DETAIL_IDENTITY(MACRO(__VA_ARGS__)) +#define THRUST_DETAIL_MAGIC_NS_NAME1(P1) \ + THRUST_##P1##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME2(P1, P2) \ + THRUST_##P1##_##P2##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME3(P1, P2, P3) \ + THRUST_##P1##_##P2##_##P3##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME4(P1, P2, P3, P4) \ + THRUST_##P1##_##P2##_##P3##_##P4##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME5(P1, P2, P3, P4, P5) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME6(P1, P2, P3, P4, P5, P6) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME19(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_NS +#define THRUST_DETAIL_MAGIC_NS_NAME20(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_##P20##_NS +#define THRUST_DETAIL_DISPATCH(N) THRUST_DETAIL_MAGIC_NS_NAME ## N +#define THRUST_DETAIL_MAGIC_NS_NAME(...) THRUST_DETAIL_IDENTITY(THRUST_DETAIL_APPLY(THRUST_DETAIL_DISPATCH, THRUST_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) +#endif // !defined(THRUST_DETAIL_MAGIC_NS_NAME) + +#if defined(THRUST_DISABLE_NAMESPACE_MAGIC) +#if !defined(THRUST_WRAPPED_NAMESPACE) +#if !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) +#error "Disabling namespace magic is unsafe without wrapping namespace" +#endif // !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) +#endif // !defined(THRUST_WRAPPED_NAMESPACE) +#define THRUST_DETAIL_MAGIC_NS_BEGIN +#define THRUST_DETAIL_MAGIC_NS_END +#else // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) +#if defined(_NVHPC_CUDA) +#define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, NV_TARGET_SM_INTEGER_LIST) { +#define THRUST_DETAIL_MAGIC_NS_END } +#else // not defined(_NVHPC_CUDA) +#define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, __CUDA_ARCH_LIST__) { +#define THRUST_DETAIL_MAGIC_NS_END } +#endif // not defined(_NVHPC_CUDA) +#endif // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) + +#else // THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA + +#define THRUST_DETAIL_MAGIC_NS_BEGIN +#define THRUST_DETAIL_MAGIC_NS_END + +#endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA + /** * \def THRUST_NAMESPACE_BEGIN * This macro is used to open a `thrust::` namespace block, along with any @@ -93,7 +176,8 @@ #define THRUST_NAMESPACE_BEGIN \ THRUST_NS_PREFIX \ namespace thrust \ - { + { \ + THRUST_DETAIL_MAGIC_NS_BEGIN /** * \def THRUST_NAMESPACE_END @@ -102,6 +186,7 @@ * This macro is defined by Thrust and may not be overridden. */ #define THRUST_NAMESPACE_END \ + THRUST_DETAIL_MAGIC_NS_END \ } /* end namespace thrust */ \ THRUST_NS_POSTFIX diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index 94353318b32..4964a9fcfe5 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -31,8 +31,6 @@ #include #if THRUST_DEVICE_COMPILER == THRUST_DEVICE_COMPILER_NVCC -#include - #include #include #include @@ -46,7 +44,7 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace core { -CUB_DETAIL_MAGIC_NS_BEGIN + _LIBCUDACXX_DIAGNOSTIC_PUSH _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") @@ -1152,7 +1150,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") }; _LIBCUDACXX_DIAGNOSTIC_POP -CUB_DETAIL_MAGIC_NS_END + } // namespace core } // namespace cuda_cub THRUST_NAMESPACE_END From d1aabfe7db7c3a4036ac12875aa6d70439e20f1e Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 12 Sep 2023 17:34:28 +0000 Subject: [PATCH 06/17] Fix CI overview --- ci-overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci-overview.md b/ci-overview.md index b23f49abf72..17033b9cbbd 100644 --- a/ci-overview.md +++ b/ci-overview.md @@ -65,7 +65,7 @@ The syntax of the build and test scripts is the same: ./ci/test_thrust.sh #examples -./ci/build_thrust.sh g++ 17 70;80;86 +./ci/build_thrust.sh g++ 17 "70;80;86" ``` In summary, the heart of our build and test jobs is the corresponding build or test script. From 7d8eb69cd6efc1260b4e2f0024af8574e612249d Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 12 Sep 2023 17:01:16 -0700 Subject: [PATCH 07/17] Static kernels --- cub/cub/util_macro.cuh | 2 +- cub/cub/util_namespace.cuh | 8 +------- thrust/thrust/system/cuda/detail/core/agent_launcher.h | 10 +--------- 3 files changed, 3 insertions(+), 17 deletions(-) diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index b6b72bb1d22..bd36dc5ab06 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -115,7 +115,7 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, #endif #ifndef CUB_DETAIL_KERNEL_ATTRIBUTES -#define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN +#define CUB_DETAIL_KERNEL_ATTRIBUTES static __global__ #endif /** @} */ // end group UtilModule diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index 6006ddc7779..a3854c0952e 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -40,8 +40,6 @@ // version.cuh. #include "version.cuh" -#include - // Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users // that they must now define the qualifier macro, too. #if (defined(CUB_NS_PREFIX) || defined(CUB_NS_POSTFIX)) && !defined(CUB_NS_QUALIFIER) @@ -191,10 +189,7 @@ CUB_NS_PREFIX \ namespace cub \ { \ - CUB_DETAIL_MAGIC_NS_BEGIN \ - _LIBCUDACXX_DIAGNOSTIC_PUSH \ - _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") \ - _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") + CUB_DETAIL_MAGIC_NS_BEGIN /** * \def CUB_NAMESPACE_END @@ -203,7 +198,6 @@ * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_END \ - _LIBCUDACXX_DIAGNOSTIC_POP \ CUB_DETAIL_MAGIC_NS_END \ } /* end namespace cub */ \ CUB_NS_POSTFIX diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index 4964a9fcfe5..a32231b6d57 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -35,8 +35,6 @@ #include #include -#include - #include #include @@ -45,12 +43,8 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace core { -_LIBCUDACXX_DIAGNOSTIC_PUSH -_LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") -_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") - #ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES -#define THRUST_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN +#define THRUST_DETAIL_KERNEL_ATTRIBUTES static __global__ #endif #if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) @@ -1149,8 +1143,6 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") }; -_LIBCUDACXX_DIAGNOSTIC_POP - } // namespace core } // namespace cuda_cub THRUST_NAMESPACE_END From 3e1fd8438a422f59f5a7a978934291a47754d148 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 16:33:55 +0000 Subject: [PATCH 08/17] Include RDC into the inline namespace --- cub/cub/detail/detect_cuda_runtime.cuh | 11 +- cub/cub/util_macro.cuh | 11 +- cub/cub/util_namespace.cuh | 41 +++-- .../detail/config/detect_cuda_runtime.h | 87 ++++++++++ thrust/thrust/detail/config/namespace.h | 159 +++++++++--------- thrust/thrust/system/cuda/config.h | 59 +------ 6 files changed, 206 insertions(+), 162 deletions(-) create mode 100644 thrust/thrust/detail/config/detect_cuda_runtime.h diff --git a/cub/cub/detail/detect_cuda_runtime.cuh b/cub/cub/detail/detect_cuda_runtime.cuh index a2af93b7189..b8e776db748 100644 --- a/cub/cub/detail/detect_cuda_runtime.cuh +++ b/cub/cub/detail/detect_cuda_runtime.cuh @@ -27,20 +27,14 @@ ******************************************************************************/ /** - * \file + * @file * Utilities for CUDA dynamic parallelism. */ #pragma once -#include - #include -CUB_NAMESPACE_BEGIN -namespace detail -{ - #ifdef DOXYGEN_SHOULD_SKIP_THIS // Only parse this during doxygen passes: /** @@ -111,6 +105,3 @@ namespace detail #endif #endif // Do not document - -} // namespace detail -CUB_NAMESPACE_END diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index bd36dc5ab06..5c1772407c5 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -32,11 +32,12 @@ #pragma once +#include +#include + #include #include -#include "util_namespace.cuh" - CUB_NAMESPACE_BEGIN @@ -115,7 +116,11 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, #endif #ifndef CUB_DETAIL_KERNEL_ATTRIBUTES -#define CUB_DETAIL_KERNEL_ATTRIBUTES static __global__ +#if defined(CUB_RDC_ENABLED) +#define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ +#else +#define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ static +#endif #endif /** @} */ // end group UtilModule diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index a3854c0952e..4a54e11e2b6 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -38,7 +38,8 @@ // This is not used by this file; this is a hack so that we can detect the // CUB version from Thrust on older versions of CUB that did not have // version.cuh. -#include "version.cuh" +#include +#include // Prior to 1.13.1, only the PREFIX/POSTFIX macros were used. Notify users // that they must now define the qualifier macro, too. @@ -161,23 +162,33 @@ #define CUB_DETAIL_MAGIC_NS_NAME(...) CUB_DETAIL_IDENTITY(CUB_DETAIL_APPLY(CUB_DETAIL_DISPATCH, CUB_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) #endif // !defined(CUB_DETAIL_MAGIC_NS_NAME) +// clang-format off #if defined(CUB_DISABLE_NAMESPACE_MAGIC) -#if !defined(CUB_WRAPPED_NAMESPACE) -#if !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR) -#error "Disabling namespace magic is unsafe without wrapping namespace" -#endif // !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR) -#endif // !defined(CUB_WRAPPED_NAMESPACE) -#define CUB_DETAIL_MAGIC_NS_BEGIN -#define CUB_DETAIL_MAGIC_NS_END +# if !defined(CUB_WRAPPED_NAMESPACE) +# if !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR) +# error "Disabling namespace magic is unsafe without wrapping namespace" +# endif // !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR) +# endif // !defined(CUB_WRAPPED_NAMESPACE) +# define CUB_DETAIL_MAGIC_NS_BEGIN +# define CUB_DETAIL_MAGIC_NS_END #else // not defined(CUB_DISABLE_NAMESPACE_MAGIC) -#if defined(_NVHPC_CUDA) -#define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, NV_TARGET_SM_INTEGER_LIST) { -#define CUB_DETAIL_MAGIC_NS_END } -#else // not defined(_NVHPC_CUDA) -#define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, __CUDA_ARCH_LIST__) { -#define CUB_DETAIL_MAGIC_NS_END } -#endif // not defined(_NVHPC_CUDA) +# if defined(_NVHPC_CUDA) +# if defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, RDC, NV_TARGET_SM_INTEGER_LIST) { +# else // !defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, NV_TARGET_SM_INTEGER_LIST) { +# endif // !defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_END } +# else // not defined(_NVHPC_CUDA) +# if defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, RDC, __CUDA_ARCH_LIST__) { +# else // !defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, __CUDA_ARCH_LIST__) { +# endif // !defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_END } +# endif // not defined(_NVHPC_CUDA) #endif // not defined(CUB_DISABLE_NAMESPACE_MAGIC) +// clang-format on /** * \def CUB_NAMESPACE_BEGIN diff --git a/thrust/thrust/detail/config/detect_cuda_runtime.h b/thrust/thrust/detail/config/detect_cuda_runtime.h new file mode 100644 index 00000000000..ef5eb8e0b32 --- /dev/null +++ b/thrust/thrust/detail/config/detect_cuda_runtime.h @@ -0,0 +1,87 @@ +/****************************************************************************** + * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the NVIDIA CORPORATION nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + ******************************************************************************/ +#pragma once + +#include + +/** + * \def THRUST_RUNTIME_FUNCTION + * + * Execution space for functions that can use the CUDA runtime API (`__host__` + * when RDC is off, `__host__ __device__` when RDC is on). + */ +#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION + +/** + * \def THRUST_RDC_ENABLED + * + * Defined if RDC is enabled. + */ +#ifdef CUB_RDC_ENABLED +#define THRUST_RDC_ENABLED +#endif + +/** + * \def __THRUST_HAS_CUDART__ + * + * Whether or not the active compiler pass is allowed to invoke device kernels + * or methods from the CUDA runtime API. + * + * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__` + * and is not compatible with `NV_IF_TARGET`. It is provided for legacy + * purposes only. + * + * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`. + */ +#ifdef CUB_RUNTIME_ENABLED +#define __THRUST_HAS_CUDART__ 1 +#else +#define __THRUST_HAS_CUDART__ 0 +#endif + +// These definitions were intended for internal use only and are now obsolete. +// If you relied on them, consider porting your code to use the functionality +// in libcu++'s header. +// +// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make +// them available again. These should be considered deprecated and will be +// fully removed in a future version. +#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS +#ifdef __CUDA_ARCH__ +#define THRUST_DEVICE_CODE +#endif // __CUDA_ARCH__ +#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS + +#ifdef THRUST_AGENT_ENTRY_NOINLINE +#define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__ +#else +#define THRUST_AGENT_ENTRY_INLINE_ATTR __forceinline__ +#endif + +#define THRUST_DEVICE_FUNCTION __device__ __forceinline__ +#define THRUST_HOST_FUNCTION __host__ __forceinline__ +#define THRUST_FUNCTION __host__ __device__ __forceinline__ diff --git a/thrust/thrust/detail/config/namespace.h b/thrust/thrust/detail/config/namespace.h index 0a8133fc140..90f76668266 100644 --- a/thrust/thrust/detail/config/namespace.h +++ b/thrust/thrust/detail/config/namespace.h @@ -17,6 +17,7 @@ #pragma once #include +#include #include /** @@ -87,85 +88,91 @@ #define THRUST_NS_QUALIFIER ::thrust #endif +// clang-format off #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA - -#if !defined(THRUST_DETAIL_MAGIC_NS_NAME) -#define THRUST_DETAIL_COUNT_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ - _14, _15, _16, _17, _18, _19, _20, N, ...) \ - N -#define THRUST_DETAIL_COUNT(...) \ - THRUST_DETAIL_IDENTITY(THRUST_DETAIL_COUNT_N(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, \ - 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)) -#define THRUST_DETAIL_IDENTITY(N) N -#define THRUST_DETAIL_APPLY(MACRO, ...) THRUST_DETAIL_IDENTITY(MACRO(__VA_ARGS__)) -#define THRUST_DETAIL_MAGIC_NS_NAME1(P1) \ - THRUST_##P1##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME2(P1, P2) \ - THRUST_##P1##_##P2##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME3(P1, P2, P3) \ - THRUST_##P1##_##P2##_##P3##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME4(P1, P2, P3, P4) \ - THRUST_##P1##_##P2##_##P3##_##P4##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME5(P1, P2, P3, P4, P5) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME6(P1, P2, P3, P4, P5, P6) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME19(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_NS -#define THRUST_DETAIL_MAGIC_NS_NAME20(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \ - THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_##P20##_NS -#define THRUST_DETAIL_DISPATCH(N) THRUST_DETAIL_MAGIC_NS_NAME ## N -#define THRUST_DETAIL_MAGIC_NS_NAME(...) THRUST_DETAIL_IDENTITY(THRUST_DETAIL_APPLY(THRUST_DETAIL_DISPATCH, THRUST_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) -#endif // !defined(THRUST_DETAIL_MAGIC_NS_NAME) - -#if defined(THRUST_DISABLE_NAMESPACE_MAGIC) -#if !defined(THRUST_WRAPPED_NAMESPACE) -#if !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) -#error "Disabling namespace magic is unsafe without wrapping namespace" -#endif // !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) -#endif // !defined(THRUST_WRAPPED_NAMESPACE) -#define THRUST_DETAIL_MAGIC_NS_BEGIN -#define THRUST_DETAIL_MAGIC_NS_END -#else // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) -#if defined(_NVHPC_CUDA) -#define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, NV_TARGET_SM_INTEGER_LIST) { -#define THRUST_DETAIL_MAGIC_NS_END } -#else // not defined(_NVHPC_CUDA) -#define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, __CUDA_ARCH_LIST__) { -#define THRUST_DETAIL_MAGIC_NS_END } -#endif // not defined(_NVHPC_CUDA) -#endif // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) - +# if !defined(THRUST_DETAIL_MAGIC_NS_NAME) +# define THRUST_DETAIL_COUNT_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, N, ...) \ + N +# define THRUST_DETAIL_COUNT(...) \ + THRUST_DETAIL_IDENTITY(THRUST_DETAIL_COUNT_N(__VA_ARGS__, 20, 19, 18, 17, 16, 15, 14, 13, 12, \ + 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)) +# define THRUST_DETAIL_IDENTITY(N) N +# define THRUST_DETAIL_APPLY(MACRO, ...) THRUST_DETAIL_IDENTITY(MACRO(__VA_ARGS__)) +# define THRUST_DETAIL_MAGIC_NS_NAME1(P1) \ + THRUST_##P1##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME2(P1, P2) \ + THRUST_##P1##_##P2##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME3(P1, P2, P3) \ + THRUST_##P1##_##P2##_##P3##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME4(P1, P2, P3, P4) \ + THRUST_##P1##_##P2##_##P3##_##P4##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME5(P1, P2, P3, P4, P5) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME6(P1, P2, P3, P4, P5, P6) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME19(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_NS +# define THRUST_DETAIL_MAGIC_NS_NAME20(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \ + THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_##P20##_NS +# define THRUST_DETAIL_DISPATCH(N) THRUST_DETAIL_MAGIC_NS_NAME ## N +# define THRUST_DETAIL_MAGIC_NS_NAME(...) THRUST_DETAIL_IDENTITY(THRUST_DETAIL_APPLY(THRUST_DETAIL_DISPATCH, THRUST_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) +# endif // !defined(THRUST_DETAIL_MAGIC_NS_NAME) + +# if defined(THRUST_DISABLE_NAMESPACE_MAGIC) +# if !defined(THRUST_WRAPPED_NAMESPACE) +# if !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) +# error "Disabling namespace magic is unsafe without wrapping namespace" +# endif // !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) +# endif // !defined(THRUST_WRAPPED_NAMESPACE) +# define THRUST_DETAIL_MAGIC_NS_BEGIN +# define THRUST_DETAIL_MAGIC_NS_END +# else // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) +# if defined(_NVHPC_CUDA) +# if defined(THRUST_RDC_ENABLED) +# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, RDC, NV_TARGET_SM_INTEGER_LIST) { +# else +# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, NV_TARGET_SM_INTEGER_LIST) { +# endif +# define THRUST_DETAIL_MAGIC_NS_END } +# else // not defined(_NVHPC_CUDA) +# if defined(THRUST_RDC_ENABLED) +# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, RDC, __CUDA_ARCH_LIST__) { +# else +# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, __CUDA_ARCH_LIST__) { +# endif +# define THRUST_DETAIL_MAGIC_NS_END } +# endif // not defined(_NVHPC_CUDA) +# endif // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) #else // THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA - -#define THRUST_DETAIL_MAGIC_NS_BEGIN -#define THRUST_DETAIL_MAGIC_NS_END - +# define THRUST_DETAIL_MAGIC_NS_BEGIN +# define THRUST_DETAIL_MAGIC_NS_END #endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA +// clang-format on /** * \def THRUST_NAMESPACE_BEGIN diff --git a/thrust/thrust/system/cuda/config.h b/thrust/thrust/system/cuda/config.h index f6c8b9cb38d..637c3565fe6 100644 --- a/thrust/thrust/system/cuda/config.h +++ b/thrust/thrust/system/cuda/config.h @@ -42,65 +42,8 @@ #include #include -#include +#include -/** - * \def THRUST_RUNTIME_FUNCTION - * - * Execution space for functions that can use the CUDA runtime API (`__host__` - * when RDC is off, `__host__ __device__` when RDC is on). - */ -#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION - -/** - * \def THRUST_RDC_ENABLED - * - * Defined if RDC is enabled. - */ -#ifdef CUB_RDC_ENABLED -#define THRUST_RDC_ENABLED -#endif - -/** - * \def __THRUST_HAS_CUDART__ - * - * Whether or not the active compiler pass is allowed to invoke device kernels - * or methods from the CUDA runtime API. - * - * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__` - * and is not compatible with `NV_IF_TARGET`. It is provided for legacy - * purposes only. - * - * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`. - */ -#ifdef CUB_RUNTIME_ENABLED -#define __THRUST_HAS_CUDART__ 1 -#else -#define __THRUST_HAS_CUDART__ 0 -#endif - -// These definitions were intended for internal use only and are now obsolete. -// If you relied on them, consider porting your code to use the functionality -// in libcu++'s header. -// -// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make -// them available again. These should be considered deprecated and will be -// fully removed in a future version. -#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS -#ifdef __CUDA_ARCH__ -#define THRUST_DEVICE_CODE -#endif // __CUDA_ARCH__ -#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS - -#ifdef THRUST_AGENT_ENTRY_NOINLINE -#define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__ -#else -#define THRUST_AGENT_ENTRY_INLINE_ATTR __forceinline__ -#endif - -#define THRUST_DEVICE_FUNCTION __device__ __forceinline__ -#define THRUST_HOST_FUNCTION __host__ __forceinline__ -#define THRUST_FUNCTION __host__ __device__ __forceinline__ #if 0 #define THRUST_ARGS(...) __VA_ARGS__ #define THRUST_STRIP_PARENS(X) X From a97e4f633032283a5c924dac2b9d043c9539d202 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 16:41:52 +0000 Subject: [PATCH 09/17] Silense some kernel warnings --- cub/cub/util_macro.cuh | 10 +++++----- cub/cub/util_namespace.cuh | 18 +++++++++++++++++- .../system/cuda/detail/core/agent_launcher.h | 18 +++++++++++++++++- 3 files changed, 39 insertions(+), 7 deletions(-) diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index 5c1772407c5..f4740b70c70 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -116,11 +116,11 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, #endif #ifndef CUB_DETAIL_KERNEL_ATTRIBUTES -#if defined(CUB_RDC_ENABLED) -#define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ -#else -#define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ static -#endif +# if defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN +# else +# define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ static +# endif #endif /** @} */ // end group UtilModule diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index 4a54e11e2b6..2cbf4490275 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -35,6 +35,8 @@ #pragma once +#include + // This is not used by this file; this is a hack so that we can detect the // CUB version from Thrust on older versions of CUB that did not have // version.cuh. @@ -188,6 +190,18 @@ # define CUB_DETAIL_MAGIC_NS_END } # endif // not defined(_NVHPC_CUDA) #endif // not defined(CUB_DISABLE_NAMESPACE_MAGIC) + +#if defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_BEGIN \ + _LIBCUDACXX_DIAGNOSTIC_PUSH \ + _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") \ + _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") +# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_END \ +# _LIBCUDACXX_DIAGNOSTIC_POP +#else +# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_BEGIN +# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_END +#endif // clang-format on /** @@ -200,7 +214,8 @@ CUB_NS_PREFIX \ namespace cub \ { \ - CUB_DETAIL_MAGIC_NS_BEGIN + CUB_DETAIL_MAGIC_NS_BEGIN \ + CUB_DETAIL_SILENCE_KERNEL_WARNINGS_BEGIN /** * \def CUB_NAMESPACE_END @@ -209,6 +224,7 @@ * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_END \ + CUB_DETAIL_SILENCE_KERNEL_WARNINGS_END \ CUB_DETAIL_MAGIC_NS_END \ } /* end namespace cub */ \ CUB_NS_POSTFIX diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index a32231b6d57..cdef565d9f4 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -39,12 +39,24 @@ #include +#include + THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace core { +#if defined(THRUST_RDC_ENABLED) +_LIBCUDACXX_DIAGNOSTIC_PUSH +_LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") +_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") +#endif + #ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES -#define THRUST_DETAIL_KERNEL_ATTRIBUTES static __global__ +# if defined(THRUST_RDC_ENABLED) +# define THRUST_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN +# else +# define THRUST_DETAIL_KERNEL_ATTRIBUTES __global__ static +# endif #endif #if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) @@ -1143,6 +1155,10 @@ namespace core { }; +#if defined(THRUST_RDC_ENABLED) +_LIBCUDACXX_DIAGNOSTIC_POP +#endif + } // namespace core } // namespace cuda_cub THRUST_NAMESPACE_END From 30739d5bee0bf4f51d4487c59bced065dc9856ed Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 18:48:01 +0000 Subject: [PATCH 10/17] Leak warning suppression --- cub/cub/util_macro.cuh | 11 ++++++----- cub/cub/util_namespace.cuh | 18 +----------------- .../system/cuda/detail/core/agent_launcher.h | 13 ++----------- 3 files changed, 9 insertions(+), 33 deletions(-) diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index f4740b70c70..34196e37909 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -116,11 +116,12 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, #endif #ifndef CUB_DETAIL_KERNEL_ATTRIBUTES -# if defined(CUB_RDC_ENABLED) -# define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN -# else -# define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ static -# endif +#define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN +#endif + +#if !defined(CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION) +_LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") +_LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") #endif /** @} */ // end group UtilModule diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index 2cbf4490275..4a54e11e2b6 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -35,8 +35,6 @@ #pragma once -#include - // This is not used by this file; this is a hack so that we can detect the // CUB version from Thrust on older versions of CUB that did not have // version.cuh. @@ -190,18 +188,6 @@ # define CUB_DETAIL_MAGIC_NS_END } # endif // not defined(_NVHPC_CUDA) #endif // not defined(CUB_DISABLE_NAMESPACE_MAGIC) - -#if defined(CUB_RDC_ENABLED) -# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_BEGIN \ - _LIBCUDACXX_DIAGNOSTIC_PUSH \ - _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") \ - _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") -# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_END \ -# _LIBCUDACXX_DIAGNOSTIC_POP -#else -# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_BEGIN -# define CUB_DETAIL_SILENCE_KERNEL_WARNINGS_END -#endif // clang-format on /** @@ -214,8 +200,7 @@ CUB_NS_PREFIX \ namespace cub \ { \ - CUB_DETAIL_MAGIC_NS_BEGIN \ - CUB_DETAIL_SILENCE_KERNEL_WARNINGS_BEGIN + CUB_DETAIL_MAGIC_NS_BEGIN /** * \def CUB_NAMESPACE_END @@ -224,7 +209,6 @@ * This macro is defined by CUB and may not be overridden. */ #define CUB_NAMESPACE_END \ - CUB_DETAIL_SILENCE_KERNEL_WARNINGS_END \ CUB_DETAIL_MAGIC_NS_END \ } /* end namespace cub */ \ CUB_NS_POSTFIX diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index cdef565d9f4..7d617a11463 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -45,18 +45,13 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace core { -#if defined(THRUST_RDC_ENABLED) -_LIBCUDACXX_DIAGNOSTIC_PUSH +#if !defined(THRUST_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION) _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") #endif #ifndef THRUST_DETAIL_KERNEL_ATTRIBUTES -# if defined(THRUST_RDC_ENABLED) -# define THRUST_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN -# else -# define THRUST_DETAIL_KERNEL_ATTRIBUTES __global__ static -# endif +#define THRUST_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN #endif #if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) @@ -1155,10 +1150,6 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") }; -#if defined(THRUST_RDC_ENABLED) -_LIBCUDACXX_DIAGNOSTIC_POP -#endif - } // namespace core } // namespace cuda_cub THRUST_NAMESPACE_END From 9f4945c8851f50178c3ef3c71ddb79ff45b8e3cf Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 20:26:05 +0000 Subject: [PATCH 11/17] Do not separate mixed RDC builds for now --- cub/cub/util_namespace.cuh | 12 ++---------- thrust/thrust/detail/config/namespace.h | 12 ++---------- 2 files changed, 4 insertions(+), 20 deletions(-) diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index 4a54e11e2b6..6c85a8be627 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -173,18 +173,10 @@ # define CUB_DETAIL_MAGIC_NS_END #else // not defined(CUB_DISABLE_NAMESPACE_MAGIC) # if defined(_NVHPC_CUDA) -# if defined(CUB_RDC_ENABLED) -# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, RDC, NV_TARGET_SM_INTEGER_LIST) { -# else // !defined(CUB_RDC_ENABLED) -# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, NV_TARGET_SM_INTEGER_LIST) { -# endif // !defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, NV_TARGET_SM_INTEGER_LIST) { # define CUB_DETAIL_MAGIC_NS_END } # else // not defined(_NVHPC_CUDA) -# if defined(CUB_RDC_ENABLED) -# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, RDC, __CUDA_ARCH_LIST__) { -# else // !defined(CUB_RDC_ENABLED) -# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, __CUDA_ARCH_LIST__) { -# endif // !defined(CUB_RDC_ENABLED) +# define CUB_DETAIL_MAGIC_NS_BEGIN inline namespace CUB_DETAIL_MAGIC_NS_NAME(CUB_VERSION, __CUDA_ARCH_LIST__) { # define CUB_DETAIL_MAGIC_NS_END } # endif // not defined(_NVHPC_CUDA) #endif // not defined(CUB_DISABLE_NAMESPACE_MAGIC) diff --git a/thrust/thrust/detail/config/namespace.h b/thrust/thrust/detail/config/namespace.h index 90f76668266..0b45cfeff83 100644 --- a/thrust/thrust/detail/config/namespace.h +++ b/thrust/thrust/detail/config/namespace.h @@ -153,18 +153,10 @@ # define THRUST_DETAIL_MAGIC_NS_END # else // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) # if defined(_NVHPC_CUDA) -# if defined(THRUST_RDC_ENABLED) -# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, RDC, NV_TARGET_SM_INTEGER_LIST) { -# else -# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, NV_TARGET_SM_INTEGER_LIST) { -# endif +# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, NV_TARGET_SM_INTEGER_LIST) { # define THRUST_DETAIL_MAGIC_NS_END } # else // not defined(_NVHPC_CUDA) -# if defined(THRUST_RDC_ENABLED) -# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, RDC, __CUDA_ARCH_LIST__) { -# else -# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, __CUDA_ARCH_LIST__) { -# endif +# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, __CUDA_ARCH_LIST__) { # define THRUST_DETAIL_MAGIC_NS_END } # endif // not defined(_NVHPC_CUDA) # endif // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) From 5607d6ce69541c4d7cebcb20a706c972fe3fc28a Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 20:31:49 +0000 Subject: [PATCH 12/17] Prefer wrapped namespace --- cub/cub/util_namespace.cuh | 2 +- thrust/thrust/detail/config/namespace.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cub/cub/util_namespace.cuh b/cub/cub/util_namespace.cuh index 6c85a8be627..27ff12dbbaa 100644 --- a/cub/cub/util_namespace.cuh +++ b/cub/cub/util_namespace.cuh @@ -163,7 +163,7 @@ #endif // !defined(CUB_DETAIL_MAGIC_NS_NAME) // clang-format off -#if defined(CUB_DISABLE_NAMESPACE_MAGIC) +#if defined(CUB_DISABLE_NAMESPACE_MAGIC) || defined(CUB_WRAPPED_NAMESPACE) # if !defined(CUB_WRAPPED_NAMESPACE) # if !defined(CUB_IGNORE_NAMESPACE_MAGIC_ERROR) # error "Disabling namespace magic is unsafe without wrapping namespace" diff --git a/thrust/thrust/detail/config/namespace.h b/thrust/thrust/detail/config/namespace.h index 0b45cfeff83..1ef47508d25 100644 --- a/thrust/thrust/detail/config/namespace.h +++ b/thrust/thrust/detail/config/namespace.h @@ -143,7 +143,7 @@ # define THRUST_DETAIL_MAGIC_NS_NAME(...) THRUST_DETAIL_IDENTITY(THRUST_DETAIL_APPLY(THRUST_DETAIL_DISPATCH, THRUST_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) # endif // !defined(THRUST_DETAIL_MAGIC_NS_NAME) -# if defined(THRUST_DISABLE_NAMESPACE_MAGIC) +# if defined(THRUST_DISABLE_NAMESPACE_MAGIC) || defined(THRUST_WRAPPED_NAMESPACE) # if !defined(THRUST_WRAPPED_NAMESPACE) # if !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) # error "Disabling namespace magic is unsafe without wrapping namespace" From 8b0867372cdf3b7d00b9522f57a8b53294efbadf Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 20:40:52 +0000 Subject: [PATCH 13/17] Better macro names --- thrust/thrust/detail/config/namespace.h | 80 ++++++++++++------------- 1 file changed, 40 insertions(+), 40 deletions(-) diff --git a/thrust/thrust/detail/config/namespace.h b/thrust/thrust/detail/config/namespace.h index 1ef47508d25..9cd2e72ec26 100644 --- a/thrust/thrust/detail/config/namespace.h +++ b/thrust/thrust/detail/config/namespace.h @@ -90,7 +90,7 @@ // clang-format off #if THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA -# if !defined(THRUST_DETAIL_MAGIC_NS_NAME) +# if !defined(THRUST_DETAIL_ABI_NS_NAME) # define THRUST_DETAIL_COUNT_N(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ _14, _15, _16, _17, _18, _19, _20, N, ...) \ N @@ -99,70 +99,70 @@ 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)) # define THRUST_DETAIL_IDENTITY(N) N # define THRUST_DETAIL_APPLY(MACRO, ...) THRUST_DETAIL_IDENTITY(MACRO(__VA_ARGS__)) -# define THRUST_DETAIL_MAGIC_NS_NAME1(P1) \ +# define THRUST_DETAIL_ABI_NS_NAME1(P1) \ THRUST_##P1##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME2(P1, P2) \ +# define THRUST_DETAIL_ABI_NS_NAME2(P1, P2) \ THRUST_##P1##_##P2##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME3(P1, P2, P3) \ +# define THRUST_DETAIL_ABI_NS_NAME3(P1, P2, P3) \ THRUST_##P1##_##P2##_##P3##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME4(P1, P2, P3, P4) \ +# define THRUST_DETAIL_ABI_NS_NAME4(P1, P2, P3, P4) \ THRUST_##P1##_##P2##_##P3##_##P4##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME5(P1, P2, P3, P4, P5) \ +# define THRUST_DETAIL_ABI_NS_NAME5(P1, P2, P3, P4, P5) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME6(P1, P2, P3, P4, P5, P6) \ +# define THRUST_DETAIL_ABI_NS_NAME6(P1, P2, P3, P4, P5, P6) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) \ +# define THRUST_DETAIL_ABI_NS_NAME7(P1, P2, P3, P4, P5, P6, P7) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \ +# define THRUST_DETAIL_ABI_NS_NAME8(P1, P2, P3, P4, P5, P6, P7, P8) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \ +# define THRUST_DETAIL_ABI_NS_NAME9(P1, P2, P3, P4, P5, P6, P7, P8, P9) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \ +# define THRUST_DETAIL_ABI_NS_NAME10(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \ +# define THRUST_DETAIL_ABI_NS_NAME11(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \ +# define THRUST_DETAIL_ABI_NS_NAME12(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \ +# define THRUST_DETAIL_ABI_NS_NAME13(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \ +# define THRUST_DETAIL_ABI_NS_NAME14(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \ +# define THRUST_DETAIL_ABI_NS_NAME15(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \ +# define THRUST_DETAIL_ABI_NS_NAME16(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \ +# define THRUST_DETAIL_ABI_NS_NAME17(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \ +# define THRUST_DETAIL_ABI_NS_NAME18(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME19(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \ +# define THRUST_DETAIL_ABI_NS_NAME19(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_NS -# define THRUST_DETAIL_MAGIC_NS_NAME20(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \ +# define THRUST_DETAIL_ABI_NS_NAME20(P1, P2, P3, P4, P5, P6, P7, P8, P9, P10, P11, P12, P13, P14, P15, P16, P17, P18, P19, P20) \ THRUST_##P1##_##P2##_##P3##_##P4##_##P5##_##P6##_##P7##_##P8##_##P9##_##P10##_##P11##_##P12##_##P13##_##P14##_##P15##_##P16##_##P17##_##P18##_##P19##_##P20##_NS -# define THRUST_DETAIL_DISPATCH(N) THRUST_DETAIL_MAGIC_NS_NAME ## N -# define THRUST_DETAIL_MAGIC_NS_NAME(...) THRUST_DETAIL_IDENTITY(THRUST_DETAIL_APPLY(THRUST_DETAIL_DISPATCH, THRUST_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) -# endif // !defined(THRUST_DETAIL_MAGIC_NS_NAME) +# define THRUST_DETAIL_DISPATCH(N) THRUST_DETAIL_ABI_NS_NAME ## N +# define THRUST_DETAIL_ABI_NS_NAME(...) THRUST_DETAIL_IDENTITY(THRUST_DETAIL_APPLY(THRUST_DETAIL_DISPATCH, THRUST_DETAIL_COUNT(__VA_ARGS__))(__VA_ARGS__)) +# endif // !defined(THRUST_DETAIL_ABI_NS_NAME) -# if defined(THRUST_DISABLE_NAMESPACE_MAGIC) || defined(THRUST_WRAPPED_NAMESPACE) +# if defined(THRUST_DISABLE_ABI_NAMESPACE) || defined(THRUST_WRAPPED_NAMESPACE) # if !defined(THRUST_WRAPPED_NAMESPACE) -# if !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) -# error "Disabling namespace magic is unsafe without wrapping namespace" -# endif // !defined(THRUST_IGNORE_NAMESPACE_MAGIC_ERROR) +# if !defined(THRUST_IGNORE_ABI_NAMESPACE_ERROR) +# error "Disabling ABI namespace is unsafe without wrapping namespace" +# endif // !defined(THRUST_IGNORE_ABI_NAMESPACE_ERROR) # endif // !defined(THRUST_WRAPPED_NAMESPACE) -# define THRUST_DETAIL_MAGIC_NS_BEGIN -# define THRUST_DETAIL_MAGIC_NS_END -# else // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) +# define THRUST_DETAIL_ABI_NS_BEGIN +# define THRUST_DETAIL_ABI_NS_END +# else // not defined(THRUST_DISABLE_ABI_NAMESPACE) # if defined(_NVHPC_CUDA) -# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, NV_TARGET_SM_INTEGER_LIST) { -# define THRUST_DETAIL_MAGIC_NS_END } +# define THRUST_DETAIL_ABI_NS_BEGIN inline namespace THRUST_DETAIL_ABI_NS_NAME(THRUST_VERSION, NV_TARGET_SM_INTEGER_LIST) { +# define THRUST_DETAIL_ABI_NS_END } # else // not defined(_NVHPC_CUDA) -# define THRUST_DETAIL_MAGIC_NS_BEGIN inline namespace THRUST_DETAIL_MAGIC_NS_NAME(THRUST_VERSION, __CUDA_ARCH_LIST__) { -# define THRUST_DETAIL_MAGIC_NS_END } +# define THRUST_DETAIL_ABI_NS_BEGIN inline namespace THRUST_DETAIL_ABI_NS_NAME(THRUST_VERSION, __CUDA_ARCH_LIST__) { +# define THRUST_DETAIL_ABI_NS_END } # endif // not defined(_NVHPC_CUDA) -# endif // not defined(THRUST_DISABLE_NAMESPACE_MAGIC) +# endif // not defined(THRUST_DISABLE_ABI_NAMESPACE) #else // THRUST_DEVICE_SYSTEM != THRUST_DEVICE_SYSTEM_CUDA -# define THRUST_DETAIL_MAGIC_NS_BEGIN -# define THRUST_DETAIL_MAGIC_NS_END +# define THRUST_DETAIL_ABI_NS_BEGIN +# define THRUST_DETAIL_ABI_NS_END #endif // THRUST_DEVICE_SYSTEM == THRUST_DEVICE_SYSTEM_CUDA // clang-format on @@ -176,7 +176,7 @@ THRUST_NS_PREFIX \ namespace thrust \ { \ - THRUST_DETAIL_MAGIC_NS_BEGIN + THRUST_DETAIL_ABI_NS_BEGIN /** * \def THRUST_NAMESPACE_END @@ -185,7 +185,7 @@ * This macro is defined by Thrust and may not be overridden. */ #define THRUST_NAMESPACE_END \ - THRUST_DETAIL_MAGIC_NS_END \ + THRUST_DETAIL_ABI_NS_END \ } /* end namespace thrust */ \ THRUST_NS_POSTFIX From b49948a728304379ca1d4555ea109fb6fcea9739 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 20:53:38 +0000 Subject: [PATCH 14/17] Return old header organization --- .../detail/config/detect_cuda_runtime.h | 87 ------------------- thrust/thrust/detail/config/namespace.h | 1 - thrust/thrust/system/cuda/config.h | 60 ++++++++++++- 3 files changed, 59 insertions(+), 89 deletions(-) delete mode 100644 thrust/thrust/detail/config/detect_cuda_runtime.h diff --git a/thrust/thrust/detail/config/detect_cuda_runtime.h b/thrust/thrust/detail/config/detect_cuda_runtime.h deleted file mode 100644 index ef5eb8e0b32..00000000000 --- a/thrust/thrust/detail/config/detect_cuda_runtime.h +++ /dev/null @@ -1,87 +0,0 @@ -/****************************************************************************** - * Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions are met: - * * Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * * Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * * Neither the name of the NVIDIA CORPORATION nor the - * names of its contributors may be used to endorse or promote products - * derived from this software without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" - * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY - * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES - * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND - * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS - * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - ******************************************************************************/ -#pragma once - -#include - -/** - * \def THRUST_RUNTIME_FUNCTION - * - * Execution space for functions that can use the CUDA runtime API (`__host__` - * when RDC is off, `__host__ __device__` when RDC is on). - */ -#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION - -/** - * \def THRUST_RDC_ENABLED - * - * Defined if RDC is enabled. - */ -#ifdef CUB_RDC_ENABLED -#define THRUST_RDC_ENABLED -#endif - -/** - * \def __THRUST_HAS_CUDART__ - * - * Whether or not the active compiler pass is allowed to invoke device kernels - * or methods from the CUDA runtime API. - * - * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__` - * and is not compatible with `NV_IF_TARGET`. It is provided for legacy - * purposes only. - * - * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`. - */ -#ifdef CUB_RUNTIME_ENABLED -#define __THRUST_HAS_CUDART__ 1 -#else -#define __THRUST_HAS_CUDART__ 0 -#endif - -// These definitions were intended for internal use only and are now obsolete. -// If you relied on them, consider porting your code to use the functionality -// in libcu++'s header. -// -// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make -// them available again. These should be considered deprecated and will be -// fully removed in a future version. -#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS -#ifdef __CUDA_ARCH__ -#define THRUST_DEVICE_CODE -#endif // __CUDA_ARCH__ -#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS - -#ifdef THRUST_AGENT_ENTRY_NOINLINE -#define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__ -#else -#define THRUST_AGENT_ENTRY_INLINE_ATTR __forceinline__ -#endif - -#define THRUST_DEVICE_FUNCTION __device__ __forceinline__ -#define THRUST_HOST_FUNCTION __host__ __forceinline__ -#define THRUST_FUNCTION __host__ __device__ __forceinline__ diff --git a/thrust/thrust/detail/config/namespace.h b/thrust/thrust/detail/config/namespace.h index 9cd2e72ec26..6617bcfbe1b 100644 --- a/thrust/thrust/detail/config/namespace.h +++ b/thrust/thrust/detail/config/namespace.h @@ -17,7 +17,6 @@ #pragma once #include -#include #include /** diff --git a/thrust/thrust/system/cuda/config.h b/thrust/thrust/system/cuda/config.h index 637c3565fe6..f29a72ac863 100644 --- a/thrust/thrust/system/cuda/config.h +++ b/thrust/thrust/system/cuda/config.h @@ -42,7 +42,65 @@ #include #include -#include +#include + +/** + * \def THRUST_RUNTIME_FUNCTION + * + * Execution space for functions that can use the CUDA runtime API (`__host__` + * when RDC is off, `__host__ __device__` when RDC is on). + */ +#define THRUST_RUNTIME_FUNCTION CUB_RUNTIME_FUNCTION + +/** + * \def THRUST_RDC_ENABLED + * + * Defined if RDC is enabled. + */ +#ifdef CUB_RDC_ENABLED +#define THRUST_RDC_ENABLED +#endif + +/** + * \def __THRUST_HAS_CUDART__ + * + * Whether or not the active compiler pass is allowed to invoke device kernels + * or methods from the CUDA runtime API. + * + * This macro should not be used in Thrust, as it depends on `__CUDA_ARCH__` + * and is not compatible with `NV_IF_TARGET`. It is provided for legacy + * purposes only. + * + * Replace any usages with `THRUST_RDC_ENABLED` and `NV_IF_TARGET`. + */ +#ifdef CUB_RUNTIME_ENABLED +#define __THRUST_HAS_CUDART__ 1 +#else +#define __THRUST_HAS_CUDART__ 0 +#endif + +// These definitions were intended for internal use only and are now obsolete. +// If you relied on them, consider porting your code to use the functionality +// in libcu++'s header. +// +// For a temporary workaround, define THRUST_PROVIDE_LEGACY_ARCH_MACROS to make +// them available again. These should be considered deprecated and will be +// fully removed in a future version. +#ifdef THRUST_PROVIDE_LEGACY_ARCH_MACROS +#ifdef __CUDA_ARCH__ +#define THRUST_DEVICE_CODE +#endif // __CUDA_ARCH__ +#endif // THRUST_PROVIDE_LEGACY_ARCH_MACROS + +#ifdef THRUST_AGENT_ENTRY_NOINLINE +#define THRUST_AGENT_ENTRY_INLINE_ATTR __noinline__ +#else +#define THRUST_AGENT_ENTRY_INLINE_ATTR __forceinline__ +#endif + +#define THRUST_DEVICE_FUNCTION __device__ __forceinline__ +#define THRUST_HOST_FUNCTION __host__ __forceinline__ +#define THRUST_FUNCTION __host__ __device__ __forceinline__ #if 0 #define THRUST_ARGS(...) __VA_ARGS__ From 73bf5f126e0d08c33d86234c69ce94928566035c Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Mon, 18 Sep 2023 21:16:37 +0000 Subject: [PATCH 15/17] Document new macro --- cub/cub/util_macro.cuh | 4 ++++ thrust/thrust/system/cuda/detail/core/agent_launcher.h | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index 34196e37909..83c36683e38 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -119,6 +119,10 @@ constexpr __host__ __device__ auto max CUB_PREVENT_MACRO_SUBSTITUTION(T &&t, #define CUB_DETAIL_KERNEL_ATTRIBUTES __global__ _LIBCUDACXX_HIDDEN #endif +/** + * @def CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION + * If defined, the default suppression of kernel visibility attribute warning is disabled. + */ #if !defined(CUB_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION) _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index 7d617a11463..052182baf04 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -45,6 +45,10 @@ THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace core { +/** + * @def THRUST_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION + * If defined, the default suppression of kernel visibility attribute warning is disabled. + */ #if !defined(THRUST_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION) _LIBCUDACXX_GCC_DIAGNOSTIC_IGNORED("-Wattributes") _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") From 35b168bfeb7fda6c8b930d249dfab997e329d1b3 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Tue, 19 Sep 2023 17:33:04 +0000 Subject: [PATCH 16/17] Fix review notes --- cub/cub/util_macro.cuh | 2 +- thrust/thrust/detail/config/namespace.h | 2 +- .../system/cuda/detail/core/agent_launcher.h | 132 +++++++++--------- .../cuda/detail/core/triple_chevron_launch.h | 3 +- 4 files changed, 69 insertions(+), 70 deletions(-) diff --git a/cub/cub/util_macro.cuh b/cub/cub/util_macro.cuh index 83c36683e38..d8f46f09075 100644 --- a/cub/cub/util_macro.cuh +++ b/cub/cub/util_macro.cuh @@ -35,8 +35,8 @@ #include #include -#include #include +#include // _LIBCUDACXX_HIDDEN, _LIBCUDACXX_{CLANG,GCC}_DIAGNOSTIC_IGNORED CUB_NAMESPACE_BEGIN diff --git a/thrust/thrust/detail/config/namespace.h b/thrust/thrust/detail/config/namespace.h index 6617bcfbe1b..91b9f879cdd 100644 --- a/thrust/thrust/detail/config/namespace.h +++ b/thrust/thrust/detail/config/namespace.h @@ -184,7 +184,7 @@ * This macro is defined by Thrust and may not be overridden. */ #define THRUST_NAMESPACE_END \ - THRUST_DETAIL_ABI_NS_END \ + THRUST_DETAIL_ABI_NS_END \ } /* end namespace thrust */ \ THRUST_NS_POSTFIX diff --git a/thrust/thrust/system/cuda/detail/core/agent_launcher.h b/thrust/thrust/system/cuda/detail/core/agent_launcher.h index 052182baf04..825628e8b05 100644 --- a/thrust/thrust/system/cuda/detail/core/agent_launcher.h +++ b/thrust/thrust/system/cuda/detail/core/agent_launcher.h @@ -35,12 +35,12 @@ #include #include +#include // _LIBCUDACXX_HIDDEN, _LIBCUDACXX_{CLANG,GCC}_DIAGNOSTIC_IGNORED + #include #include -#include - THRUST_NAMESPACE_BEGIN namespace cuda_cub { namespace core { @@ -61,7 +61,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") #if defined(__CUDA_ARCH__) || defined(_NVHPC_CUDA) #if 0 template - void THRUST_DETAIL_KERNEL_ATTRIBUTES + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(Args... args) { @@ -70,105 +70,105 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") } #else template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0) { extern __shared__ char shmem[]; Agent::entry(x0, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1) { extern __shared__ char shmem[]; Agent::entry(x0, x1, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) { extern __shared__ char shmem[]; Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, shmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent(_0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) { extern __shared__ char shmem[]; @@ -181,7 +181,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") #if 0 template - void THRUST_DETAIL_KERNEL_ATTRIBUTES + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, Args... args) { @@ -191,7 +191,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") } #else template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0) { extern __shared__ char shmem[]; @@ -199,7 +199,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1) { extern __shared__ char shmem[]; @@ -207,7 +207,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2) { extern __shared__ char shmem[]; @@ -215,7 +215,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3) { extern __shared__ char shmem[]; @@ -223,7 +223,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4) { extern __shared__ char shmem[]; @@ -231,7 +231,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5) { extern __shared__ char shmem[]; @@ -239,7 +239,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6) { extern __shared__ char shmem[]; @@ -247,7 +247,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7) { extern __shared__ char shmem[]; @@ -255,7 +255,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8) { extern __shared__ char shmem[]; @@ -263,7 +263,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9) { extern __shared__ char shmem[]; @@ -271,7 +271,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA) { extern __shared__ char shmem[]; @@ -279,7 +279,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB) { extern __shared__ char shmem[]; @@ -287,7 +287,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC) { extern __shared__ char shmem[]; @@ -295,7 +295,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD) { extern __shared__ char shmem[]; @@ -303,7 +303,7 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") Agent::entry(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, xA, xB, xC, xD, vshmem); } template - void THRUST_DETAIL_KERNEL_ATTRIBUTES __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) + THRUST_DETAIL_KERNEL_ATTRIBUTES void __launch_bounds__(Agent::ptx_plan::BLOCK_THREADS) _kernel_agent_vshmem(char* vshmem, _0 x0, _1 x1, _2 x2, _3 x3, _4 x4, _5 x5, _6 x6, _7 x7, _8 x8, _9 x9, _xA xA, _xB xB, _xC xC, _xD xD, _xE xE) { extern __shared__ char shmem[]; @@ -314,71 +314,71 @@ _LIBCUDACXX_CLANG_DIAGNOSTIC_IGNORED("-Wattributes") #else #if 0 template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(Args... args) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(Args... args) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*, Args... args) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*, Args... args) {} #else template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1,_2) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1,_2,_3) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1,_2,_3, _4) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1,_2,_3, _4, _5) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5, _6) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1,_2,_3, _4, _5, _6) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7, _8) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0,_1,_2,_3, _4, _5, _6, _7, _8) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD, _xE) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB,_xC, _xD, _xE) {} //////////////////////////////////////////////////////////// template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1,_2) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1,_2,_3) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7, _8) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0,_1,_2,_3, _4, _5, _6, _7, _8) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD) {} template - void THRUST_DETAIL_KERNEL_ATTRIBUTES _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) {} + THRUST_DETAIL_KERNEL_ATTRIBUTES void _kernel_agent_vshmem(char*,_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _xA, _xB, _xC, _xD, _xE) {} #endif #endif diff --git a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h index 3cbc0be9bd5..28ac1a23053 100644 --- a/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h +++ b/thrust/thrust/system/cuda/detail/core/triple_chevron_launch.h @@ -30,11 +30,10 @@ #include #include -#include +#include // _LIBCUDACXX_HIDDEN #include - THRUST_NAMESPACE_BEGIN namespace cuda_cub { From dd32e8dd7000c6e574d322baf06ed95fdbb31ab2 Mon Sep 17 00:00:00 2001 From: Georgy Evtushenko Date: Thu, 21 Sep 2023 16:23:05 +0000 Subject: [PATCH 17/17] Clarify description of kernel annotation macro --- cub/docs/developer_overview.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cub/docs/developer_overview.rst b/cub/docs/developer_overview.rst index 14d8265797d..892d0dc8f06 100644 --- a/cub/docs/developer_overview.rst +++ b/cub/docs/developer_overview.rst @@ -739,7 +739,8 @@ To eliminate the symbol visibility issues on our end, we follow the following ru To satisfy (1), ``thrust::cuda_cub::launcher::triple_chevron`` visibility is hidden. To satisfy (2), instead of annotating kernels as ``__global__`` we annotate them as -``CUB_DETAIL_KERNEL_ATTRIBUTES``. +``CUB_DETAIL_KERNEL_ATTRIBUTES``. Apart from annotating a kernel as global function, the macro +contains hidden visibility attribute. To satisfy (3), CUB symbols are placed inside an inline namespace containing the set of GPU architectures for which the TU is being compiled.