Skip to content

Commit

Permalink
Merge branch 'main' into chained_policy_prune
Browse files Browse the repository at this point in the history
  • Loading branch information
miscco authored Aug 27, 2024
2 parents 60987f2 + 0a1cddb commit 3aaad6e
Show file tree
Hide file tree
Showing 109 changed files with 2,549 additions and 599 deletions.
101 changes: 1 addition & 100 deletions cub/cub/device/dispatch/dispatch_for.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#endif // no system header

#include <cub/agent/agent_for.cuh>
#include <cub/device/dispatch/kernels/for_each.cuh>
#include <cub/device/dispatch/tuning/tuning_for.cuh>
#include <cub/thread/thread_load.cuh>
#include <cub/util_device.cuh>
Expand All @@ -56,106 +57,6 @@ namespace detail
namespace for_each
{

template <class Fn>
struct first_parameter
{
using type = void;
};

template <class C, class R, class A>
struct first_parameter<R (C::*)(A)>
{
using type = A;
};

template <class C, class R, class A>
struct first_parameter<R (C::*)(A) const>
{
using type = A;
};

template <class Fn>
using first_parameter_t = typename first_parameter<decltype(&Fn::operator())>::type;

template <class Value, class Fn, class = void>
struct has_unique_value_overload : ::cuda::std::false_type
{};

// clang-format off
template <class Value, class Fn>
struct has_unique_value_overload<
Value,
Fn,
typename ::cuda::std::enable_if<
!::cuda::std::is_reference<first_parameter_t<Fn>>::value &&
::cuda::std::is_convertible<Value, first_parameter_t<Fn>
>::value>::type>
: ::cuda::std::true_type
{};

// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable.
// This can be observable if the unary operator takes parameter by reference and modifies it or uses address.
// The trait below checks if the freedom to copy trivial types can be regained.
template <typename Value, typename Fn>
using can_regain_copy_freedom =
::cuda::std::integral_constant<
bool,
::cuda::std::is_trivially_constructible<Value>::value &&
::cuda::std::is_trivially_copy_assignable<Value>::value &&
:: cuda::std::is_trivially_move_assignable<Value>::value &&
::cuda::std::is_trivially_destructible<Value>::value &&
has_unique_value_overload<Value, Fn>::value>;
// clang-format on

// This kernel is used when the block size is not known at compile time
template <class ChainedPolicyT, class OffsetT, class OpT>
CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
{
using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
using agent_t = agent_block_striped_t<active_policy_t, OffsetT, OpT>;

const auto block_threads = static_cast<OffsetT>(blockDim.x);
const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
const auto tile_base = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
const auto num_remaining = num_items - tile_base;
const auto items_in_tile = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);

if (items_in_tile == items_per_tile)
{
agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
}
else
{
agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
}
}

// This kernel is used when the block size is known at compile time
template <class ChainedPolicyT, class OffsetT, class OpT>
CUB_DETAIL_KERNEL_ATTRIBUTES //
__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
void static_kernel(OffsetT num_items, OpT op)
{
using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
using agent_t = agent_block_striped_t<active_policy_t, OffsetT, OpT>;

constexpr auto block_threads = active_policy_t::block_threads;
constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads;

const auto tile_base = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
const auto num_remaining = num_items - tile_base;
const auto items_in_tile = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);

if (items_in_tile == items_per_tile)
{
agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
}
else
{
agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
}
}

// The dispatch layer is in the detail namespace until we figure out tuning API
template <class OffsetT, class OpT, class PolicyHubT = policy_hub_t>
struct dispatch_t : PolicyHubT
Expand Down
154 changes: 154 additions & 0 deletions cub/cub/device/dispatch/kernels/for_each.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
/******************************************************************************
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of the NVIDIA CORPORATION nor the
* names of its contributors may be used to endorse or promote products
* derived from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
* DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
******************************************************************************/

#pragma once

#include <cub/config.cuh>

#if defined(_CCCL_IMPLICIT_SYSTEM_HEADER_GCC)
# pragma GCC system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_CLANG)
# pragma clang system_header
#elif defined(_CCCL_IMPLICIT_SYSTEM_HEADER_MSVC)
# pragma system_header
#endif // no system header

#include <cub/agent/agent_for.cuh>

#include <cuda/std/type_traits>

CUB_NAMESPACE_BEGIN

namespace detail
{
namespace for_each
{

template <class Fn>
struct first_parameter
{
using type = void;
};

template <class C, class R, class A>
struct first_parameter<R (C::*)(A)>
{
using type = A;
};

template <class C, class R, class A>
struct first_parameter<R (C::*)(A) const>
{
using type = A;
};

template <class Fn>
using first_parameter_t = typename first_parameter<decltype(&Fn::operator())>::type;

template <class Value, class Fn, class = void>
struct has_unique_value_overload : ::cuda::std::false_type
{};

// clang-format off
template <class Value, class Fn>
struct has_unique_value_overload<
Value,
Fn,
typename ::cuda::std::enable_if<
!::cuda::std::is_reference<first_parameter_t<Fn>>::value &&
::cuda::std::is_convertible<Value, first_parameter_t<Fn>
>::value>::type>
: ::cuda::std::true_type
{};

// For trivial types, foreach is not allowed to copy values, even if those are trivially copyable.
// This can be observable if the unary operator takes parameter by reference and modifies it or uses address.
// The trait below checks if the freedom to copy trivial types can be regained.
template <typename Value, typename Fn>
using can_regain_copy_freedom =
::cuda::std::integral_constant<
bool,
::cuda::std::is_trivially_constructible<Value>::value &&
::cuda::std::is_trivially_copy_assignable<Value>::value &&
:: cuda::std::is_trivially_move_assignable<Value>::value &&
::cuda::std::is_trivially_destructible<Value>::value &&
has_unique_value_overload<Value, Fn>::value>;
// clang-format on

// This kernel is used when the block size is not known at compile time
template <class ChainedPolicyT, class OffsetT, class OpT>
CUB_DETAIL_KERNEL_ATTRIBUTES void dynamic_kernel(OffsetT num_items, OpT op)
{
using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
using agent_t = agent_block_striped_t<active_policy_t, OffsetT, OpT>;

const auto block_threads = static_cast<OffsetT>(blockDim.x);
const auto items_per_tile = active_policy_t::items_per_thread * block_threads;
const auto tile_base = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
const auto num_remaining = num_items - tile_base;
const auto items_in_tile = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);

if (items_in_tile == items_per_tile)
{
agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
}
else
{
agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
}
}

// This kernel is used when the block size is known at compile time
template <class ChainedPolicyT, class OffsetT, class OpT>
CUB_DETAIL_KERNEL_ATTRIBUTES //
__launch_bounds__(ChainedPolicyT::ActivePolicy::for_policy_t::block_threads) //
void static_kernel(OffsetT num_items, OpT op)
{
using active_policy_t = typename ChainedPolicyT::ActivePolicy::for_policy_t;
using agent_t = agent_block_striped_t<active_policy_t, OffsetT, OpT>;

constexpr auto block_threads = active_policy_t::block_threads;
constexpr auto items_per_tile = active_policy_t::items_per_thread * block_threads;

const auto tile_base = static_cast<OffsetT>(blockIdx.x) * items_per_tile;
const auto num_remaining = num_items - tile_base;
const auto items_in_tile = static_cast<OffsetT>(num_remaining < items_per_tile ? num_remaining : items_per_tile);

if (items_in_tile == items_per_tile)
{
agent_t{tile_base, op}.template consume_tile<true>(items_per_tile, block_threads);
}
else
{
agent_t{tile_base, op}.template consume_tile<false>(items_in_tile, block_threads);
}
}

} // namespace for_each
} // namespace detail

CUB_NAMESPACE_END
13 changes: 0 additions & 13 deletions cub/test/c2h/utility.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,19 +38,6 @@
namespace c2h
{

/**
* Return a value of type `T0` with the same bitwise representation of `in`.
* Types `To` and `From` must be the same size.
*/
template <typename To, typename From>
__host__ __device__ To bit_cast(const From& in)
{
static_assert(sizeof(To) == sizeof(From), "Types must be same size.");
To out;
memcpy(&out, &in, sizeof(To));
return out;
}

// TODO(bgruber): duplicated version of thrust/testing/unittest/system.h
inline std::string demangle(const char* name)
{
Expand Down
4 changes: 3 additions & 1 deletion cub/test/catch2_radix_sort_helper.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@
#include <thrust/scan.h>
#include <thrust/sequence.h>

#include <cuda/std/bit>

#include <array>
#include <climits>
#include <cstdint>
Expand Down Expand Up @@ -199,7 +201,7 @@ c2h::host_vector<KeyT> get_striped_keys(const c2h::host_vector<KeyT>& h_keys, in

for (std::size_t i = 0; i < h_keys.size(); i++)
{
bit_ordered_t key = c2h::bit_cast<bit_ordered_t>(h_keys[i]);
bit_ordered_t key = ::cuda::std::bit_cast<bit_ordered_t>(h_keys[i]);

_CCCL_IF_CONSTEXPR (traits_t::CATEGORY == cub::FLOATING_POINT)
{
Expand Down
7 changes: 3 additions & 4 deletions cub/test/catch2_test_device_histogram.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,9 @@
#include <cub/device/device_histogram.cuh>
#include <cub/iterator/counting_input_iterator.cuh>

#include <cuda/std/__algorithm/copy.h>
#include <cuda/std/__cccl/dialect.h>
#include <cuda/std/__cccl/execution_space.h>
#include <cuda/std/__algorithm_>
#include <cuda/std/array>
#include <cuda/std/bit>
#include <cuda/std/type_traits>

#include <algorithm>
Expand Down Expand Up @@ -213,7 +212,7 @@ struct bit_and_anything
_CCCL_HOST_DEVICE auto operator()(const T& a, const T& b) const -> T
{
using U = typename cub::Traits<T>::UnsignedBits;
return c2h::bit_cast<T>(static_cast<U>(c2h::bit_cast<U>(a) & c2h::bit_cast<U>(b)));
return ::cuda::std::bit_cast<T>(static_cast<U>(::cuda::std::bit_cast<U>(a) & ::cuda::std::bit_cast<U>(b)));
}
};

Expand Down
4 changes: 2 additions & 2 deletions cub/test/catch2_test_device_radix_sort_keys.cu
Original file line number Diff line number Diff line change
Expand Up @@ -192,8 +192,8 @@ CUB_TEST("DeviceRadixSort::SortKeys: negative zero handling", "[keys][radix][sor
using bits_t = typename cub::Traits<key_t>::UnsignedBits;

constexpr std::size_t num_bits = sizeof(key_t) * CHAR_BIT;
const key_t positive_zero = c2h::bit_cast<key_t>(bits_t(0));
const key_t negative_zero = c2h::bit_cast<key_t>(bits_t(1) << (num_bits - 1));
const key_t positive_zero = ::cuda::std::bit_cast<key_t>(bits_t(0));
const key_t negative_zero = ::cuda::std::bit_cast<key_t>(bits_t(1) << (num_bits - 1));

constexpr std::size_t max_num_items = 1 << 18;
const std::size_t num_items = GENERATE_COPY(take(1, random(max_num_items / 2, max_num_items)));
Expand Down
5 changes: 3 additions & 2 deletions cub/test/catch2_test_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
_CCCL_NV_DIAG_SUPPRESS(177) // catch2 may contain unused variableds
#endif // nvcc-11

#include <cuda/std/bit>
#include <cuda/std/cmath>
#include <cuda/std/utility>

Expand Down Expand Up @@ -133,8 +134,8 @@ struct bitwise_equal
bool operator()(const T& a, const T& b) const
{
using bits_t = typename cub::Traits<T>::UnsignedBits;
bits_t a_bits = c2h::bit_cast<bits_t>(a);
bits_t b_bits = c2h::bit_cast<bits_t>(b);
bits_t a_bits = ::cuda::std::bit_cast<bits_t>(a);
bits_t b_bits = ::cuda::std::bit_cast<bits_t>(b);
return a_bits == b_bits;
}
};
Expand Down
1 change: 1 addition & 0 deletions cub/test/catch2_test_nvrtc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ TEST_CASE("Test nvrtc", "[test][nvrtc]")
"#include <cub/block/block_reduce.cuh> \n"
"#include <cub/block/block_scan.cuh> \n"
"#include <cub/device/dispatch/kernels/reduce.cuh> \n"
"#include <cub/device/dispatch/kernels/for_each.cuh> \n"
" \n"
"extern \"C\" __global__ void kernel(int *ptr, int *errors) \n"
"{ \n"
Expand Down
Loading

0 comments on commit 3aaad6e

Please sign in to comment.