Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prune CUB's ChainedPolicy by __CUDA_ARCH_LIST__ #2154

Merged
merged 6 commits into from
Sep 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
89 changes: 86 additions & 3 deletions cub/cub/util_device.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -358,13 +358,16 @@ struct SmVersionCacheTag
{};

/**
* \brief Retrieves the PTX virtual architecture that will be used on \p device (major * 100 + minor * 10).
* \brief Retrieves the PTX virtual architecture that will be used on \p device (major * 100 + minor * 10). If
* __CUDA_ARCH_LIST__ is defined, this value is one of __CUDA_ARCH_LIST__.
*
* \note This function may cache the result internally.
* \note This function is thread safe.
*/
_CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
{
// Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
// architectures in __CUDA_ARCH__ and the runtime queried ptx version.
auto const payload = GetPerDeviceAttributeCache<PtxVersionCacheTag>()(
// If this call fails, then we get the error code back in the payload, which we check with `CubDebug` below.
[=](int& pv) {
Expand All @@ -388,6 +391,8 @@ _CCCL_HOST inline cudaError_t PtxVersion(int& ptx_version, int device)
*/
CUB_RUNTIME_FUNCTION inline cudaError_t PtxVersion(int& ptx_version)
{
// Note: the ChainedPolicy pruning (i.e., invoke_static) requites that there's an exact match between one of the
// architectures in __CUDA_ARCH__ and the runtime queried ptx version.
cudaError_t result = cudaErrorUnknown;
NV_IF_TARGET(NV_IS_HOST,
(result = PtxVersion(ptx_version, CurrentDevice());),
Expand Down Expand Up @@ -635,18 +640,83 @@ struct ChainedPolicy
template <typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t Invoke(int device_ptx_version, FunctorT& op)
{
// __CUDA_ARCH_LIST__ is only available from CTK 11.5 onwards
#ifdef __CUDA_ARCH_LIST__
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
elstehle marked this conversation as resolved.
Show resolved Hide resolved
return runtime_to_compiletime<__CUDA_ARCH_LIST__>(device_ptx_version, op);
#else
if (device_ptx_version < PolicyPtxVersion)
{
return PrevPolicyT::Invoke(device_ptx_version, op);
}
return op.template Invoke<PolicyT>();
#endif
}

private:
template <int, typename, typename>
friend struct ChainedPolicy; // let us call invoke_static of other ChainedPolicy instantiations

template <int... CudaArches, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t runtime_to_compiletime(int device_ptx_version, FunctorT& op)
{
// We instantiate invoke_static for each CudaArches, but only call the one matching device_ptx_version.
// If there's no exact match of any of the architectures in __CUDA_ARCH_LIST__ and the runtime
// queried ptx version (i.e., the closest ptx version to the current device's architecture that the EmptyKernel was
// compiled for), we return cudaErrorInvalidDeviceFunction. Such a scenario may arise if CUB_DISABLE_NAMESPACE_MAGIC
// is set and different TUs are compiled for different sets of architecture.
cudaError_t e = cudaErrorInvalidDeviceFunction;
const cudaError_t dummy[] = {
(device_ptx_version == CudaArches ? (e = invoke_static<CudaArches>(op, ::cuda::std::true_type{}))
elstehle marked this conversation as resolved.
Show resolved Hide resolved
: cudaSuccess)...};
(void) dummy;
return e;
}

template <int DevicePtxVersion, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op, ::cuda::std::true_type)
{
// TODO(bgruber): drop diagnostic suppression in C++17
_CCCL_DIAG_PUSH
_CCCL_DIAG_SUPPRESS_MSVC(4127) // suppress Conditional Expression is Constant
_CCCL_IF_CONSTEXPR (DevicePtxVersion < PolicyPtxVersion)
{
// TODO(bgruber): drop boolean tag dispatches in C++17, since _CCCL_IF_CONSTEXPR will discard this branch properly
return PrevPolicyT::template invoke_static<DevicePtxVersion>(
op, ::cuda::std::bool_constant<(DevicePtxVersion < PolicyPtxVersion)>{});
}
else
{
return do_invoke(op, ::cuda::std::bool_constant<DevicePtxVersion >= PolicyPtxVersion>{});
}
_CCCL_DIAG_POP
}

template <int, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT&, ::cuda::std::false_type)
{
_LIBCUDACXX_UNREACHABLE();
}

template <typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t do_invoke(FunctorT& op, ::cuda::std::true_type)
{
return op.template Invoke<PolicyT>();
}

template <typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t do_invoke(FunctorT&, ::cuda::std::false_type)
{
_LIBCUDACXX_UNREACHABLE();
}
};

/// Helper for dispatching into a policy chain (end-of-chain specialization)
template <int PTX_VERSION, typename PolicyT>
struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
template <int PolicyPtxVersion, typename PolicyT>
struct ChainedPolicy<PolicyPtxVersion, PolicyT, PolicyT>
{
template <int, typename, typename>
friend struct ChainedPolicy; // befriend primary template, so it can call invoke_static

/// The policy for the active compiler pass
using ActivePolicy = PolicyT;

Expand All @@ -656,6 +726,19 @@ struct ChainedPolicy<PTX_VERSION, PolicyT, PolicyT>
{
return op.template Invoke<PolicyT>();
}

private:
template <int, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT& op, ::cuda::std::true_type)
{
return op.template Invoke<PolicyT>();
}

template <int, typename FunctorT>
CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t invoke_static(FunctorT&, ::cuda::std::false_type)
{
_LIBCUDACXX_UNREACHABLE();
}
};

CUB_NAMESPACE_END
195 changes: 195 additions & 0 deletions cub/test/catch2_test_util_device.cu
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
#include <thrust/detail/raw_pointer_cast.h>
#include <thrust/device_vector.h>

#include <cuda/std/__algorithm/find_if.h>
#include <cuda/std/array>

#include "catch2_test_helper.h"
#include "catch2_test_launch_helper.h"

Expand Down Expand Up @@ -87,3 +90,195 @@ CUB_TEST("CUB correctly identifies the ptx version the kernel was compiled for",
REQUIRE(ptx_version == kernel_cuda_arch);
REQUIRE(host_ptx_version == kernel_cuda_arch);
}

#ifdef __CUDA_ARCH_LIST__
CUB_TEST("PtxVersion returns a value from __CUDA_ARCH_LIST__", "[util][dispatch]")
{
int ptx_version = 0;
REQUIRE(cub::PtxVersion(ptx_version) == cudaSuccess);
const auto arch_list = std::vector<int>{__CUDA_ARCH_LIST__};
REQUIRE(std::find(arch_list.begin(), arch_list.end(), ptx_version) != arch_list.end());
}
#endif

#define GEN_POLICY(cur, prev) \
struct policy##cur : cub::ChainedPolicy<cur, policy##cur, policy##prev> \
{ \
static constexpr int value = cur; \
}

#ifdef __CUDA_ARCH_LIST__
// We list policies for all virtual architectures that __CUDA_ARCH_LIST__ can contain, so the actual architectures the
// tests are compiled for should match to one of those
struct policy_hub_all
{
// for the list of supported architectures, see libcudacxx/include/nv/target
GEN_POLICY(350, 350);
GEN_POLICY(370, 350);
GEN_POLICY(500, 370);
GEN_POLICY(520, 500);
GEN_POLICY(530, 520);
GEN_POLICY(600, 530);
GEN_POLICY(610, 600);
GEN_POLICY(620, 610);
GEN_POLICY(700, 620);
GEN_POLICY(720, 700);
GEN_POLICY(750, 720);
GEN_POLICY(800, 750);
GEN_POLICY(860, 800);
GEN_POLICY(870, 860);
GEN_POLICY(890, 870);
GEN_POLICY(900, 890);
GEN_POLICY(1000, 900);
// add more policies here when new architectures emerge
GEN_POLICY(2000, 1000); // non-existing architecture, just to test pruning

using max_policy = policy2000;
};

// Check that selected is one of arches
template <int Selected, int... ArchList>
struct check
{
static_assert(::cuda::std::_Or<::cuda::std::bool_constant<Selected == ArchList>...>::value, "");
using type = cudaError_t;
};

struct closure_all
{
int ptx_version;

// We need to fail template instantiation if ActivePolicy::value is not one from the __CUDA_ARCH_LIST__
template <typename ActivePolicy>
CUB_RUNTIME_FUNCTION auto Invoke() const -> typename check<ActivePolicy::value, __CUDA_ARCH_LIST__>::type
{
// policy_hub_all must list all PTX virtual architectures, so we can do an exact comparison here
# if TEST_LAUNCH == 0
REQUIRE(+ActivePolicy::value == ptx_version);
# endif // TEST_LAUNCH == 0
// the returned error code will be checked by the launch helper
return +ActivePolicy::value == ptx_version ? cudaSuccess : cudaErrorInvalidValue;
}
};

CUB_RUNTIME_FUNCTION cudaError_t
check_chained_policy_prunes_to_arch_list(void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t = 0)
{
if (d_temp_storage == nullptr)
{
temp_storage_bytes = 1;
return cudaSuccess;
}
int ptx_version = 0;
cub::PtxVersion(ptx_version);
closure_all c{ptx_version};
return policy_hub_all::max_policy::Invoke(ptx_version, c);
}

DECLARE_LAUNCH_WRAPPER(check_chained_policy_prunes_to_arch_list, check_wrapper_all);

CUB_TEST("ChainedPolicy prunes based on __CUDA_ARCH_LIST__", "[util][dispatch]")
{
check_wrapper_all();
}
#endif

template <int NumPolicies>
struct check_policy_closure
{
int ptx_version;
::cuda::std::array<int, NumPolicies> policies;

// quick way to get a comparator for find_if below
_CCCL_HOST_DEVICE bool operator()(int policy_ver) const
{
return policy_ver <= ptx_version;
}

template <typename ActivePolicy>
CUB_RUNTIME_FUNCTION cudaError_t Invoke() const
{
#define CHECK_EXPR +ActivePolicy::value == *::cuda::std::find_if(policies.rbegin(), policies.rend(), *this)

#if TEST_LAUNCH == 0
CAPTURE(ptx_version, policies);
REQUIRE(CHECK_EXPR);
#else // TEST_LAUNCH == 0
if (!(CHECK_EXPR))
{
printf("Check `%s` failed!\n ptx_version=%d\n ActivePolicy::value=%d\n policies=",
THRUST_PP_STRINGIZE(CHECK_EXPR),
ptx_version,
ActivePolicy::value);
for (int i = 0; i < NumPolicies; i++)
{
printf("%d,", policies[i]);
}
printf("\n");
}
#endif // TEST_LAUNCH == 0
// the returned error code will be checked by the launch helper
return (CHECK_EXPR) ? cudaSuccess : cudaErrorInvalidValue;
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
#undef CHECK_EXPR
}
};

template <typename PolicyHub, int NumPolicies>
CUB_RUNTIME_FUNCTION cudaError_t check_chained_policy_selects_correct_policy(
void* d_temp_storage, size_t& temp_storage_bytes, ::cuda::std::array<int, NumPolicies> policies, cudaStream_t = 0)
{
if (d_temp_storage == nullptr)
{
temp_storage_bytes = 1;
return cudaSuccess;
}
int ptx_version = 0;
cub::PtxVersion(ptx_version);
check_policy_closure<NumPolicies> c{ptx_version, std::move(policies)};
return PolicyHub::max_policy::Invoke(ptx_version, c);
}

DECLARE_TMPL_LAUNCH_WRAPPER(check_chained_policy_selects_correct_policy,
check_wrapper_some,
ESCAPE_LIST(typename PolicyHub, int NumPolicies),
ESCAPE_LIST(PolicyHub, NumPolicies));

struct policy_hub_some
{
GEN_POLICY(350, 350);
GEN_POLICY(500, 350);
GEN_POLICY(700, 500);
GEN_POLICY(900, 700);
GEN_POLICY(2000, 900); // non-existing architecture, just to test
using max_policy = policy2000;
};

struct policy_hub_few
{
GEN_POLICY(350, 350);
GEN_POLICY(860, 350);
GEN_POLICY(2000, 860); // non-existing architecture, just to test
using max_policy = policy2000;
};

struct policy_hub_minimal
{
GEN_POLICY(350, 350);
using max_policy = policy350;
};

CUB_TEST("ChainedPolicy invokes correct policy", "[util][dispatch]")
{
bernhardmgruber marked this conversation as resolved.
Show resolved Hide resolved
SECTION("policy_hub_some")
{
check_wrapper_some<policy_hub_some, 5>(::cuda::std::array<int, 5>{350, 500, 700, 900, 2000});
}
SECTION("policy_hub_few")
{
check_wrapper_some<policy_hub_few, 3>(::cuda::std::array<int, 3>{350, 860, 2000});
}
SECTION("policy_hub_minimal")
{
check_wrapper_some<policy_hub_minimal, 1>(::cuda::std::array<int, 1>{350});
}
}
12 changes: 10 additions & 2 deletions thrust/testing/transform_input_output_iterator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,16 @@

#include <unittest/unittest.h>

// There is an unfortunate miscompilation of the gcc-13 vectorizer leading to OOB writes
// Adding this attribute suffices that this miscompilation does not appear anymore
#if defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 13
# define THRUST_DISABLE_BROKEN_GCC_VECTORIZER __attribute__((optimize("no-tree-vectorize")))
#else // defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 13
# define THRUST_DISABLE_BROKEN_GCC_VECTORIZER
#endif // defined(_CCCL_COMPILER_GCC) && __GNUC__ >= 13

template <class Vector>
void TestTransformInputOutputIterator()
THRUST_DISABLE_BROKEN_GCC_VECTORIZER void TestTransformInputOutputIterator()
{
using T = typename Vector::value_type;

Expand Down Expand Up @@ -52,7 +60,7 @@ void TestTransformInputOutputIterator()
DECLARE_VECTOR_UNITTEST(TestTransformInputOutputIterator);

template <class Vector>
void TestMakeTransformInputOutputIterator()
THRUST_DISABLE_BROKEN_GCC_VECTORIZER void TestMakeTransformInputOutputIterator()
{
using T = typename Vector::value_type;

Expand Down
Loading