Skip to content

Commit

Permalink
Backport 574 ptx (#663)
Browse files Browse the repository at this point in the history
* Add `cuda::ptx::*` namespace (#574)

* fixup `___CUDA_VPTX` -> `_CUDA_VPTX` (#664)

* fixup `___CUDA_VPTX` -> `_CUDA_VPTX`

* Fix warning for unused variable in branches that are constexpr disabled.

---------

Co-authored-by: Allard Hendriksen <ahendriksen@nvidia.com>
Co-authored-by: Wesley Maxey <71408887+wmaxey@users.noreply.github.com>
  • Loading branch information
3 people authored Nov 8, 2023
1 parent db1312e commit bb37c94
Show file tree
Hide file tree
Showing 12 changed files with 2,334 additions and 30 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//
// UNSUPPORTED: libcpp-has-no-threads

// <cuda/ptx>

#include <cuda/ptx>
#include <cuda/std/utility>

#include "concurrent_agents.h"
#include "cuda_space_selector.h"
#include "test_macros.h"

template <typename ... _Ty>
__device__ inline bool __unused(_Ty...) { return true; }

__global__ void test_compilation() {
using cuda::ptx::sem_release;
using cuda::ptx::space_cluster;
using cuda::ptx::space_shared;
using cuda::ptx::scope_cluster;
using cuda::ptx::scope_cta;

__shared__ uint64_t bar;
bar = 1;
uint64_t state = 1;

#if __cccl_ptx_isa >= 700
NV_IF_TARGET(NV_PROVIDES_SM_80, (
state = cuda::ptx::mbarrier_arrive(&bar); // 1.
state = cuda::ptx::mbarrier_arrive_no_complete(&bar, 1); // 5.
));
#endif // __cccl_ptx_isa >= 700

// This guard is redundant: before PTX ISA 7.8, there was no support for SM_90
#if __cccl_ptx_isa >= 780
NV_IF_TARGET(NV_PROVIDES_SM_90, (
state = cuda::ptx::mbarrier_arrive(&bar, 1); // 2.
));
#endif // __cccl_ptx_isa >= 780

#if __cccl_ptx_isa >= 800
NV_IF_TARGET(NV_PROVIDES_SM_90, (
state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar); // 3a.
state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar); // 3a.

state = cuda::ptx::mbarrier_arrive(sem_release, scope_cta, space_shared, &bar, 1); // 3b.
state = cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_shared, &bar, 1); // 3b.

cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar); // 4a.

cuda::ptx::mbarrier_arrive(sem_release, scope_cluster, space_cluster, &bar, 1); // 4b.

state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cta, space_shared, &bar, 1); // 8.
state = cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_shared, &bar, 1); // 8.

cuda::ptx::mbarrier_arrive_expect_tx(sem_release, scope_cluster, space_cluster, &bar, 1); // 9.
));
#endif // __cccl_ptx_isa >= 800
__unused(bar, state);
}

int main(int, char**)
{
return 0;
}
2 changes: 2 additions & 0 deletions libcudacxx/.upstream-tests/test/support/concurrent_agents.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#endif
#endif

#include <cuda/std/cassert>

#include "test_macros.h"

TEST_EXEC_CHECK_DISABLE
Expand Down
2 changes: 2 additions & 0 deletions libcudacxx/docs/extended_api.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ nav_order: 3

{% include_relative extended_api/functional.md %}

{% include_relative extended_api/ptx.md %}

[Thread Scopes]: ./extended_api/memory_model.md#thread-scopes
[Thread Groups]: ./extended_api/thread_groups.md

669 changes: 669 additions & 0 deletions libcudacxx/docs/extended_api/ptx.md

Large diffs are not rendered by default.

23 changes: 23 additions & 0 deletions libcudacxx/include/cuda/ptx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
// -*- C++ -*-
//===----------------------------------------------------------------------===//
//
// Part of libcu++, the C++ Standard Library for your entire system,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#ifndef _CUDA_PTX
#define _CUDA_PTX

#include "std/detail/__config"

#include "std/detail/__pragma_push"

#include "std/detail/libcxx/include/__cuda/ptx.h"

#include "std/detail/__pragma_pop"

#endif // _CUDA_PTX
3 changes: 3 additions & 0 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__config
Original file line number Diff line number Diff line change
Expand Up @@ -1505,6 +1505,9 @@ typedef __char32_t char32_t;
#define _LIBCUDACXX_END_NAMESPACE_CUDA } }
#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE namespace cuda { namespace device { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
#define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE } } }
#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_PTX namespace cuda { namespace ptx { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
#define _LIBCUDACXX_END_NAMESPACE_CUDA_PTX } } }
#define _CUDA_VPTX ::cuda::ptx::_LIBCUDACXX_ABI_NAMESPACE
#define _LIBCUDACXX_BEGIN_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL namespace cuda { namespace device { namespace experimental { inline namespace _LIBCUDACXX_ABI_NAMESPACE {
#define _LIBCUDACXX_END_NAMESPACE_CUDA_DEVICE_EXPERIMENTAL } } } }
#endif
Expand Down
48 changes: 18 additions & 30 deletions libcudacxx/include/cuda/std/detail/libcxx/include/__cuda/barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ _CCCL_IMPLICIT_SYSTEM_HEADER

#include "../cstdlib" // _LIBCUDACXX_UNREACHABLE
#include "../__type_traits/void_t.h" // _CUDA_VSTD::__void_t
#include "../__cuda/ptx.h" // cuda::ptx::*

#if defined(_LIBCUDACXX_COMPILER_NVRTC)
#define _LIBCUDACXX_OFFSET_IS_ZERO(type, member) !(&(((type *)0)->member))
Expand Down Expand Up @@ -206,29 +207,21 @@ friend class _CUDA_VSTD::__barrier_poll_tester_parity;
else if (!__isShared(&__barrier)) {
__trap();
}

asm volatile ("mbarrier.arrive.shared.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__update))
: "memory");
// Cannot use cuda::device::barrier_native_handle here, as it is
// only defined for block-scope barriers. This barrier may be a
// non-block scoped barrier.
auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
__token = _CUDA_VPTX::mbarrier_arrive(__bh, __update);
), NV_PROVIDES_SM_80, (
if (!__isShared(&__barrier)) {
return __barrier.arrive(__update);
}

auto __bh = reinterpret_cast<_CUDA_VSTD::uint64_t*>(&__barrier);
// Need 2 instructions, can't finish barrier with arrive > 1
if (__update > 1) {
asm volatile ("mbarrier.arrive.noComplete.shared.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier))),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__update - 1))
: "memory");
_CUDA_VPTX::mbarrier_arrive_no_complete(__bh, __update - 1);
}
asm volatile ("mbarrier.arrive.shared.b64 %0, [%1];"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__cvta_generic_to_shared(&__barrier)))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive( __bh);
), NV_IS_DEVICE, (
if (!__isShared(&__barrier)) {
return __barrier.arrive(__update);
Expand Down Expand Up @@ -603,27 +596,22 @@ barrier<thread_scope_block>::arrival_token barrier_arrive_tx(
// us in release builds. In debug builds, the error would be caught
// by the asserts at the top of this function.

auto __bh = __cvta_generic_to_shared(barrier_native_handle(__b));
auto __native_handle = barrier_native_handle(__b);
auto __bh = __cvta_generic_to_shared(__native_handle);
if (__arrive_count_update == 1) {
asm (
"mbarrier.arrive.expect_tx.release.cta.shared::cta.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive_expect_tx(
_CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __transaction_count_update
);
} else {
asm (
"mbarrier.expect_tx.relaxed.cta.shared::cta.b64 [%0], %1;"
:
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__transaction_count_update))
: "memory");
asm (
"mbarrier.arrive.release.cta.shared::cta.b64 %0, [%1], %2;"
: "=l"(__token)
: "r"(static_cast<_CUDA_VSTD::uint32_t>(__bh)),
"r"(static_cast<_CUDA_VSTD::uint32_t>(__arrive_count_update))
: "memory");
__token = _CUDA_VPTX::mbarrier_arrive(
_CUDA_VPTX::sem_release, _CUDA_VPTX::scope_cta, _CUDA_VPTX::space_shared, __native_handle, __arrive_count_update
);
}
)
);
Expand Down Expand Up @@ -1089,8 +1077,8 @@ __completion_mechanism __dispatch_memcpy_async_global_to_shared(_Group const & _
));

NV_IF_TARGET(NV_PROVIDES_SM_80, (
const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
if _LIBCUDACXX_CONSTEXPR_AFTER_CXX14 (_Align >= 4) {
const bool __can_use_async_group = __allowed_completions & uint32_t(__completion_mechanism::__async_group);
if (__can_use_async_group) {
__cp_async_shared_global_mechanism<_Align>(__group, __dest_char, __src_char, __size);
return __completion_mechanism::__async_group;
Expand Down
Loading

0 comments on commit bb37c94

Please sign in to comment.