Skip to content

Commit

Permalink
[CUDA] Add support for __grid_constant__ attribute (#114589)
Browse files Browse the repository at this point in the history
LLVM support for the attribute has been implemented already, so it just
plumbs it through to the CUDA front-end.

One notable difference from NVCC is that the attribute can be used
regardless of the targeted GPU. On the older GPUs it will just be
ignored. The attribute is a performance hint, and does not warrant a
hard error if compiler can't benefit from it on a particular GPU
variant.
  • Loading branch information
Artem-B authored Nov 5, 2024
1 parent 592c0fe commit 7c3fdcc
Show file tree
Hide file tree
Showing 13 changed files with 145 additions and 8 deletions.
1 change: 1 addition & 0 deletions clang/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -716,6 +716,7 @@ CUDA Support
^^^^^^^^^^^^
- Clang now supports CUDA SDK up to 12.6
- Added support for sm_100
- Added support for `__grid_constant__` attribute.

AIX Support
^^^^^^^^^^^
Expand Down
7 changes: 7 additions & 0 deletions clang/include/clang/Basic/Attr.td
Original file line number Diff line number Diff line change
Expand Up @@ -1451,6 +1451,13 @@ def CUDAHost : InheritableAttr {
}
def : MutualExclusions<[CUDAGlobal, CUDAHost]>;

def CUDAGridConstant : InheritableAttr {
let Spellings = [GNU<"grid_constant">, Declspec<"__grid_constant__">];
let Subjects = SubjectList<[ParmVar]>;
let LangOpts = [CUDA];
let Documentation = [CUDAGridConstantAttrDocs];
}

def NVPTXKernel : InheritableAttr, TargetSpecificAttr<TargetNVPTX> {
let Spellings = [Clang<"nvptx_kernel">];
let Subjects = SubjectList<[Function]>;
Expand Down
10 changes: 10 additions & 0 deletions clang/include/clang/Basic/AttrDocs.td
Original file line number Diff line number Diff line change
Expand Up @@ -6794,6 +6794,16 @@ unbind runtime APIs.
}];
}

def CUDAGridConstantAttrDocs : Documentation {
let Category = DocCatDecl;
let Content = [{
The ``__grid_constant__`` attribute can be applied to a ``const``-qualified kernel
function argument and allows compiler to take the address of that argument without
making a copy. The argument applies to sm_70 or newer GPUs, during compilation
with CUDA-11.7(PTX 7.7) or newer, and is ignored otherwise.
}];
}

def HIPManagedAttrDocs : Documentation {
let Category = DocCatDecl;
let Content = [{
Expand Down
2 changes: 2 additions & 0 deletions clang/include/clang/Basic/DiagnosticSemaKinds.td
Original file line number Diff line number Diff line change
Expand Up @@ -9100,6 +9100,8 @@ def err_cuda_host_shared : Error<
"%select{__device__|__global__|__host__|__host__ __device__}0 functions">;
def err_cuda_nonstatic_constdev: Error<"__constant__, __device__, and "
"__managed__ are not allowed on non-static local variables">;
def err_cuda_grid_constant_not_allowed : Error<
"__grid_constant__ is only allowed on const-qualified kernel parameters">;
def err_cuda_ovl_target : Error<
"%select{__device__|__global__|__host__|__host__ __device__}0 function %1 "
"cannot overload %select{__device__|__global__|__host__|__host__ __device__}2 function %3">;
Expand Down
36 changes: 29 additions & 7 deletions clang/lib/CodeGen/Targets/NVPTX.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "ABIInfoImpl.h"
#include "TargetInfo.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/IR/IntrinsicsNVPTX.h"

using namespace clang;
Expand Down Expand Up @@ -78,7 +79,13 @@ class NVPTXTargetCodeGenInfo : public TargetCodeGenInfo {
// Adds a NamedMDNode with GV, Name, and Operand as operands, and adds the
// resulting MDNode to the nvvm.annotations MDNode.
static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
int Operand);
int Operand,
const SmallVectorImpl<int> &GridConstantArgs);

static void addNVVMMetadata(llvm::GlobalValue *GV, StringRef Name,
int Operand) {
addNVVMMetadata(GV, Name, Operand, SmallVector<int, 1>(0));
}

private:
static void emitBuiltinSurfTexDeviceCopy(CodeGenFunction &CGF, LValue Dst,
Expand Down Expand Up @@ -240,7 +247,8 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
}

const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
if (!FD) return;
if (!FD)
return;

llvm::Function *F = cast<llvm::Function>(GV);

Expand All @@ -263,8 +271,13 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
// __global__ functions cannot be called from the device, we do not
// need to set the noinline attribute.
if (FD->hasAttr<CUDAGlobalAttr>()) {
SmallVector<int, 10> GCI;
for (auto IV : llvm::enumerate(FD->parameters()))
if (IV.value()->hasAttr<CUDAGridConstantAttr>())
// For some reason arg indices are 1-based in NVVM
GCI.push_back(IV.index() + 1);
// Create !{<func-ref>, metadata !"kernel", i32 1} node
addNVVMMetadata(F, "kernel", 1);
addNVVMMetadata(F, "kernel", 1, GCI);
}
if (CUDALaunchBoundsAttr *Attr = FD->getAttr<CUDALaunchBoundsAttr>())
M.handleCUDALaunchBoundsAttr(F, Attr);
Expand All @@ -276,18 +289,27 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
}
}

void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
StringRef Name, int Operand) {
void NVPTXTargetCodeGenInfo::addNVVMMetadata(
llvm::GlobalValue *GV, StringRef Name, int Operand,
const SmallVectorImpl<int> &GridConstantArgs) {
llvm::Module *M = GV->getParent();
llvm::LLVMContext &Ctx = M->getContext();

// Get "nvvm.annotations" metadata node
llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");

llvm::Metadata *MDVals[] = {
SmallVector<llvm::Metadata *, 5> MDVals = {
llvm::ConstantAsMetadata::get(GV), llvm::MDString::get(Ctx, Name),
llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), Operand))};
if (!GridConstantArgs.empty()) {
SmallVector<llvm::Metadata *, 10> GCM;
for (int I : GridConstantArgs)
GCM.push_back(llvm::ConstantAsMetadata::get(
llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), I)));
MDVals.append({llvm::MDString::get(Ctx, "grid_constant"),
llvm::MDNode::get(Ctx, GCM)});
}
// Append metadata to nvvm.annotations
MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
}
Expand All @@ -309,7 +331,7 @@ NVPTXTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
return llvm::ConstantExpr::getAddrSpaceCast(
llvm::ConstantPointerNull::get(NPT), PT);
}
}
} // namespace

void CodeGenModule::handleCUDALaunchBoundsAttr(llvm::Function *F,
const CUDALaunchBoundsAttr *Attr,
Expand Down
11 changes: 10 additions & 1 deletion clang/lib/Sema/SemaDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12225,8 +12225,17 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
<< NewFD;
}

if (!Redeclaration && LangOpts.CUDA)
if (!Redeclaration && LangOpts.CUDA) {
bool IsKernel = NewFD->hasAttr<CUDAGlobalAttr>();
for (auto *Parm : NewFD->parameters()) {
if (!Parm->getType()->isDependentType() &&
Parm->hasAttr<CUDAGridConstantAttr>() &&
!(IsKernel && Parm->getType().isConstQualified()))
Diag(Parm->getAttr<CUDAGridConstantAttr>()->getLocation(),
diag::err_cuda_grid_constant_not_allowed);
}
CUDA().checkTargetOverload(NewFD, Previous);
}
}

// Check if the function definition uses any AArch64 SME features without
Expand Down
12 changes: 12 additions & 0 deletions clang/lib/Sema/SemaDeclAttr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4748,6 +4748,15 @@ static void handleManagedAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
D->addAttr(CUDADeviceAttr::CreateImplicit(S.Context));
}

static void handleGridConstantAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
if (D->isInvalidDecl())
return;
// Whether __grid_constant__ is allowed to be used will be checked in
// Sema::CheckFunctionDeclaration as we need complete function decl to make
// the call.
D->addAttr(::new (S.Context) CUDAGridConstantAttr(S.Context, AL));
}

static void handleGNUInlineAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
const auto *Fn = cast<FunctionDecl>(D);
if (!Fn->isInlineSpecified()) {
Expand Down Expand Up @@ -6645,6 +6654,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL,
case ParsedAttr::AT_CUDADevice:
handleDeviceAttr(S, D, AL);
break;
case ParsedAttr::AT_CUDAGridConstant:
handleGridConstantAttr(S, D, AL);
break;
case ParsedAttr::AT_HIPManaged:
handleManagedAttr(S, D, AL);
break;
Expand Down
6 changes: 6 additions & 0 deletions clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -876,6 +876,12 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
continue;
}

if (auto *A = dyn_cast<CUDAGridConstantAttr>(TmplAttr)) {
if (!New->hasAttr<CUDAGridConstantAttr>())
New->addAttr(A->clone(Context));
continue;
}

assert(!TmplAttr->isPackExpansion());
if (TmplAttr->isLateParsed() && LateAttrs) {
// Late parsed attributes must be instantiated and attached after the
Expand Down
2 changes: 2 additions & 0 deletions clang/test/CodeGenCUDA/Inputs/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#define __managed__ __attribute__((managed))
#endif
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))
#define __grid_constant__ __attribute__((grid_constant))
#else
#define __constant__
#define __device__
Expand All @@ -20,6 +21,7 @@
#define __shared__
#define __managed__
#define __launch_bounds__(...)
#define __grid_constant__
#endif

struct dim3 {
Expand Down
31 changes: 31 additions & 0 deletions clang/test/CodeGenCUDA/grid-constant.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals all --version 5
// RUN: %clang_cc1 "-triple" "nvptx64-nvidia-cuda" -emit-llvm -fcuda-is-device -o - %s | FileCheck %s

#include "Inputs/cuda.h"

struct S {};

__global__ void kernel(__grid_constant__ const S gc_arg1, int arg2, __grid_constant__ const int gc_arg3) {}

// dependent arguments get diagnosed after instantiation.
template <typename T>
__global__ void tkernel_const(__grid_constant__ const T arg) {}

template <typename T>
__global__ void tkernel(int dummy, __grid_constant__ T arg) {}

void foo() {
tkernel_const<const S><<<1,1>>>({});
tkernel_const<S><<<1,1>>>({});
tkernel<const S><<<1,1>>>(1, {});
}
//.
//.
// CHECK: [[META0:![0-9]+]] = !{ptr @_Z6kernel1Sii, !"kernel", i32 1, !"grid_constant", [[META1:![0-9]+]]}
// CHECK: [[META1]] = !{i32 1, i32 3}
// CHECK: [[META2:![0-9]+]] = !{ptr @_Z13tkernel_constIK1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3:![0-9]+]]}
// CHECK: [[META3]] = !{i32 1}
// CHECK: [[META4:![0-9]+]] = !{ptr @_Z13tkernel_constI1SEvT_, !"kernel", i32 1, !"grid_constant", [[META3]]}
// CHECK: [[META5:![0-9]+]] = !{ptr @_Z7tkernelIK1SEviT_, !"kernel", i32 1, !"grid_constant", [[META6:![0-9]+]]}
// CHECK: [[META6]] = !{i32 2}
//.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
// CHECK-NEXT: CUDADeviceBuiltinSurfaceType (SubjectMatchRule_record)
// CHECK-NEXT: CUDADeviceBuiltinTextureType (SubjectMatchRule_record)
// CHECK-NEXT: CUDAGlobal (SubjectMatchRule_function)
// CHECK-NEXT: CUDAGridConstant (SubjectMatchRule_variable_is_parameter)
// CHECK-NEXT: CUDAHost (SubjectMatchRule_function)
// CHECK-NEXT: CUDALaunchBounds (SubjectMatchRule_objc_method, SubjectMatchRule_hasType_functionType)
// CHECK-NEXT: CUDAShared (SubjectMatchRule_variable)
Expand Down
1 change: 1 addition & 0 deletions clang/test/SemaCUDA/Inputs/cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#define __host__ __attribute__((host))
#define __shared__ __attribute__((shared))
#define __managed__ __attribute__((managed))
#define __grid_constant__ __attribute__((grid_constant))
#define __launch_bounds__(...) __attribute__((launch_bounds(__VA_ARGS__)))

struct dim3 {
Expand Down
33 changes: 33 additions & 0 deletions clang/test/SemaCUDA/grid-constant.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
// RUN: %clang_cc1 -fsyntax-only -verify %s
// RUN: %clang_cc1 -fsyntax-only -fcuda-is-device -verify %s
#include "Inputs/cuda.h"

struct S {};

__global__ void kernel_struct(__grid_constant__ const S arg) {}
__global__ void kernel_scalar(__grid_constant__ const int arg) {}

__global__ void gc_kernel_non_const(__grid_constant__ S arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}

void non_kernel(__grid_constant__ S arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}

// templates w/ non-dependent argument types get diagnosed right
// away, without instantiation.
template <typename T>
__global__ void tkernel_nd_const(__grid_constant__ const S arg, T dummy) {}
template <typename T>
__global__ void tkernel_nd_non_const(__grid_constant__ S arg, T dummy) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}

// dependent arguments get diagnosed after instantiation.
template <typename T>
__global__ void tkernel_const(__grid_constant__ const T arg) {}

template <typename T>
__global__ void tkernel(__grid_constant__ T arg) {} // expected-error {{__grid_constant__ is only allowed on const-qualified kernel parameters}}

void foo() {
tkernel_const<const S><<<1,1>>>({});
tkernel_const<S><<<1,1>>>({});
tkernel<const S><<<1,1>>>({});
tkernel<S><<<1,1>>>({}); // expected-note {{in instantiation of function template specialization 'tkernel<S>' requested here}}
}

0 comments on commit 7c3fdcc

Please sign in to comment.