From d42e70b3d315645e37f3b1455d39e68678e69525 Mon Sep 17 00:00:00 2001 From: Reshabh Sharma Date: Wed, 4 Aug 2021 19:47:07 +0530 Subject: [PATCH] [AMDGPU] Handle functions in llvm's global ctors and dtors list This patch introduces a new code object metadata field, ".kind" which is used to add support for init and fini kernels. HSAStreamer will use function attributes, "device-init" and "device-fini" to distinguish between init and fini kernels from the regular kernels and will emit metadata with ".kind" set to "init" and "fini" respectively. To reduce the number of init and fini kernels, the ctors and dtors present in the llvm's global.ctors and global.dtors lists are called from a single init and fini kernel respectively. Reviewed by: yaxunl Differential Revision: https://reviews.llvm.org/D105682 --- llvm/docs/AMDGPUUsage.rst | 31 +++++++ llvm/lib/Target/AMDGPU/AMDGPU.h | 4 + .../Target/AMDGPU/AMDGPUCtorDtorLowering.cpp | 91 +++++++++++++++++++ .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp | 4 + .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 2 + llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 + .../hsa-metadata-from-llvm-ctor-dtor-list.ll | 39 ++++++++ llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 5 + llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll | 21 +++++ .../AMDGPU/lower-multiple-ctor-dtor.ll | 31 +++++++ .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 + 11 files changed, 230 insertions(+) create mode 100644 llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp create mode 100644 llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll create mode 100644 llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index a1dc5548643070..bfbc77cfe53e3d 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -3142,6 +3142,37 @@ same *vendor-name*. a register allocator created spill location. + ".kind" string The kind of the kernel + with the following + values: + + "normal" + Regular kernels. + + "init" + These kernels must be + invoked after loading + the containing code + object and must + complete before any + normal and fini + kernels in the same + code object are + invoked. + + "fini" + These kernels must be + invoked before + unloading the + containing code object + and after all init and + normal kernels in the + same code object have + been invoked and + completed. + + If omitted, "normal" is + assumed. =================================== ============== ========= ================================ .. diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index ca088e63e03c86..3d0a61877d46c8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -114,6 +114,10 @@ ModulePass *createAMDGPUFixFunctionBitcastsPass(); void initializeAMDGPUFixFunctionBitcastsPass(PassRegistry &); extern char &AMDGPUFixFunctionBitcastsID; +ModulePass *createAMDGPUCtorDtorLoweringPass(); +void initializeAMDGPUCtorDtorLoweringPass(PassRegistry &); +extern char &AMDGPUCtorDtorLoweringID; + FunctionPass *createAMDGPULowerKernelArgumentsPass(); void initializeAMDGPULowerKernelArgumentsPass(PassRegistry &); extern char &AMDGPULowerKernelArgumentsID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp new file mode 100644 index 00000000000000..a94666ff53f8b8 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUCtorDtorLowering.cpp @@ -0,0 +1,91 @@ +//===-- AMDGPUCtorDtorLowering.cpp - Fix function bitcasts -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This pass creates a unified init and fini kernel with the required metadata +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalVariable.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Value.h" +#include "llvm/Pass.h" +#include "llvm/Transforms/Utils/ModuleUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "amdgpu-lower-ctor-dtor" + +namespace { +class AMDGPUCtorDtorLowering final : public ModulePass { + bool runOnModule(Module &M) override; + +public: + Function *createInitOrFiniKernelFunction(Module &M, bool IsCtor) { + StringRef InitOrFiniKernelName = "amdgcn.device.init"; + if (!IsCtor) + InitOrFiniKernelName = "amdgcn.device.fini"; + + Function *InitOrFiniKernel = Function::createWithDefaultAttr( + FunctionType::get(Type::getVoidTy(M.getContext()), false), + GlobalValue::InternalLinkage, 0, InitOrFiniKernelName, &M); + BasicBlock *InitOrFiniKernelBB = + BasicBlock::Create(M.getContext(), "", InitOrFiniKernel); + ReturnInst::Create(M.getContext(), InitOrFiniKernelBB); + + InitOrFiniKernel->setCallingConv(CallingConv::AMDGPU_KERNEL); + if (IsCtor) + InitOrFiniKernel->addFnAttr("device-init"); + else + InitOrFiniKernel->addFnAttr("device-fini"); + return InitOrFiniKernel; + } + + void createInitOrFiniKernel(Module &M, GlobalVariable *GV, bool IsCtor) { + if (!GV) + return; + ConstantArray *GA = cast(GV->getInitializer()); + if (GA->getNumOperands() == 0) + return; + Function *InitOrFiniKernel = createInitOrFiniKernelFunction(M, IsCtor); + IRBuilder<> IRB(InitOrFiniKernel->getEntryBlock().getTerminator()); + for (Value *V : GA->operands()) { + auto *CS = cast(V); + if (Function *F = dyn_cast(CS->getOperand(1))) { + FunctionCallee Ctor = + M.getOrInsertFunction(F->getName(), IRB.getVoidTy()); + IRB.CreateCall(Ctor); + } + } + appendToUsed(M, {InitOrFiniKernel}); + } + + static char ID; + AMDGPUCtorDtorLowering() : ModulePass(ID) {} +}; +} // End anonymous namespace + +char AMDGPUCtorDtorLowering::ID = 0; +char &llvm::AMDGPUCtorDtorLoweringID = AMDGPUCtorDtorLowering::ID; +INITIALIZE_PASS(AMDGPUCtorDtorLowering, DEBUG_TYPE, + "Lower ctors and dtors for AMDGPU", false, false) + +ModulePass *llvm::createAMDGPUCtorDtorLoweringPass() { + return new AMDGPUCtorDtorLowering(); +} + +bool AMDGPUCtorDtorLowering::runOnModule(Module &M) { + createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_ctors"), + /*IsCtor =*/true); + createInitOrFiniKernel(M, M.getGlobalVariable("llvm.global_dtors"), + /*IsCtor =*/false); + return false; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp index 8eeda7b67b732b..fe07d487d7cb50 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp @@ -665,6 +665,10 @@ void MetadataStreamerV3::emitKernelAttrs(const Function &Func, Func.getFnAttribute("runtime-handle").getValueAsString().str(), /*Copy=*/true); } + if(Func.hasFnAttribute("device-init")) + Kern[".kind"] = Kern.getDocument()->getNode("init"); + else if(Func.hasFnAttribute("device-fini")) + Kern[".kind"] = Kern.getDocument()->getNode("fini"); } void MetadataStreamerV3::emitKernelArgs(const Function &Func, diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index ac25e2b28cf01f..045e1d54149079 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -349,6 +349,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeSIOptimizeVGPRLiveRangePass(*PR); initializeSILoadStoreOptimizerPass(*PR); initializeAMDGPUFixFunctionBitcastsPass(*PR); + initializeAMDGPUCtorDtorLoweringPass(*PR); initializeAMDGPUAlwaysInlinePass(*PR); initializeAMDGPUAttributorPass(*PR); initializeAMDGPUAnnotateKernelFeaturesPass(*PR); @@ -1014,6 +1015,7 @@ void AMDGPUPassConfig::addIRPasses() { disablePass(&PatchableFunctionID); addPass(createAMDGPUPrintfRuntimeBinding()); + addPass(createAMDGPUCtorDtorLoweringPass()); // This must occur before inlining, as the inliner will not look through // bitcast calls. diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index fb2d1cd247c2ca..78f4f8fa874f15 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -53,6 +53,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUCodeGenPrepare.cpp AMDGPUExportClustering.cpp AMDGPUFixFunctionBitcasts.cpp + AMDGPUCtorDtorLowering.cpp AMDGPUFrameLowering.cpp AMDGPUHSAMetadataStreamer.cpp AMDGPUInstCombineIntrinsic.cpp diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll new file mode 100644 index 00000000000000..83ddad21bf17db --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-from-llvm-ctor-dtor-list.ll @@ -0,0 +1,39 @@ +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx802 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s +; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=3 -amdgpu-dump-hsa-metadata -amdgpu-verify-hsa-metadata -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s + +@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }] + +define internal void @foo() { + ret void + +} + +define internal void @foo.5() { + ret void + +} + +; CHECK: --- +; CHECK: .kind: init +; CHECK: .name: amdgcn.device.init + +@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }] + +define internal void @bar() { + ret void + +} + +define internal void @bar.5() { + ret void + +} + +; CHECK: .kind: fini +; CHECK: .name: amdgcn.device.fini + +; PARSER: AMDGPU HSA Metadata Parser Test: PASS diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll index 698d1161f250d7..73909dc918f0a7 100644 --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -31,6 +31,7 @@ ; GCN-O0-NEXT: AMDGPU Printf lowering ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Dominator Tree Construction +; GCN-O0-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O0-NEXT: Fix function bitcasts for AMDGPU ; GCN-O0-NEXT: FunctionPass Manager ; GCN-O0-NEXT: Early propagate attributes from kernels to functions @@ -165,6 +166,7 @@ ; GCN-O1-NEXT: AMDGPU Printf lowering ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Dominator Tree Construction +; GCN-O1-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O1-NEXT: Fix function bitcasts for AMDGPU ; GCN-O1-NEXT: FunctionPass Manager ; GCN-O1-NEXT: Early propagate attributes from kernels to functions @@ -415,6 +417,7 @@ ; GCN-O1-OPTS-NEXT: AMDGPU Printf lowering ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Dominator Tree Construction +; GCN-O1-OPTS-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O1-OPTS-NEXT: Fix function bitcasts for AMDGPU ; GCN-O1-OPTS-NEXT: FunctionPass Manager ; GCN-O1-OPTS-NEXT: Early propagate attributes from kernels to functions @@ -698,6 +701,7 @@ ; GCN-O2-NEXT: AMDGPU Printf lowering ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Dominator Tree Construction +; GCN-O2-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O2-NEXT: Fix function bitcasts for AMDGPU ; GCN-O2-NEXT: FunctionPass Manager ; GCN-O2-NEXT: Early propagate attributes from kernels to functions @@ -983,6 +987,7 @@ ; GCN-O3-NEXT: AMDGPU Printf lowering ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Dominator Tree Construction +; GCN-O3-NEXT: Lower ctors and dtors for AMDGPU ; GCN-O3-NEXT: Fix function bitcasts for AMDGPU ; GCN-O3-NEXT: FunctionPass Manager ; GCN-O3-NEXT: Early propagate attributes from kernels to functions diff --git a/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll new file mode 100644 index 00000000000000..1a247070b99fc5 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-ctor-dtor.ll @@ -0,0 +1,21 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s + +@llvm.global_ctors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }] +@llvm.global_dtors = appending addrspace(1) global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }] + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0 +; CHECK-NEXT: call void @foo + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1 +; CHECK-NEXT: call void @bar + +define internal void @foo() { + ret void +} + +define internal void @bar() { + ret void +} + +; CHECK: attributes #0 = { "device-init" } +; CHECK: attributes #1 = { "device-fini" } diff --git a/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll new file mode 100644 index 00000000000000..e23ea2329b92f8 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/lower-multiple-ctor-dtor.ll @@ -0,0 +1,31 @@ +; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-ctor-dtor < %s | FileCheck %s + +@llvm.global_ctors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @foo, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @foo.5, i8* null }] +@llvm.global_dtors = appending addrspace(1) global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @bar, i8* null }, { i32, void ()*, i8* } { i32 1, void ()* @bar.5, i8* null }] + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.init() #0 +; CHECK-NEXT: call void @foo +; CHECK-NEXT: call void @foo.5 + +; CHECK-LABEL: amdgpu_kernel void @amdgcn.device.fini() #1 +; CHECK-NEXT: call void @bar +; CHECK-NEXT: call void @bar.5 + +define internal void @foo() { + ret void +} + +define internal void @bar() { + ret void +} + +define internal void @foo.5() { + ret void +} + +define internal void @bar.5() { + ret void +} + +; CHECK: attributes #0 = { "device-init" } +; CHECK: attributes #1 = { "device-fini" } diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index 9a831ba9b1f5ab..ed88d2d13e8e74 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -133,6 +133,7 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUCodeGenPrepare.cpp", "AMDGPUExportClustering.cpp", "AMDGPUFixFunctionBitcasts.cpp", + "AMDGPUCtorDtorLowering.cpp", "AMDGPUFrameLowering.cpp", "AMDGPUGlobalISelUtils.cpp", "AMDGPUHSAMetadataStreamer.cpp",