From e55caa070beda48ee923c2c474bb365208a95018 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Tue, 31 May 2022 13:43:08 +0800 Subject: [PATCH] [aot] [CUDA-AOT PR #0] Refactored compile_module_to_executable() to CUDAModuleToFunctionConverter --- taichi/backends/cuda/codegen_cuda.cpp | 252 ++++++++++++++------------ taichi/backends/cuda/codegen_cuda.h | 18 ++ taichi/codegen/codegen_llvm.h | 9 +- 3 files changed, 160 insertions(+), 119 deletions(-) diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp index ad72bf95f1413b..c9dcb8f15dc80f 100644 --- a/taichi/backends/cuda/codegen_cuda.cpp +++ b/taichi/backends/cuda/codegen_cuda.cpp @@ -15,6 +15,7 @@ #include "taichi/backends/cuda/cuda_context.h" #include "taichi/codegen/codegen_llvm.h" #include "taichi/llvm/llvm_program.h" +#include "taichi/util/action_recorder.h" TLANG_NAMESPACE_BEGIN @@ -37,123 +38,12 @@ class CodeGenLLVMCUDA : public CodeGenLLVM { FunctionType gen() override { auto compiled_res = run_compilation(); - return compile_module_to_executable(this->kernel, std::move(compiled_res)); - } - static FunctionType compile_module_to_executable( - Kernel *kernel, - CompiledData &&compiled_data) { -#ifdef TI_WITH_CUDA - auto *tlctx = - kernel->program->get_llvm_program_impl()->get_llvm_context(Arch::cuda); - for (auto &task : compiled_data.offloaded_tasks) { - llvm::Function *func = compiled_data.llvm_module->getFunction(task.name); - TI_ASSERT(func); - tlctx->mark_function_as_cuda_kernel(func, task.block_dim); - } - - auto jit = tlctx->jit.get(); - auto cuda_module = jit->add_module(std::move(compiled_data.llvm_module), - kernel->program->config.gpu_max_reg); - - return [cuda_module, kernel, - offloaded_tasks = - compiled_data.offloaded_tasks](RuntimeContext &context) { - CUDAContext::get_instance().make_current(); - auto args = kernel->args; - std::vector arg_buffers(args.size(), nullptr); - std::vector device_buffers(args.size(), nullptr); - - // We could also use kernel->make_launch_context() to create - // |ctx_builder|, but that implies the usage of Program's context. For the - // sake of decoupling, let's not do that and explicitly set the context we - // want to modify. - Kernel::LaunchContextBuilder ctx_builder(kernel, &context); - bool transferred = false; - for (int i = 0; i < (int)args.size(); i++) { - if (args[i].is_array) { - const auto arr_sz = context.array_runtime_sizes[i]; - if (arr_sz == 0) { - continue; - } - arg_buffers[i] = context.get_arg(i); - if (!context.is_device_allocations[i]) { - // Note: both numpy and PyTorch support arrays/tensors with zeros - // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes - // `arr_sz` zero. - unsigned int attr_val = 0; - uint32_t ret_code = - CUDADriver::get_instance().mem_get_attribute.call( - &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, - (void *)arg_buffers[i]); - if (ret_code != CUDA_SUCCESS || attr_val != CU_MEMORYTYPE_DEVICE) { - // Copy to device buffer if arg is on host - // - ret_code != CUDA_SUCCESS: - // arg_buffers[i] is not on device - // - attr_val != CU_MEMORYTYPE_DEVICE: - // Cuda driver is aware of arg_buffers[i] but it might be on - // host. - // See CUDA driver API `cuPointerGetAttribute` for more details. - transferred = true; - CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz); - CUDADriver::get_instance().memcpy_host_to_device( - (void *)device_buffers[i], arg_buffers[i], arr_sz); - } else { - device_buffers[i] = arg_buffers[i]; - } - // device_buffers[i] saves a raw ptr on CUDA device. - ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i], - arr_sz, - /*is_device_allocation=*/false); - - } else if (arr_sz > 0) { - // arg_buffers[i] is a DeviceAllocation* - // TODO: Unwraps DeviceAllocation* can be done at CodeGenLLVM since - // it's shared by cpu and cuda. - DeviceAllocation *ptr = - static_cast(arg_buffers[i]); - device_buffers[i] = kernel->program->get_llvm_program_impl() - ->get_ndarray_alloc_info_ptr(*ptr); - // We compare arg_buffers[i] and device_buffers[i] later to check - // if transfer happened. - // TODO: this logic can be improved but I'll leave it to a followup - // PR. - arg_buffers[i] = device_buffers[i]; - - // device_buffers[i] saves the unwrapped raw ptr from arg_buffers[i] - ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i], - arr_sz, - /*is_device_allocation=*/false); - } - } - } - if (transferred) { - CUDADriver::get_instance().stream_synchronize(nullptr); - } + CUDAModuleToFunctionConverter converter{ + tlctx, this->kernel->program->get_llvm_program_impl()}; - for (auto task : offloaded_tasks) { - TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, - task.block_dim); - cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0, - {&context}); - } - // copy data back to host - if (transferred) { - CUDADriver::get_instance().stream_synchronize(nullptr); - for (int i = 0; i < (int)args.size(); i++) { - if (device_buffers[i] != arg_buffers[i]) { - CUDADriver::get_instance().memcpy_device_to_host( - arg_buffers[i], (void *)device_buffers[i], - context.array_runtime_sizes[i]); - CUDADriver::get_instance().mem_free((void *)device_buffers[i]); - } - } - } - }; -#else - TI_ERROR("No CUDA"); - return nullptr; -#endif // TI_WITH_CUDA + return converter.convert(this->kernel, std::move(compiled_res.llvm_module), + std::move(compiled_res.offloaded_tasks)); } llvm::Value *create_print(std::string tag, @@ -934,9 +824,141 @@ class CodeGenLLVMCUDA : public CodeGenLLVM { } }; +static void set_arg_external_array(RuntimeContext *ctx, + const std::string &kernel_name, + int arg_id, + uintptr_t ptr, + uint64 size, + bool is_device_allocation) { + ActionRecorder::get_instance().record( + "set_kernel_arg_ext_ptr", + {ActionArg("kernel_name", kernel_name), ActionArg("arg_id", arg_id), + ActionArg("address", fmt::format("0x{:x}", ptr)), + ActionArg("array_size_in_bytes", (int64)size)}); + + ctx->set_arg(arg_id, ptr); + ctx->set_array_runtime_size(arg_id, size); + ctx->set_array_is_device_allocation(arg_id, is_device_allocation); +} + FunctionType CodeGenCUDA::codegen() { TI_AUTO_PROF return CodeGenLLVMCUDA(kernel, ir).gen(); } +FunctionType CUDAModuleToFunctionConverter::convert( + const std::string &kernel_name, + const std::vector &args, + std::unique_ptr mod, + std::vector &&tasks) const { +#ifdef TI_WITH_CUDA + for (const auto &task : tasks) { + llvm::Function *func = mod->getFunction(task.name); + TI_ASSERT(func); + tlctx_->mark_function_as_cuda_kernel(func, task.block_dim); + } + + auto jit = tlctx_->jit.get(); + auto cuda_module = + jit->add_module(std::move(mode), program_->config->gpu_max_reg); + + return [cuda_module, args, offloaded_tasks = tasks](RuntimeContext &context) { + CUDAContext::get_instance().make_current(); + std::vector arg_buffers(args.size(), nullptr); + std::vector device_buffers(args.size(), nullptr); + + bool transferred = false; + for (int i = 0; i < (int)args.size(); i++) { + if (args[i].is_array) { + const auto arr_sz = context.array_runtime_sizes[i]; + if (arr_sz == 0) { + continue; + } + arg_buffers[i] = context.get_arg(i); + if (!context.is_device_allocations[i]) { + // Note: both numpy and PyTorch support arrays/tensors with zeros + // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes + // `arr_sz` zero. + unsigned int attr_val = 0; + uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call( + &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, + (void *)arg_buffers[i]); + + if (ret_code != CUDA_SUCCESS || attr_val != CU_MEMORYTYPE_DEVICE) { + // Copy to device buffer if arg is on host + // - ret_code != CUDA_SUCCESS: + // arg_buffers[i] is not on device + // - attr_val != CU_MEMORYTYPE_DEVICE: + // Cuda driver is aware of arg_buffers[i] but it might be on + // host. + // See CUDA driver API `cuPointerGetAttribute` for more details. + transferred = true; + CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz); + CUDADriver::get_instance().memcpy_host_to_device( + (void *)device_buffers[i], arg_buffers[i], arr_sz); + } else { + device_buffers[i] = arg_buffers[i]; + } + // device_buffers[i] saves a raw ptr on CUDA device. + set_arg_external_array(&context, kernel_name, i, + (uint64)device_buffers[i], arr_sz, + /*is_device_allocation=*/false); + + } else if (arr_sz > 0) { + // arg_buffers[i] is a DeviceAllocation* + // TODO: Unwraps DeviceAllocation* can be done at CodeGenLLVM since + // it's shared by cpu and cuda. + DeviceAllocation *ptr = + static_cast(arg_buffers[i]); + device_buffers[i] = program_->get_ndarray_alloc_info_ptr(*ptr); + // We compare arg_buffers[i] and device_buffers[i] later to check + // if transfer happened. + // TODO: this logic can be improved but I'll leave it to a followup + // PR. + arg_buffers[i] = device_buffers[i]; + + // device_buffers[i] saves the unwrapped raw ptr from arg_buffers[i] + set_arg_external_array(&context, kernel_name, i, + (uint64)device_buffers[i], arr_sz, + /*is_device_allocation=*/false); + } + } + } + if (transferred) { + CUDADriver::get_instance().stream_synchronize(nullptr); + } + + for (auto task : offloaded_tasks) { + TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim, + task.block_dim); + cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0, + {&context}); + } + // copy data back to host + if (transferred) { + CUDADriver::get_instance().stream_synchronize(nullptr); + for (int i = 0; i < (int)args.size(); i++) { + if (device_buffers[i] != arg_buffers[i]) { + CUDADriver::get_instance().memcpy_device_to_host( + arg_buffers[i], (void *)device_buffers[i], + context.array_runtime_sizes[i]); + CUDADriver::get_instance().mem_free((void *)device_buffers[i]); + } + } + } + }; +#else + TI_ERROR("No CUDA"); + return nullptr; +#endif // TI_WITH_CUDA +} + +FunctionType CUDAModuleToFunctionConverter::convert( + const Kernel *kernel, + std::unique_ptr mod, + std::vector &&tasks) const { + return convert(kernel->name, infer_launch_args(kernel), std::move(mod), + std::move(tasks)); +} + TLANG_NAMESPACE_END diff --git a/taichi/backends/cuda/codegen_cuda.h b/taichi/backends/cuda/codegen_cuda.h index 0d4eec87e4b262..29b5434e75de7f 100644 --- a/taichi/backends/cuda/codegen_cuda.h +++ b/taichi/backends/cuda/codegen_cuda.h @@ -3,6 +3,7 @@ #pragma once #include "taichi/codegen/codegen.h" +#include "taichi/codegen/codegen_llvm.h" TLANG_NAMESPACE_BEGIN @@ -15,4 +16,21 @@ class CodeGenCUDA : public KernelCodeGen { FunctionType codegen() override; }; +class CUDAModuleToFunctionConverter : public ModuleToFunctionConverter { + public: + explicit ModuleToFunctionConverter(TaichiLLVMContext *tlctx, + LlvmProgramImpl *program) + : ModuleToFunctionConverter(tlctx, program) { + } + + FunctionType convert(const std::string &kernel_name, + const std::vector &args, + std::unique_ptr mod, + std::vector &&tasks) const override; + + FunctionType convert(const Kernel *kernel, + std::unique_ptr mod, + std::vector &&tasks) const override; +}; + TLANG_NAMESPACE_END diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h index bb83a3151b6368..583acd5e80245c 100644 --- a/taichi/codegen/codegen_llvm.h +++ b/taichi/codegen/codegen_llvm.h @@ -413,7 +413,8 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder { class LlvmProgramImpl; -// This is for CPU, we need one for CUDA (AMDGPU) as well. +// TODO: Make ModuleToFunctionConverter abstract, +// Move CPU implementation to "taichi/backend/cpu/" class ModuleToFunctionConverter { public: explicit ModuleToFunctionConverter(TaichiLLVMContext *tlctx, @@ -426,9 +427,9 @@ class ModuleToFunctionConverter { std::unique_ptr mod, std::vector &&tasks) const; - FunctionType convert(const Kernel *kernel, - std::unique_ptr mod, - std::vector &&tasks) const; + virtual FunctionType convert(const Kernel *kernel, + std::unique_ptr mod, + std::vector &&tasks) const; protected: TaichiLLVMContext *tlctx_{nullptr};