From c12fb2b8ec40610db470803051736e2137ffd3cd Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Wed, 8 Jun 2022 14:00:18 +0800 Subject: [PATCH 1/3] [aot] [llvm] Implemented FieldCacheData and refactored initialize_llvm_runtime_snodes() --- taichi/llvm/llvm_offline_cache.h | 31 +++++- taichi/llvm/llvm_program.cpp | 177 ++++++++++++++++++------------- taichi/llvm/llvm_program.h | 6 +- 3 files changed, 139 insertions(+), 75 deletions(-) diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h index c82837e42521e..fe9eca8f8ba96 100644 --- a/taichi/llvm/llvm_offline_cache.h +++ b/taichi/llvm/llvm_offline_cache.h @@ -42,7 +42,36 @@ struct LlvmOfflineCache { TI_IO_DEF(kernel_key, args, offloaded_task_list); }; - std::unordered_map kernels; + struct FieldCacheData { + struct SNodeCacheData { + int id; + int type; + size_t cell_size_bytes; + size_t chunk_size; + + TI_IO_DEF(id, type, cell_size_bytes, chunk_size); + }; + + int tree_id; + size_t root_size; + std::vector snode_metas; + + TI_IO_DEF(tree_id, root_size, snode_metas); + + // TODO(zhanlue) + // Serialize/Deserialize the llvm::Module from StructCompiler + // At runtime, make sure loaded Field-Modules and Kernel-Modules are linked + // altogether. + }; + + // TODO(zhanlue): we need a better identifier for each FieldCacheData + // (SNodeTree) Given that snode_tree_id is not continuous, it is ridiculous to + // ask the users to remember each of the snode_tree_ids + // ** Find a way to name each SNodeTree ** + std::unordered_map fields; // key = snode_tree_id + + std::unordered_map + kernels; // key = kernel_name TI_IO_DEF(kernels); }; diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp index 65bc1a75e12e1..e922a9ebe8026 100644 --- a/taichi/llvm/llvm_program.cpp +++ b/taichi/llvm/llvm_program.cpp @@ -158,95 +158,105 @@ LlvmProgramImpl::clone_struct_compiler_initial_context( return tlctx->clone_runtime_module(); } -void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree, - StructCompiler *scomp, - uint64 *result_buffer) { - TaichiLLVMContext *tlctx = nullptr; - if (config->arch == Arch::cuda) { +void LlvmProgramImpl::initialize_llvm_runtime_snodes( + const LlvmOfflineCache::FieldCacheData &field_cache_data, + uint64 *result_buffer); +TaichiLLVMContext *tlctx = nullptr; +if (config->arch == Arch::cuda) { #if defined(TI_WITH_CUDA) - tlctx = llvm_context_device_.get(); + tlctx = llvm_context_device_.get(); #else - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED #endif - } else { - tlctx = llvm_context_host_.get(); - } +} else { + tlctx = llvm_context_host_.get(); +} - auto *const runtime_jit = tlctx->runtime_jit_module; - // By the time this creator is called, "this" is already destroyed. - // Therefore it is necessary to capture members by values. - const auto snodes = scomp->snodes; - const int root_id = tree->root()->id; - - TI_TRACE("Allocating data structure of size {} bytes", scomp->root_size); - std::size_t rounded_size = - taichi::iroundup(scomp->root_size, taichi_page_size); - - Ptr root_buffer = snode_tree_buffer_manager_->allocate( - runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree->id(), - result_buffer); - if (config->arch == Arch::cuda) { +auto *const runtime_jit = tlctx -> runtime_jit_module; +// By the time this creator is called, "this" is already destroyed. +// Therefore it is necessary to capture members by values. +size_t root_size = field_cache_data.root_size; +const auto snode_metas = field_cache_data.snode_metas; +const int root_id = field_cache_data.tree_id; + +TI_TRACE("Allocating data structure of size {} bytes", root_size); +std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size); + +Ptr root_buffer = snode_tree_buffer_manager_->allocate(runtime_jit, + llvm_runtime_, + rounded_size, + taichi_page_size, + root_id, + result_buffer); +if (config->arch == Arch::cuda) { #if defined(TI_WITH_CUDA) - CUDADriver::get_instance().memset(root_buffer, 0, rounded_size); + CUDADriver::get_instance().memset(root_buffer, 0, rounded_size); #else - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED #endif - } else { - std::memset(root_buffer, 0, rounded_size); - } +} else { + std::memset(root_buffer, 0, rounded_size); +} - DeviceAllocation alloc{kDeviceNullAllocation}; +DeviceAllocation alloc{kDeviceNullAllocation}; - if (config->arch == Arch::cuda) { +if (config->arch == Arch::cuda) { #if defined(TI_WITH_CUDA) - alloc = cuda_device()->import_memory(root_buffer, rounded_size); + alloc = cuda_device()->import_memory(root_buffer, rounded_size); #else - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED #endif - } else { - alloc = cpu_device()->import_memory(root_buffer, rounded_size); - } +} else { + alloc = cpu_device()->import_memory(root_buffer, rounded_size); +} - snode_tree_allocs_[tree->id()] = alloc; +snode_tree_allocs_[tree->id()] = alloc; - bool all_dense = config->demote_dense_struct_fors; - for (int i = 0; i < (int)snodes.size(); i++) { - if (snodes[i]->type != SNodeType::dense && - snodes[i]->type != SNodeType::place && - snodes[i]->type != SNodeType::root) { - all_dense = false; - break; - } +bool all_dense = config->demote_dense_struct_fors; +for (size_t i = 0; i < snode_metas.size(); i++) { + if (snode_metas[i]->type != SNodeType::dense && + snode_metas[i]->type != SNodeType::place && + snode_metas[i]->type != SNodeType::root) { + all_dense = false; + break; } +} - runtime_jit->call( - "runtime_initialize_snodes", llvm_runtime_, scomp->root_size, root_id, - (int)snodes.size(), tree->id(), rounded_size, root_buffer, all_dense); - - for (int i = 0; i < (int)snodes.size(); i++) { - if (is_gc_able(snodes[i]->type)) { - const auto snode_id = snodes[i]->id; - std::size_t node_size; - auto element_size = snodes[i]->cell_size_bytes; - if (snodes[i]->type == SNodeType::pointer) { - // pointer. Allocators are for single elements - node_size = element_size; - } else { - // dynamic. Allocators are for the chunks - node_size = sizeof(void *) + element_size * snodes[i]->chunk_size; - } - TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id, - node_size); - auto rt = llvm_runtime_; - runtime_jit->call( - "runtime_NodeAllocator_initialize", rt, snode_id, node_size); - TI_TRACE("Allocating ambient element for snode {} (node size {})", - snode_id, node_size); - runtime_jit->call("runtime_allocate_ambient", rt, snode_id, - node_size); +runtime_jit->call( + "runtime_initialize_snodes", + llvm_runtime_, + root_size, + root_id, + (int)snode_metas.size(), + root_id, + rounded_size, + root_buffer, + all_dense); + +for (size_t i = 0; i < snode_metas.size(); i++) { + if (is_gc_able(snode_metas[i]->type)) { + const auto snode_id = snode_metas[i].id; + std::size_t node_size; + auto element_size = snode_metas[i].cell_size_bytes; + if (snode_metas[i].type == SNodeType::pointer) { + // pointer. Allocators are for single elements + node_size = element_size; + } else { + // dynamic. Allocators are for the chunks + node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size; } + TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id, + node_size); + auto rt = llvm_runtime_; + runtime_jit->call( + "runtime_NodeAllocator_initialize", rt, snode_id, node_size); + TI_TRACE("Allocating ambient element for snode {} (node size {})", snode_id, + node_size); + runtime_jit->call("runtime_allocate_ambient", rt, snode_id, + node_size); } } +} std::unique_ptr LlvmProgramImpl::compile_snode_tree_types_impl( SNodeTree *tree) { @@ -275,10 +285,35 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) { compile_snode_tree_types_impl(tree); } +static LlvmOfflineCache::FieldCacheData construct_filed_cache_data( + const SNodeTree &tree, + const StructCompiler &struct_compiler) { + TI_ASSERT(tree.id == tree.root()->id); + + LlvmOfflineCache::FieldCacheData ret; + ret.tree_id = tree.id; + ret.root_size = struct_compiler.root_size; + + const auto &snodes = struct_compiler.snodes; + for (size_t i = 0; i < snodes.size(); i++) { + LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data; + snode_cache_data.id = snodes[i]->id; + snode_cache_data.type = snodes[i]->type; + snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes; + snode_cache_data.chunk_size = snodes[i]->chunk_size; + + ret.snode_metas.emplace_back(std::move(snode_cache_data)); + } + + return ret; +} + void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) { auto struct_compiler = compile_snode_tree_types_impl(tree); - initialize_llvm_runtime_snodes(tree, struct_compiler.get(), result_buffer); + + auto field_cache_data = construct_filed_cache_data(*tree, *struct_compiler); + initialize_llvm_runtime_snodes(field_cache_data, result_buffer); } uint64 LlvmProgramImpl::fetch_result_uint64(int i, uint64 *result_buffer) { diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h index c9029bbcd85f0..69378ee660bf1 100644 --- a/taichi/llvm/llvm_program.h +++ b/taichi/llvm/llvm_program.h @@ -132,9 +132,9 @@ class LlvmProgramImpl : public ProgramImpl { /** * Initializes the SNodes for LLVM based backends. */ - void initialize_llvm_runtime_snodes(const SNodeTree *tree, - StructCompiler *scomp, - uint64 *result_buffer); + void initialize_llvm_runtime_snodes( + const LlvmOfflineCache::FieldCacheData &field_cache_data, + uint64 *result_buffer); uint64 fetch_result_uint64(int i, uint64 *result_buffer); From e3f1ab8ac70798f8b14e5e4750c9ea43c3544b74 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Wed, 8 Jun 2022 14:15:47 +0800 Subject: [PATCH 2/3] Addressed compilation erros --- taichi/llvm/llvm_offline_cache.h | 3 +- taichi/llvm/llvm_program.cpp | 152 +++++++++++++++---------------- 2 files changed, 73 insertions(+), 82 deletions(-) diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h index fe9eca8f8ba96..91386c2bffb31 100644 --- a/taichi/llvm/llvm_offline_cache.h +++ b/taichi/llvm/llvm_offline_cache.h @@ -45,7 +45,7 @@ struct LlvmOfflineCache { struct FieldCacheData { struct SNodeCacheData { int id; - int type; + SNodeType type; size_t cell_size_bytes; size_t chunk_size; @@ -53,6 +53,7 @@ struct LlvmOfflineCache { }; int tree_id; + int root_id; size_t root_size; std::vector snode_metas; diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp index e922a9ebe8026..eea60dad165f7 100644 --- a/taichi/llvm/llvm_program.cpp +++ b/taichi/llvm/llvm_program.cpp @@ -160,103 +160,94 @@ LlvmProgramImpl::clone_struct_compiler_initial_context( void LlvmProgramImpl::initialize_llvm_runtime_snodes( const LlvmOfflineCache::FieldCacheData &field_cache_data, - uint64 *result_buffer); -TaichiLLVMContext *tlctx = nullptr; -if (config->arch == Arch::cuda) { + uint64 *result_buffer) { + TaichiLLVMContext *tlctx = nullptr; + if (config->arch == Arch::cuda) { #if defined(TI_WITH_CUDA) - tlctx = llvm_context_device_.get(); + tlctx = llvm_context_device_.get(); #else - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED #endif -} else { - tlctx = llvm_context_host_.get(); -} + } else { + tlctx = llvm_context_host_.get(); + } -auto *const runtime_jit = tlctx -> runtime_jit_module; -// By the time this creator is called, "this" is already destroyed. -// Therefore it is necessary to capture members by values. -size_t root_size = field_cache_data.root_size; -const auto snode_metas = field_cache_data.snode_metas; -const int root_id = field_cache_data.tree_id; - -TI_TRACE("Allocating data structure of size {} bytes", root_size); -std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size); - -Ptr root_buffer = snode_tree_buffer_manager_->allocate(runtime_jit, - llvm_runtime_, - rounded_size, - taichi_page_size, - root_id, - result_buffer); -if (config->arch == Arch::cuda) { + auto *const runtime_jit = tlctx->runtime_jit_module; + // By the time this creator is called, "this" is already destroyed. + // Therefore it is necessary to capture members by values. + size_t root_size = field_cache_data.root_size; + const auto snode_metas = field_cache_data.snode_metas; + const int tree_id = field_cache_data.tree_id; + const int root_id = field_cache_data.root_id; + + TI_TRACE("Allocating data structure of size {} bytes", root_size); + std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size); + + Ptr root_buffer = snode_tree_buffer_manager_->allocate( + runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id, + result_buffer); + if (config->arch == Arch::cuda) { #if defined(TI_WITH_CUDA) - CUDADriver::get_instance().memset(root_buffer, 0, rounded_size); + CUDADriver::get_instance().memset(root_buffer, 0, rounded_size); #else - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED #endif -} else { - std::memset(root_buffer, 0, rounded_size); -} + } else { + std::memset(root_buffer, 0, rounded_size); + } -DeviceAllocation alloc{kDeviceNullAllocation}; + DeviceAllocation alloc{kDeviceNullAllocation}; -if (config->arch == Arch::cuda) { + if (config->arch == Arch::cuda) { #if defined(TI_WITH_CUDA) - alloc = cuda_device()->import_memory(root_buffer, rounded_size); + alloc = cuda_device()->import_memory(root_buffer, rounded_size); #else - TI_NOT_IMPLEMENTED + TI_NOT_IMPLEMENTED #endif -} else { - alloc = cpu_device()->import_memory(root_buffer, rounded_size); -} + } else { + alloc = cpu_device()->import_memory(root_buffer, rounded_size); + } -snode_tree_allocs_[tree->id()] = alloc; + snode_tree_allocs_[tree_id] = alloc; -bool all_dense = config->demote_dense_struct_fors; -for (size_t i = 0; i < snode_metas.size(); i++) { - if (snode_metas[i]->type != SNodeType::dense && - snode_metas[i]->type != SNodeType::place && - snode_metas[i]->type != SNodeType::root) { - all_dense = false; - break; + bool all_dense = config->demote_dense_struct_fors; + for (size_t i = 0; i < snode_metas.size(); i++) { + if (snode_metas[i].type != SNodeType::dense && + snode_metas[i].type != SNodeType::place && + snode_metas[i].type != SNodeType::root) { + all_dense = false; + break; + } } -} -runtime_jit->call( - "runtime_initialize_snodes", - llvm_runtime_, - root_size, - root_id, - (int)snode_metas.size(), - root_id, - rounded_size, - root_buffer, - all_dense); - -for (size_t i = 0; i < snode_metas.size(); i++) { - if (is_gc_able(snode_metas[i]->type)) { - const auto snode_id = snode_metas[i].id; - std::size_t node_size; - auto element_size = snode_metas[i].cell_size_bytes; - if (snode_metas[i].type == SNodeType::pointer) { - // pointer. Allocators are for single elements - node_size = element_size; - } else { - // dynamic. Allocators are for the chunks - node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size; + runtime_jit->call( + "runtime_initialize_snodes", llvm_runtime_, root_size, root_id, + (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense); + + for (size_t i = 0; i < snode_metas.size(); i++) { + if (is_gc_able(snode_metas[i].type)) { + const auto snode_id = snode_metas[i].id; + std::size_t node_size; + auto element_size = snode_metas[i].cell_size_bytes; + if (snode_metas[i].type == SNodeType::pointer) { + // pointer. Allocators are for single elements + node_size = element_size; + } else { + // dynamic. Allocators are for the chunks + node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size; + } + TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id, + node_size); + auto rt = llvm_runtime_; + runtime_jit->call( + "runtime_NodeAllocator_initialize", rt, snode_id, node_size); + TI_TRACE("Allocating ambient element for snode {} (node size {})", + snode_id, node_size); + runtime_jit->call("runtime_allocate_ambient", rt, snode_id, + node_size); } - TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id, - node_size); - auto rt = llvm_runtime_; - runtime_jit->call( - "runtime_NodeAllocator_initialize", rt, snode_id, node_size); - TI_TRACE("Allocating ambient element for snode {} (node size {})", snode_id, - node_size); - runtime_jit->call("runtime_allocate_ambient", rt, snode_id, - node_size); } } -} std::unique_ptr LlvmProgramImpl::compile_snode_tree_types_impl( SNodeTree *tree) { @@ -288,10 +279,9 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) { static LlvmOfflineCache::FieldCacheData construct_filed_cache_data( const SNodeTree &tree, const StructCompiler &struct_compiler) { - TI_ASSERT(tree.id == tree.root()->id); - LlvmOfflineCache::FieldCacheData ret; - ret.tree_id = tree.id; + ret.tree_id = tree.id(); + ret.root_id = tree.root()->id; ret.root_size = struct_compiler.root_size; const auto &snodes = struct_compiler.snodes; From cf72ff8c7bd7eab6203b51bdb0107d7a69453d73 Mon Sep 17 00:00:00 2001 From: jim19930609 Date: Wed, 8 Jun 2022 16:06:02 +0800 Subject: [PATCH 3/3] [aot] [llvm] LLVM AOT Field #1: Adjust serialization/deserialization logics for FieldCacheData --- taichi/llvm/llvm_offline_cache.cpp | 14 +++++++++++++ taichi/llvm/llvm_offline_cache.h | 32 ++++++++++++++++++++++++++---- 2 files changed, 42 insertions(+), 4 deletions(-) diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp index 542f02aebc2c7..92994fe9aa2f0 100644 --- a/taichi/llvm/llvm_offline_cache.cpp +++ b/taichi/llvm/llvm_offline_cache.cpp @@ -51,6 +51,20 @@ LlvmOfflineCacheFileReader::LlvmOfflineCacheFileReader( : path_(path), data_(std::move(data)), format_(format) { } +bool LlvmOfflineCacheFileReader::get_field_cache( + LlvmOfflineCache::FieldCacheData &res, + int snode_tree_id) { + auto itr = data_.fields.find(snode_tree_id); + if (itr == data_.fields.end()) { + TI_DEBUG("Cannot find field with snode_tree_id={}", snode_tree_id); + return false; + } + + const auto &loaded_field_cache = itr->second; + res = loaded_field_cache; // copy assign + return true; +} + bool LlvmOfflineCacheFileReader::get_kernel_cache( LlvmOfflineCache::KernelCacheData &res, const std::string &key, diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h index 91386c2bffb31..1193365fe30e3 100644 --- a/taichi/llvm/llvm_offline_cache.h +++ b/taichi/llvm/llvm_offline_cache.h @@ -59,10 +59,31 @@ struct LlvmOfflineCache { TI_IO_DEF(tree_id, root_size, snode_metas); - // TODO(zhanlue) - // Serialize/Deserialize the llvm::Module from StructCompiler - // At runtime, make sure loaded Field-Modules and Kernel-Modules are linked - // altogether. + // TODO(zhanlue): refactor llvm::Modules + // + // struct_module will eventually get cloned into each kernel_module, + // so there's no need to serialize it here. + // + // We have three different types of llvm::Module + // 1. runtime_module: contains runtime functions. + // 2. struct_module: contains compiled SNodeTree in llvm::Type. + // 3. kernel_modules: contains compiled kernel codes. + // + // The way those modules work rely on a recursive clone mechanism: + // runtime_module = load("runtime.bc") + // struct_module = clone(runtime_module) + compiled-SNodeTree + // kernel_module = clone(struct_module) + compiled-Kernel + // + // As a result, every kernel_module contains a copy of struct_module + + // runtime_module. + // + // This recursive clone mechanism is super fragile, + // which potentially causes inconsistency between modules if not handled + // properly. + // + // Let's turn to use llvm::link to bind the modules, + // and make runtime_module, struct_module, kernel_module independent of each + // other }; // TODO(zhanlue): we need a better identifier for each FieldCacheData @@ -83,6 +104,9 @@ class LlvmOfflineCacheFileReader { const std::string &key, llvm::LLVMContext &llvm_ctx); + bool get_field_cache(LlvmOfflineCache::FieldCacheData &res, + int snode_tree_id); + static std::unique_ptr make( const std::string &path, LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL);