diff --git a/taichi/aot/module_builder.cpp b/taichi/aot/module_builder.cpp
index d5f7668a0a430..1b1b71fb58c8f 100644
--- a/taichi/aot/module_builder.cpp
+++ b/taichi/aot/module_builder.cpp
@@ -6,7 +6,7 @@ namespace lang {
 
 void AotModuleBuilder::add(const std::string &identifier, Kernel *kernel) {
   if (!kernel->lowered() && Kernel::supports_lowering(kernel->arch)) {
-    kernel->lower();
+    kernel->lower(/*to_executable=*/!arch_uses_llvm(kernel->arch));
   }
   add_per_backend(identifier, kernel);
 }
diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp
index 8faa9bec9d974..0c945d9a36bb9 100644
--- a/taichi/codegen/codegen.cpp
+++ b/taichi/codegen/codegen.cpp
@@ -87,6 +87,8 @@ void KernelCodeGen::cache_module(const std::string &kernel_key,
 }
 
 std::vector<LLVMCompiledData> KernelCodeGen::compile_kernel_to_module() {
+  auto *llvm_prog = get_llvm_program(prog);
+  auto *tlctx = llvm_prog->get_llvm_context(kernel->arch);
   auto &config = prog->config;
   std::string kernel_key = get_hashed_offline_cache_key(&config, kernel);
   kernel->set_kernel_key_for_cache(kernel_key);
@@ -98,6 +100,7 @@ std::vector<LLVMCompiledData> KernelCodeGen::compile_kernel_to_module() {
       TI_DEBUG("Create kernel '{}' from cache (key='{}')", kernel->get_name(),
                kernel_key);
       cache_module(kernel_key, res);
+      TI_ASSERT(res.size() == 1);
       return res;
     }
   }
@@ -110,17 +113,17 @@ std::vector<LLVMCompiledData> KernelCodeGen::compile_kernel_to_module() {
   TI_ASSERT(block);
 
   auto &offloads = block->statements;
-  std::vector<LLVMCompiledData> data(offloads.size());
+  std::vector<std::unique_ptr<LLVMCompiledData>> data(offloads.size());
   using TaskFunc = int32 (*)(void *);
   std::vector<TaskFunc> task_funcs(offloads.size());
   for (int i = 0; i < offloads.size(); i++) {
     auto compile_func = [&, i] {
+      tlctx->fetch_this_thread_struct_module();
       auto offload =
           irpass::analysis::clone(offloads[i].get(), offloads[i]->get_kernel());
       irpass::re_id(offload.get());
       auto new_data = this->compile_task(nullptr, offload->as<OffloadedStmt>());
-      data[i].tasks = std::move(new_data.tasks);
-      data[i].module = std::move(new_data.module);
+      data[i] = std::make_unique<LLVMCompiledData>(std::move(new_data));
     };
     if (kernel->is_evaluator) {
       compile_func();
@@ -131,11 +134,15 @@ std::vector<LLVMCompiledData> KernelCodeGen::compile_kernel_to_module() {
   if (!kernel->is_evaluator) {
     worker.flush();
   }
+  auto linked = tlctx->link_compiled_tasks(std::move(data));
+  std::vector<LLVMCompiledData> linked_data;
+  linked_data.push_back(std::move(*linked));
+
   if (!kernel->is_evaluator) {
     TI_DEBUG("Cache kernel '{}' (key='{}')", kernel->get_name(), kernel_key);
-    cache_module(kernel_key, data);
+    cache_module(kernel_key, linked_data);
   }
-  return data;
+  return linked_data;
 }
 
 ModuleToFunctionConverter::ModuleToFunctionConverter(
diff --git a/taichi/codegen/codegen.h b/taichi/codegen/codegen.h
index f2c41ccb0870f..53233ac45d734 100644
--- a/taichi/codegen/codegen.h
+++ b/taichi/codegen/codegen.h
@@ -33,7 +33,7 @@ class KernelCodeGen {
   }
 
 #ifdef TI_WITH_LLVM
-  std::vector<LLVMCompiledData> compile_kernel_to_module();
+  virtual std::vector<LLVMCompiledData> compile_kernel_to_module();
 
   virtual LLVMCompiledData compile_task(
       std::unique_ptr<llvm::Module> &&module = nullptr,
diff --git a/taichi/codegen/cpu/codegen_cpu.cpp b/taichi/codegen/cpu/codegen_cpu.cpp
index e1d1b72143599..ca4742bf51e6a 100644
--- a/taichi/codegen/cpu/codegen_cpu.cpp
+++ b/taichi/codegen/cpu/codegen_cpu.cpp
@@ -235,16 +235,14 @@ FunctionType CPUModuleToFunctionConverter::convert(
     const std::string &kernel_name,
     const std::vector<LlvmLaunchArgInfo> &args,
     std::vector<LLVMCompiledData> &&data) const {
-  for (auto &datum : data) {
-    tlctx_->add_module(std::move(datum.module));
-  }
-
+  TI_AUTO_PROF;
+  auto jit_module = tlctx_->create_jit_module(std::move(data.back().module));
   using TaskFunc = int32 (*)(void *);
   std::vector<TaskFunc> task_funcs;
   task_funcs.reserve(data.size());
   for (auto &datum : data) {
     for (auto &task : datum.tasks) {
-      auto *func_ptr = tlctx_->lookup_function_pointer(task.name);
+      auto *func_ptr = jit_module->lookup_function(task.name);
       TI_ASSERT_INFO(func_ptr, "Offloaded datum function {} not found",
                      task.name);
       task_funcs.push_back((TaskFunc)(func_ptr));
diff --git a/taichi/codegen/cuda/codegen_cuda.cpp b/taichi/codegen/cuda/codegen_cuda.cpp
index eb98dfa502444..3572e1319f383 100644
--- a/taichi/codegen/cuda/codegen_cuda.cpp
+++ b/taichi/codegen/cuda/codegen_cuda.cpp
@@ -715,25 +715,20 @@ FunctionType CUDAModuleToFunctionConverter::convert(
     const std::string &kernel_name,
     const std::vector<LlvmLaunchArgInfo> &args,
     std::vector<LLVMCompiledData> &&data) const {
+  auto &mod = data[0].module;
+  auto &tasks = data[0].tasks;
 #ifdef TI_WITH_CUDA
-  std::vector<JITModule *> cuda_modules;
-  std::vector<std::vector<OffloadedTask>> offloaded_tasks;
-  cuda_modules.reserve(data.size());
-  for (auto &datum : data) {
-    auto &mod = datum.module;
-    auto &tasks = datum.tasks;
-    for (const auto &task : tasks) {
-      llvm::Function *func = mod->getFunction(task.name);
-      TI_ASSERT(func);
-      tlctx_->mark_function_as_cuda_kernel(func, task.block_dim);
-    }
-    auto jit = tlctx_->jit.get();
-    cuda_modules.push_back(
-        jit->add_module(std::move(mod), executor_->get_config()->gpu_max_reg));
-    offloaded_tasks.push_back(std::move(tasks));
+  for (const auto &task : tasks) {
+    llvm::Function *func = mod->getFunction(task.name);
+    TI_ASSERT(func);
+    tlctx_->mark_function_as_cuda_kernel(func, task.block_dim);
   }
 
-  return [cuda_modules, kernel_name, args, offloaded_tasks,
+  auto jit = tlctx_->jit.get();
+  auto cuda_module =
+      jit->add_module(std::move(mod), executor_->get_config()->gpu_max_reg);
+
+  return [cuda_module, kernel_name, args, offloaded_tasks = tasks,
           executor = this->executor_](RuntimeContext &context) {
     CUDAContext::get_instance().make_current();
     std::vector<void *> arg_buffers(args.size(), nullptr);
@@ -797,13 +792,11 @@ FunctionType CUDAModuleToFunctionConverter::convert(
       CUDADriver::get_instance().stream_synchronize(nullptr);
     }
 
-    for (int i = 0; i < offloaded_tasks.size(); i++) {
-      for (auto &task : offloaded_tasks[i]) {
-        TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-                 task.block_dim);
-        cuda_modules[i]->launch(task.name, task.grid_dim, task.block_dim, 0,
-                                {&context});
-      }
+    for (auto task : offloaded_tasks) {
+      TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+               task.block_dim);
+      cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0,
+                          {&context});
     }
 
     // copy data back to host
diff --git a/taichi/codegen/llvm/codegen_llvm.cpp b/taichi/codegen/llvm/codegen_llvm.cpp
index 09de38810bfd8..9ab43d0d1722b 100644
--- a/taichi/codegen/llvm/codegen_llvm.cpp
+++ b/taichi/codegen/llvm/codegen_llvm.cpp
@@ -318,11 +318,13 @@ void TaskCodeGenLLVM::emit_struct_meta_base(const std::string &name,
   // snodes, even if they have the same type.
   if (snode->parent)
     common.set("from_parent_element",
-               get_runtime_function(snode->get_ch_from_parent_func_name()));
+               get_struct_function(snode->get_ch_from_parent_func_name(),
+                                   snode->get_snode_tree_id()));
 
   if (snode->type != SNodeType::place)
     common.set("refine_coordinates",
-               get_runtime_function(snode->refine_coordinates_func_name()));
+               get_struct_function(snode->refine_coordinates_func_name(),
+                                   snode->get_snode_tree_id()));
 }
 
 TaskCodeGenLLVM::TaskCodeGenLLVM(Kernel *kernel,
@@ -332,7 +334,7 @@ TaskCodeGenLLVM::TaskCodeGenLLVM(Kernel *kernel,
     : LLVMModuleBuilder(
           module == nullptr ? get_llvm_program(kernel->program)
                                   ->get_llvm_context(kernel->arch)
-                                  ->clone_struct_module()
+                                  ->new_module("kernel")
                             : std::move(module),
           get_llvm_program(kernel->program)->get_llvm_context(kernel->arch)),
       kernel(kernel),
@@ -1706,10 +1708,11 @@ void TaskCodeGenLLVM::visit(GetChStmt *stmt) {
     auto offset = tlctx->get_constant(bit_offset);
     llvm_val[stmt] = create_bit_ptr(llvm_val[stmt->input_ptr], offset);
   } else {
-    auto ch = create_call(stmt->output_snode->get_ch_from_parent_func_name(),
-                          {builder->CreateBitCast(
-                              llvm_val[stmt->input_ptr],
-                              llvm::PointerType::getInt8PtrTy(*llvm_context))});
+    auto ch = call_struct_func(
+        stmt->output_snode->get_snode_tree_id(),
+        stmt->output_snode->get_ch_from_parent_func_name(),
+        builder->CreateBitCast(llvm_val[stmt->input_ptr],
+                               llvm::PointerType::getInt8PtrTy(*llvm_context)));
     llvm_val[stmt] = builder->CreateBitCast(
         ch, llvm::PointerType::get(StructCompilerLLVM::get_llvm_node_type(
                                        module.get(), stmt->output_snode),
@@ -1989,7 +1992,8 @@ void TaskCodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt,
         create_entry_block_alloca(physical_coordinate_ty);
 
     auto refine =
-        get_runtime_function(leaf_block->refine_coordinates_func_name());
+        get_struct_function(leaf_block->refine_coordinates_func_name(),
+                            leaf_block->get_snode_tree_id());
     // A block corner is the global coordinate/index of the lower-left corner
     // cell within that block, and is the same for all the cells within that
     // block.
@@ -2068,8 +2072,8 @@ void TaskCodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt,
     // needed to make final coordinates non-consecutive, since each thread will
     // process multiple coordinates via vectorization
     if (stmt->is_bit_vectorized) {
-      refine =
-          get_runtime_function(stmt->snode->refine_coordinates_func_name());
+      refine = get_struct_function(stmt->snode->refine_coordinates_func_name(),
+                                   stmt->snode->get_snode_tree_id());
       create_call(refine,
                   {new_coordinates, new_coordinates, tlctx->get_constant(0)});
     }
@@ -2156,7 +2160,6 @@ void TaskCodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt,
   auto struct_for_func = get_runtime_function("parallel_struct_for");
 
   if (arch_is_gpu(current_arch())) {
-    tlctx->add_struct_for_func(module.get(), stmt->tls_size);
     struct_for_func = llvm::cast<llvm::Function>(
         module
             ->getOrInsertFunction(
diff --git a/taichi/codegen/llvm/llvm_codegen_utils.h b/taichi/codegen/llvm/llvm_codegen_utils.h
index c21c0295e24d9..ec22ed9cdde14 100644
--- a/taichi/codegen/llvm/llvm_codegen_utils.h
+++ b/taichi/codegen/llvm/llvm_codegen_utils.h
@@ -87,35 +87,19 @@ class LLVMModuleBuilder {
   }
 
   llvm::Type *get_runtime_type(const std::string &name) {
-#ifdef TI_LLVM_15
-    auto ty = llvm::StructType::getTypeByName(module->getContext(),
-                                              ("struct." + name));
-#else
-    auto ty = module->getTypeByName("struct." + name);
-#endif
-    if (!ty) {
-      TI_ERROR("LLVMRuntime type {} not found.", name);
-    }
-    return ty;
+    return tlctx->get_runtime_type(name);
   }
 
   llvm::Function *get_runtime_function(const std::string &name) {
-    auto f = module->getFunction(name);
+    auto f = tlctx->get_runtime_function(name);
     if (!f) {
       TI_ERROR("LLVMRuntime function {} not found.", name);
     }
-#ifdef TI_LLVM_15
-    f->removeFnAttr(llvm::Attribute::OptimizeNone);
-    f->removeFnAttr(llvm::Attribute::NoInline);
-    f->addFnAttr(llvm::Attribute::AlwaysInline);
-#else
-    f->removeAttribute(llvm::AttributeList::FunctionIndex,
-                       llvm::Attribute::OptimizeNone);
-    f->removeAttribute(llvm::AttributeList::FunctionIndex,
-                       llvm::Attribute::NoInline);
-    f->addAttribute(llvm::AttributeList::FunctionIndex,
-                    llvm::Attribute::AlwaysInline);
-#endif
+    f = llvm::cast<llvm::Function>(
+        module
+            ->getOrInsertFunction(name, f->getFunctionType(),
+                                  f->getAttributes())
+            .getCallee());
     return f;
   }
 
diff --git a/taichi/codegen/llvm/llvm_compiled_data.h b/taichi/codegen/llvm/llvm_compiled_data.h
index 0cf52571f1358..59f9a33c3bb42 100644
--- a/taichi/codegen/llvm/llvm_compiled_data.h
+++ b/taichi/codegen/llvm/llvm_compiled_data.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <memory>
+#include <unordered_set>
 
 #include "llvm/IR/Module.h"
 
diff --git a/taichi/codegen/llvm/struct_llvm.cpp b/taichi/codegen/llvm/struct_llvm.cpp
index 907342858dfa8..62b986541e060 100644
--- a/taichi/codegen/llvm/struct_llvm.cpp
+++ b/taichi/codegen/llvm/struct_llvm.cpp
@@ -292,7 +292,7 @@ void StructCompilerLLVM::run(SNode &root) {
   auto node_type = get_llvm_node_type(module.get(), &root);
   root_size = tlctx_->get_type_size(node_type);
 
-  tlctx_->set_struct_module(module);
+  tlctx_->add_struct_module(std::move(module), root.get_snode_tree_id());
 }
 
 llvm::Type *StructCompilerLLVM::get_stub(llvm::Module *module,
@@ -336,7 +336,6 @@ llvm::Type *StructCompilerLLVM::get_llvm_element_type(llvm::Module *module,
 
 llvm::Function *StructCompilerLLVM::create_function(llvm::FunctionType *ft,
                                                     std::string func_name) {
-  tlctx_->add_function_to_snode_tree(snode_tree_id_, func_name);
   return llvm::Function::Create(ft, llvm::Function::ExternalLinkage, func_name,
                                 *module);
 }
diff --git a/taichi/codegen/wasm/codegen_wasm.cpp b/taichi/codegen/wasm/codegen_wasm.cpp
index 250179d65d1bd..bd3104ecb8173 100644
--- a/taichi/codegen/wasm/codegen_wasm.cpp
+++ b/taichi/codegen/wasm/codegen_wasm.cpp
@@ -9,6 +9,7 @@
 #include "taichi/ir/statements.h"
 #include "taichi/util/statistics.h"
 #include "taichi/util/file_sequence_writer.h"
+#include "taichi/runtime/program_impls/llvm/llvm_program.h"
 
 namespace taichi {
 namespace lang {
@@ -243,10 +244,10 @@ class TaskCodeGenWASM : public TaskCodeGenLLVM {
 
 FunctionType KernelCodeGenWASM::compile_to_function() {
   TI_AUTO_PROF
-  TaskCodeGenWASM gen(kernel, ir);
-  auto res = gen.run_compilation();
-  gen.tlctx->add_module(std::move(res.module));
-  auto kernel_symbol = gen.tlctx->lookup_function_pointer(res.tasks[0].name);
+  auto linked = std::move(compile_kernel_to_module()[0]);
+  auto *tlctx = get_llvm_program(prog)->get_llvm_context(kernel->arch);
+  tlctx->create_jit_module(std::move(linked.module));
+  auto kernel_symbol = tlctx->lookup_function_pointer(linked.tasks[0].name);
   return [=](RuntimeContext &context) {
     TI_TRACE("Launching Taichi Kernel Function");
     auto func = (int32(*)(void *))kernel_symbol;
@@ -257,6 +258,7 @@ FunctionType KernelCodeGenWASM::compile_to_function() {
 LLVMCompiledData KernelCodeGenWASM::compile_task(
     std::unique_ptr<llvm::Module> &&module,
     OffloadedStmt *stmt) {
+  kernel->offload_to_executable(ir);
   bool init_flag = module == nullptr;
   std::vector<OffloadedTask> name_list;
   auto gen = std::make_unique<TaskCodeGenWASM>(kernel, ir, std::move(module));
@@ -278,5 +280,20 @@ LLVMCompiledData KernelCodeGenWASM::compile_task(
 
   return {name_list, std::move(gen->module), {}, {}};
 }
+
+std::vector<LLVMCompiledData> KernelCodeGenWASM::compile_kernel_to_module() {
+  auto *tlctx = get_llvm_program(prog)->get_llvm_context(kernel->arch);
+  if (!kernel->lowered()) {
+    kernel->lower(/*to_executable=*/false);
+  }
+  auto res = compile_task();
+  std::vector<std::unique_ptr<LLVMCompiledData>> data;
+  data.push_back(std::make_unique<LLVMCompiledData>(std::move(res)));
+  auto linked = tlctx->link_compiled_tasks(std::move(data));
+  std::vector<LLVMCompiledData> ret;
+  ret.push_back(std::move(*linked));
+  return ret;
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/codegen/wasm/codegen_wasm.h b/taichi/codegen/wasm/codegen_wasm.h
index fde841012314f..91ec5aaf6a648 100644
--- a/taichi/codegen/wasm/codegen_wasm.h
+++ b/taichi/codegen/wasm/codegen_wasm.h
@@ -23,6 +23,8 @@ class KernelCodeGenWASM : public KernelCodeGen {
   LLVMCompiledData compile_task(
       std::unique_ptr<llvm::Module> &&module = nullptr,
       OffloadedStmt *stmt = nullptr) override;  // AOT Module Gen
+
+  std::vector<LLVMCompiledData> compile_kernel_to_module() override;
 #endif
 };
 
diff --git a/taichi/runtime/cpu/aot_module_builder_impl.cpp b/taichi/runtime/cpu/aot_module_builder_impl.cpp
index c5577cd43337b..daf89298f3d76 100644
--- a/taichi/runtime/cpu/aot_module_builder_impl.cpp
+++ b/taichi/runtime/cpu/aot_module_builder_impl.cpp
@@ -10,8 +10,8 @@ namespace lang {
 namespace cpu {
 
 LLVMCompiledData AotModuleBuilderImpl::compile_kernel(Kernel *kernel) {
-  auto cgen = KernelCodeGenCPU::make_codegen_llvm(kernel, /*ir=*/nullptr);
-  return cgen->run_compilation();
+  auto cgen = KernelCodeGenCPU(kernel);
+  return std::move(cgen.compile_kernel_to_module()[0]);
 }
 
 }  // namespace cpu
diff --git a/taichi/runtime/cuda/aot_module_builder_impl.cpp b/taichi/runtime/cuda/aot_module_builder_impl.cpp
index 2a40066f3471f..0d7431d9874ad 100644
--- a/taichi/runtime/cuda/aot_module_builder_impl.cpp
+++ b/taichi/runtime/cuda/aot_module_builder_impl.cpp
@@ -10,8 +10,8 @@ namespace lang {
 namespace cuda {
 
 LLVMCompiledData AotModuleBuilderImpl::compile_kernel(Kernel *kernel) {
-  auto cgen = KernelCodeGenCUDA::make_codegen_llvm(kernel, /*ir=*/nullptr);
-  return cgen->run_compilation();
+  auto cgen = KernelCodeGenCUDA(kernel);
+  return std::move(cgen.compile_kernel_to_module()[0]);
 }
 
 }  // namespace cuda
diff --git a/taichi/runtime/llvm/llvm_context.cpp b/taichi/runtime/llvm/llvm_context.cpp
index c1a1c52a3454d..31c24b1305038 100644
--- a/taichi/runtime/llvm/llvm_context.cpp
+++ b/taichi/runtime/llvm/llvm_context.cpp
@@ -112,6 +112,13 @@ TaichiLLVMContext::TaichiLLVMContext(CompileConfig *config, Arch arch)
 #endif
   }
   jit = JITSession::create(this, config, arch);
+
+  linking_context_data = std::make_unique<ThreadLocalData>(
+      std::make_unique<llvm::orc::ThreadSafeContext>(
+          std::make_unique<llvm::LLVMContext>()));
+  linking_context_data->runtime_module = clone_module_to_context(
+      get_this_thread_runtime_module(), linking_context_data->llvm_context);
+
   TI_TRACE("Taichi llvm context created.");
 }
 
@@ -503,26 +510,18 @@ void TaichiLLVMContext::link_module_with_cuda_libdevice(
     TI_ERROR("CUDA libdevice linking failure.");
   }
 
-  // Make sure all libdevice functions are linked, and set their linkage to
-  // internal
+  // Make sure all libdevice functions are linked
   for (auto func_name : libdevice_function_names) {
     auto func = module->getFunction(func_name);
     if (!func) {
       TI_INFO("Function {} not found", func_name);
-    } else
-      func->setLinkage(llvm::Function::InternalLinkage);
+    }
   }
 }
 
-std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_struct_module() {
-  TI_AUTO_PROF
-  auto struct_module = get_this_thread_struct_module();
-  TI_ASSERT(struct_module);
-  return llvm::CloneModule(*struct_module);
-}
-
-void TaichiLLVMContext::set_struct_module(
-    const std::unique_ptr<llvm::Module> &module) {
+void TaichiLLVMContext::add_struct_module(std::unique_ptr<Module> module,
+                                          int tree_id) {
+  TI_AUTO_PROF;
   TI_ASSERT(std::this_thread::get_id() == main_thread_id_);
   auto this_thread_data = get_this_thread_data();
   TI_ASSERT(module);
@@ -530,16 +529,19 @@ void TaichiLLVMContext::set_struct_module(
     module->print(llvm::errs(), nullptr);
     TI_ERROR("module broken");
   }
-  // TODO: Move this after ``if (!arch_is_cpu(arch))``.
-  this_thread_data->struct_module = llvm::CloneModule(*module);
+
+  linking_context_data->struct_modules[tree_id] =
+      clone_module_to_context(module.get(), linking_context_data->llvm_context);
+
   for (auto &[id, data] : per_thread_data_) {
     if (id == std::this_thread::get_id()) {
       continue;
     }
-    TI_ASSERT(!data->runtime_module);
-    data->struct_module = clone_module_to_context(
-        this_thread_data->struct_module.get(), data->llvm_context);
+    data->struct_modules[tree_id] =
+        clone_module_to_context(module.get(), data->llvm_context);
   }
+
+  this_thread_data->struct_modules[tree_id] = std::move(module);
 }
 template <typename T>
 llvm::Value *TaichiLLVMContext::get_constant(DataType dt, T t) {
@@ -661,7 +663,8 @@ llvm::DataLayout TaichiLLVMContext::get_data_layout() {
   return jit->get_data_layout();
 }
 
-JITModule *TaichiLLVMContext::add_module(std::unique_ptr<llvm::Module> module) {
+JITModule *TaichiLLVMContext::create_jit_module(
+    std::unique_ptr<llvm::Module> module) {
   return jit->add_module(std::move(module));
 }
 
@@ -755,15 +758,6 @@ TaichiLLVMContext::get_this_thread_thread_safe_context() {
   return data->thread_safe_llvm_context.get();
 }
 
-llvm::Module *TaichiLLVMContext::get_this_thread_struct_module() {
-  ThreadLocalData *data = get_this_thread_data();
-  if (!data->struct_module) {
-    data->struct_module = clone_module_to_this_thread_context(
-        main_thread_data_->struct_module.get());
-  }
-  return data->struct_module.get();
-}
-
 template llvm::Value *TaichiLLVMContext::get_constant(float32 t);
 template llvm::Value *TaichiLLVMContext::get_constant(float64 t);
 
@@ -823,24 +817,14 @@ void TaichiLLVMContext::update_runtime_jit_module(
     return starts_with(func_name, "runtime_") ||
            starts_with(func_name, "LLVMRuntime_");
   });
-  runtime_jit_module = add_module(std::move(module));
+  runtime_jit_module = create_jit_module(std::move(module));
 }
 
-void TaichiLLVMContext::delete_functions_of_snode_tree(int id) {
-  if (!snode_tree_funcs_.count(id)) {
-    return;
+void TaichiLLVMContext::delete_snode_tree(int id) {
+  TI_ASSERT(linking_context_data->struct_modules.erase(id));
+  for (auto &[thread_id, data] : per_thread_data_) {
+    TI_ASSERT(data->struct_modules.erase(id));
   }
-  llvm::Module *module = get_this_thread_struct_module();
-  for (auto str : snode_tree_funcs_[id]) {
-    auto *func = module->getFunction(str);
-    func->eraseFromParent();
-  }
-  snode_tree_funcs_.erase(id);
-  set_struct_module(get_this_thread_data()->struct_module);
-}
-
-void TaichiLLVMContext::add_function_to_snode_tree(int id, std::string func) {
-  snode_tree_funcs_[id].push_back(func);
 }
 
 void TaichiLLVMContext::fetch_this_thread_struct_module() {
@@ -901,11 +885,52 @@ TaichiLLVMContext::ThreadLocalData::ThreadLocalData(
 
 TaichiLLVMContext::ThreadLocalData::~ThreadLocalData() {
   runtime_module.reset();
-  struct_module.reset();
   struct_modules.clear();
   thread_safe_llvm_context.reset();
 }
 
+std::unique_ptr<LLVMCompiledData> TaichiLLVMContext::link_compiled_tasks(
+    std::vector<std::unique_ptr<LLVMCompiledData>> data_list) {
+  auto linked = std::make_unique<LLVMCompiledData>();
+  std::unordered_set<int> used_tree_ids;
+  std::unordered_set<int> tls_sizes;
+  std::unordered_set<std::string> offloaded_names;
+  auto mod = new_module("kernel", linking_context_data->llvm_context);
+  llvm::Linker linker(*mod);
+  for (auto &datum : data_list) {
+    for (auto tree_id : datum->used_tree_ids) {
+      used_tree_ids.insert(tree_id);
+    }
+    for (auto tls_size : datum->struct_for_tls_sizes) {
+      tls_sizes.insert(tls_size);
+    }
+    for (auto &task : datum->tasks) {
+      offloaded_names.insert(task.name);
+      linked->tasks.push_back(std::move(task));
+    }
+    linker.linkInModule(clone_module_to_context(
+        datum->module.get(), linking_context_data->llvm_context));
+  }
+  for (auto tree_id : used_tree_ids) {
+    linker.linkInModule(
+        llvm::CloneModule(*linking_context_data->struct_modules[tree_id]),
+        llvm::Linker::LinkOnlyNeeded | llvm::Linker::OverrideFromSrc);
+  }
+  auto runtime_module =
+      llvm::CloneModule(*linking_context_data->runtime_module);
+  for (auto tls_size : tls_sizes) {
+    add_struct_for_func(runtime_module.get(), tls_size);
+  }
+  linker.linkInModule(
+      std::move(runtime_module),
+      llvm::Linker::LinkOnlyNeeded | llvm::Linker::OverrideFromSrc);
+  eliminate_unused_functions(mod.get(), [&](std::string func_name) -> bool {
+    return offloaded_names.count(func_name);
+  });
+  linked->module = std::move(mod);
+  return linked;
+}
+
 void TaichiLLVMContext::add_struct_for_func(llvm::Module *module,
                                             int tls_size) {
   // Note that on CUDA local array allocation must have a compile-time
diff --git a/taichi/runtime/llvm/llvm_context.h b/taichi/runtime/llvm/llvm_context.h
index 6eab588f704e9..a4dda7ce846eb 100644
--- a/taichi/runtime/llvm/llvm_context.h
+++ b/taichi/runtime/llvm/llvm_context.h
@@ -13,6 +13,7 @@
 #include "taichi/runtime/llvm/llvm_fwd.h"
 #include "taichi/ir/snode.h"
 #include "taichi/jit/jit_session.h"
+#include "taichi/codegen/llvm/llvm_compiled_data.h"
 
 namespace taichi {
 namespace lang {
@@ -33,8 +34,6 @@ class TaichiLLVMContext {
     std::unordered_map<int, std::unique_ptr<llvm::Module>> struct_modules;
     ThreadLocalData(std::unique_ptr<llvm::orc::ThreadSafeContext> ctx);
     ~ThreadLocalData();
-    std::unique_ptr<llvm::Module> struct_module{nullptr};  // TODO: To be
-                                                           // deleted
   };
   CompileConfig *config_;
 
@@ -43,6 +42,8 @@ class TaichiLLVMContext {
   // main_thread is defined to be the thread that runs the initializer
   JITModule *runtime_jit_module{nullptr};
 
+  std::unique_ptr<ThreadLocalData> linking_context_data{nullptr};
+
   TaichiLLVMContext(CompileConfig *config, Arch arch);
 
   virtual ~TaichiLLVMContext();
@@ -60,19 +61,12 @@ class TaichiLLVMContext {
    */
   void init_runtime_jit_module();
 
-  /**
-   * Clones the LLVM module containing the JIT compiled SNode structs.
-   *
-   * @return The cloned module.
-   */
-  std::unique_ptr<llvm::Module> clone_struct_module();
-
   /**
    * Updates the LLVM module of the JIT compiled SNode structs.
    *
    * @param module Module containing the JIT compiled SNode structs.
    */
-  void set_struct_module(const std::unique_ptr<llvm::Module> &module);
+  void add_struct_module(std::unique_ptr<llvm::Module> module, int tree_id);
 
   /**
    * Clones the LLVM module compiled from llvm/runtime.cpp
@@ -83,7 +77,7 @@ class TaichiLLVMContext {
 
   std::unique_ptr<llvm::Module> module_from_file(const std::string &file);
 
-  JITModule *add_module(std::unique_ptr<llvm::Module> module);
+  JITModule *create_jit_module(std::unique_ptr<llvm::Module> module);
 
   virtual void *lookup_function_pointer(const std::string &name) {
     return jit->lookup(name);
@@ -101,8 +95,6 @@ class TaichiLLVMContext {
 
   llvm::Type *get_data_type(DataType dt);
 
-  llvm::Module *get_this_thread_struct_module();
-
   template <typename T>
   llvm::Type *get_data_type() {
     return TaichiLLVMContext::get_data_type(taichi::lang::get_data_type<T>());
@@ -144,14 +136,15 @@ class TaichiLLVMContext {
       std::string name,
       llvm::LLVMContext *context = nullptr);
 
-  void add_function_to_snode_tree(int id, std::string func);
-
-  void delete_functions_of_snode_tree(int id);
+  void delete_snode_tree(int id);
 
   void add_struct_for_func(llvm::Module *module, int tls_size);
 
   static std::string get_struct_for_func_name(int tls_size);
 
+  std::unique_ptr<LLVMCompiledData> link_compiled_tasks(
+      std::vector<std::unique_ptr<LLVMCompiledData>> data_list);
+
  private:
   std::unique_ptr<llvm::Module> clone_module_to_context(
       llvm::Module *module,
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index 7e625f3e4677e..78bbc4d1cf4b0 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -620,8 +620,7 @@ void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
 }
 
 void LlvmRuntimeExecutor::destroy_snode_tree(SNodeTree *snode_tree) {
-  get_llvm_context(config_->arch)
-      ->delete_functions_of_snode_tree(snode_tree->id());
+  get_llvm_context(config_->arch)->delete_snode_tree(snode_tree->id());
   snode_tree_buffer_manager_->destroy(snode_tree);
 }
 
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.cpp b/taichi/runtime/program_impls/llvm/llvm_program.cpp
index 2f1eb2e451f61..b058e545fe5db 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.cpp
+++ b/taichi/runtime/program_impls/llvm/llvm_program.cpp
@@ -42,35 +42,24 @@ FunctionType LlvmProgramImpl::compile(Kernel *kernel,
   return codegen->compile_to_function();
 }
 
-std::unique_ptr<llvm::Module>
-LlvmProgramImpl::clone_struct_compiler_initial_context(
-    bool has_multiple_snode_trees,
-    TaichiLLVMContext *tlctx) {
-  if (has_multiple_snode_trees) {
-    return tlctx->clone_struct_module();
-  }
-  return tlctx->clone_runtime_module();
-}
-
 std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
     SNodeTree *tree) {
   auto *const root = tree->root();
-  const bool has_multiple_snode_trees = (num_snode_trees_processed_ > 0);
   std::unique_ptr<StructCompiler> struct_compiler{nullptr};
   if (arch_is_cpu(config->arch)) {
-    auto host_module = clone_struct_compiler_initial_context(
-        has_multiple_snode_trees, runtime_exec_->llvm_context_host_.get());
+    auto host_module =
+        runtime_exec_->llvm_context_host_.get()->new_module("struct");
     struct_compiler = std::make_unique<StructCompilerLLVM>(
         host_arch(), this, std::move(host_module), tree->id());
   } else if (config->arch == Arch::dx12) {
-    auto device_module = clone_struct_compiler_initial_context(
-        has_multiple_snode_trees, runtime_exec_->llvm_context_device_.get());
+    auto device_module =
+        runtime_exec_->llvm_context_device_.get()->new_module("struct");
     struct_compiler = std::make_unique<StructCompilerLLVM>(
         Arch::dx12, this, std::move(device_module), tree->id());
   } else {
     TI_ASSERT(config->arch == Arch::cuda);
-    auto device_module = clone_struct_compiler_initial_context(
-        has_multiple_snode_trees, runtime_exec_->llvm_context_device_.get());
+    auto device_module =
+        runtime_exec_->llvm_context_device_.get()->new_module("struct");
     struct_compiler = std::make_unique<StructCompilerLLVM>(
         Arch::cuda, this, std::move(device_module), tree->id());
   }
diff --git a/taichi/runtime/program_impls/llvm/llvm_program.h b/taichi/runtime/program_impls/llvm/llvm_program.h
index c84bea18766dc..2da386bc631fa 100644
--- a/taichi/runtime/program_impls/llvm/llvm_program.h
+++ b/taichi/runtime/program_impls/llvm/llvm_program.h
@@ -66,10 +66,6 @@ class LlvmProgramImpl : public ProgramImpl {
   }
 
  private:
-  std::unique_ptr<llvm::Module> clone_struct_compiler_initial_context(
-      bool has_multiple_snode_trees,
-      TaichiLLVMContext *tlctx);
-
   std::unique_ptr<StructCompiler> compile_snode_tree_types_impl(
       SNodeTree *tree);
 
diff --git a/taichi/runtime/wasm/aot_module_builder_impl.cpp b/taichi/runtime/wasm/aot_module_builder_impl.cpp
index a4900798df4f2..c0bcacfb357e9 100644
--- a/taichi/runtime/wasm/aot_module_builder_impl.cpp
+++ b/taichi/runtime/wasm/aot_module_builder_impl.cpp
@@ -4,6 +4,7 @@
 #include <fstream>
 
 #include "taichi/util/file_sequence_writer.h"
+#include "llvm/Linker/Linker.h"
 
 namespace taichi {
 namespace lang {
@@ -36,11 +37,13 @@ void AotModuleBuilderImpl::dump(const std::string &output_dir,
 void AotModuleBuilderImpl::add_per_backend(const std::string &identifier,
                                            Kernel *kernel) {
   auto module_info =
-      KernelCodeGenWASM(kernel, nullptr).compile_task(std::move(module_));
-  module_ = std::move(module_info.module);
-
-  for (auto &task : module_info.tasks)
-    name_list_.push_back(task.name);
+      KernelCodeGenWASM(kernel, nullptr).compile_kernel_to_module();
+  if (module_) {
+    llvm::Linker::linkModules(*module_, std::move(module_info[0].module),
+                              llvm::Linker::OverrideFromSrc);
+  } else {
+    module_ = std::move(module_info[0].module);
+  }
 }
 
 void AotModuleBuilderImpl::add_field_per_backend(const std::string &identifier,
diff --git a/tests/cpp/codegen/refine_coordinates_test.cpp b/tests/cpp/codegen/refine_coordinates_test.cpp
index 33399a604d5e5..02425dce4e4d4 100644
--- a/tests/cpp/codegen/refine_coordinates_test.cpp
+++ b/tests/cpp/codegen/refine_coordinates_test.cpp
@@ -2,6 +2,7 @@
 #include "gtest/gtest.h"
 
 #include <memory>
+#include <unordered_set>
 
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
@@ -32,14 +33,21 @@ class InvokeRefineCoordinatesBuilder : public LLVMModuleBuilder {
   static FuncType build(const SNode *snode, TaichiLLVMContext *tlctx) {
     InvokeRefineCoordinatesBuilder mb{tlctx};
     mb.run_jit(snode);
-    tlctx->add_module(std::move(mb.module));
-    auto *fn = tlctx->lookup_function_pointer(kFuncName);
+    LLVMCompiledData data;
+    data.module = std::move(mb.module);
+    data.used_tree_ids = std::move(mb.used_snode_tree_ids);
+    data.tasks.emplace_back(kFuncName);
+    std::vector<std::unique_ptr<LLVMCompiledData>> data_list;
+    data_list.push_back(std::make_unique<LLVMCompiledData>(std::move(data)));
+    auto linked_data = tlctx->link_compiled_tasks(std::move(data_list));
+    auto *jit = tlctx->create_jit_module(std::move(linked_data->module));
+    auto *fn = jit->lookup_function(kFuncName);
     return reinterpret_cast<FuncType>(fn);
   }
 
  private:
   InvokeRefineCoordinatesBuilder(TaichiLLVMContext *tlctx)
-      : LLVMModuleBuilder(tlctx->clone_struct_module(), tlctx) {
+      : LLVMModuleBuilder(tlctx->new_module("kernel"), tlctx) {
     this->llvm_context = this->tlctx->get_this_thread_context();
     this->builder = std::make_unique<llvm::IRBuilder<>>(*llvm_context);
   }
@@ -75,8 +83,15 @@ class InvokeRefineCoordinatesBuilder : public LLVMModuleBuilder {
     RuntimeObject parent_coords{kLLVMPhysicalCoordinatesName, this,
                                 builder.get()};
     parent_coords.set("val", index0, parent_coords_first_component);
-    auto *refine_fn =
-        get_runtime_function(snode->refine_coordinates_func_name());
+    auto *refine_fn_struct = tlctx->get_struct_function(
+        snode->refine_coordinates_func_name(), snode->get_snode_tree_id());
+    auto *refine_fn = llvm::cast<llvm::Function>(
+        module
+            ->getOrInsertFunction(refine_fn_struct->getName(),
+                                  refine_fn_struct->getFunctionType(),
+                                  refine_fn_struct->getAttributes())
+            .getCallee());
+    used_snode_tree_ids.insert(snode->get_snode_tree_id());
     RuntimeObject child_coords{kLLVMPhysicalCoordinatesName, this,
                                builder.get()};
     builder->CreateCall(refine_fn,
@@ -86,6 +101,8 @@ class InvokeRefineCoordinatesBuilder : public LLVMModuleBuilder {
 
     llvm::verifyFunction(*func);
   }
+
+  std::unordered_set<int> used_snode_tree_ids;
 };
 
 struct BitsRange {
@@ -119,9 +136,9 @@ class RefineCoordinatesTest : public ::testing::Test {
     auto &leaf_snode = dense_snode_->insert_children(SNodeType::place);
     leaf_snode.dt = PrimitiveType::f32;
 
-    auto sc = std::make_unique<StructCompilerLLVM>(
-        arch_, &config_, tlctx_, tlctx_->clone_runtime_module(),
-        /*snode_tree_id=*/0);
+    auto sc = std::make_unique<StructCompilerLLVM>(arch_, &config_, tlctx_,
+                                                   tlctx_->new_module("struct"),
+                                                   /*snode_tree_id=*/0);
     sc->run(*root_snode_);
   }
 
diff --git a/tests/cpp/llvm/llvm_offline_cache_test.cpp b/tests/cpp/llvm/llvm_offline_cache_test.cpp
index 268aad42180ad..ef622db1e6414 100644
--- a/tests/cpp/llvm/llvm_offline_cache_test.cpp
+++ b/tests/cpp/llvm/llvm_offline_cache_test.cpp
@@ -128,9 +128,10 @@ TEST_P(LlvmOfflineCacheTest, ReadWrite) {
 
     ASSERT_NE(kcache.compiled_data_list[0].module, nullptr);
     kcache.compiled_data_list[0].module->dump();
-    tlctx_->add_module(std::move(kcache.compiled_data_list[0].module));
+    auto jit_module = tlctx_->create_jit_module(
+        std::move(kcache.compiled_data_list[0].module));
     using FuncType = int (*)(int, int);
-    FuncType my_add = (FuncType)tlctx_->lookup_function_pointer(kTaskName);
+    FuncType my_add = (FuncType)jit_module->lookup_function(kTaskName);
     const auto res = my_add(40, 2);
     EXPECT_EQ(res, 42);
   };
diff --git a/tests/python/test_offline_cache.py b/tests/python/test_offline_cache.py
index f6c6abcec4bf6..1a0f13dfae522 100644
--- a/tests/python/test_offline_cache.py
+++ b/tests/python/test_offline_cache.py
@@ -42,7 +42,7 @@ def cache_files_size(path):
 def expected_num_cache_files(arch, num_offloads: List[int] = None) -> int:
     if not num_offloads:
         return 0
-    result = sum(num_offloads)
+    result = sum(num_offloads) if arch in [ti.vulkan] else len(num_offloads)
     if arch in [ti.cpu, ti.cuda]:
         result += 2  # metadata.{json, tcb}
     elif arch in [ti.vulkan]:
@@ -310,11 +310,13 @@ def compute_y():
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
-    assert added_files(curr_arch) == expected_num_cache_files(curr_arch, [8])
+    assert added_files(curr_arch) == expected_num_cache_files(
+        curr_arch, [1] * 8)
     helper()
 
     ti.reset()
-    assert added_files(curr_arch) == expected_num_cache_files(curr_arch, [8])
+    assert added_files(curr_arch) == expected_num_cache_files(
+        curr_arch, [1] * 8)
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -392,11 +394,13 @@ def helper():
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
-    assert added_files(curr_arch) == expected_num_cache_files(curr_arch, [4])
+    assert added_files(curr_arch) == expected_num_cache_files(
+        curr_arch, [1] * 4)
     helper()
 
     ti.reset()
-    assert added_files(curr_arch) == expected_num_cache_files(curr_arch, [4])
+    assert added_files(curr_arch) == expected_num_cache_files(
+        curr_arch, [1] * 4)
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -491,7 +495,9 @@ def run_simple_kernels(max_size):
         only_init(max_size)
         for kernel, args, get_res, num_offloads in simple_kernels_to_test:
             assert kernel(*args) == test_utils.approx(get_res(*args))
-            sleep(1)  # make sure the kernels are not used in the same second
+            if curr_arch in [ti.vulkan]:
+                sleep(
+                    1)  # make sure the kernels are not used in the same second
 
     kernel_count = len(simple_kernels_to_test)
     count_of_cache_file = cache_files_cnt(curr_arch)
@@ -515,16 +521,14 @@ def added_files(arch):
 
     only_init(size_of_cache_files)
     ti.reset()
-    rem = 0
+    rem = []
     if policy in ['never', 'version']:
-        rem = expected_num_cache_files(
-            curr_arch, [kern[3] for kern in simple_kernels_to_test])
+        rem = [kern[3] for kern in simple_kernels_to_test]
     else:
         lo = -min(kernel_count - int(factor * kernel_count), kernel_count)
         lo = kernel_count if lo == 0 else lo
-        rem = expected_num_cache_files(
-            curr_arch, [kern[3] for kern in simple_kernels_to_test[lo:]])
-    assert added_files(curr_arch) == rem
+        rem = [kern[3] for kern in simple_kernels_to_test[lo:]]
+    assert added_files(curr_arch) == expected_num_cache_files(curr_arch, rem)
 
 
 # FIXME: Change to `supported_archs_offline_cache` after fixing bugs of real-function on gpu