From c12fb2b8ec40610db470803051736e2137ffd3cd Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Wed, 8 Jun 2022 14:00:18 +0800
Subject: [PATCH 1/3] [aot] [llvm] Implemented FieldCacheData and refactored
 initialize_llvm_runtime_snodes()

---
 taichi/llvm/llvm_offline_cache.h |  31 +++++-
 taichi/llvm/llvm_program.cpp     | 177 ++++++++++++++++++-------------
 taichi/llvm/llvm_program.h       |   6 +-
 3 files changed, 139 insertions(+), 75 deletions(-)
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index c82837e42521e..fe9eca8f8ba96 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -42,7 +42,36 @@ struct LlvmOfflineCache {
     TI_IO_DEF(kernel_key, args, offloaded_task_list);
   };
 
-  std::unordered_map<std::string, KernelCacheData> kernels;
+  struct FieldCacheData {
+    struct SNodeCacheData {
+      int id;
+      int type;
+      size_t cell_size_bytes;
+      size_t chunk_size;
+
+      TI_IO_DEF(id, type, cell_size_bytes, chunk_size);
+    };
+
+    int tree_id;
+    size_t root_size;
+    std::vector<SNodeCacheData> snode_metas;
+
+    TI_IO_DEF(tree_id, root_size, snode_metas);
+
+    // TODO(zhanlue)
+    //  Serialize/Deserialize the llvm::Module from StructCompiler
+    //  At runtime, make sure loaded Field-Modules and Kernel-Modules are linked
+    //  altogether.
+  };
+
+  // TODO(zhanlue): we need a better identifier for each FieldCacheData
+  // (SNodeTree) Given that snode_tree_id is not continuous, it is ridiculous to
+  // ask the users to remember each of the snode_tree_ids
+  // ** Find a way to name each SNodeTree **
+  std::unordered_map<int, FieldCacheData> fields;  // key = snode_tree_id
+
+  std::unordered_map<std::string, KernelCacheData>
+      kernels;  // key = kernel_name
 
   TI_IO_DEF(kernels);
 };
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 65bc1a75e12e1..e922a9ebe8026 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -158,95 +158,105 @@ LlvmProgramImpl::clone_struct_compiler_initial_context(
   return tlctx->clone_runtime_module();
 }
 
-void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree,
-                                                     StructCompiler *scomp,
-                                                     uint64 *result_buffer) {
-  TaichiLLVMContext *tlctx = nullptr;
-  if (config->arch == Arch::cuda) {
+void LlvmProgramImpl::initialize_llvm_runtime_snodes(
+    const LlvmOfflineCache::FieldCacheData &field_cache_data,
+    uint64 *result_buffer);
+TaichiLLVMContext *tlctx = nullptr;
+if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    tlctx = llvm_context_device_.get();
+  tlctx = llvm_context_device_.get();
 #else
-    TI_NOT_IMPLEMENTED
+  TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    tlctx = llvm_context_host_.get();
-  }
+} else {
+  tlctx = llvm_context_host_.get();
+}
 
-  auto *const runtime_jit = tlctx->runtime_jit_module;
-  // By the time this creator is called, "this" is already destroyed.
-  // Therefore it is necessary to capture members by values.
-  const auto snodes = scomp->snodes;
-  const int root_id = tree->root()->id;
-
-  TI_TRACE("Allocating data structure of size {} bytes", scomp->root_size);
-  std::size_t rounded_size =
-      taichi::iroundup(scomp->root_size, taichi_page_size);
-
-  Ptr root_buffer = snode_tree_buffer_manager_->allocate(
-      runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree->id(),
-      result_buffer);
-  if (config->arch == Arch::cuda) {
+auto *const runtime_jit = tlctx -> runtime_jit_module;
+// By the time this creator is called, "this" is already destroyed.
+// Therefore it is necessary to capture members by values.
+size_t root_size = field_cache_data.root_size;
+const auto snode_metas = field_cache_data.snode_metas;
+const int root_id = field_cache_data.tree_id;
+
+TI_TRACE("Allocating data structure of size {} bytes", root_size);
+std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
+
+Ptr root_buffer = snode_tree_buffer_manager_->allocate(runtime_jit,
+                                                       llvm_runtime_,
+                                                       rounded_size,
+                                                       taichi_page_size,
+                                                       root_id,
+                                                       result_buffer);
+if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
+  CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
-    TI_NOT_IMPLEMENTED
+  TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    std::memset(root_buffer, 0, rounded_size);
-  }
+} else {
+  std::memset(root_buffer, 0, rounded_size);
+}
 
-  DeviceAllocation alloc{kDeviceNullAllocation};
+DeviceAllocation alloc{kDeviceNullAllocation};
 
-  if (config->arch == Arch::cuda) {
+if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-    alloc = cuda_device()->import_memory(root_buffer, rounded_size);
+  alloc = cuda_device()->import_memory(root_buffer, rounded_size);
 #else
-    TI_NOT_IMPLEMENTED
+  TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    alloc = cpu_device()->import_memory(root_buffer, rounded_size);
-  }
+} else {
+  alloc = cpu_device()->import_memory(root_buffer, rounded_size);
+}
 
-  snode_tree_allocs_[tree->id()] = alloc;
+snode_tree_allocs_[tree->id()] = alloc;
 
-  bool all_dense = config->demote_dense_struct_fors;
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    if (snodes[i]->type != SNodeType::dense &&
-        snodes[i]->type != SNodeType::place &&
-        snodes[i]->type != SNodeType::root) {
-      all_dense = false;
-      break;
-    }
+bool all_dense = config->demote_dense_struct_fors;
+for (size_t i = 0; i < snode_metas.size(); i++) {
+  if (snode_metas[i]->type != SNodeType::dense &&
+      snode_metas[i]->type != SNodeType::place &&
+      snode_metas[i]->type != SNodeType::root) {
+    all_dense = false;
+    break;
   }
+}
 
-  runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
-      "runtime_initialize_snodes", llvm_runtime_, scomp->root_size, root_id,
-      (int)snodes.size(), tree->id(), rounded_size, root_buffer, all_dense);
-
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    if (is_gc_able(snodes[i]->type)) {
-      const auto snode_id = snodes[i]->id;
-      std::size_t node_size;
-      auto element_size = snodes[i]->cell_size_bytes;
-      if (snodes[i]->type == SNodeType::pointer) {
-        // pointer. Allocators are for single elements
-        node_size = element_size;
-      } else {
-        // dynamic. Allocators are for the chunks
-        node_size = sizeof(void *) + element_size * snodes[i]->chunk_size;
-      }
-      TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
-               node_size);
-      auto rt = llvm_runtime_;
-      runtime_jit->call<void *, int, std::size_t>(
-          "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
-      TI_TRACE("Allocating ambient element for snode {} (node size {})",
-               snode_id, node_size);
-      runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
-                                     node_size);
+runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
+    "runtime_initialize_snodes",
+    llvm_runtime_,
+    root_size,
+    root_id,
+    (int)snode_metas.size(),
+    root_id,
+    rounded_size,
+    root_buffer,
+    all_dense);
+
+for (size_t i = 0; i < snode_metas.size(); i++) {
+  if (is_gc_able(snode_metas[i]->type)) {
+    const auto snode_id = snode_metas[i].id;
+    std::size_t node_size;
+    auto element_size = snode_metas[i].cell_size_bytes;
+    if (snode_metas[i].type == SNodeType::pointer) {
+      // pointer. Allocators are for single elements
+      node_size = element_size;
+    } else {
+      // dynamic. Allocators are for the chunks
+      node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
     }
+    TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
+             node_size);
+    auto rt = llvm_runtime_;
+    runtime_jit->call<void *, int, std::size_t>(
+        "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
+    TI_TRACE("Allocating ambient element for snode {} (node size {})", snode_id,
+             node_size);
+    runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
+                                   node_size);
   }
 }
+}
 
 std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
     SNodeTree *tree) {
@@ -275,10 +285,35 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   compile_snode_tree_types_impl(tree);
 }
 
+static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
+    const SNodeTree &tree,
+    const StructCompiler &struct_compiler) {
+  TI_ASSERT(tree.id == tree.root()->id);
+
+  LlvmOfflineCache::FieldCacheData ret;
+  ret.tree_id = tree.id;
+  ret.root_size = struct_compiler.root_size;
+
+  const auto &snodes = struct_compiler.snodes;
+  for (size_t i = 0; i < snodes.size(); i++) {
+    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
+    snode_cache_data.id = snodes[i]->id;
+    snode_cache_data.type = snodes[i]->type;
+    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
+    snode_cache_data.chunk_size = snodes[i]->chunk_size;
+
+    ret.snode_metas.emplace_back(std::move(snode_cache_data));
+  }
+
+  return ret;
+}
+
 void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
                                              uint64 *result_buffer) {
   auto struct_compiler = compile_snode_tree_types_impl(tree);
-  initialize_llvm_runtime_snodes(tree, struct_compiler.get(), result_buffer);
+
+  auto field_cache_data = construct_filed_cache_data(*tree, *struct_compiler);
+  initialize_llvm_runtime_snodes(field_cache_data, result_buffer);
 }
 
 uint64 LlvmProgramImpl::fetch_result_uint64(int i, uint64 *result_buffer) {
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index c9029bbcd85f0..69378ee660bf1 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -132,9 +132,9 @@ class LlvmProgramImpl : public ProgramImpl {
   /**
    * Initializes the SNodes for LLVM based backends.
    */
-  void initialize_llvm_runtime_snodes(const SNodeTree *tree,
-                                      StructCompiler *scomp,
-                                      uint64 *result_buffer);
+  void initialize_llvm_runtime_snodes(
+      const LlvmOfflineCache::FieldCacheData &field_cache_data,
+      uint64 *result_buffer);
 
   uint64 fetch_result_uint64(int i, uint64 *result_buffer);
 

From e3f1ab8ac70798f8b14e5e4750c9ea43c3544b74 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Wed, 8 Jun 2022 14:15:47 +0800
Subject: [PATCH 2/3] Addressed compilation erros

---
 taichi/llvm/llvm_offline_cache.h |   3 +-
 taichi/llvm/llvm_program.cpp     | 152 +++++++++++++++----------------
 2 files changed, 73 insertions(+), 82 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index fe9eca8f8ba96..91386c2bffb31 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -45,7 +45,7 @@ struct LlvmOfflineCache {
   struct FieldCacheData {
     struct SNodeCacheData {
       int id;
-      int type;
+      SNodeType type;
       size_t cell_size_bytes;
       size_t chunk_size;
 
@@ -53,6 +53,7 @@ struct LlvmOfflineCache {
     };
 
     int tree_id;
+    int root_id;
     size_t root_size;
     std::vector<SNodeCacheData> snode_metas;
 
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index e922a9ebe8026..eea60dad165f7 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -160,103 +160,94 @@ LlvmProgramImpl::clone_struct_compiler_initial_context(
 
 void LlvmProgramImpl::initialize_llvm_runtime_snodes(
     const LlvmOfflineCache::FieldCacheData &field_cache_data,
-    uint64 *result_buffer);
-TaichiLLVMContext *tlctx = nullptr;
-if (config->arch == Arch::cuda) {
+    uint64 *result_buffer) {
+  TaichiLLVMContext *tlctx = nullptr;
+  if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-  tlctx = llvm_context_device_.get();
+    tlctx = llvm_context_device_.get();
 #else
-  TI_NOT_IMPLEMENTED
+    TI_NOT_IMPLEMENTED
 #endif
-} else {
-  tlctx = llvm_context_host_.get();
-}
+  } else {
+    tlctx = llvm_context_host_.get();
+  }
 
-auto *const runtime_jit = tlctx -> runtime_jit_module;
-// By the time this creator is called, "this" is already destroyed.
-// Therefore it is necessary to capture members by values.
-size_t root_size = field_cache_data.root_size;
-const auto snode_metas = field_cache_data.snode_metas;
-const int root_id = field_cache_data.tree_id;
-
-TI_TRACE("Allocating data structure of size {} bytes", root_size);
-std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
-
-Ptr root_buffer = snode_tree_buffer_manager_->allocate(runtime_jit,
-                                                       llvm_runtime_,
-                                                       rounded_size,
-                                                       taichi_page_size,
-                                                       root_id,
-                                                       result_buffer);
-if (config->arch == Arch::cuda) {
+  auto *const runtime_jit = tlctx->runtime_jit_module;
+  // By the time this creator is called, "this" is already destroyed.
+  // Therefore it is necessary to capture members by values.
+  size_t root_size = field_cache_data.root_size;
+  const auto snode_metas = field_cache_data.snode_metas;
+  const int tree_id = field_cache_data.tree_id;
+  const int root_id = field_cache_data.root_id;
+
+  TI_TRACE("Allocating data structure of size {} bytes", root_size);
+  std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
+
+  Ptr root_buffer = snode_tree_buffer_manager_->allocate(
+      runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id,
+      result_buffer);
+  if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-  CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
+    CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
 #else
-  TI_NOT_IMPLEMENTED
+    TI_NOT_IMPLEMENTED
 #endif
-} else {
-  std::memset(root_buffer, 0, rounded_size);
-}
+  } else {
+    std::memset(root_buffer, 0, rounded_size);
+  }
 
-DeviceAllocation alloc{kDeviceNullAllocation};
+  DeviceAllocation alloc{kDeviceNullAllocation};
 
-if (config->arch == Arch::cuda) {
+  if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
-  alloc = cuda_device()->import_memory(root_buffer, rounded_size);
+    alloc = cuda_device()->import_memory(root_buffer, rounded_size);
 #else
-  TI_NOT_IMPLEMENTED
+    TI_NOT_IMPLEMENTED
 #endif
-} else {
-  alloc = cpu_device()->import_memory(root_buffer, rounded_size);
-}
+  } else {
+    alloc = cpu_device()->import_memory(root_buffer, rounded_size);
+  }
 
-snode_tree_allocs_[tree->id()] = alloc;
+  snode_tree_allocs_[tree_id] = alloc;
 
-bool all_dense = config->demote_dense_struct_fors;
-for (size_t i = 0; i < snode_metas.size(); i++) {
-  if (snode_metas[i]->type != SNodeType::dense &&
-      snode_metas[i]->type != SNodeType::place &&
-      snode_metas[i]->type != SNodeType::root) {
-    all_dense = false;
-    break;
+  bool all_dense = config->demote_dense_struct_fors;
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (snode_metas[i].type != SNodeType::dense &&
+        snode_metas[i].type != SNodeType::place &&
+        snode_metas[i].type != SNodeType::root) {
+      all_dense = false;
+      break;
+    }
   }
-}
 
-runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
-    "runtime_initialize_snodes",
-    llvm_runtime_,
-    root_size,
-    root_id,
-    (int)snode_metas.size(),
-    root_id,
-    rounded_size,
-    root_buffer,
-    all_dense);
-
-for (size_t i = 0; i < snode_metas.size(); i++) {
-  if (is_gc_able(snode_metas[i]->type)) {
-    const auto snode_id = snode_metas[i].id;
-    std::size_t node_size;
-    auto element_size = snode_metas[i].cell_size_bytes;
-    if (snode_metas[i].type == SNodeType::pointer) {
-      // pointer. Allocators are for single elements
-      node_size = element_size;
-    } else {
-      // dynamic. Allocators are for the chunks
-      node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
+  runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
+      "runtime_initialize_snodes", llvm_runtime_, root_size, root_id,
+      (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense);
+
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (is_gc_able(snode_metas[i].type)) {
+      const auto snode_id = snode_metas[i].id;
+      std::size_t node_size;
+      auto element_size = snode_metas[i].cell_size_bytes;
+      if (snode_metas[i].type == SNodeType::pointer) {
+        // pointer. Allocators are for single elements
+        node_size = element_size;
+      } else {
+        // dynamic. Allocators are for the chunks
+        node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
+      }
+      TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
+               node_size);
+      auto rt = llvm_runtime_;
+      runtime_jit->call<void *, int, std::size_t>(
+          "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
+      TI_TRACE("Allocating ambient element for snode {} (node size {})",
+               snode_id, node_size);
+      runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
+                                     node_size);
     }
-    TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
-             node_size);
-    auto rt = llvm_runtime_;
-    runtime_jit->call<void *, int, std::size_t>(
-        "runtime_NodeAllocator_initialize", rt, snode_id, node_size);
-    TI_TRACE("Allocating ambient element for snode {} (node size {})", snode_id,
-             node_size);
-    runtime_jit->call<void *, int>("runtime_allocate_ambient", rt, snode_id,
-                                   node_size);
   }
 }
-}
 
 std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
     SNodeTree *tree) {
@@ -288,10 +279,9 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
 static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
     const SNodeTree &tree,
     const StructCompiler &struct_compiler) {
-  TI_ASSERT(tree.id == tree.root()->id);
-
   LlvmOfflineCache::FieldCacheData ret;
-  ret.tree_id = tree.id;
+  ret.tree_id = tree.id();
+  ret.root_id = tree.root()->id;
   ret.root_size = struct_compiler.root_size;
 
   const auto &snodes = struct_compiler.snodes;

From cf72ff8c7bd7eab6203b51bdb0107d7a69453d73 Mon Sep 17 00:00:00 2001
From: jim19930609 <jim19930609@gmail.com>
Date: Wed, 8 Jun 2022 16:06:02 +0800
Subject: [PATCH 3/3] [aot] [llvm] LLVM AOT Field #1: Adjust
 serialization/deserialization logics for FieldCacheData

---
 taichi/llvm/llvm_offline_cache.cpp | 14 +++++++++++++
 taichi/llvm/llvm_offline_cache.h   | 32 ++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index 542f02aebc2c7..92994fe9aa2f0 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -51,6 +51,20 @@ LlvmOfflineCacheFileReader::LlvmOfflineCacheFileReader(
     : path_(path), data_(std::move(data)), format_(format) {
 }
 
+bool LlvmOfflineCacheFileReader::get_field_cache(
+    LlvmOfflineCache::FieldCacheData &res,
+    int snode_tree_id) {
+  auto itr = data_.fields.find(snode_tree_id);
+  if (itr == data_.fields.end()) {
+    TI_DEBUG("Cannot find field with snode_tree_id={}", snode_tree_id);
+    return false;
+  }
+
+  const auto &loaded_field_cache = itr->second;
+  res = loaded_field_cache;  // copy assign
+  return true;
+}
+
 bool LlvmOfflineCacheFileReader::get_kernel_cache(
     LlvmOfflineCache::KernelCacheData &res,
     const std::string &key,
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index 91386c2bffb31..1193365fe30e3 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -59,10 +59,31 @@ struct LlvmOfflineCache {
 
     TI_IO_DEF(tree_id, root_size, snode_metas);
 
-    // TODO(zhanlue)
-    //  Serialize/Deserialize the llvm::Module from StructCompiler
-    //  At runtime, make sure loaded Field-Modules and Kernel-Modules are linked
-    //  altogether.
+    // TODO(zhanlue): refactor llvm::Modules
+    //
+    // struct_module will eventually get cloned into each kernel_module,
+    // so there's no need to serialize it here.
+    //
+    // We have three different types of llvm::Module
+    // 1. runtime_module: contains runtime functions.
+    // 2. struct_module: contains compiled SNodeTree in llvm::Type.
+    // 3. kernel_modules: contains compiled kernel codes.
+    //
+    // The way those modules work rely on a recursive clone mechanism:
+    //   runtime_module = load("runtime.bc")
+    //   struct_module = clone(runtime_module) + compiled-SNodeTree
+    //   kernel_module = clone(struct_module) + compiled-Kernel
+    //
+    // As a result, every kernel_module contains a copy of struct_module +
+    // runtime_module.
+    //
+    // This recursive clone mechanism is super fragile,
+    // which potentially causes inconsistency between modules if not handled
+    // properly.
+    //
+    // Let's turn to use llvm::link to bind the modules,
+    // and make runtime_module, struct_module, kernel_module independent of each
+    // other
   };
 
   // TODO(zhanlue): we need a better identifier for each FieldCacheData
@@ -83,6 +104,9 @@ class LlvmOfflineCacheFileReader {
                         const std::string &key,
                         llvm::LLVMContext &llvm_ctx);
 
+  bool get_field_cache(LlvmOfflineCache::FieldCacheData &res,
+                       int snode_tree_id);
+
   static std::unique_ptr<LlvmOfflineCacheFileReader> make(
       const std::string &path,
       LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL);