From a9f2905b16c7581e48b0ba46dc5f7918665f7baf Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Thu, 15 Sep 2022 09:54:37 +0800
Subject: [PATCH 1/7] [aot] Guard C-API interfaces with try-catch (#6060)

---
 c_api/include/taichi/taichi_core.h |   1 +
 c_api/src/taichi_core_impl.cpp     | 113 ++++++++++++++++++++++++-----
 c_api/src/taichi_core_impl.h       |  14 ++++
 c_api/src/taichi_opengl_impl.cpp   |   2 +
 c_api/src/taichi_vulkan_impl.cpp   |  36 +++++++--
 c_api/taichi.json                  |   3 +-
 6 files changed, 146 insertions(+), 23 deletions(-)
diff --git a/c_api/include/taichi/taichi_core.h b/c_api/include/taichi/taichi_core.h
index 4b23cceb33a19..69a2fefae793b 100644
--- a/c_api/include/taichi/taichi_core.h
+++ b/c_api/include/taichi/taichi_core.h
@@ -56,6 +56,7 @@ typedef enum TiError {
   TI_ERROR_ARGUMENT_OUT_OF_RANGE = -6,
   TI_ERROR_ARGUMENT_NOT_FOUND = -7,
   TI_ERROR_INVALID_INTEROP = -8,
+  TI_ERROR_INVALID_STATE = -9,
   TI_ERROR_MAX_ENUM = 0xffffffff,
 } TiError;
 
diff --git a/c_api/src/taichi_core_impl.cpp b/c_api/src/taichi_core_impl.cpp
index 4002af00376f1..303a8eb26450b 100644
--- a/c_api/src/taichi_core_impl.cpp
+++ b/c_api/src/taichi_core_impl.cpp
@@ -35,6 +35,8 @@ const char *describe_error(TiError error) {
       return "argument not found";
     case TI_ERROR_INVALID_INTEROP:
       return "invalid interop";
+    case TI_ERROR_INVALID_STATE:
+      return "invalid state";
     default:
       return "unknown error";
   }
@@ -104,6 +106,8 @@ Runtime &Event::runtime() {
 // -----------------------------------------------------------------------------
 
 TiError ti_get_last_error(uint64_t message_size, char *message) {
+  TiError out = TI_ERROR_INVALID_STATE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   // Emit message only if the output buffer is property provided.
   if (message_size > 0 && message != nullptr) {
     size_t n = thread_error_cache.message.size();
@@ -113,11 +117,14 @@ TiError ti_get_last_error(uint64_t message_size, char *message) {
     std::memcpy(message, thread_error_cache.message.data(), n);
     message[n] = '\0';
   }
-  return thread_error_cache.error;
+  out = thread_error_cache.error;
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 // C-API errors MUST be set via this interface. No matter from internal or
 // external procedures.
 void ti_set_last_error(TiError error, const char *message) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   if (error < TI_ERROR_SUCCESS) {
     TI_WARN("C-API error: ({}) {}", describe_error(error), message);
     if (message != nullptr) {
@@ -130,32 +137,40 @@ void ti_set_last_error(TiError error, const char *message) {
     thread_error_cache.error = TI_ERROR_SUCCESS;
     thread_error_cache.message.clear();
   }
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiRuntime ti_create_runtime(TiArch arch) {
+  TiRuntime out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   switch (arch) {
 #ifdef TI_WITH_VULKAN
     case TI_ARCH_VULKAN: {
-      return (TiRuntime)(static_cast<Runtime *>(new VulkanRuntimeOwned));
+      out = (TiRuntime)(static_cast<Runtime *>(new VulkanRuntimeOwned));
+      break;
     }
 #endif  // TI_WITH_VULKAN
 #ifdef TI_WITH_OPENGL
     case TI_ARCH_OPENGL: {
-      return (TiRuntime)(static_cast<Runtime *>(new OpenglRuntime));
+      out = (TiRuntime)(static_cast<Runtime *>(new OpenglRuntime));
+      break;
     }
 #endif  // TI_WITH_OPENGL
 #ifdef TI_WITH_LLVM
     case TI_ARCH_X64: {
-      return (TiRuntime)(static_cast<Runtime *>(
+      out = (TiRuntime)(static_cast<Runtime *>(
           new capi::LlvmRuntime(taichi::Arch::x64)));
+      break;
     }
     case TI_ARCH_ARM64: {
-      return (TiRuntime)(static_cast<Runtime *>(
+      out = (TiRuntime)(static_cast<Runtime *>(
           new capi::LlvmRuntime(taichi::Arch::arm64)));
+      break;
     }
     case TI_ARCH_CUDA: {
-      return (TiRuntime)(static_cast<Runtime *>(
+      out = (TiRuntime)(static_cast<Runtime *>(
           new capi::LlvmRuntime(taichi::Arch::cuda)));
+      break;
     }
 #endif  // TI_WITH_LLVM
     default: {
@@ -163,15 +178,20 @@ TiRuntime ti_create_runtime(TiArch arch) {
       return TI_NULL_HANDLE;
     }
   }
-  return TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_destroy_runtime(TiRuntime runtime) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   delete (Runtime *)runtime;
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiMemory ti_allocate_memory(TiRuntime runtime,
                             const TiMemoryAllocateInfo *create_info) {
+  TiMemory out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
   TI_CAPI_ARGUMENT_NULL_RV(create_info);
 
@@ -196,35 +216,46 @@ TiMemory ti_allocate_memory(TiRuntime runtime,
   params.export_sharing = create_info->export_sharing;
   params.usage = usage;
 
-  TiMemory devmem = ((Runtime *)runtime)->allocate_memory(params);
-  return devmem;
+  out = ((Runtime *)runtime)->allocate_memory(params);
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 
 void ti_free_memory(TiRuntime runtime, TiMemory devmem) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(devmem);
 
   Runtime *runtime2 = (Runtime *)runtime;
   runtime2->free_memory(devmem);
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void *ti_map_memory(TiRuntime runtime, TiMemory devmem) {
+  void *out = nullptr;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
   TI_CAPI_ARGUMENT_NULL_RV(devmem);
 
   Runtime *runtime2 = (Runtime *)runtime;
-  return runtime2->get().map(devmem2devalloc(*runtime2, devmem));
+  out = runtime2->get().map(devmem2devalloc(*runtime2, devmem));
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_unmap_memory(TiRuntime runtime, TiMemory devmem) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(devmem);
 
   Runtime *runtime2 = (Runtime *)runtime;
   runtime2->get().unmap(devmem2devalloc(*runtime2, devmem));
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiImage ti_allocate_image(TiRuntime runtime,
                           const TiImageAllocateInfo *allocate_info) {
+  TiImage out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
   TI_CAPI_ARGUMENT_NULL_RV(allocate_info);
 
@@ -275,43 +306,58 @@ TiImage ti_allocate_image(TiRuntime runtime,
   params.export_sharing = false;
   params.usage = usage;
 
-  TiImage devimg = ((Runtime *)runtime)->allocate_image(params);
-  return devimg;
+  out = ((Runtime *)runtime)->allocate_image(params);
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_free_image(TiRuntime runtime, TiImage image) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(image);
 
   ((Runtime *)runtime)->free_image(image);
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiSampler ti_create_sampler(TiRuntime runtime,
                             const TiSamplerCreateInfo *create_info) {
+  TiSampler out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_NOT_SUPPORTED(ti_create_sampler);
-  return TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_destroy_sampler(TiRuntime runtime, TiSampler sampler) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_NOT_SUPPORTED(ti_destroy_sampler);
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiEvent ti_create_event(TiRuntime runtime) {
+  TiEvent out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
 
   Runtime *runtime2 = (Runtime *)runtime;
   std::unique_ptr<taichi::lang::DeviceEvent> event =
       runtime2->get().create_event();
   Event *event2 = new Event(*runtime2, std::move(event));
-  return (TiEvent)event2;
+  out = (TiEvent)event2;
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_destroy_event(TiEvent event) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(event);
 
   delete (Event *)event;
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void ti_copy_memory_device_to_device(TiRuntime runtime,
                                      const TiMemorySlice *dst_memory,
                                      const TiMemorySlice *src_memory) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(dst_memory);
   TI_CAPI_ARGUMENT_NULL(dst_memory->memory);
@@ -325,11 +371,13 @@ void ti_copy_memory_device_to_device(TiRuntime runtime,
   auto src = devmem2devalloc(*runtime2, src_memory->memory)
                  .get_ptr(src_memory->offset);
   runtime2->buffer_copy(dst, src, dst_memory->size);
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void ti_copy_texture_device_to_device(TiRuntime runtime,
                                       const TiImageSlice *dst_texture,
                                       const TiImageSlice *src_texture) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(dst_texture);
   TI_CAPI_ARGUMENT_NULL(dst_texture->image);
@@ -353,10 +401,12 @@ void ti_copy_texture_device_to_device(TiRuntime runtime,
   params.height = dst_texture->extent.height;
   params.depth = dst_texture->extent.depth;
   runtime2->copy_image(dst, src, params);
+  TI_CAPI_TRY_CATCH_END();
 }
 void ti_transition_texture(TiRuntime runtime,
                            TiImage texture,
                            TiImageLayout layout) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(texture);
 
@@ -376,9 +426,12 @@ void ti_transition_texture(TiRuntime runtime,
   }
 
   runtime2->transition_image(image, layout2);
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiAotModule ti_load_aot_module(TiRuntime runtime, const char *module_path) {
+  TiAotModule out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
   TI_CAPI_ARGUMENT_NULL_RV(module_path);
 
@@ -388,15 +441,21 @@ TiAotModule ti_load_aot_module(TiRuntime runtime, const char *module_path) {
     ti_set_last_error(TI_ERROR_CORRUPTED_DATA, module_path);
     return TI_NULL_HANDLE;
   }
-  return aot_module;
+  out = aot_module;
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_destroy_aot_module(TiAotModule aot_module) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(aot_module);
 
   delete (AotModule *)aot_module;
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiKernel ti_get_aot_module_kernel(TiAotModule aot_module, const char *name) {
+  TiKernel out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(aot_module);
   TI_CAPI_ARGUMENT_NULL_RV(name);
 
@@ -408,11 +467,15 @@ TiKernel ti_get_aot_module_kernel(TiAotModule aot_module, const char *name) {
     return TI_NULL_HANDLE;
   }
 
-  return (TiKernel)kernel;
+  out = (TiKernel)kernel;
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 
 TiComputeGraph ti_get_aot_module_compute_graph(TiAotModule aot_module,
                                                const char *name) {
+  TiComputeGraph out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(aot_module);
   TI_CAPI_ARGUMENT_NULL_RV(name);
 
@@ -424,13 +487,16 @@ TiComputeGraph ti_get_aot_module_compute_graph(TiAotModule aot_module,
     return TI_NULL_HANDLE;
   }
 
-  return (TiComputeGraph)cgraph;
+  out = (TiComputeGraph)cgraph;
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 
 void ti_launch_kernel(TiRuntime runtime,
                       TiKernel kernel,
                       uint32_t arg_count,
                       const TiArgument *args) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(kernel);
   if (arg_count > 0) {
@@ -482,12 +548,14 @@ void ti_launch_kernel(TiRuntime runtime,
     }
   }
   ((taichi::lang::aot::Kernel *)kernel)->launch(&runtime_context);
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void ti_launch_compute_graph(TiRuntime runtime,
                              TiComputeGraph compute_graph,
                              uint32_t arg_count,
                              const TiNamedArgument *args) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(compute_graph);
   if (arg_count > 0) {
@@ -615,36 +683,47 @@ void ti_launch_compute_graph(TiRuntime runtime,
     }
   }
   ((taichi::lang::aot::CompiledGraph *)compute_graph)->run(arg_map);
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void ti_signal_event(TiRuntime runtime, TiEvent event) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(event);
 
   ((Runtime *)runtime)->signal_event(&((Event *)event)->get());
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void ti_reset_event(TiRuntime runtime, TiEvent event) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(event);
 
   ((Runtime *)runtime)->reset_event(&((Event *)event)->get());
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void ti_wait_event(TiRuntime runtime, TiEvent event) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(event);
 
   ((Runtime *)runtime)->wait_event(&((Event *)event)->get());
+  TI_CAPI_TRY_CATCH_END();
 }
 
 void ti_submit(TiRuntime runtime) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
 
   ((Runtime *)runtime)->submit();
+  TI_CAPI_TRY_CATCH_END();
 }
 void ti_wait(TiRuntime runtime) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
 
   ((Runtime *)runtime)->wait();
+  TI_CAPI_TRY_CATCH_END();
 }
diff --git a/c_api/src/taichi_core_impl.h b/c_api/src/taichi_core_impl.h
index bc96cc95ba61f..13967b9bfb21d 100644
--- a/c_api/src/taichi_core_impl.h
+++ b/c_api/src/taichi_core_impl.h
@@ -1,4 +1,5 @@
 #pragma once
+#include <exception>
 #include "taichi/taichi_core.h"
 #include "taichi/aot/module_loader.h"
 #include "taichi/rhi/device.h"
@@ -54,6 +55,19 @@
     return TI_NULL_HANDLE;                                       \
   }
 
+#define TI_CAPI_TRY_CATCH_BEGIN() try {
+#define TI_CAPI_TRY_CATCH_END()                                 \
+  }                                                             \
+  catch (const std::exception &e) {                             \
+    ti_set_last_error(TI_ERROR_INVALID_STATE, e.what());        \
+  }                                                             \
+  catch (const std::string &e) {                                \
+    ti_set_last_error(TI_ERROR_INVALID_STATE, e.c_str());       \
+  }                                                             \
+  catch (...) {                                                 \
+    ti_set_last_error(TI_ERROR_INVALID_STATE, "c++ exception"); \
+  }
+
 class Runtime;
 class Context;
 class AotModule;
diff --git a/c_api/src/taichi_opengl_impl.cpp b/c_api/src/taichi_opengl_impl.cpp
index 62aee96e30180..a3087c027e97a 100644
--- a/c_api/src/taichi_opengl_impl.cpp
+++ b/c_api/src/taichi_opengl_impl.cpp
@@ -18,6 +18,7 @@ taichi::lang::gfx::GfxRuntime &OpenglRuntime::get_gfx_runtime() {
 void ti_export_opengl_memory(TiRuntime runtime,
                              TiMemory memory,
                              TiOpenglMemoryInteropInfo *interop_info) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(memory);
   TI_CAPI_ARGUMENT_NULL(interop_info);
@@ -27,4 +28,5 @@ void ti_export_opengl_memory(TiRuntime runtime,
   taichi::lang::DeviceAllocation devalloc = devmem2devalloc(*runtime2, memory);
   interop_info->buffer = devalloc.alloc_id;
   interop_info->size = runtime2->get_gl().get_devalloc_size(devalloc);
+  TI_CAPI_TRY_CATCH_END();
 }
diff --git a/c_api/src/taichi_vulkan_impl.cpp b/c_api/src/taichi_vulkan_impl.cpp
index 4081bcf5d1ad3..bf458b9601e6f 100644
--- a/c_api/src/taichi_vulkan_impl.cpp
+++ b/c_api/src/taichi_vulkan_impl.cpp
@@ -121,6 +121,8 @@ TiRuntime ti_create_vulkan_runtime_ext(uint32_t api_version,
                                        uint32_t instance_extension_count,
                                        const char **device_extensions,
                                        uint32_t device_extension_count) {
+  TiRuntime out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   if (api_version < VK_API_VERSION_1_0) {
     ti_set_last_error(TI_ERROR_ARGUMENT_OUT_OF_RANGE, "api_version<1.0");
     return TI_NULL_HANDLE;
@@ -146,10 +148,14 @@ TiRuntime ti_create_vulkan_runtime_ext(uint32_t api_version,
     params.additional_device_extensions.push_back(device_extensions[i]);
   }
   params.surface_creator = nullptr;
-  return (TiRuntime) static_cast<Runtime *>(new VulkanRuntimeOwned(params));
+  out = (TiRuntime) static_cast<Runtime *>(new VulkanRuntimeOwned(params));
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 TiRuntime ti_import_vulkan_runtime(
     const TiVulkanRuntimeInteropInfo *interop_info) {
+  TiRuntime out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(interop_info);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info->instance);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info->physical_device);
@@ -164,11 +170,14 @@ TiRuntime ti_import_vulkan_runtime(
   params.graphics_queue = interop_info->graphics_queue;
   params.graphics_queue_family_index =
       interop_info->graphics_queue_family_index;
-  return (TiRuntime) static_cast<Runtime *>(
+  out = (TiRuntime) static_cast<Runtime *>(
       new VulkanRuntimeImported(interop_info->api_version, params));
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_export_vulkan_runtime(TiRuntime runtime,
                               TiVulkanRuntimeInteropInfo *interop_info) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(interop_info);
 
@@ -186,11 +195,14 @@ void ti_export_vulkan_runtime(TiRuntime runtime,
   interop_info->graphics_queue = vk_device.graphics_queue();
   interop_info->graphics_queue_family_index =
       vk_device.graphics_queue_family_index();
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiMemory ti_import_vulkan_memory(
     TiRuntime runtime,
     const TiVulkanMemoryInteropInfo *interop_info) {
+  TiMemory out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info->buffer);
@@ -204,7 +216,9 @@ TiMemory ti_import_vulkan_memory(
       vkapi::create_buffer(vk_runtime.vk_device(), interop_info->buffer,
                            interop_info->size, interop_info->usage);
   taichi::lang::DeviceAllocation devalloc = vk_runtime.import_vkbuffer(buffer);
-  return devalloc2devmem(*runtime2, devalloc);
+  out = devalloc2devmem(*runtime2, devalloc);
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_export_vulkan_memory(TiRuntime runtime,
                              TiMemory memory,
@@ -225,6 +239,8 @@ TiImage ti_import_vulkan_image(TiRuntime runtime,
                                const TiVulkanImageInteropInfo *interop_info,
                                VkImageViewType view_type,
                                VkImageLayout layout) {
+  TiImage out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info->image);
@@ -268,12 +284,15 @@ TiImage ti_import_vulkan_image(TiRuntime runtime,
 
   taichi::lang::DeviceAllocation image2 =
       vk_runtime.import_vk_image(image, image_view, layout);
-  return devalloc2devimg(*runtime2, image2);
+  out = devalloc2devimg(*runtime2, image2);
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 
 void ti_export_vulkan_image(TiRuntime runtime,
                             TiImage image,
                             TiVulkanImageInteropInfo *interop_info) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(image);
   TI_CAPI_ARGUMENT_NULL(interop_info);
@@ -294,10 +313,13 @@ void ti_export_vulkan_image(TiRuntime runtime,
   interop_info->sample_count = VK_SAMPLE_COUNT_1_BIT;
   interop_info->tiling = VK_IMAGE_TILING_OPTIMAL;
   interop_info->usage = image2->usage;
+  TI_CAPI_TRY_CATCH_END();
 }
 
 TiEvent ti_import_vulkan_event(TiRuntime runtime,
                                const TiVulkanEventInteropInfo *interop_info) {
+  TiEvent out = TI_NULL_HANDLE;
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL_RV(runtime);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info);
   TI_CAPI_ARGUMENT_NULL_RV(interop_info->event);
@@ -313,11 +335,14 @@ TiEvent ti_import_vulkan_event(TiRuntime runtime,
   std::unique_ptr<taichi::lang::DeviceEvent> event2(
       new taichi::lang::vulkan::VulkanDeviceEvent(std::move(event)));
 
-  return (TiEvent) new Event(*runtime2, std::move(event2));
+  out = (TiEvent) new Event(*runtime2, std::move(event2));
+  TI_CAPI_TRY_CATCH_END();
+  return out;
 }
 void ti_export_vulkan_event(TiRuntime runtime,
                             TiEvent event,
                             TiVulkanEventInteropInfo *interop_info) {
+  TI_CAPI_TRY_CATCH_BEGIN();
   TI_CAPI_ARGUMENT_NULL(runtime);
   TI_CAPI_ARGUMENT_NULL(event);
   TI_CAPI_ARGUMENT_NULL(interop_info);
@@ -326,4 +351,5 @@ void ti_export_vulkan_event(TiRuntime runtime,
   auto event2 =
       (taichi::lang::vulkan::VulkanDeviceEvent *)(&((Event *)event)->get());
   interop_info->event = event2->vkapi_ref->event;
+  TI_CAPI_TRY_CATCH_END();
 }
diff --git a/c_api/taichi.json b/c_api/taichi.json
index d352b122cdf9b..dd5fa4d73aa07 100644
--- a/c_api/taichi.json
+++ b/c_api/taichi.json
@@ -93,7 +93,8 @@
                         "argument_null": -5,
                         "argument_out_of_range": -6,
                         "argument_not_found": -7,
-                        "invalid_interop": -8
+                        "invalid_interop": -8,
+                        "invalid_state": -9
                     }
                 },
                 {

From e7bdbff27d832ad65d0dd0c1d35efe0f61c3c3ca Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Wed, 14 Sep 2022 18:59:13 -0700
Subject: [PATCH 2/7] [dx12] Drop code for llvm passes which prepare for DXIL
 generation. (#5998)

2 passes are added for DXIL generation.

TaichiIntrinsicLower will translate taichi intrinsic like thread_idx
into the form DirectX backend expected.

TaichiRuntimeContextLower will translate the TaichiRuntimeContext
parameter for kernel into Buffers/ConstantBuffers.
TaichiRuntimeContextLower is empty now.
It is added after inline so optimizations reduce the load/store on temp
ptr. And it is easier to know a store is on the TaichiRuntimeContext.

Related issue = #5276

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/dx12/CMakeLists.txt            |   2 +
 .../dx12/dx12_global_optimize_module.cpp      |  19 +++
 taichi/codegen/dx12/dx12_llvm_passes.h        |  24 ++++
 taichi/codegen/dx12/dx12_lower_intrinsic.cpp  | 121 ++++++++++++++++++
 .../dx12/dx12_lower_runtime_context.cpp       |  49 +++++++
 5 files changed, 215 insertions(+)
 create mode 100644 taichi/codegen/dx12/dx12_lower_intrinsic.cpp
 create mode 100644 taichi/codegen/dx12/dx12_lower_runtime_context.cpp

diff --git a/taichi/codegen/dx12/CMakeLists.txt b/taichi/codegen/dx12/CMakeLists.txt
index 24dca8d27238b..a22c3e52d91e3 100644
--- a/taichi/codegen/dx12/CMakeLists.txt
+++ b/taichi/codegen/dx12/CMakeLists.txt
@@ -5,6 +5,8 @@ target_sources(dx12_codegen
   PRIVATE
     codegen_dx12.cpp
     dx12_global_optimize_module.cpp
+    dx12_lower_intrinsic.cpp
+    dx12_lower_runtime_context.cpp
   )
 
 target_include_directories(dx12_codegen
diff --git a/taichi/codegen/dx12/dx12_global_optimize_module.cpp b/taichi/codegen/dx12/dx12_global_optimize_module.cpp
index 94a9d7c003328..316a295d5aea6 100644
--- a/taichi/codegen/dx12/dx12_global_optimize_module.cpp
+++ b/taichi/codegen/dx12/dx12_global_optimize_module.cpp
@@ -8,6 +8,7 @@
 #include "taichi/util/file_sequence_writer.h"
 #include "taichi/runtime/llvm/llvm_context.h"
 
+#include "dx12_llvm_passes.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Function.h"
@@ -38,6 +39,8 @@ namespace taichi {
 namespace lang {
 namespace directx12 {
 
+const char *NumWorkGroupsCBName = "num_work_groups.cbuf";
+
 const llvm::StringRef ShaderAttrKindStr = "hlsl.shader";
 
 void mark_function_as_cs_entry(::llvm::Function *F) {
@@ -53,6 +56,16 @@ void set_num_threads(llvm::Function *F, unsigned x, unsigned y, unsigned z) {
   F->addFnAttr(NumThreadsAttrKindStr, Str);
 }
 
+GlobalVariable *createGlobalVariableForResource(Module &M,
+                                                const char *Name,
+                                                llvm::Type *Ty) {
+  auto *GV = new GlobalVariable(M, Ty, /*isConstant*/ false,
+                                GlobalValue::LinkageTypes::ExternalLinkage,
+                                /*Initializer*/ nullptr, Name);
+  GV->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::None);
+  return GV;
+}
+
 std::vector<uint8_t> global_optimize_module(llvm::Module *module,
                                             CompileConfig &config) {
   TI_AUTO_PROF
@@ -104,6 +117,9 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
 
   module->setDataLayout(target_machine->createDataLayout());
 
+  // Lower taichi intrinsic first.
+  module_pass_manager.add(createTaichiIntrinsicLowerPass(&config));
+
   module_pass_manager.add(createTargetTransformInfoWrapperPass(
       target_machine->getTargetIRAnalysis()));
   function_pass_manager.add(createTargetTransformInfoWrapperPass(
@@ -119,6 +135,9 @@ std::vector<uint8_t> global_optimize_module(llvm::Module *module,
 
   b.populateFunctionPassManager(function_pass_manager);
   b.populateModulePassManager(module_pass_manager);
+  // Add passes after inline.
+  module_pass_manager.add(createTaichiRuntimeContextLowerPass());
+
   llvm::SmallString<256> str;
   llvm::raw_svector_ostream OS(str);
   // Write DXIL container to OS.
diff --git a/taichi/codegen/dx12/dx12_llvm_passes.h b/taichi/codegen/dx12/dx12_llvm_passes.h
index c07896abba1a3..2821cead303d3 100644
--- a/taichi/codegen/dx12/dx12_llvm_passes.h
+++ b/taichi/codegen/dx12/dx12_llvm_passes.h
@@ -7,6 +7,8 @@
 namespace llvm {
 class Function;
 class Module;
+class Type;
+class GlobalVariable;
 }  // namespace llvm
 
 namespace taichi {
@@ -18,6 +20,9 @@ namespace directx12 {
 void mark_function_as_cs_entry(llvm::Function *);
 bool is_cs_entry(llvm::Function *);
 void set_num_threads(llvm::Function *, unsigned x, unsigned y, unsigned z);
+llvm::GlobalVariable *createGlobalVariableForResource(llvm::Module &M,
+                                                      const char *Name,
+                                                      llvm::Type *Ty);
 
 std::vector<uint8_t> global_optimize_module(llvm::Module *module,
                                             CompileConfig &config);
@@ -27,3 +32,22 @@ extern const char *NumWorkGroupsCBName;
 }  // namespace directx12
 }  // namespace lang
 }  // namespace taichi
+
+namespace llvm {
+class ModulePass;
+class PassRegistry;
+class Function;
+
+/// Initializer for DXIL-prepare
+void initializeTaichiRuntimeContextLowerPass(PassRegistry &);
+
+/// Pass to convert modules into DXIL-compatable modules
+ModulePass *createTaichiRuntimeContextLowerPass();
+
+/// Initializer for taichi intrinsic lower.
+void initializeTaichiIntrinsicLowerPass(PassRegistry &);
+
+/// Pass to lower taichi intrinsic into DXIL intrinsic.
+ModulePass *createTaichiIntrinsicLowerPass(taichi::lang::CompileConfig *config);
+
+}  // namespace llvm
diff --git a/taichi/codegen/dx12/dx12_lower_intrinsic.cpp b/taichi/codegen/dx12/dx12_lower_intrinsic.cpp
new file mode 100644
index 0000000000000..2a694ca04af49
--- /dev/null
+++ b/taichi/codegen/dx12/dx12_lower_intrinsic.cpp
@@ -0,0 +1,121 @@
+
+#include "dx12_llvm_passes.h"
+#include "llvm/Pass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include "taichi/program/compile_config.h"
+#include "taichi/runtime/llvm/llvm_context.h"
+
+using namespace llvm;
+using namespace taichi::lang::directx12;
+
+#define DEBUG_TYPE "dxil-taichi-intrinsic-lower"
+
+namespace {
+
+class TaichiIntrinsicLower : public ModulePass {
+ public:
+  bool runOnModule(Module &M) override {
+    auto &Ctx = M.getContext();
+    // patch intrinsic
+    auto patch_intrinsic = [&](std::string name, Intrinsic::ID intrin,
+                               bool ret = true,
+                               std::vector<llvm::Type *> types = {},
+                               std::vector<llvm::Value *> extra_args = {}) {
+      auto func = M.getFunction(name);
+      if (!func) {
+        return;
+      }
+      func->deleteBody();
+      auto bb = llvm::BasicBlock::Create(Ctx, "entry", func);
+      IRBuilder<> builder(Ctx);
+      builder.SetInsertPoint(bb);
+      std::vector<llvm::Value *> args;
+      for (auto &arg : func->args())
+        args.push_back(&arg);
+      args.insert(args.end(), extra_args.begin(), extra_args.end());
+      if (ret) {
+        builder.CreateRet(builder.CreateIntrinsic(intrin, types, args));
+      } else {
+        builder.CreateIntrinsic(intrin, types, args);
+        builder.CreateRetVoid();
+      }
+      func->setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
+      taichi::lang::TaichiLLVMContext::mark_inline(func);
+    };
+
+    llvm::IRBuilder<> B(Ctx);
+    Value *i32Zero = B.getInt32(0);
+
+    auto patch_intrinsic_to_const = [&](std::string name, Constant *C,
+                                        Type *Ty) {
+      auto func = M.getFunction(name);
+      if (!func) {
+        return;
+      }
+      func->deleteBody();
+      auto bb = llvm::BasicBlock::Create(Ctx, "entry", func);
+      IRBuilder<> B(Ctx);
+      B.SetInsertPoint(bb);
+      Value *V = C;
+      if (V->getType()->isPointerTy())
+        V = B.CreateLoad(Ty, C);
+      B.CreateRet(V);
+      func->setLinkage(GlobalValue::LinkageTypes::InternalLinkage);
+      taichi::lang::TaichiLLVMContext::mark_inline(func);
+    };
+    // group thread id.
+    patch_intrinsic("thread_idx", Intrinsic::dx_thread_id_in_group, true, {},
+                    {i32Zero});
+    // group idx.
+    patch_intrinsic("block_idx", Intrinsic::dx_group_id, true, {}, {i32Zero});
+    // Group Size
+    unsigned group_size = 64;
+    if (config)
+      group_size = config->default_gpu_block_dim;
+
+    auto *I32Ty = B.getInt32Ty();
+    Constant *block_dim = B.getInt32(group_size);
+    patch_intrinsic_to_const("block_dim", block_dim, I32Ty);
+    // Num work groups will be in a special CBuffer.
+    // TaichiRuntimeContextLower pass will place the CBuffer to special binding
+    // space.
+    Type *TyNumWorkGroups = FixedVectorType::get(I32Ty, 3);
+    Constant *CBNumWorkGroups = createGlobalVariableForResource(
+        M, NumWorkGroupsCBName, TyNumWorkGroups);
+
+    Constant *NumWorkGroupX = cast<Constant>(
+        B.CreateConstGEP2_32(TyNumWorkGroups, CBNumWorkGroups, 0, 0));
+    patch_intrinsic_to_const("grid_dim", NumWorkGroupX, I32Ty);
+    return true;
+  }
+
+  TaichiIntrinsicLower(taichi::lang::CompileConfig *config = nullptr)
+      : ModulePass(ID), config(config) {
+    initializeTaichiIntrinsicLowerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID;  // Pass identification.
+ private:
+  taichi::lang::CompileConfig *config;
+};
+char TaichiIntrinsicLower::ID = 0;
+
+}  // end anonymous namespace
+
+INITIALIZE_PASS(TaichiIntrinsicLower,
+                DEBUG_TYPE,
+                "Lower taichi intrinsic",
+                false,
+                false)
+
+llvm::ModulePass *llvm::createTaichiIntrinsicLowerPass(
+    taichi::lang::CompileConfig *config) {
+  return new TaichiIntrinsicLower(config);
+}
diff --git a/taichi/codegen/dx12/dx12_lower_runtime_context.cpp b/taichi/codegen/dx12/dx12_lower_runtime_context.cpp
new file mode 100644
index 0000000000000..26884557e2659
--- /dev/null
+++ b/taichi/codegen/dx12/dx12_lower_runtime_context.cpp
@@ -0,0 +1,49 @@
+
+
+#include "dx12_llvm_passes.h"
+
+#include "llvm/Pass.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+#include "taichi/program/compile_config.h"
+#include "taichi/runtime/llvm/llvm_context.h"
+
+using namespace llvm;
+using namespace taichi::lang::directx12;
+
+#define DEBUG_TYPE "dxil-taichi-runtime-context-lower"
+
+namespace {
+
+class TaichiRuntimeContextLower : public ModulePass {
+ public:
+  bool runOnModule(Module &M) override {
+    // TODO: lower taichi RuntimeContext into DXIL resources.
+    return true;
+  }
+
+  TaichiRuntimeContextLower() : ModulePass(ID) {
+    initializeTaichiRuntimeContextLowerPass(*PassRegistry::getPassRegistry());
+  }
+
+  static char ID;  // Pass identification.
+ private:
+};
+char TaichiRuntimeContextLower::ID = 0;
+
+}  // end anonymous namespace
+
+INITIALIZE_PASS(TaichiRuntimeContextLower,
+                DEBUG_TYPE,
+                "Lower taichi RuntimeContext",
+                false,
+                false)
+
+llvm::ModulePass *llvm::createTaichiRuntimeContextLowerPass() {
+  return new TaichiRuntimeContextLower();
+}

From 9d27b94ec04943528c4028da4f50fa6dfc25d000 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Thu, 15 Sep 2022 10:36:52 +0800
Subject: [PATCH 3/7] [ir] MatrixField refactor 3/n: Add MatrixFieldExpression
 (#6010)

Related issue = #5959

This PR adds definition of `MatrixFieldExpression`, which serves as the
representation of a matrix field in C++. Currently only
`dynamic_index_stride` has been completely moved there. Others will get
migrated in future PRs.

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/matrix.py              | 14 ++++++++++----
 python/taichi/lang/mesh.py                |  3 ++-
 taichi/analysis/gen_offline_cache_key.cpp |  7 +++++++
 taichi/inc/expressions.inc.h              |  1 +
 taichi/ir/expr.cpp                        |  6 ++++++
 taichi/ir/expr.h                          |  4 ++++
 taichi/ir/expression.h                    |  5 +----
 taichi/ir/expression_printer.h            | 11 +++++++++++
 taichi/ir/frontend_ir.h                   | 18 ++++++++++++++++++
 taichi/python/export_lang.cpp             | 16 ++++++++++++++++
 tests/python/test_matrix.py               | 18 +++++++++---------
 11 files changed, 85 insertions(+), 18 deletions(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 8dffe6f0634dc..3fcddd7dfe703 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1538,7 +1538,7 @@ def __init__(self, field, indices):
                 for e in field._get_field_members()
             ],
             ndim=getattr(field, "ndim", 2))
-        self._impl.dynamic_index_stride = field.dynamic_index_stride
+        self._impl.dynamic_index_stride = field._get_dynamic_index_stride()
 
 
 class MatrixField(Field):
@@ -1557,7 +1557,8 @@ def __init__(self, _vars, n, m, ndim=2):
         self.n = n
         self.m = m
         self.ndim = ndim
-        self.dynamic_index_stride = None
+        self.ptr = ti_python_core.expr_matrix_field(
+            [var.ptr for var in self.vars], [n, m][:ndim])
 
     def get_scalar_field(self, *indices):
         """Creates a ScalarField using a specific field member.
@@ -1573,12 +1574,17 @@ def get_scalar_field(self, *indices):
         j = 0 if len(indices) == 1 else indices[1]
         return ScalarField(self.vars[i * self.m + j])
 
+    def _get_dynamic_index_stride(self):
+        if self.ptr.get_dynamic_indexable():
+            return self.ptr.get_dynamic_index_stride()
+        return None
+
     def _calc_dynamic_index_stride(self):
         # Algorithm: https://github.com/taichi-dev/taichi/issues/3810
         paths = [ScalarField(var).snode._path_from_root() for var in self.vars]
         num_members = len(paths)
         if num_members == 1:
-            self.dynamic_index_stride = 0
+            self.ptr.set_dynamic_index_stride(0)
             return
         length = len(paths[0])
         if any(
@@ -1602,7 +1608,7 @@ def _calc_dynamic_index_stride(self):
             if stride != paths[i][depth_below_lca]._offset_bytes_in_parent_cell \
                     - paths[i - 1][depth_below_lca]._offset_bytes_in_parent_cell:
                 return
-        self.dynamic_index_stride = stride
+        self.ptr.set_dynamic_index_stride(stride)
 
     def fill(self, val):
         """Fills this matrix field with specified values.
diff --git a/python/taichi/lang/mesh.py b/python/taichi/lang/mesh.py
index 2005d217ccff3..db0e23c629c4f 100644
--- a/python/taichi/lang/mesh.py
+++ b/python/taichi/lang/mesh.py
@@ -67,7 +67,8 @@ def __init__(self, field: MatrixField, mesh_ptr: _ti_core.MeshPtr,
         self.grad = field.grad
         self.n = field.n
         self.m = field.m
-        self.dynamic_index_stride = field.dynamic_index_stride
+        self.ndim = field.ndim
+        self.ptr = field.ptr
 
         self.mesh_ptr = mesh_ptr
         self.element_type = element_type
diff --git a/taichi/analysis/gen_offline_cache_key.cpp b/taichi/analysis/gen_offline_cache_key.cpp
index eccfd0049a261..5c1dfbef622f6 100644
--- a/taichi/analysis/gen_offline_cache_key.cpp
+++ b/taichi/analysis/gen_offline_cache_key.cpp
@@ -160,6 +160,13 @@ class ASTSerializer : public IRVisitor, public ExpressionVisitor {
     emit(expr->adjoint_checkbit);
   }
 
+  void visit(MatrixFieldExpression *expr) override {
+    emit(ExprOpCode::MatrixFieldExpression);
+    emit(expr->fields);
+    emit(expr->element_shape);
+    emit(expr->dynamic_index_stride);
+  }
+
   void visit(IndexExpression *expr) override {
     emit(ExprOpCode::IndexExpression);
     emit(expr->var);
diff --git a/taichi/inc/expressions.inc.h b/taichi/inc/expressions.inc.h
index 0ae81430f83ae..b311e6fb5a441 100644
--- a/taichi/inc/expressions.inc.h
+++ b/taichi/inc/expressions.inc.h
@@ -6,6 +6,7 @@ PER_EXPRESSION(TernaryOpExpression)
 PER_EXPRESSION(InternalFuncCallExpression)
 PER_EXPRESSION(ExternalTensorExpression)
 PER_EXPRESSION(FieldExpression)
+PER_EXPRESSION(MatrixFieldExpression)
 PER_EXPRESSION(IndexExpression)
 PER_EXPRESSION(MatrixExpression)
 PER_EXPRESSION(StrideExpression)
diff --git a/taichi/ir/expr.cpp b/taichi/ir/expr.cpp
index e36a1a50ad46a..3955bd752e1ad 100644
--- a/taichi/ir/expr.cpp
+++ b/taichi/ir/expr.cpp
@@ -119,4 +119,10 @@ Expr expr_field(Expr id_expr, DataType dt) {
       std::make_shared<FieldExpression>(dt, id_expr.cast<IdExpression>()->id));
   return ret;
 }
+
+Expr expr_matrix_field(const std::vector<Expr> &fields,
+                       const std::vector<int> &element_shape) {
+  return Expr::make<MatrixFieldExpression>(fields, element_shape);
+}
+
 TLANG_NAMESPACE_END
diff --git a/taichi/ir/expr.h b/taichi/ir/expr.h
index 7bc4d8f3a8ce9..ba84faecd09a9 100644
--- a/taichi/ir/expr.h
+++ b/taichi/ir/expr.h
@@ -145,4 +145,8 @@ Expr assume_range(const Expr &expr, const Expr &base, int low, int high);
 Expr loop_unique(const Expr &input, const std::vector<SNode *> &covers);
 
 Expr expr_field(Expr id_expr, DataType dt);
+
+Expr expr_matrix_field(const std::vector<Expr> &fields,
+                       const std::vector<int> &element_shape);
+
 TLANG_NAMESPACE_END
diff --git a/taichi/ir/expression.h b/taichi/ir/expression.h
index e4d9ed08f6ad9..1f400e5b24374 100644
--- a/taichi/ir/expression.h
+++ b/taichi/ir/expression.h
@@ -39,10 +39,7 @@ class Expression {
     stmt = nullptr;
   }
 
-  virtual void type_check(CompileConfig *config) {
-    // TODO: make it pure virtual after type_check for all expressions are
-    // implemented
-  }
+  virtual void type_check(CompileConfig *config) = 0;
 
   virtual void accept(ExpressionVisitor *visitor) = 0;
 
diff --git a/taichi/ir/expression_printer.h b/taichi/ir/expression_printer.h
index 3f4520d443d81..c0f595efc0f95 100644
--- a/taichi/ir/expression_printer.h
+++ b/taichi/ir/expression_printer.h
@@ -110,6 +110,17 @@ class ExpressionHumanFriendlyPrinter : public ExpressionPrinter {
     }
   }
 
+  void visit(MatrixFieldExpression *expr) override {
+    emit('[');
+    emit_vector(expr->fields);
+    emit("] (");
+    emit_vector(expr->element_shape);
+    if (expr->dynamic_index_stride) {
+      emit(", dynamic_index_stride = ", expr->dynamic_index_stride);
+    }
+    emit(')');
+  }
+
   void visit(MatrixExpression *expr) override {
     emit('[');
     emit_vector(expr->elements);
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index fddad4040d22c..cadf3663b78fb 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -521,6 +521,24 @@ class FieldExpression : public Expression {
   TI_DEFINE_ACCEPT_FOR_EXPRESSION
 };
 
+class MatrixFieldExpression : public Expression {
+ public:
+  std::vector<Expr> fields;
+  std::vector<int> element_shape;
+  bool dynamic_indexable{false};
+  int dynamic_index_stride{0};
+
+  MatrixFieldExpression(const std::vector<Expr> &fields,
+                        const std::vector<int> &element_shape)
+      : fields(fields), element_shape(element_shape) {
+  }
+
+  void type_check(CompileConfig *config) override {
+  }
+
+  TI_DEFINE_ACCEPT_FOR_EXPRESSION
+};
+
 /**
  * Creating a local matrix;
  * lowered from ti.Matrix with real_matrix=True
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index ee32dba9ca17c..c3d58b362e52e 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -754,6 +754,20 @@ void export_lang(py::module &m) {
       .def("set_adjoint", &Expr::set_adjoint)
       .def("set_adjoint_checkbit", &Expr::set_adjoint_checkbit)
       .def("set_dual", &Expr::set_dual)
+      .def("set_dynamic_index_stride",
+           [&](Expr *expr, int dynamic_index_stride) {
+             auto matrix_field = expr->cast<MatrixFieldExpression>();
+             matrix_field->dynamic_indexable = true;
+             matrix_field->dynamic_index_stride = dynamic_index_stride;
+           })
+      .def("get_dynamic_indexable",
+           [&](Expr *expr) -> bool {
+             return expr->cast<MatrixFieldExpression>()->dynamic_indexable;
+           })
+      .def("get_dynamic_index_stride",
+           [&](Expr *expr) -> int {
+             return expr->cast<MatrixFieldExpression>()->dynamic_index_stride;
+           })
       .def(
           "get_dt",
           [&](Expr *expr) -> const Type * {
@@ -838,6 +852,8 @@ void export_lang(py::module &m) {
 
   m.def("expr_field", expr_field);
 
+  m.def("expr_matrix_field", expr_matrix_field);
+
 #define DEFINE_EXPRESSION_OP(x) m.def("expr_" #x, expr_##x);
 
   DEFINE_EXPRESSION_OP(neg)
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index 6069261419b74..1ecd1b6591117 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -402,10 +402,10 @@ def test_matrix_field_dynamic_index_stride():
     @ti.kernel
     def check_stride():
         for i in range(128):
-            assert ti.get_addr(y, i) - ti.get_addr(x,
-                                                   i) == v.dynamic_index_stride
-            assert ti.get_addr(z, i) - ti.get_addr(y,
-                                                   i) == v.dynamic_index_stride
+            assert ti.get_addr(y, i) - ti.get_addr(
+                x, i) == v._get_dynamic_index_stride()
+            assert ti.get_addr(z, i) - ti.get_addr(
+                y, i) == v._get_dynamic_index_stride()
 
     check_stride()
 
@@ -431,7 +431,7 @@ def test_matrix_field_dynamic_index_different_path_length():
     ti.root.dense(ti.i, 2).dense(ti.i, 4).place(y)
 
     impl.get_runtime().materialize()
-    assert v.dynamic_index_stride is None
+    assert v._get_dynamic_index_stride() is None
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda])
@@ -444,7 +444,7 @@ def test_matrix_field_dynamic_index_not_pure_dense():
     ti.root.dense(ti.i, 2).dense(ti.i, 4).place(y)
 
     impl.get_runtime().materialize()
-    assert v.dynamic_index_stride is None
+    assert v._get_dynamic_index_stride() is None
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda])
@@ -459,7 +459,7 @@ def test_matrix_field_dynamic_index_different_cell_size_bytes():
     ti.root.dense(ti.i, 8).place(y)
 
     impl.get_runtime().materialize()
-    assert v.dynamic_index_stride is None
+    assert v._get_dynamic_index_stride() is None
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda])
@@ -475,7 +475,7 @@ def test_matrix_field_dynamic_index_different_offset_bytes_in_parent_cell():
     ti.root.dense(ti.i, 8).place(y, temp_b)
 
     impl.get_runtime().materialize()
-    assert v.dynamic_index_stride is None
+    assert v._get_dynamic_index_stride() is None
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda])
@@ -490,7 +490,7 @@ def test_matrix_field_dynamic_index_different_stride():
     ti.root.dense(ti.i, 8).place(x, y, temp, z)
 
     impl.get_runtime().materialize()
-    assert v.dynamic_index_stride is None
+    assert v._get_dynamic_index_stride() is None
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda], dynamic_index=True)

From 9a35e049e891a4642e237f39eb70b546ba1083bb Mon Sep 17 00:00:00 2001
From: Mike He <dh63@cs.washington.edu>
Date: Wed, 14 Sep 2022 22:53:17 -0400
Subject: [PATCH 4/7] [Lang] Fixes matrix-vector multiplication (#6014)

Related issue = fix #5988

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->
---
 python/taichi/lang/matrix.py | 12 ++++++++++++
 tests/python/test_matrix.py  | 20 +++++++++++++++++++-
 2 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 3fcddd7dfe703..48a6013e37a9e 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -447,6 +447,11 @@ def __init__(self,
             is_matrix = isinstance(arr[0], Iterable) and not is_vector(self)
             initializer = _make_entries_initializer(is_matrix)
             self.ndim = 2 if is_matrix else 1
+            if not is_matrix and isinstance(arr[0], Iterable):
+                flattened = []
+                for row in arr:
+                    flattened += row
+                arr = flattened
 
             if in_python_scope() or is_ref:
                 mat = initializer.pyscope_or_ref(arr)
@@ -553,6 +558,11 @@ def __matmul__(self, other):
 
         """
         assert isinstance(other, Matrix), "rhs of `@` is not a matrix / vector"
+        if is_vector(self) and not is_vector(other):
+            # left multiplication
+            assert self.n == other.m, f"Dimension mismatch between shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
+            return other.transpose() @ self
+        # right multiplication
         assert self.m == other.n, f"Dimension mismatch between shapes ({self.n}, {self.m}), ({other.n}, {other.m})"
         entries = []
         for i in range(self.n):
@@ -562,6 +572,8 @@ def __matmul__(self, other):
                 for k in range(1, other.n):
                     acc = acc + self(i, k) * other(k, j)
                 entries[i].append(acc)
+        if is_vector(other) and other.m == 1:
+            return Vector(entries)
         return Matrix(entries)
 
     # host access & python scope operation
diff --git a/tests/python/test_matrix.py b/tests/python/test_matrix.py
index 1ecd1b6591117..4e3ad8d688319 100644
--- a/tests/python/test_matrix.py
+++ b/tests/python/test_matrix.py
@@ -115,7 +115,7 @@ def func(t: ti.i32):
         m += ti.Matrix([[3, 4], [5, t]])
         print(m @ v)
         print(r.x, r.y, r.z, r.w)
-        s = w.transpose() @ m
+        s = w @ m
         print(s)
         print(m)
 
@@ -701,6 +701,24 @@ def bar():
         bar()
 
 
+@test_utils.test(arch=get_host_arch_list(), debug=True)
+def test_matrix_vector_multiplication():
+    mat = ti.math.mat3(1)
+    vec = ti.math.vec3(3)
+    r = mat @ vec
+    for i in range(3):
+        assert r[i] == 9
+
+    @ti.kernel
+    def foo():
+        mat = ti.math.mat3(1)
+        vec = ti.math.vec3(3)
+        r = mat @ vec
+        assert r[0] == r[1] == r[2] == 9
+
+    foo()
+
+
 @test_utils.test(arch=[ti.cuda, ti.cpu], real_matrix=True)
 def test_local_matrix_read():
 

From 72804ae992212ddee5357bae8b27d6e3f2148831 Mon Sep 17 00:00:00 2001
From: Chang Yu <g1n0st@live.com>
Date: Thu, 15 Sep 2022 14:22:28 +0800
Subject: [PATCH 5/7] [Mesh] [bug] Fix nested mesh for (#6062)

Related issue = #3608

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->
---
 python/taichi/lang/ast/ast_transformer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 6fac2c6418e0c..07600d02bfe32 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -1110,7 +1110,7 @@ def build_nested_mesh_for(ctx, node):
             loop_var = expr.Expr(ctx.ast_builder.make_id_expr(''))
             ctx.create_variable(loop_name, loop_var)
             begin = expr.Expr(0)
-            end = node.iter.ptr.size
+            end = ti_ops.cast(node.iter.ptr.size, primitive_types.i32)
             ctx.ast_builder.begin_frontend_range_for(loop_var.ptr, begin.ptr,
                                                      end.ptr)
             entry_expr = _ti_core.get_relation_access(

From 574b05e70f31e0ae5bc314d1fa7c2fe8991643b8 Mon Sep 17 00:00:00 2001
From: Proton <feisuzhu@163.com>
Date: Thu, 15 Sep 2022 18:17:52 +0800
Subject: [PATCH 6/7] [ci] Upgrade conda cudatoolkit version to 11.3 (#6070)

Related issue = #

<!--
Thank you for your contribution!

If it is your first time contributing to Taichi, please read our
Contributor Guidelines:
  https://docs.taichi-lang.org/docs/contributor_guide

- Please always prepend your PR title with tags such as [CUDA], [Lang],
[Doc], [Example]. For a complete list of valid PR tags, please check out
https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
- Use upper-case tags (e.g., [Metal]) for PRs that change public APIs.
Otherwise, please use lower-case tags (e.g., [metal]).
- More details:
https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags

- Please fill in the issue number that this PR relates to.
- If your PR fixes the issue **completely**, use the `close` or `fixes`
prefix so that GitHub automatically closes the issue when the PR is
merged. For example,
    Related issue = close #2345
- If the PR does not belong to any existing issue, free to leave it
blank.
-->

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/workflows/release.yml |  4 ++--
 .github/workflows/testing.yml |  6 +++---
 ci/Dockerfile.ubuntu.18.04    | 18 +++++++++---------
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index ba6b5128dfa31..0f2cbc90a7640 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -87,7 +87,7 @@ jobs:
           mkdir -m777 shared
           docker create --user dev --name taichi_build --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e DISPLAY -e PY -e GPU_BUILD -e TAICHI_CMAKE_ARGS -e PROJECT_NAME \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.2 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build:/home/dev/
           docker start -a taichi_build
@@ -111,7 +111,7 @@ jobs:
         run: |
           docker create --user dev --name taichi_test --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e DISPLAY -e PY -e GPU_TEST \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.2 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/unix_test.sh
           docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
           docker cp .github/workflows/scripts/common-utils.sh taichi_test:/home/dev/common-utils.sh
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 95262572addf9..4aa46e4944852 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -405,7 +405,7 @@ jobs:
             --gpus 'all,"capabilities=graphics,utility,display,video,compute"' \
             -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           # A tarball is needed because sccache needs some permissions that only the file owner has.
           # 1000 is the uid and gid of user "dev" in the container.
@@ -444,7 +444,7 @@ jobs:
             -e TI_LITE_TEST \
             -e TI_TEST_OFFLINE_CACHE \
             -e DISPLAY -e PY -e GPU_TEST -e TI_WANTED_ARCHS -e TI_RUN_RELEASE_TESTS \
-             registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+             registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
              /home/dev/unix_test.sh
           docker cp .github/workflows/scripts/unix_test.sh taichi_test:/home/dev/unix_test.sh
           docker cp .github/workflows/scripts/common-utils.sh taichi_test:/home/dev/common-utils.sh
@@ -671,7 +671,7 @@ jobs:
           docker run --user dev --name taichi_build_host \
             $DOCKER_RUN_ARGS \
             -v $TAICHI_WHEEL_DIR:/home/dev/taichi/dist \
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.3 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.4 \
             /home/dev/taichi/.github/workflows/scripts/unix-build-v2.sh
         env:
           TAICHI_CMAKE_ARGS: >-
diff --git a/ci/Dockerfile.ubuntu.18.04 b/ci/Dockerfile.ubuntu.18.04
index 07aef59d22d6d..f991f0390461a 100644
--- a/ci/Dockerfile.ubuntu.18.04
+++ b/ci/Dockerfile.ubuntu.18.04
@@ -93,17 +93,17 @@ RUN wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh &
     bash Miniconda3-latest-Linux-x86_64.sh -p /home/dev/miniconda -b
 ENV PATH="/home/dev/miniconda/bin:$PATH"
 
-# Set up multi-python environment
-RUN conda init bash
-RUN conda create -n py36 python=3.6 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py37 python=3.7 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py38 python=3.8 pytorch cudatoolkit=10.2 -c pytorch -y
-RUN conda create -n py39 python=3.9 pytorch cudatoolkit=10.2 -c pytorch -y
-# TODO add torch to 3.10 when supported
-RUN conda create -n py310 python=3.10 -y
-
 # Remove mesa EGL driver, which interferes with the propritary NVIDIA drivers
 RUN rm -f /usr/lib/x86_64-linux-gnu/libEGL_mesa*
 
 WORKDIR /home/dev
 ENV LANG="C.UTF-8"
+
+# Set up multi-python environment
+RUN conda init bash
+RUN conda create -n py36 python=3.6 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py37 python=3.7 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py38 python=3.8 pytorch cudatoolkit=11.3 -c pytorch -y
+RUN conda create -n py39 python=3.9 pytorch cudatoolkit=11.3 -c pytorch -y
+# TODO add torch to 3.10 when supported
+RUN conda create -n py310 python=3.10 -y

From 4d94b31aed9758a167cf7918a5b04f3b8884d08d Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Thu, 15 Sep 2022 03:43:01 -0700
Subject: [PATCH 7/7] [ci] [dx12] Enable dx12 build for windows cpu ci. (#6069)

Fix build fail and enable dx12 build for windows cpu ci to make sure it
compiles.

Related issue = #5276
---
 .github/workflows/scripts/win_build_test_cpu.ps1 | 1 +
 taichi/codegen/dx12/codegen_dx12.cpp             | 4 ++--
 taichi/codegen/dx12/codegen_dx12.h               | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/scripts/win_build_test_cpu.ps1 b/.github/workflows/scripts/win_build_test_cpu.ps1
index 0b1a4cd5e7faa..4aa90582517ed 100644
--- a/.github/workflows/scripts/win_build_test_cpu.ps1
+++ b/.github/workflows/scripts/win_build_test_cpu.ps1
@@ -66,6 +66,7 @@ if (!$llvmVer.CompareTo("10")) {
 } else {
     $env:TAICHI_CMAKE_ARGS += " -DLLVM_AS_EXECUTABLE=C:\\taichi_llvm_15\\bin\\llvm-as.exe -DTI_WITH_VULKAN:BOOL=OFF"
     $env:TAICHI_CMAKE_ARGS += " -DTI_LLVM_15:BOOL=ON"
+    $env:TAICHI_CMAKE_ARGS += " -DTI_WITH_DX12:BOOL=ON"
 }
 
 
diff --git a/taichi/codegen/dx12/codegen_dx12.cpp b/taichi/codegen/dx12/codegen_dx12.cpp
index 4be95a53f7b1b..bee1d3c27a6e8 100644
--- a/taichi/codegen/dx12/codegen_dx12.cpp
+++ b/taichi/codegen/dx12/codegen_dx12.cpp
@@ -227,7 +227,7 @@ class TaskCodeGenLLVMDX12 : public TaskCodeGenLLVM {
 #ifdef TI_WITH_LLVM
 
 static std::vector<uint8_t> generate_dxil_from_llvm(
-    LLVMCompiledData &compiled_data,
+    LLVMCompiledTask &compiled_data,
     taichi::lang::Kernel *kernel) {
   // generate dxil from llvm ir.
   auto offloaded_local = compiled_data.tasks;
@@ -286,7 +286,7 @@ KernelCodeGenDX12::CompileResult KernelCodeGenDX12::compile() {
   return Result;
 }
 
-LLVMCompiledData KernelCodeGenDX12::compile_task(
+LLVMCompiledTask KernelCodeGenDX12::compile_task(
     std::unique_ptr<llvm::Module> &&module,
     OffloadedStmt *stmt) {
   TaskCodeGenLLVMDX12 gen(kernel, stmt);
diff --git a/taichi/codegen/dx12/codegen_dx12.h b/taichi/codegen/dx12/codegen_dx12.h
index 1b9e920e71873..5d352231a1e6b 100644
--- a/taichi/codegen/dx12/codegen_dx12.h
+++ b/taichi/codegen/dx12/codegen_dx12.h
@@ -22,7 +22,7 @@ class KernelCodeGenDX12 : public KernelCodeGen {
   };
   CompileResult compile();
 #ifdef TI_WITH_LLVM
-  LLVMCompiledData compile_task(
+  LLVMCompiledTask compile_task(
       std::unique_ptr<llvm::Module> &&module = nullptr,
       OffloadedStmt *stmt = nullptr) override;
 #endif