diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index a2e555d22b44d..aceaae0428c57 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -20,7 +20,8 @@ endif ()
 # Do not enable lto for APPLE since it made linking extremely slow.
 if (WIN32)
     if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto=thin")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -flto=thin")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -flto=thin")
     endif()
 endif()
 
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 1bcafcf015530..c9584aa66f7df 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -372,24 +372,21 @@ add_subdirectory(external/SPIRV-Tools)
 # https://github.com/KhronosGroup/SPIRV-Tools/issues/1569#issuecomment-390250792
 target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE SPIRV-Tools-opt ${SPIRV_TOOLS})
 
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Headers/include)
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Reflect)
+
+add_subdirectory(taichi/runtime/gfx)
+target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE gfx_runtime)
+
+# Vulkan Device API
 if (TI_WITH_VULKAN)
     include_directories(SYSTEM external/Vulkan-Headers/include)
 
     include_directories(SYSTEM external/volk)
 
-    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Headers/include)
-    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Reflect)
-
     # By specifying SYSTEM, we suppressed the warnings from third-party headers.
     target_include_directories(${CORE_LIBRARY_NAME} SYSTEM PRIVATE external/VulkanMemoryAllocator/include)
 
-    if (LINUX)
-        # shaderc requires pthread
-        set(THREADS_PREFER_PTHREAD_FLAG ON)
-        find_package(Threads REQUIRED)
-        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE Threads::Threads)
-    endif()
-
     if (APPLE)
         find_library(MOLTEN_VK libMoltenVK.dylib PATHS $HOMEBREW_CELLAR/molten-vk $VULKAN_SDK REQUIRED)
         configure_file(${MOLTEN_VK} ${CMAKE_BINARY_DIR}/libMoltenVK.dylib COPYONLY)
@@ -398,9 +395,6 @@ if (TI_WITH_VULKAN)
             install(FILES ${CMAKE_BINARY_DIR}/libMoltenVK.dylib DESTINATION ${INSTALL_LIB_DIR}/runtime)
         endif()
     endif()
-
-    add_subdirectory(taichi/runtime/vulkan)
-    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE vulkan_runtime)
 endif ()
 
 
diff --git a/taichi/aot/module_loader.cpp b/taichi/aot/module_loader.cpp
index 908cc60db02c1..6526dfbc57969 100644
--- a/taichi/aot/module_loader.cpp
+++ b/taichi/aot/module_loader.cpp
@@ -1,6 +1,6 @@
 #include "taichi/aot/module_loader.h"
 
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 #include "taichi/backends/metal/aot_module_loader_impl.h"
 
 namespace taichi {
@@ -32,19 +32,18 @@ Kernel *KernelTemplate::get_kernel(
 std::unique_ptr<Module> Module::load(Arch arch, std::any mod_params) {
   if (arch == Arch::vulkan) {
 #ifdef TI_WITH_VULKAN
-    return vulkan::make_aot_module(mod_params);
-#else
-    TI_NOT_IMPLEMENTED
+    return gfx::make_aot_module(mod_params, arch);
+#endif
+  } else if (arch == Arch::dx11) {
+#ifdef TI_WITH_DX11
+    return gfx::make_aot_module(mod_params, arch);
 #endif
   } else if (arch == Arch::metal) {
 #ifdef TI_WITH_METAL
     return metal::make_aot_module(mod_params);
-#else
-    TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    TI_NOT_IMPLEMENTED;
   }
+  TI_NOT_IMPLEMENTED;
 }
 
 Kernel *Module::get_kernel(const std::string &name) {
diff --git a/taichi/backends/dx/dx_device.cpp b/taichi/backends/dx/dx_device.cpp
index 436a40c221aee..27746b68e0f1f 100644
--- a/taichi/backends/dx/dx_device.cpp
+++ b/taichi/backends/dx/dx_device.cpp
@@ -22,7 +22,7 @@ void dump_buffer(ID3D11Device *device,
 
 void check_dx_error(HRESULT hr, const char *msg) {
   if (!SUCCEEDED(hr)) {
-    TI_ERROR("Error in {}: {}", msg, hr);
+    TI_ERROR("Error in {}: {:x}", msg, uint32_t(hr));
   }
 }
 
@@ -593,16 +593,21 @@ DeviceAllocation Dx11Device::allocate_memory(const AllocParams &params) {
 
 void Dx11Device::dealloc_memory(DeviceAllocation handle) {
   uint32_t alloc_id = handle.alloc_id;
-  if (alloc_id_to_buffer_.count(alloc_id) == 0)
-    return;
+  if (alloc_id_to_buffer_.find(alloc_id) == alloc_id_to_buffer_.end())
+    TI_ERROR("Invalid handle, possible double free?");
   ID3D11Buffer *buf = alloc_id_to_buffer_[alloc_id];
   buf->Release();
   alloc_id_to_buffer_.erase(alloc_id);
   ID3D11UnorderedAccessView *uav = alloc_id_to_uav_[alloc_id];
   uav->Release();
-  ID3D11Buffer *cpucopy = alloc_id_to_cpucopy_[alloc_id];
-  if (cpucopy)
-    cpucopy->Release();
+  if (alloc_id_to_cpucopy_.find(alloc_id) != alloc_id_to_cpucopy_.end()) {
+    alloc_id_to_cpucopy_[alloc_id]->Release();
+    alloc_id_to_cpucopy_.erase(alloc_id);
+  }
+  if (alloc_id_to_cb_copy_.find(alloc_id) != alloc_id_to_cb_copy_.end()) {
+    alloc_id_to_cb_copy_[alloc_id]->Release();
+    alloc_id_to_cb_copy_.erase(alloc_id);
+  }
   alloc_id_to_uav_.erase(alloc_id);
 }
 
@@ -724,10 +729,9 @@ ID3D11UnorderedAccessView *Dx11Device::alloc_id_to_uav(uint32_t alloc_id) {
 }
 
 ID3D11Buffer *Dx11Device::create_or_get_cb_buffer(uint32_t alloc_id) {
-  if (alloc_id_to_cb_copy_.count(alloc_id) > 0) {
+  if (alloc_id_to_cb_copy_.find(alloc_id) != alloc_id_to_cb_copy_.end()) {
     return alloc_id_to_cb_copy_[alloc_id];
   }
-  assert(alloc_id_to_buffer_.count(alloc_id) > 0);
   ID3D11Buffer *buf = alloc_id_to_buffer_[alloc_id];
   ID3D11Buffer *cb_buf;
   HRESULT hr = create_constant_buffer_copy(device_, buf, &cb_buf);
diff --git a/taichi/backends/dx/dx_program.cpp b/taichi/backends/dx/dx_program.cpp
index 1b76bdeffde9d..f41617536d3eb 100644
--- a/taichi/backends/dx/dx_program.cpp
+++ b/taichi/backends/dx/dx_program.cpp
@@ -3,18 +3,20 @@
 #include "taichi/backends/dx/dx_program.h"
 
 #include "taichi/backends/dx/dx_device.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 
 namespace taichi {
 namespace lang {
 namespace directx11 {
 
 FunctionType compile_to_executable(Kernel *kernel,
-                                   vulkan::VkRuntime *runtime,
-                                   vulkan::SNodeTreeManager *snode_tree_mgr) {
+                                   gfx::GfxRuntime *runtime,
+                                   gfx::SNodeTreeManager *snode_tree_mgr) {
   auto handle = runtime->register_taichi_kernel(
-      std::move(vulkan::run_codegen(kernel, runtime->get_ti_device(),
-                                    snode_tree_mgr->get_compiled_structs())));
+      std::move(gfx::run_codegen(kernel, runtime->get_ti_device(),
+                                 snode_tree_mgr->get_compiled_structs())));
   return [runtime, handle](RuntimeContext &ctx) {
     runtime->launch_kernel(handle, &ctx);
   };
@@ -40,28 +42,52 @@ void Dx11ProgramImpl::materialize_runtime(MemoryPool *memory_pool,
 
   device_ = std::make_shared<directx11::Dx11Device>();
 
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = *result_buffer_ptr;
   params.device = device_.get();
-  runtime_ = std::make_unique<vulkan::VkRuntime>(std::move(params));
-  snode_tree_mgr_ = std::make_unique<vulkan::SNodeTreeManager>(runtime_.get());
+  runtime_ = std::make_unique<gfx::GfxRuntime>(std::move(params));
+  snode_tree_mgr_ = std::make_unique<gfx::SNodeTreeManager>(runtime_.get());
 }
 
-void Dx11ProgramImpl::synchronize() {
-  TI_NOT_IMPLEMENTED;
+void Dx11ProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
+  if (runtime_) {
+    snode_tree_mgr_->materialize_snode_tree(tree);
+  } else {
+    gfx::CompiledSNodeStructs compiled_structs =
+        gfx::compile_snode_structs(*tree->root());
+    aot_compiled_snode_structs_.push_back(compiled_structs);
+  }
 }
 
 void Dx11ProgramImpl::materialize_snode_tree(SNodeTree *tree,
-                                             uint64 *result_buffer_ptr) {
+                                             uint64 *result_buffer) {
   snode_tree_mgr_->materialize_snode_tree(tree);
 }
 
 std::unique_ptr<AotModuleBuilder> Dx11ProgramImpl::make_aot_module_builder() {
-  return nullptr;
+  if (runtime_) {
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        snode_tree_mgr_->get_compiled_structs(), Arch::dx11);
+  } else {
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        aot_compiled_snode_structs_, Arch::dx11);
+  }
 }
 
-void Dx11ProgramImpl::destroy_snode_tree(SNodeTree *snode_tree) {
-  TI_NOT_IMPLEMENTED;
+DeviceAllocation Dx11ProgramImpl::allocate_memory_ndarray(
+    std::size_t alloc_size,
+    uint64 *result_buffer) {
+  return get_compute_device()->allocate_memory(
+      {alloc_size, /*host_write=*/false, /*host_read=*/false,
+       /*export_sharing=*/false});
+}
+
+std::unique_ptr<aot::Kernel> Dx11ProgramImpl::make_aot_kernel(Kernel &kernel) {
+  spirv::lower(&kernel);
+  std::vector<gfx::CompiledSNodeStructs> compiled_structs;
+  gfx::GfxRuntime::RegisterParams kparams =
+      gfx::run_codegen(&kernel, get_compute_device(), compiled_structs);
+  return std::make_unique<gfx::KernelImpl>(runtime_.get(), std::move(kparams));
 }
 
 }  // namespace lang
diff --git a/taichi/backends/dx/dx_program.h b/taichi/backends/dx/dx_program.h
index 603ee63abc3c4..1ae5522e563ba 100644
--- a/taichi/backends/dx/dx_program.h
+++ b/taichi/backends/dx/dx_program.h
@@ -3,8 +3,8 @@
 #ifdef TI_WITH_DX11
 
 #include "taichi/backends/dx/dx_device.h"
-#include "taichi/runtime/vulkan/runtime.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/runtime.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
 #include "taichi/program/program_impl.h"
 
 namespace taichi {
@@ -13,26 +13,59 @@ namespace lang {
 class Dx11ProgramImpl : public ProgramImpl {
  public:
   Dx11ProgramImpl(CompileConfig &config);
-
   FunctionType compile(Kernel *kernel, OffloadedStmt *offloaded) override;
+
   std::size_t get_snode_num_dynamically_allocated(
       SNode *snode,
       uint64 *result_buffer) override {
-    return 0;
+    return 0;  // TODO: support sparse
   }
-  std::unique_ptr<AotModuleBuilder> make_aot_module_builder();
+
+  void compile_snode_tree_types(SNodeTree *tree) override;
+
   void materialize_runtime(MemoryPool *memory_pool,
                            KernelProfilerBase *profiler,
                            uint64 **result_buffer_ptr) override;
-  virtual void materialize_snode_tree(SNodeTree *tree,
-                                      uint64 *result_buffer_ptr) override;
-  virtual void destroy_snode_tree(SNodeTree *snode_tree) override;
-  void synchronize() override;
+
+  void materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) override;
+
+  void synchronize() override {
+    runtime_->synchronize();
+  }
+
+  StreamSemaphore flush() override {
+    return runtime_->flush();
+  }
+
+  std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override;
+
+  void destroy_snode_tree(SNodeTree *snode_tree) override {
+    TI_ASSERT(snode_tree_mgr_ != nullptr);
+    snode_tree_mgr_->destroy_snode_tree(snode_tree);
+  }
+
+  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
+                                           uint64 *result_buffer) override;
+
+  Device *get_compute_device() override {
+    return device_.get();
+  }
+
+  Device *get_graphics_device() override {
+    return device_.get();
+  }
+
+  DevicePtr get_snode_tree_device_ptr(int tree_id) override {
+    return snode_tree_mgr_->get_snode_tree_device_ptr(tree_id);
+  }
+
+  std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) override;
 
  private:
   std::shared_ptr<directx11::Dx11Device> device_{nullptr};
-  std::unique_ptr<vulkan::VkRuntime> runtime_{nullptr};
-  std::unique_ptr<vulkan::SNodeTreeManager> snode_tree_mgr_{nullptr};
+  std::unique_ptr<gfx::GfxRuntime> runtime_{nullptr};
+  std::unique_ptr<gfx::SNodeTreeManager> snode_tree_mgr_{nullptr};
+  std::vector<spirv::CompiledSNodeStructs> aot_compiled_snode_structs_;
 };
 
 }  // namespace lang
diff --git a/taichi/backends/opengl/opengl_device.cpp b/taichi/backends/opengl/opengl_device.cpp
index b399cbb12e9ab..0da96f76d3685 100644
--- a/taichi/backends/opengl/opengl_device.cpp
+++ b/taichi/backends/opengl/opengl_device.cpp
@@ -197,7 +197,7 @@ void GLResourceBinder::rw_buffer(uint32_t set,
                                  DeviceAllocation alloc) {
   TI_ASSERT_INFO(set == 0, "OpenGL only supports set = 0, requested set = {}",
                  set);
-  binding_map_[binding] = alloc.alloc_id;
+  ssbo_binding_map_[binding] = alloc.alloc_id;
 }
 
 void GLResourceBinder::buffer(uint32_t set,
@@ -211,7 +211,9 @@ void GLResourceBinder::buffer(uint32_t set,
 void GLResourceBinder::buffer(uint32_t set,
                               uint32_t binding,
                               DeviceAllocation alloc) {
-  rw_buffer(set, binding, alloc);
+  TI_ASSERT_INFO(set == 0, "OpenGL only supports set = 0, requested set = {}",
+                 set);
+  ubo_binding_map_[binding] = alloc.alloc_id;
 }
 
 void GLResourceBinder::image(uint32_t set,
@@ -295,10 +297,17 @@ void GLCommandList::bind_pipeline(Pipeline *p) {
 
 void GLCommandList::bind_resources(ResourceBinder *_binder) {
   GLResourceBinder *binder = static_cast<GLResourceBinder *>(_binder);
-  for (auto &[binding, buffer] : binder->binding_map()) {
+  for (auto &[binding, buffer] : binder->ssbo_binding_map()) {
+    auto cmd = std::make_unique<CmdBindBufferToIndex>();
+    cmd->buffer = buffer;
+    cmd->index = binding;
+    recorded_commands_.push_back(std::move(cmd));
+  }
+  for (auto &[binding, buffer] : binder->ubo_binding_map()) {
     auto cmd = std::make_unique<CmdBindBufferToIndex>();
     cmd->buffer = buffer;
     cmd->index = binding;
+    cmd->target = GL_UNIFORM_BUFFER;
     recorded_commands_.push_back(std::move(cmd));
   }
 }
@@ -682,7 +691,7 @@ void GLCommandList::CmdBindPipeline::execute() {
 }
 
 void GLCommandList::CmdBindBufferToIndex::execute() {
-  glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, buffer);
+  glBindBufferBase(target, index, buffer);
   check_opengl_error("glBindBufferBase");
 }
 
diff --git a/taichi/backends/opengl/opengl_device.h b/taichi/backends/opengl/opengl_device.h
index 99eb1cba77eb5..7ab1e8c6dc185 100644
--- a/taichi/backends/opengl/opengl_device.h
+++ b/taichi/backends/opengl/opengl_device.h
@@ -53,12 +53,17 @@ class GLResourceBinder : public ResourceBinder {
   // index_width = 2 -> uint16 index
   void index_buffer(DevicePtr ptr, size_t index_width) override;
 
-  const std::unordered_map<uint32_t, GLuint> &binding_map() {
-    return binding_map_;
+  const std::unordered_map<uint32_t, GLuint> &ssbo_binding_map() {
+    return ssbo_binding_map_;
+  }
+
+  const std::unordered_map<uint32_t, GLuint> &ubo_binding_map() {
+    return ubo_binding_map_;
   }
 
  private:
-  std::unordered_map<uint32_t, GLuint> binding_map_;
+  std::unordered_map<uint32_t, GLuint> ssbo_binding_map_;
+  std::unordered_map<uint32_t, GLuint> ubo_binding_map_;
 };
 
 class GLPipeline : public Pipeline {
@@ -141,6 +146,7 @@ class GLCommandList : public CommandList {
   struct CmdBindBufferToIndex : public Cmd {
     GLuint buffer{0};
     GLuint index{0};
+    GLenum target{GL_SHADER_STORAGE_BUFFER};
     void execute() override;
   };
 
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.h b/taichi/backends/vulkan/aot_module_loader_impl.h
deleted file mode 100644
index 16230e1411306..0000000000000
--- a/taichi/backends/vulkan/aot_module_loader_impl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <any>
-#include <string>
-#include <vector>
-
-#include "taichi/backends/vulkan/aot_utils.h"
-#include "taichi/runtime/vulkan/runtime.h"
-#include "taichi/codegen/spirv/kernel_utils.h"
-#include "taichi/aot/module_builder.h"
-#include "taichi/aot/module_loader.h"
-#include "taichi/backends/vulkan/aot_module_builder_impl.h"
-#include "taichi/backends/vulkan/vulkan_graph_data.h"
-
-namespace taichi {
-namespace lang {
-namespace vulkan {
-struct TI_DLL_EXPORT AotModuleParams {
-  std::string module_path;
-  VkRuntime *runtime{nullptr};
-};
-
-TI_DLL_EXPORT std::unique_ptr<aot::Module> make_aot_module(std::any mod_params);
-
-}  // namespace vulkan
-}  // namespace lang
-}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/backends/vulkan/vulkan_program.cpp
index cddb446ae469f..d7cac58be11b8 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/backends/vulkan/vulkan_program.cpp
@@ -1,8 +1,8 @@
 #include "taichi/backends/vulkan/vulkan_program.h"
 
-#include "taichi/backends/vulkan/aot_module_builder_impl.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 
 #if !defined(ANDROID) && !defined(TI_EMSCRIPTENED)
 #include "GLFW/glfw3.h"
@@ -69,11 +69,11 @@ VulkanProgramImpl::VulkanProgramImpl(CompileConfig &config)
 }
 
 FunctionType compile_to_executable(Kernel *kernel,
-                                   VkRuntime *runtime,
-                                   SNodeTreeManager *snode_tree_mgr) {
+                                   gfx::GfxRuntime *runtime,
+                                   gfx::SNodeTreeManager *snode_tree_mgr) {
   auto handle = runtime->register_taichi_kernel(
-      run_codegen(kernel, runtime->get_ti_device(),
-                  snode_tree_mgr->get_compiled_structs()));
+      gfx::run_codegen(kernel, runtime->get_ti_device(),
+                       snode_tree_mgr->get_compiled_structs()));
   return [runtime, handle](RuntimeContext &ctx) {
     runtime->launch_kernel(handle, &ctx);
   };
@@ -144,20 +144,20 @@ void VulkanProgramImpl::materialize_runtime(MemoryPool *memory_pool,
 
   embedded_device_ = std::make_unique<VulkanDeviceCreator>(evd_params);
 
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = *result_buffer_ptr;
   params.device = embedded_device_->device();
-  vulkan_runtime_ = std::make_unique<vulkan::VkRuntime>(std::move(params));
+  vulkan_runtime_ = std::make_unique<gfx::GfxRuntime>(std::move(params));
   snode_tree_mgr_ =
-      std::make_unique<vulkan::SNodeTreeManager>(vulkan_runtime_.get());
+      std::make_unique<gfx::SNodeTreeManager>(vulkan_runtime_.get());
 }
 
 void VulkanProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   if (vulkan_runtime_) {
     snode_tree_mgr_->materialize_snode_tree(tree);
   } else {
-    CompiledSNodeStructs compiled_structs =
-        vulkan::compile_snode_structs(*tree->root());
+    gfx::CompiledSNodeStructs compiled_structs =
+        gfx::compile_snode_structs(*tree->root());
     aot_compiled_snode_structs_.push_back(compiled_structs);
   }
 }
@@ -169,10 +169,11 @@ void VulkanProgramImpl::materialize_snode_tree(SNodeTree *tree,
 
 std::unique_ptr<AotModuleBuilder> VulkanProgramImpl::make_aot_module_builder() {
   if (vulkan_runtime_) {
-    return std::make_unique<AotModuleBuilderImpl>(
-        snode_tree_mgr_->get_compiled_structs());
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        snode_tree_mgr_->get_compiled_structs(), Arch::vulkan);
   } else {
-    return std::make_unique<AotModuleBuilderImpl>(aot_compiled_snode_structs_);
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        aot_compiled_snode_structs_, Arch::vulkan);
   }
 }
 
@@ -187,11 +188,11 @@ DeviceAllocation VulkanProgramImpl::allocate_memory_ndarray(
 std::unique_ptr<aot::Kernel> VulkanProgramImpl::make_aot_kernel(
     Kernel &kernel) {
   spirv::lower(&kernel);
-  std::vector<CompiledSNodeStructs> compiled_structs;
-  VkRuntime::RegisterParams kparams =
-      run_codegen(&kernel, get_compute_device(), compiled_structs);
-  return std::make_unique<KernelImpl>(vulkan_runtime_.get(),
-                                      std::move(kparams));
+  std::vector<gfx::CompiledSNodeStructs> compiled_structs;
+  gfx::GfxRuntime::RegisterParams kparams =
+      gfx::run_codegen(&kernel, get_compute_device(), compiled_structs);
+  return std::make_unique<gfx::KernelImpl>(vulkan_runtime_.get(),
+                                           std::move(kparams));
 }
 
 VulkanProgramImpl::~VulkanProgramImpl() {
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index b8d7a820fb7be..830910ebcc858 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -6,8 +6,8 @@
 #include "taichi/backends/vulkan/vulkan_device_creator.h"
 #include "taichi/backends/vulkan/vulkan_utils.h"
 #include "taichi/backends/vulkan/vulkan_loader.h"
-#include "taichi/runtime/vulkan/runtime.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/runtime.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
 #include "taichi/backends/vulkan/vulkan_device.h"
 #include "vk_mem_alloc.h"
 
@@ -35,7 +35,7 @@ class VulkanProgramImpl : public ProgramImpl {
   std::size_t get_snode_num_dynamically_allocated(
       SNode *snode,
       uint64 *result_buffer) override {
-    return 0;  // TODO: support sparse in vulkan
+    return 0;  // TODO: support sparse
   }
 
   void compile_snode_tree_types(SNodeTree *tree) override;
@@ -56,7 +56,7 @@ class VulkanProgramImpl : public ProgramImpl {
 
   std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override;
 
-  virtual void destroy_snode_tree(SNodeTree *snode_tree) override {
+  void destroy_snode_tree(SNodeTree *snode_tree) override {
     TI_ASSERT(snode_tree_mgr_ != nullptr);
     snode_tree_mgr_->destroy_snode_tree(snode_tree);
   }
@@ -88,8 +88,8 @@ class VulkanProgramImpl : public ProgramImpl {
 
  private:
   std::unique_ptr<vulkan::VulkanDeviceCreator> embedded_device_{nullptr};
-  std::unique_ptr<vulkan::VkRuntime> vulkan_runtime_{nullptr};
-  std::unique_ptr<vulkan::SNodeTreeManager> snode_tree_mgr_{nullptr};
+  std::unique_ptr<gfx::GfxRuntime> vulkan_runtime_{nullptr};
+  std::unique_ptr<gfx::SNodeTreeManager> snode_tree_mgr_{nullptr};
   std::vector<spirv::CompiledSNodeStructs> aot_compiled_snode_structs_;
 };
 }  // namespace lang
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 05ae5f128715d..0f02447d2b15e 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -193,7 +193,8 @@ void Program::materialize_runtime() {
 }
 
 void Program::destroy_snode_tree(SNodeTree *snode_tree) {
-  TI_ASSERT(arch_uses_llvm(config.arch) || config.arch == Arch::vulkan);
+  TI_ASSERT(arch_uses_llvm(config.arch) || config.arch == Arch::vulkan ||
+            config.arch == Arch::dx11);
   program_impl_->destroy_snode_tree(snode_tree);
   free_snode_tree_ids_.push(snode_tree->id());
 }
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index 2ede85517e551..44e1f88c0e72c 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -5,7 +5,7 @@
 
 #include "taichi/backends/metal/api.h"
 #include "taichi/runtime/opengl/opengl_api.h"
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/backends/dx/dx_api.h"
 #include "taichi/common/core.h"
 #include "taichi/common/interface.h"
diff --git a/taichi/runtime/gfx/CMakeLists.txt b/taichi/runtime/gfx/CMakeLists.txt
new file mode 100644
index 0000000000000..bc5bcdf03b5fb
--- /dev/null
+++ b/taichi/runtime/gfx/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ./taichi/runtime/gfx/CMakeLists.txt
+
+add_library(gfx_runtime)
+target_sources(gfx_runtime
+  PRIVATE
+    runtime.cpp
+    snode_tree_manager.cpp
+    aot_module_builder_impl.cpp
+    aot_module_loader_impl.cpp
+  )
+target_include_directories(gfx_runtime
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+    ${PROJECT_SOURCE_DIR}/external/eigen
+    ${PROJECT_SOURCE_DIR}/external/FP16/include
+  )
diff --git a/taichi/backends/vulkan/vulkan_graph_data.h b/taichi/runtime/gfx/aot_graph_data.h
similarity index 55%
rename from taichi/backends/vulkan/vulkan_graph_data.h
rename to taichi/runtime/gfx/aot_graph_data.h
index 6fa3cafc1e3e0..b9d8b4315ab65 100644
--- a/taichi/backends/vulkan/vulkan_graph_data.h
+++ b/taichi/runtime/gfx/aot_graph_data.h
@@ -1,12 +1,12 @@
 #pragma once
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 class KernelImpl : public aot::Kernel {
  public:
-  explicit KernelImpl(VkRuntime *runtime, VkRuntime::RegisterParams &&params)
+  explicit KernelImpl(GfxRuntime *runtime, GfxRuntime::RegisterParams &&params)
       : runtime_(runtime), params_(std::move(params)) {
     handle_ = runtime_->register_taichi_kernel(params_);
   }
@@ -15,15 +15,15 @@ class KernelImpl : public aot::Kernel {
     runtime_->launch_kernel(handle_, ctx);
   }
 
-  const VkRuntime::RegisterParams &params() {
+  const GfxRuntime::RegisterParams &params() {
     return params_;
   }
 
  private:
-  VkRuntime *const runtime_;
-  VkRuntime::KernelHandle handle_;
-  const VkRuntime::RegisterParams params_;
+  GfxRuntime *const runtime_;
+  GfxRuntime::KernelHandle handle_;
+  const GfxRuntime::RegisterParams params_;
 };
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_builder_impl.cpp b/taichi/runtime/gfx/aot_module_builder_impl.cpp
similarity index 93%
rename from taichi/backends/vulkan/aot_module_builder_impl.cpp
rename to taichi/runtime/gfx/aot_module_builder_impl.cpp
index ed03800098bd1..48e3b7964de03 100644
--- a/taichi/backends/vulkan/aot_module_builder_impl.cpp
+++ b/taichi/runtime/gfx/aot_module_builder_impl.cpp
@@ -1,15 +1,15 @@
-#include "taichi/backends/vulkan/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
 
 #include <fstream>
 #include <type_traits>
 
 #include "taichi/aot/module_data.h"
 #include "taichi/codegen/spirv/spirv_codegen.h"
-#include "taichi/backends/vulkan/vulkan_graph_data.h"
+#include "taichi/runtime/gfx/aot_graph_data.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 namespace {
 class AotDataConverter {
@@ -66,7 +66,6 @@ class AotDataConverter {
     aot::CompiledOffloadedTask res{};
     res.type = offloaded_task_type_name(in.task_type);
     res.name = in.name;
-    // TODO: update range_hint after ndarray is supported on vulkan.
     if (in.range_for_attribs && in.range_for_attribs->const_begin &&
         in.range_for_attribs->const_end) {
       res.range_hint = std::to_string(in.range_for_attribs->end -
@@ -98,9 +97,11 @@ class AotDataConverter {
 
 }  // namespace
 AotModuleBuilderImpl::AotModuleBuilderImpl(
-    const std::vector<CompiledSNodeStructs> &compiled_structs)
-    : compiled_structs_(compiled_structs) {
-  aot_target_device_ = std::make_unique<aot::TargetDevice>(Arch::vulkan);
+    const std::vector<CompiledSNodeStructs> &compiled_structs,
+    Arch device_api_backend)
+    : compiled_structs_(compiled_structs),
+      device_api_backend_(device_api_backend) {
+  aot_target_device_ = std::make_unique<aot::TargetDevice>(device_api_backend_);
   if (!compiled_structs.empty()) {
     ti_aot_data_.root_buffer_size = compiled_structs[0].root_size;
   }
@@ -120,7 +121,7 @@ std::string AotModuleBuilderImpl::write_spv_file(
 void AotModuleBuilderImpl::dump(const std::string &output_dir,
                                 const std::string &filename) const {
   TI_WARN_IF(!filename.empty(),
-             "Filename prefix is ignored on vulkan backend.");
+             "Filename prefix is ignored on Unified Device API backends.");
   const std::string bin_path = fmt::format("{}/metadata.tcb", output_dir);
   write_to_binary_file(ti_aot_data_, bin_path);
 
@@ -199,6 +200,6 @@ void AotModuleBuilderImpl::add_per_backend_tmpl(const std::string &identifier,
   ti_aot_data_.spirv_codes.push_back(compiled.task_spirv_source_codes);
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_builder_impl.h b/taichi/runtime/gfx/aot_module_builder_impl.h
similarity index 85%
rename from taichi/backends/vulkan/aot_module_builder_impl.h
rename to taichi/runtime/gfx/aot_module_builder_impl.h
index 40dc4157c06f4..a6eeaaa3af143 100644
--- a/taichi/backends/vulkan/aot_module_builder_impl.h
+++ b/taichi/runtime/gfx/aot_module_builder_impl.h
@@ -4,19 +4,20 @@
 #include <vector>
 
 #include "taichi/aot/module_builder.h"
-#include "taichi/backends/vulkan/aot_utils.h"
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/aot_utils.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/codegen/spirv/snode_struct_compiler.h"
 #include "taichi/codegen/spirv/kernel_utils.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 class AotModuleBuilderImpl : public AotModuleBuilder {
  public:
   explicit AotModuleBuilderImpl(
-      const std::vector<CompiledSNodeStructs> &compiled_structs);
+      const std::vector<CompiledSNodeStructs> &compiled_structs,
+      Arch device_api_backend);
 
   void dump(const std::string &output_dir,
             const std::string &filename) const override;
@@ -45,8 +46,10 @@ class AotModuleBuilderImpl : public AotModuleBuilder {
   const std::vector<CompiledSNodeStructs> &compiled_structs_;
   TaichiAotData ti_aot_data_;
   std::unique_ptr<Device> aot_target_device_;
+
+  Arch device_api_backend_;
 };
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.cpp b/taichi/runtime/gfx/aot_module_loader_impl.cpp
similarity index 85%
rename from taichi/backends/vulkan/aot_module_loader_impl.cpp
rename to taichi/runtime/gfx/aot_module_loader_impl.cpp
index 4ea34de89fc7c..497636b5b7c2c 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.cpp
+++ b/taichi/runtime/gfx/aot_module_loader_impl.cpp
@@ -1,30 +1,30 @@
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 
 #include <fstream>
 #include <type_traits>
 
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/aot/graph_data.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 namespace {
 class FieldImpl : public aot::Field {
  public:
-  explicit FieldImpl(VkRuntime *runtime, const aot::CompiledFieldData &field)
+  explicit FieldImpl(GfxRuntime *runtime, const aot::CompiledFieldData &field)
       : runtime_(runtime), field_(field) {
   }
 
  private:
-  VkRuntime *const runtime_;
+  GfxRuntime *const runtime_;
   aot::CompiledFieldData field_;
 };
 
 class AotModuleImpl : public aot::Module {
  public:
-  explicit AotModuleImpl(const AotModuleParams &params)
-      : runtime_(params.runtime) {
+  explicit AotModuleImpl(const AotModuleParams &params, Arch device_api_backend)
+      : runtime_(params.runtime), device_api_backend_(device_api_backend) {
     const std::string bin_path =
         fmt::format("{}/metadata.tcb", params.module_path);
     read_from_binary_file(ti_aot_data_, bin_path);
@@ -63,7 +63,7 @@ class AotModuleImpl : public aot::Module {
 
   // Module metadata
   Arch arch() const override {
-    return Arch::vulkan;
+    return device_api_backend_;
   }
   uint64_t version() const override {
     TI_NOT_IMPLEMENTED;
@@ -82,7 +82,7 @@ class AotModuleImpl : public aot::Module {
   }
 
   bool get_kernel_params_by_name(const std::string &name,
-                                 VkRuntime::RegisterParams &kernel) {
+                                 GfxRuntime::RegisterParams &kernel) {
     for (int i = 0; i < ti_aot_data_.kernels.size(); ++i) {
       // Offloaded task names encode more than the name of the function, but for
       // AOT, only use the name of the function which should be the first part
@@ -102,7 +102,7 @@ class AotModuleImpl : public aot::Module {
 
   std::unique_ptr<aot::Kernel> make_new_kernel(
       const std::string &name) override {
-    VkRuntime::RegisterParams kparams;
+    GfxRuntime::RegisterParams kparams;
     if (!get_kernel_params_by_name(name, kparams)) {
       TI_DEBUG("Failed to load kernel {}", name);
       return nullptr;
@@ -139,16 +139,18 @@ class AotModuleImpl : public aot::Module {
   }
 
   TaichiAotData ti_aot_data_;
-  VkRuntime *runtime_{nullptr};
+  GfxRuntime *runtime_{nullptr};
+  Arch device_api_backend_;
 };
 
 }  // namespace
 
-std::unique_ptr<aot::Module> make_aot_module(std::any mod_params) {
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params,
+                                             Arch device_api_backend) {
   AotModuleParams params = std::any_cast<AotModuleParams &>(mod_params);
-  return std::make_unique<AotModuleImpl>(params);
+  return std::make_unique<AotModuleImpl>(params, device_api_backend);
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/gfx/aot_module_loader_impl.h b/taichi/runtime/gfx/aot_module_loader_impl.h
new file mode 100644
index 0000000000000..6fec0a063bf24
--- /dev/null
+++ b/taichi/runtime/gfx/aot_module_loader_impl.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <any>
+#include <string>
+#include <vector>
+
+#include "taichi/runtime/gfx/aot_utils.h"
+#include "taichi/runtime/gfx/runtime.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/aot_graph_data.h"
+#include "taichi/codegen/spirv/kernel_utils.h"
+#include "taichi/aot/module_builder.h"
+#include "taichi/aot/module_loader.h"
+
+namespace taichi {
+namespace lang {
+namespace gfx {
+
+struct TI_DLL_EXPORT AotModuleParams {
+  std::string module_path;
+  GfxRuntime *runtime{nullptr};
+};
+
+TI_DLL_EXPORT std::unique_ptr<aot::Module> make_aot_module(
+    std::any mod_params,
+    Arch device_api_backend);
+
+}  // namespace gfx
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_utils.h b/taichi/runtime/gfx/aot_utils.h
similarity index 84%
rename from taichi/backends/vulkan/aot_utils.h
rename to taichi/runtime/gfx/aot_utils.h
index 5c00d4023efa8..e8c1f5b0ea150 100644
--- a/taichi/backends/vulkan/aot_utils.h
+++ b/taichi/runtime/gfx/aot_utils.h
@@ -7,10 +7,10 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 /**
- * AOT module data for the vulkan backend.
+ * AOT module data for the Unified Device API backend.
  */
 struct TaichiAotData {
   //   BufferMetaData metadata;
@@ -22,6 +22,6 @@ struct TaichiAotData {
   TI_IO_DEF(kernels, fields, root_buffer_size);
 };
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/vulkan/runtime.cpp b/taichi/runtime/gfx/runtime.cpp
similarity index 94%
rename from taichi/runtime/vulkan/runtime.cpp
rename to taichi/runtime/gfx/runtime.cpp
index 3a4d199824a13..d47e19c71ce46 100644
--- a/taichi/runtime/vulkan/runtime.cpp
+++ b/taichi/runtime/gfx/runtime.cpp
@@ -1,4 +1,4 @@
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/program/program.h"
 
 #include <chrono>
@@ -18,28 +18,9 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 namespace {
-class StopWatch {
- public:
-  StopWatch() : begin_(std::chrono::system_clock::now()) {
-  }
-
-  int get_micros() {
-    typedef std::chrono::duration<float> fsec;
-
-    auto now = std::chrono::system_clock::now();
-
-    fsec fs = now - begin_;
-    begin_ = now;
-    auto d = std::chrono::duration_cast<std::chrono::microseconds>(fs);
-    return d.count();
-  }
-
- private:
-  std::chrono::time_point<std::chrono::system_clock> begin_;
-};
 
 class HostDeviceContextBlitter {
  public:
@@ -123,7 +104,7 @@ class HostDeviceContextBlitter {
             break;
           }
         }
-        TI_ERROR("Vulkan does not support arg type={}",
+        TI_ERROR("Device does not support arg type={}",
                  PrimitiveType::get(arg.dtype).to_string());
       } while (0);
     }
@@ -227,7 +208,7 @@ class HostDeviceContextBlitter {
             continue;
           }
         }
-        TI_ERROR("Vulkan does not support return value type={}",
+        TI_ERROR("Device does not support return value type={}",
                  data_type_name(PrimitiveType::get(ret.dtype)));
       }
     }
@@ -268,7 +249,7 @@ constexpr size_t kGtmpBufferSize = 1024 * 1024;
 constexpr size_t kListGenBufferSize = 32 << 20;
 
 // Info for launching a compiled Taichi kernel, which consists of a series of
-// Vulkan pipelines.
+// Unified Device API pipelines.
 
 CompiledTaichiKernel::CompiledTaichiKernel(const Params &ti_params)
     : ti_kernel_attribs_(*ti_params.ti_kernel_attribs),
@@ -372,14 +353,14 @@ void CompiledTaichiKernel::generate_command_list(
   }
 }
 
-VkRuntime::VkRuntime(const Params &params)
+GfxRuntime::GfxRuntime(const Params &params)
     : device_(params.device), host_result_buffer_(params.host_result_buffer) {
   TI_ASSERT(host_result_buffer_ != nullptr);
   current_cmdlist_pending_since_ = high_res_clock::now();
   init_nonroot_buffers();
 }
 
-VkRuntime::~VkRuntime() {
+GfxRuntime::~GfxRuntime() {
   synchronize();
   {
     decltype(ti_kernels_) tmp;
@@ -388,8 +369,8 @@ VkRuntime::~VkRuntime() {
   global_tmps_buffer_.reset();
 }
 
-VkRuntime::KernelHandle VkRuntime::register_taichi_kernel(
-    VkRuntime::RegisterParams reg_params) {
+GfxRuntime::KernelHandle GfxRuntime::register_taichi_kernel(
+    GfxRuntime::RegisterParams reg_params) {
   CompiledTaichiKernel::Params params;
   params.ti_kernel_attribs = &(reg_params.kernel_attribs);
   params.num_snode_trees = reg_params.num_snode_trees;
@@ -414,7 +395,7 @@ VkRuntime::KernelHandle VkRuntime::register_taichi_kernel(
   return res;
 }
 
-void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
+void GfxRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   auto *ti_kernel = ti_kernels_[handle.id_].get();
 
   std::unique_ptr<DeviceAllocationGuard> args_buffer{nullptr},
@@ -553,13 +534,13 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   }
 }
 
-void VkRuntime::synchronize() {
+void GfxRuntime::synchronize() {
   flush();
   device_->wait_idle();
   ctx_buffers_.clear();
 }
 
-StreamSemaphore VkRuntime::flush() {
+StreamSemaphore GfxRuntime::flush() {
   StreamSemaphore sema;
   if (current_cmdlist_) {
     sema = device_->get_compute_stream()->submit(current_cmdlist_.get());
@@ -572,11 +553,11 @@ StreamSemaphore VkRuntime::flush() {
   return sema;
 }
 
-Device *VkRuntime::get_ti_device() const {
+Device *GfxRuntime::get_ti_device() const {
   return device_;
 }
 
-void VkRuntime::init_nonroot_buffers() {
+void GfxRuntime::init_nonroot_buffers() {
   global_tmps_buffer_ = device_->allocate_memory_unique(
       {kGtmpBufferSize,
        /*host_write=*/false, /*host_read=*/false,
@@ -598,7 +579,7 @@ void VkRuntime::init_nonroot_buffers() {
   stream->submit_synced(cmdlist.get());
 }
 
-void VkRuntime::add_root_buffer(size_t root_buffer_size) {
+void GfxRuntime::add_root_buffer(size_t root_buffer_size) {
   if (root_buffer_size == 0) {
     root_buffer_size = 4;  // there might be empty roots
   }
@@ -617,14 +598,14 @@ void VkRuntime::add_root_buffer(size_t root_buffer_size) {
   root_buffers_size_map_[root_buffers_.back().get()] = root_buffer_size;
 }
 
-DeviceAllocation *VkRuntime::get_root_buffer(int id) const {
+DeviceAllocation *GfxRuntime::get_root_buffer(int id) const {
   if (id >= root_buffers_.size()) {
     TI_ERROR("root buffer id {} not found", id);
   }
   return root_buffers_[id].get();
 }
 
-size_t VkRuntime::get_root_buffer_size(int id) const {
+size_t GfxRuntime::get_root_buffer_size(int id) const {
   auto it = root_buffers_size_map_.find(root_buffers_[id].get());
   if (id >= root_buffers_.size() || it == root_buffers_size_map_.end()) {
     TI_ERROR("root buffer id {} not found", id);
@@ -632,7 +613,7 @@ size_t VkRuntime::get_root_buffer_size(int id) const {
   return it->second;
 }
 
-VkRuntime::RegisterParams run_codegen(
+GfxRuntime::RegisterParams run_codegen(
     Kernel *kernel,
     Device *device,
     const std::vector<CompiledSNodeStructs> &compiled_structs) {
@@ -647,12 +628,12 @@ VkRuntime::RegisterParams run_codegen(
   params.enable_spv_opt =
       kernel->program->config.external_optimization_level > 0;
   spirv::KernelCodegen codegen(params);
-  VkRuntime::RegisterParams res;
+  GfxRuntime::RegisterParams res;
   codegen.run(res.kernel_attribs, res.task_spirv_source_codes);
   res.num_snode_trees = compiled_structs.size();
   return res;
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/vulkan/runtime.h b/taichi/runtime/gfx/runtime.h
similarity index 93%
rename from taichi/runtime/vulkan/runtime.h
rename to taichi/runtime/gfx/runtime.h
index c4f356783bcea..cbe1a48d758d1 100644
--- a/taichi/runtime/vulkan/runtime.h
+++ b/taichi/runtime/gfx/runtime.h
@@ -14,7 +14,7 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 using namespace taichi::lang::spirv;
 
@@ -73,20 +73,20 @@ class CompiledTaichiKernel {
   std::vector<std::unique_ptr<Pipeline>> pipelines_;
 };
 
-class TI_DLL_EXPORT VkRuntime {
+class TI_DLL_EXPORT GfxRuntime {
  public:
   struct Params {
     uint64_t *host_result_buffer{nullptr};
     Device *device{nullptr};
   };
 
-  explicit VkRuntime(const Params &params);
+  explicit GfxRuntime(const Params &params);
   // To make Pimpl + std::unique_ptr work
-  ~VkRuntime();
+  ~GfxRuntime();
 
   class KernelHandle {
    private:
-    friend class VkRuntime;
+    friend class GfxRuntime;
     int id_ = -1;
   };
 
@@ -113,7 +113,7 @@ class TI_DLL_EXPORT VkRuntime {
   size_t get_root_buffer_size(int id) const;
 
  private:
-  friend class taichi::lang::vulkan::SNodeTreeManager;
+  friend class taichi::lang::gfx::SNodeTreeManager;
 
   void init_nonroot_buffers();
 
@@ -135,11 +135,11 @@ class TI_DLL_EXPORT VkRuntime {
   std::unordered_map<DeviceAllocation *, size_t> root_buffers_size_map_;
 };
 
-VkRuntime::RegisterParams run_codegen(
+GfxRuntime::RegisterParams run_codegen(
     Kernel *kernel,
     Device *device,
     const std::vector<CompiledSNodeStructs> &compiled_structs);
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/snode_tree_manager.cpp b/taichi/runtime/gfx/snode_tree_manager.cpp
similarity index 73%
rename from taichi/backends/vulkan/snode_tree_manager.cpp
rename to taichi/runtime/gfx/snode_tree_manager.cpp
index b7d4816ae42d9..0ace8e561304d 100644
--- a/taichi/backends/vulkan/snode_tree_manager.cpp
+++ b/taichi/runtime/gfx/snode_tree_manager.cpp
@@ -1,17 +1,17 @@
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
 
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
-SNodeTreeManager::SNodeTreeManager(VkRuntime *rtm) : runtime_(rtm) {
+SNodeTreeManager::SNodeTreeManager(GfxRuntime *rtm) : runtime_(rtm) {
 }
 
 void SNodeTreeManager::materialize_snode_tree(SNodeTree *tree) {
   auto *const root = tree->root();
-  CompiledSNodeStructs compiled_structs = vulkan::compile_snode_structs(*root);
+  CompiledSNodeStructs compiled_structs = compile_snode_structs(*root);
   runtime_->add_root_buffer(compiled_structs.root_size);
   compiled_snode_structs_.push_back(compiled_structs);
 }
@@ -33,6 +33,6 @@ DevicePtr SNodeTreeManager::get_snode_tree_device_ptr(int tree_id) {
   return runtime_->root_buffers_[tree_id]->get_ptr();
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/snode_tree_manager.h b/taichi/runtime/gfx/snode_tree_manager.h
similarity index 79%
rename from taichi/backends/vulkan/snode_tree_manager.h
rename to taichi/runtime/gfx/snode_tree_manager.h
index d946c308632b9..3add69e6c4a41 100644
--- a/taichi/backends/vulkan/snode_tree_manager.h
+++ b/taichi/runtime/gfx/snode_tree_manager.h
@@ -8,12 +8,12 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
-class VkRuntime;
+class GfxRuntime;
 
 /**
- * @brief Manages the SNodeTrees for the Vulkan backend.
+ * @brief Manages the SNodeTrees for the underlying backend.
  *
  */
 class SNodeTreeManager {
@@ -21,7 +21,7 @@ class SNodeTreeManager {
   using CompiledSNodeStructs = taichi::lang::spirv::CompiledSNodeStructs;
 
  public:
-  explicit SNodeTreeManager(VkRuntime *rtm);
+  explicit SNodeTreeManager(GfxRuntime *rtm);
 
   const std::vector<CompiledSNodeStructs> &get_compiled_structs() const {
     return compiled_snode_structs_;
@@ -34,10 +34,10 @@ class SNodeTreeManager {
   DevicePtr get_snode_tree_device_ptr(int tree_id);
 
  private:
-  VkRuntime *const runtime_;
+  GfxRuntime *const runtime_;
   std::vector<CompiledSNodeStructs> compiled_snode_structs_;
 };
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/opengl/opengl_api.cpp b/taichi/runtime/opengl/opengl_api.cpp
index d4c0c93667592..daa6144e398bd 100644
--- a/taichi/runtime/opengl/opengl_api.cpp
+++ b/taichi/runtime/opengl/opengl_api.cpp
@@ -461,12 +461,12 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
   for (const auto &task : program_.tasks) {
     auto binder = compiled_pipeline_[i]->resource_binder();
     auto &core_bufs = runtime->impl->core_bufs;
-    binder->buffer(0, static_cast<int>(GLBufId::Runtime), core_bufs.runtime);
+    binder->rw_buffer(0, static_cast<int>(GLBufId::Runtime), core_bufs.runtime);
     if (program_.used.buf_data)
-      binder->buffer(0, static_cast<int>(GLBufId::Root), core_bufs.root);
-    binder->buffer(0, static_cast<int>(GLBufId::Gtmp), core_bufs.gtmp);
+      binder->rw_buffer(0, static_cast<int>(GLBufId::Root), core_bufs.root);
+    binder->rw_buffer(0, static_cast<int>(GLBufId::Gtmp), core_bufs.gtmp);
     if (program_.args_buf_size || program_.ret_buf_size)
-      binder->buffer(0, static_cast<int>(GLBufId::Args), *args_buf_);
+      binder->rw_buffer(0, static_cast<int>(GLBufId::Args), *args_buf_);
     // TODO: properly assert and throw if we bind more than allowed SSBOs.
     //       On most devices this number is 8. But I need to look up how
     //       to query this information so currently this is thrown from OpenGl.
@@ -475,9 +475,9 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
         DeviceAllocation *ptr =
             static_cast<DeviceAllocation *>((void *)ctx.args[arg_id]);
 
-        binder->buffer(0, bind_id, *ptr);
+        binder->rw_buffer(0, bind_id, *ptr);
       } else {
-        binder->buffer(0, bind_id, ext_arr_bufs_[arg_id]);
+        binder->rw_buffer(0, bind_id, ext_arr_bufs_[arg_id]);
       }
     }
 
diff --git a/taichi/runtime/vulkan/CMakeLists.txt b/taichi/runtime/vulkan/CMakeLists.txt
deleted file mode 100644
index 00ecee7a09caf..0000000000000
--- a/taichi/runtime/vulkan/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# ./taichi/runtime/vulkan/CMakeLists.txt
-
-add_library(vulkan_runtime)
-target_sources(vulkan_runtime
-  PRIVATE
-    runtime.cpp
-  )
-target_include_directories(vulkan_runtime
-  PRIVATE
-    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
-    ${PROJECT_SOURCE_DIR}/external/eigen
-    ${PROJECT_SOURCE_DIR}/external/FP16/include
-  )
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index e0c0211fb195a..06d36a7515edb 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -7,8 +7,8 @@
 #include "tests/cpp/program/test_program.h"
 #include "taichi/aot/graph_data.h"
 #include "taichi/program/graph_builder.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 #ifdef TI_WITH_VULKAN
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
 #include "taichi/backends/device.h"
 #include "taichi/backends/vulkan/vulkan_device.h"
 #include "taichi/backends/vulkan/vulkan_device_creator.h"
@@ -121,7 +121,7 @@ using namespace lang;
 
 #ifdef TI_WITH_VULKAN
 [[maybe_unused]] static void write_devalloc(
-    taichi::lang::vulkan::VkRuntime *vulkan_runtime,
+    taichi::lang::gfx::GfxRuntime *vulkan_runtime,
     taichi::lang::DeviceAllocation &alloc,
     const void *data,
     size_t size) {
@@ -132,7 +132,7 @@ using namespace lang;
 }
 
 [[maybe_unused]] static void load_devalloc(
-    taichi::lang::vulkan::VkRuntime *vulkan_runtime,
+    taichi::lang::gfx::GfxRuntime *vulkan_runtime,
     taichi::lang::DeviceAllocation &alloc,
     void *data,
     size_t size) {
@@ -169,14 +169,14 @@ TEST(AotSaveLoad, Vulkan) {
       std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
 
   // Create Vulkan runtime
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = result_buffer;
   params.device = embedded_device->device();
   auto vulkan_runtime =
-      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
 
   // Run AOT module loader
-  vulkan::AotModuleParams mod_params;
+  gfx::AotModuleParams mod_params;
   mod_params.module_path = ".";
   mod_params.runtime = vulkan_runtime.get();
 
@@ -242,14 +242,14 @@ TEST(AotSaveLoad, VulkanNdarray) {
       std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
 
   // Create Vulkan runtime
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = result_buffer;
   params.device = embedded_device->device();
   auto vulkan_runtime =
-      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
 
   // Run AOT module loader
-  vulkan::AotModuleParams mod_params;
+  gfx::AotModuleParams mod_params;
   mod_params.module_path = ".";
   mod_params.runtime = vulkan_runtime.get();
 
@@ -352,14 +352,14 @@ TEST(AotLoadGraph, Vulkan) {
       static_cast<taichi::lang::vulkan::VulkanDevice *>(
           embedded_device->device());
   // Create Vulkan runtime
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = result_buffer;
   params.device = device_;
   auto vulkan_runtime =
-      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
 
   // Run AOT module loader
-  vulkan::AotModuleParams mod_params;
+  gfx::AotModuleParams mod_params;
   mod_params.module_path = ".";
   mod_params.runtime = vulkan_runtime.get();
 
diff --git a/tests/cpp/backends/dx11_device_test.cpp b/tests/cpp/backends/dx11_device_test.cpp
index a8584170ee75b..d35b62a015585 100644
--- a/tests/cpp/backends/dx11_device_test.cpp
+++ b/tests/cpp/backends/dx11_device_test.cpp
@@ -127,8 +127,8 @@ TEST(Dx11ProgramTest, MaterializeRuntimeTest) {
   /*
   This test needs allocate_memory because of the call stack here:
   Dx11ProgramImpl::materialize_runtime
-  - VkRuntime::VkRuntime
-     - VkRuntime::init_buffers
+  - GfxRuntime::GfxRuntime
+     - GfxRuntime::init_buffers
         - Dx11Device::allocate_memory_unique
         - Dx11Device::get_compute_stream
         - Dx11Stream::new_command_list
diff --git a/tests/python/test_ad_basics.py b/tests/python/test_ad_basics.py
index a870e643a6d9f..7d4191760baaa 100644
--- a/tests/python/test_ad_basics.py
+++ b/tests/python/test_ad_basics.py
@@ -90,7 +90,7 @@ def test_poly(tifunc):
     (lambda x: ti.asin(x), lambda x: np.arcsin(x)),
 ])
 @if_has_autograd
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_trigonometric(tifunc, npfunc):
     grad_test(tifunc, npfunc)
 
@@ -319,7 +319,7 @@ def work():
     assert 'RandStmt not supported' in e.value.args[0]
 
 
-@test_utils.test(exclude=[ti.cc, ti.vulkan, ti.opengl])
+@test_utils.test(exclude=[ti.cc, ti.vulkan, ti.opengl, ti.dx11])
 def test_ad_frac():
     @ti.func
     def frac(x):
diff --git a/tests/python/test_clear_all_gradients.py b/tests/python/test_clear_all_gradients.py
index f93e780494b4d..fbde26b3a740e 100644
--- a/tests/python/test_clear_all_gradients.py
+++ b/tests/python/test_clear_all_gradients.py
@@ -4,7 +4,7 @@
 from tests import test_utils
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_clear_all_gradients():
     x = ti.field(ti.f32)
     y = ti.field(ti.f32)
diff --git a/tests/python/test_element_wise.py b/tests/python/test_element_wise.py
index 642d7cb606e7f..cecbccc9fb6b6 100644
--- a/tests/python/test_element_wise.py
+++ b/tests/python/test_element_wise.py
@@ -11,7 +11,7 @@ def _c_mod(a, b):
 
 @pytest.mark.parametrize('lhs_is_mat,rhs_is_mat', [(True, True), (True, False),
                                                    (False, True)])
-@test_utils.test(fast_math=False, exclude=[ti.vulkan])
+@test_utils.test(fast_math=False, exclude=[ti.vulkan, ti.dx11])
 def test_binary_f(lhs_is_mat, rhs_is_mat):
     x = ti.Matrix.field(3, 2, ti.f32, 16)
     if lhs_is_mat:
diff --git a/tests/python/test_f16.py b/tests/python/test_f16.py
index 2cf9728b65b7d..b27526546ad6e 100644
--- a/tests/python/test_f16.py
+++ b/tests/python/test_f16.py
@@ -102,7 +102,7 @@ def init():
 
 
 @pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
-@test_utils.test(arch=archs_support_f16, exclude=ti.vulkan)
+@test_utils.test(arch=archs_support_f16, exclude=[ti.vulkan, ti.dx11])
 def test_to_paddle():
     import paddle
     n = 16
@@ -123,7 +123,7 @@ def init():
 
 
 @pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
-@test_utils.test(arch=archs_support_f16, exclude=ti.vulkan)
+@test_utils.test(arch=archs_support_f16, exclude=[ti.vulkan, ti.dx11])
 def test_from_paddle():
     import paddle
     n = 16
diff --git a/tests/python/test_fields_builder.py b/tests/python/test_fields_builder.py
index 590fc2986e27a..888f8773be12b 100644
--- a/tests/python/test_fields_builder.py
+++ b/tests/python/test_fields_builder.py
@@ -38,7 +38,7 @@ def assign_field_multiple():
         assert x[i] == i
 
 
-@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.dx11, ti.metal])
 def test_fields_builder_dense():
     shape = 5
     fb1 = ti.FieldsBuilder()
@@ -142,7 +142,7 @@ def assign_field_multiple_struct_for():
 # See https://docs.taichi-lang.org/docs/type#primitive-types for more details.
 @pytest.mark.parametrize('test_1d_size', [1, 10, 100])
 @pytest.mark.parametrize('field_type', [ti.f32, ti.i32])
-@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.dx11, ti.metal])
 def test_fields_builder_destroy(test_1d_size, field_type):
     def test_for_single_destroy_multi_fields():
         fb = ti.FieldsBuilder()
@@ -180,7 +180,7 @@ def test_for_raise_destroy_twice():
             c.destroy()
 
 
-@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.dx11])
 def test_field_initialize_zero():
     fb0 = ti.FieldsBuilder()
     a = ti.field(ti.i32)
diff --git a/tests/python/test_loop_grad.py b/tests/python/test_loop_grad.py
index ab209613430a2..11482fb39384c 100644
--- a/tests/python/test_loop_grad.py
+++ b/tests/python/test_loop_grad.py
@@ -2,7 +2,7 @@
 from tests import test_utils
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_loop_grad():
     x = ti.field(ti.f32)
 
@@ -32,7 +32,7 @@ def func():
             assert x.grad[k, i] == 2**(m - 1 - i)
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_loop_grad_complex():
     return  # This case is not supported yet
     x = ti.field(ti.f32)
diff --git a/tests/python/test_print.py b/tests/python/test_print.py
index 3e62d19199378..e9f0943123755 100644
--- a/tests/python/test_print.py
+++ b/tests/python/test_print.py
@@ -23,7 +23,8 @@ def func():
 
 # TODO: As described by @k-ye above, what we want to ensure
 #       is that, the content shows on console is *correct*.
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_multi_print():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -33,7 +34,8 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_string():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -45,7 +47,8 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_matrix():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -61,7 +64,8 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_sep_end():
     @ti.kernel
     def func():
@@ -81,7 +85,8 @@ def func():
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_multiple_threads():
     x = ti.field(dtype=ti.f32, shape=(128, ))
 
@@ -97,7 +102,8 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_list():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(2, 3))
     y = ti.Vector.field(3, dtype=ti.f32, shape=())
diff --git a/tests/python/test_reduction.py b/tests/python/test_reduction.py
index 88e7f93d097f5..a1e62a741d769 100644
--- a/tests/python/test_reduction.py
+++ b/tests/python/test_reduction.py
@@ -33,8 +33,9 @@
 
 def _test_reduction_single(dtype, criterion, op):
     N = 1024 * 1024
-    if (ti.lang.impl.current_cfg().arch == ti.opengl or
-            ti.lang.impl.current_cfg().arch == ti.vulkan) and dtype == ti.f32:
+    if (ti.lang.impl.current_cfg().arch == ti.opengl
+            or ti.lang.impl.current_cfg().arch == ti.vulkan
+            or ti.lang.impl.current_cfg().arch == ti.dx11) and dtype == ti.f32:
         # OpenGL/Vulkan are not capable of such large number in its float32...
         N = 1024 * 16
 
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index e6f7b240f91f6..e499f41a190b7 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -61,7 +61,7 @@ def test_linear_nested_aos():
         assert y[i] == i + 123
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_2d_nested():
     x = ti.field(ti.i32)
 
diff --git a/tests/python/test_torch_io.py b/tests/python/test_torch_io.py
index 1f05bfdcb1f3e..0c96605da431a 100644
--- a/tests/python/test_torch_io.py
+++ b/tests/python/test_torch_io.py
@@ -11,7 +11,7 @@
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_devices():
     n = 32
     x = ti.field(dtype=ti.i32, shape=n)
@@ -48,7 +48,7 @@ def store(y: ti.types.ndarray()):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io():
     n = 32
 
@@ -88,7 +88,7 @@ def backward(ctx, outp_grad):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_2d():
     n = 32
 
@@ -112,7 +112,7 @@ def forward(ctx, inp):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_3d():
     n = 16
 
@@ -138,7 +138,7 @@ def forward(ctx, inp):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_simple():
     n = 32
 
@@ -165,7 +165,7 @@ def test_io_simple():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_zeros():
     mat = ti.Matrix.field(2, 6, dtype=ti.f32, shape=(), needs_grad=True)
     zeros = torch.zeros((2, 6))
@@ -179,7 +179,7 @@ def test_io_zeros():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_struct():
     n = 16
     x1 = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
@@ -199,7 +199,7 @@ def test_io_struct():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_fused_kernels():
     n = 12
     X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
@@ -211,7 +211,7 @@ def test_fused_kernels():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_device():
     n = 12
     X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
@@ -222,7 +222,7 @@ def test_device():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_shape_matrix():
     n = 12
     x = ti.Matrix.field(3, 2, ti.f32, shape=(n, n))
@@ -242,7 +242,7 @@ def test_shape_matrix():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_shape_vector():
     n = 12
     x = ti.Vector.field(3, ti.f32, shape=(n, n))
@@ -261,7 +261,7 @@ def test_shape_vector():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_torch_zero():
     @ti.kernel
     def test_torch(arr: ti.types.ndarray()):
diff --git a/tests/python/test_types.py b/tests/python/test_types.py
index 2d1cb785136be..893112ce546c8 100644
--- a/tests/python/test_types.py
+++ b/tests/python/test_types.py
@@ -20,13 +20,14 @@ def func(value: dt):
 
 
 @pytest.mark.parametrize('dt', _TI_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_type_assign_argument(dt):
     _test_type_assign_argument(dt)
 
 
 @pytest.mark.parametrize('dt', _TI_64_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_type_assign_argument64(dt):
     _test_type_assign_argument(dt)
 
@@ -52,13 +53,14 @@ def func():
 
 
 @pytest.mark.parametrize('dt', _TI_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_type_operator(dt):
     _test_type_operator(dt)
 
 
 @pytest.mark.parametrize('dt', _TI_64_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_type_operator64(dt):
     _test_type_operator(dt)
 
@@ -77,13 +79,14 @@ def func(i: ti.i32, j: ti.i32):
 
 
 @pytest.mark.parametrize('dt', _TI_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_type_field(dt):
     _test_type_field(dt)
 
 
 @pytest.mark.parametrize('dt', _TI_64_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_type_field64(dt):
     _test_type_field(dt)
 
@@ -119,7 +122,7 @@ def func():
     (ti.i32, 32),
     (ti.u32, 32),
 ])
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_overflow(dt, n):
     _test_overflow(dt, n)
 
@@ -128,7 +131,8 @@ def test_overflow(dt, n):
     (ti.i64, 64),
     (ti.u64, 64),
 ])
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_overflow64(dt, n):
     _test_overflow(dt, n)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9f7d9babb0aa7..c42d230cb886a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,7 @@
 
 import pytest
 from taichi._lib import core as _ti_core
-from taichi.lang import cc, cpu, cuda, gpu, metal, opengl, vulkan
+from taichi.lang import cc, cpu, cuda, dx11, gpu, metal, opengl, vulkan
 from taichi.lang.misc import is_arch_supported
 
 import taichi as ti