From 42d4087c4711a1a6dfb58214c266144b3e4037ba Mon Sep 17 00:00:00 2001
From: Xiang Li <python3kgae@outlook.com>
Date: Fri, 7 Oct 2022 19:19:38 -0700
Subject: [PATCH] [dx12] Add ti.dx12. (#6174)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/misc.py                    | 20 +++++++----
 taichi/codegen/codegen.cpp                    |  9 +++++
 taichi/python/export_misc.cpp                 |  9 +++++
 taichi/runtime/llvm/llvm_runtime_executor.cpp |  2 +-
 tests/cpp/aot/llvm/kernel_aot_test.cpp        | 17 ++++++++++
 .../aot/python_scripts/kernel_aot_test1.py    |  2 ++
 tests/python/test_api.py                      | 33 ++++++++++---------
 tests/test_config.json                        |  4 +++
 8 files changed, 72 insertions(+), 24 deletions(-)
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 7938c09b3fcc0..53956ce9ebc2c 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -149,9 +149,14 @@
 """
 # ----------------------
 
-gpu = [cuda, metal, vulkan, opengl, dx11]
+dx12 = _ti_core.dx12
+"""The DX11 backend.
+"""
+# ----------------------
+
+gpu = [cuda, metal, vulkan, opengl, dx11, dx12]
 """A list of GPU backends supported on the current system.
-Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11'.
+Currently contains 'cuda', 'metal', 'opengl', 'vulkan', 'dx11', 'dx12'.
 
 When this is used, Taichi automatically picks the matching GPU backend. If no
 GPU is detected, Taichi falls back to the CPU backend.
@@ -726,6 +731,7 @@ def is_arch_supported(arch, use_gles=False):
         cc: _ti_core.with_cc,
         vulkan: _ti_core.with_vulkan,
         dx11: _ti_core.with_dx11,
+        dx12: _ti_core.with_dx12,
         wasm: lambda: True,
         cpu: lambda: True,
     }
@@ -765,9 +771,9 @@ def get_compute_stream_device_time_elapsed_us() -> float:
 
 __all__ = [
     'i', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'j', 'jk', 'jkl', 'jl',
-    'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'wasm', 'arm64', 'cc', 'cpu',
-    'cuda', 'gpu', 'metal', 'opengl', 'vulkan', 'extension', 'loop_config',
-    'global_thread_idx', 'assume_in_range', 'block_local', 'cache_read_only',
-    'init', 'mesh_local', 'no_activate', 'reset', 'mesh_patch_idx',
-    'get_compute_stream_device_time_elapsed_us'
+    'k', 'kl', 'l', 'x86_64', 'x64', 'dx11', 'dx12', 'wasm', 'arm64', 'cc',
+    'cpu', 'cuda', 'gpu', 'metal', 'opengl', 'vulkan', 'extension',
+    'loop_config', 'global_thread_idx', 'assume_in_range', 'block_local',
+    'cache_read_only', 'init', 'mesh_local', 'no_activate', 'reset',
+    'mesh_patch_idx', 'get_compute_stream_device_time_elapsed_us'
 ]
diff --git a/taichi/codegen/codegen.cpp b/taichi/codegen/codegen.cpp
index 674cdd18d21ad..7a617a3928643 100644
--- a/taichi/codegen/codegen.cpp
+++ b/taichi/codegen/codegen.cpp
@@ -12,6 +12,9 @@
 #if defined(TI_WITH_CUDA)
 #include "taichi/codegen/cuda/codegen_cuda.h"
 #endif
+#if defined(TI_WITH_DX12)
+#include "taichi/codegen/dx12/codegen_dx12.h"
+#endif
 #include "taichi/system/timer.h"
 #include "taichi/ir/analysis.h"
 #include "taichi/ir/transforms.h"
@@ -47,6 +50,12 @@ std::unique_ptr<KernelCodeGen> KernelCodeGen::create(Arch arch,
     return std::make_unique<KernelCodeGenCUDA>(kernel, stmt);
 #else
     TI_NOT_IMPLEMENTED
+#endif
+  } else if (arch == Arch::dx12) {
+#if defined(TI_WITH_DX12)
+    return std::make_unique<KernelCodeGenDX12>(kernel, stmt);
+#else
+    TI_NOT_IMPLEMENTED
 #endif
   } else {
     TI_NOT_IMPLEMENTED
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index 37ca97f06b45c..3e08a5df69b95 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -32,6 +32,10 @@
 #include "taichi/rhi/opengl/opengl_api.h"
 #endif
 
+#ifdef TI_WITH_DX12
+#include "taichi/rhi/dx12/dx12_api.h"
+#endif
+
 #ifdef TI_WITH_CC
 namespace taichi::lang::cccp {
 extern bool is_c_backend_available();
@@ -163,6 +167,11 @@ void export_misc(py::module &m) {
 #else
   m.def("with_dx11", []() { return false; });
 #endif
+#ifdef TI_WITH_DX12
+  m.def("with_dx12", taichi::lang::directx12::is_dx12_api_available);
+#else
+  m.def("with_dx12", []() { return false; });
+#endif
 
 #ifdef TI_WITH_CC
   m.def("with_cc", taichi::lang::cccp::is_c_backend_available);
diff --git a/taichi/runtime/llvm/llvm_runtime_executor.cpp b/taichi/runtime/llvm/llvm_runtime_executor.cpp
index a22a869eafd9d..6cc2a7876bc36 100644
--- a/taichi/runtime/llvm/llvm_runtime_executor.cpp
+++ b/taichi/runtime/llvm/llvm_runtime_executor.cpp
@@ -120,7 +120,7 @@ LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
     llvm_context_device_ =
         std::make_unique<TaichiLLVMContext>(config_, Arch::dx12);
     // FIXME: add dx12 JIT.
-    // llvm_context_device_->init_runtime_jit_module();
+    llvm_context_device_->init_runtime_jit_module();
   }
 #endif
 
diff --git a/tests/cpp/aot/llvm/kernel_aot_test.cpp b/tests/cpp/aot/llvm/kernel_aot_test.cpp
index bb177dfd37aa9..ba026a39ab5e2 100644
--- a/tests/cpp/aot/llvm/kernel_aot_test.cpp
+++ b/tests/cpp/aot/llvm/kernel_aot_test.cpp
@@ -5,6 +5,7 @@
 #include "taichi/system/memory_pool.h"
 #include "taichi/runtime/cpu/aot_module_loader_impl.h"
 #include "taichi/runtime/cuda/aot_module_loader_impl.h"
+#include "taichi/runtime/dx12/aot_module_loader_impl.h"
 #include "taichi/rhi/cuda/cuda_driver.h"
 #include "taichi/platform/cuda/detect_cuda.h"
 
@@ -101,4 +102,20 @@ TEST(LlvmAotTest, CudaKernel) {
   }
 }
 
+#ifdef TI_WITH_DX12
+TEST(LlvmAotTest, DX12Kernel) {
+  directx12::AotModuleParams aot_params;
+  const auto folder_dir = getenv("TAICHI_AOT_FOLDER_PATH");
+
+  std::stringstream aot_mod_ss;
+  aot_mod_ss << folder_dir;
+  aot_params.module_path = aot_mod_ss.str();
+  // FIXME: add executor.
+  auto mod = directx12::make_aot_module(aot_params, Arch::dx12);
+  auto *k_run = mod->get_kernel("run");
+  EXPECT_TRUE(k_run);
+  // FIXME: launch the kernel and check result.
+}
+#endif
+
 }  // namespace taichi::lang
diff --git a/tests/cpp/aot/python_scripts/kernel_aot_test1.py b/tests/cpp/aot/python_scripts/kernel_aot_test1.py
index b38130802acdc..f3822708f4a78 100644
--- a/tests/cpp/aot/python_scripts/kernel_aot_test1.py
+++ b/tests/cpp/aot/python_scripts/kernel_aot_test1.py
@@ -38,5 +38,7 @@ def run(base: int, arr: ti.types.ndarray()):
         compile_kernel_aot_test1(arch=ti.vulkan)
     elif args.arch == "opengl":
         compile_kernel_aot_test1(arch=ti.opengl)
+    elif args.arch == "dx12":
+        compile_kernel_aot_test1(arch=ti.dx12)
     else:
         assert False
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 7f90cb5831770..dab6a7f927e04 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -72,22 +72,23 @@ def _get_expected_matrix_apis():
     'atomic_sub', 'atomic_xor', 'axes', 'bit_cast', 'bit_shr', 'block_local',
     'cache_read_only', 'cast', 'cc', 'ceil', 'cos', 'cpu', 'cuda',
     'data_oriented', 'dataclass', 'deactivate', 'deactivate_all_snodes',
-    'dx11', 'eig', 'exp', 'experimental', 'extension', 'f16', 'f32', 'f64',
-    'field', 'float16', 'float32', 'float64', 'floor', 'func', 'get_addr',
-    'get_compute_stream_device_time_elapsed_us', 'global_thread_idx', 'gpu',
-    'graph', 'grouped', 'hex_to_rgb', 'i', 'i16', 'i32', 'i64', 'i8', 'ij',
-    'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'init', 'int16', 'int32', 'int64',
-    'int8', 'is_active', 'is_logging_effective', 'j', 'jk', 'jkl', 'jl', 'k',
-    'kernel', 'kl', 'l', 'lang', 'length', 'linalg', 'log', 'loop_config',
-    'math', 'max', 'mesh_local', 'mesh_patch_idx', 'metal', 'min', 'ndarray',
-    'ndrange', 'no_activate', 'one', 'opengl', 'polar_decompose', 'pow',
-    'profiler', 'randn', 'random', 'raw_div', 'raw_mod', 'ref',
-    'rescale_index', 'reset', 'rgb_to_hex', 'root', 'round', 'rsqrt', 'select',
-    'set_logging_level', 'simt', 'sin', 'solve', 'sparse_matrix_builder',
-    'sqrt', 'static', 'static_assert', 'static_print', 'stop_grad', 'svd',
-    'swizzle_generator', 'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools',
-    'types', 'u16', 'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64',
-    'uint8', 'vulkan', 'wasm', 'x64', 'x86_64', 'zero'
+    'dx11', 'dx12', 'eig', 'exp', 'experimental', 'extension', 'f16', 'f32',
+    'f64', 'field', 'float16', 'float32', 'float64', 'floor', 'func',
+    'get_addr', 'get_compute_stream_device_time_elapsed_us',
+    'global_thread_idx', 'gpu', 'graph', 'grouped', 'hex_to_rgb', 'i', 'i16',
+    'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl', 'il', 'init',
+    'int16', 'int32', 'int64', 'int8', 'is_active', 'is_logging_effective',
+    'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l', 'lang', 'length',
+    'linalg', 'log', 'loop_config', 'math', 'max', 'mesh_local',
+    'mesh_patch_idx', 'metal', 'min', 'ndarray', 'ndrange', 'no_activate',
+    'one', 'opengl', 'polar_decompose', 'pow', 'profiler', 'randn', 'random',
+    'raw_div', 'raw_mod', 'ref', 'rescale_index', 'reset', 'rgb_to_hex',
+    'root', 'round', 'rsqrt', 'select', 'set_logging_level', 'simt', 'sin',
+    'solve', 'sparse_matrix_builder', 'sqrt', 'static', 'static_assert',
+    'static_print', 'stop_grad', 'svd', 'swizzle_generator', 'sym_eig', 'sync',
+    'tan', 'tanh', 'template', 'tools', 'types', 'u16', 'u32', 'u64', 'u8',
+    'ui', 'uint16', 'uint32', 'uint64', 'uint8', 'vulkan', 'wasm', 'x64',
+    'x86_64', 'zero'
 ]
 user_api[ti.ad] = [
     'FwdMode', 'Tape', 'clear_all_gradients', 'grad_for', 'grad_replaced',
diff --git a/tests/test_config.json b/tests/test_config.json
index 14e4cace901f5..7694b932613df 100644
--- a/tests/test_config.json
+++ b/tests/test_config.json
@@ -8,6 +8,10 @@
         ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
         "--arch=cuda"
     ],
+    "LlvmAotTest.DX12Kernel": [
+        ["cpp", "aot", "python_scripts", "kernel_aot_test1.py"],
+        "--arch=dx12"
+    ],
     "LlvmAotTest.CpuField": [
         ["cpp", "aot", "python_scripts", "field_aot_test.py"],
         "--arch=cpu"