From e346e9f87b1fcf1151a7135248fd1981d3f477f8 Mon Sep 17 00:00:00 2001
From: Frost Ming <mianghong@gmail.com>
Date: Wed, 27 Apr 2022 09:07:11 +0800
Subject: [PATCH 001/176] [Build] Switch to scikit-build as the build backend
 (#4624)

* switch to skbuild

* Switch the build system to scikit-build

* include bc and libmolten

* find llvm runtime bc

* fix bc files installation

* install bc after compile

* Add more message

* Auto Format

* fix findpython

* Kickstart CI

* add empty line

* add missing dependency

* fix python args

* start CI

* Fix clang tidy run

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: Taichi Gardener <taichigardener@gmail.com>
Co-authored-by: Ailing <ailzhang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/workflows/scripts/check_clang_tidy.sh |   7 +-
 .github/workflows/scripts/win_build.ps1       |   1 -
 .gitignore                                    |   1 +
 CMakeLists.txt                                |   2 +-
 MANIFEST.in                                   |   5 -
 cmake/PythonNumpyPybind11.cmake               |  99 +-------
 cmake/TaichiCore.cmake                        |  15 +-
 pyproject.toml                                |   6 +-
 requirements_dev.txt                          |   3 +
 scripts/run_clang_tidy.py                     |  15 +-
 setup.py                                      | 231 ++++--------------
 11 files changed, 103 insertions(+), 282 deletions(-)

diff --git a/.github/workflows/scripts/check_clang_tidy.sh b/.github/workflows/scripts/check_clang_tidy.sh
index d9db1c9a3433f..4155421d86716 100755
--- a/.github/workflows/scripts/check_clang_tidy.sh
+++ b/.github/workflows/scripts/check_clang_tidy.sh
@@ -5,8 +5,5 @@ CI_SETUP_CMAKE_ARGS=$1
 cd taichi
 python3 -m pip install -r requirements_dev.txt
 
-rm -rf build && mkdir build && cd build
-cmake $CI_SETUP_CMAKE_ARGS ..
-
-cd ..
-python3 ./scripts/run_clang_tidy.py $PWD/taichi -clang-tidy-binary clang-tidy-10 -checks=-*,performance-inefficient-string-concatenation,readability-identifier-naming -header-filter=$PWD/taichi -p $PWD/build -j2
+export CI_SETUP_CMAKE_ARGS
+python3 ./scripts/run_clang_tidy.py $PWD/taichi -clang-tidy-binary clang-tidy-10 -checks=-*,performance-inefficient-string-concatenation,readability-identifier-naming -header-filter=$PWD/taichi -j2
diff --git a/.github/workflows/scripts/win_build.ps1 b/.github/workflows/scripts/win_build.ps1
index e58c179ee6952..86ad4243742e5 100644
--- a/.github/workflows/scripts/win_build.ps1
+++ b/.github/workflows/scripts/win_build.ps1
@@ -76,7 +76,6 @@ python -m venv venv
 . venv\Scripts\activate.ps1
 python -m pip install wheel
 python -m pip install -r requirements_dev.txt
-python -m pip install -r requirements_test.txt
 if (-not $?) { exit 1 }
 WriteInfo("Building Taichi")
 $env:TAICHI_CMAKE_ARGS += " -DCLANG_EXECUTABLE=$libsDir\\taichi_clang\\bin\\clang++.exe"
diff --git a/.gitignore b/.gitignore
index fd39d08f9acea..958272e9a7b4d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -85,3 +85,4 @@ _build
 !docs/**/*.json
 imgui.ini
 /venv/
+/_skbuild/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 17db545ac8194..7387e8b648b56 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,7 +49,6 @@ else ()
 endif ()
 
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")
-set(CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")
 
 find_program(CCACHE_PROGRAM ccache)
 if(CCACHE_PROGRAM)
@@ -157,6 +156,7 @@ foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
       WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm"
   )
   add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}")
+  install(FILES "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm/runtime_${arch}.bc" DESTINATION ${CMAKE_INSTALL_PREFIX}/python/taichi/_lib/runtime)
 endforeach()
 
 configure_file(taichi/common/version.h.in ${CMAKE_SOURCE_DIR}/taichi/common/version.h)
diff --git a/MANIFEST.in b/MANIFEST.in
index eff0f4f8f88c7..3c1e64aa11c27 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,8 +1,3 @@
-include MANIFEST.in
-include version.txt
-include python/*.txt
-include python/*.py
-include *.cfg
 include python/taichi/*.md
 recursive-include python/taichi/assets *
 recursive-include python/taichi/examples *.py
diff --git a/cmake/PythonNumpyPybind11.cmake b/cmake/PythonNumpyPybind11.cmake
index 311630dba74a8..65a231e04f64b 100644
--- a/cmake/PythonNumpyPybind11.cmake
+++ b/cmake/PythonNumpyPybind11.cmake
@@ -1,93 +1,16 @@
 # Python, numpy, and pybind11
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -m pybind11 --cmake
+                OUTPUT_VARIABLE pybind11_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
+execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy;print(numpy.get_include())"
+                OUTPUT_VARIABLE NUMPY_INCLUDE_DIR OUTPUT_STRIP_TRAILING_WHITESPACE)
 
-if (PYTHON_EXECUTABLE)
-    message("Using ${PYTHON_EXECUTABLE} as python executable.")
-else ()
-    if (WIN32)
-        message("Using 'python' as python interpreter.")
-        set(PYTHON_EXECUTABLE python)
-    else ()
-        message("Using 'python3' as python interpreter.")
-        set(PYTHON_EXECUTABLE python3)
-    endif()
-endif ()
+message("-- Python: Using ${PYTHON_EXECUTABLE} as the interpreter")
+message("    version: ${PYTHON_VERSION_STRING}")
+message("    include: ${PYTHON_INCLUDE_DIR}")
+message("    library: ${PYTHON_LIBRARY}")
+message("    numpy include: ${NUMPY_INCLUDE_DIR}")
 
-if (WIN32)
-    execute_process(COMMAND where ${PYTHON_EXECUTABLE}
-        OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATHS)
-    if (${PYTHON_EXECUTABLE_PATHS})
-        string(FIND ${PYTHON_EXECUTABLE_PATHS} "\n" _LINE_BREAK_LOC)
-        string(SUBSTRING ${PYTHON_EXECUTABLE_PATHS} 0 ${_LINE_BREAK_LOC} PYTHON_EXECUTABLE_PATH)
-    else ()
-        set(PYTHON_EXECUTABLE_PATH ${PYTHON_EXECUTABLE})
-    endif ()
-else ()
-    execute_process(COMMAND which ${PYTHON_EXECUTABLE}
-            OUTPUT_VARIABLE PYTHON_EXECUTABLE_PATH)
-endif()
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
-        "import sys;\
-        from distutils import sysconfig;\
-        sys.stdout.write(sysconfig.get_python_version())"
-        OUTPUT_VARIABLE PYTHON_VERSION)
-execute_process(COMMAND ${PYTHON_EXECUTABLE} --version)
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
-        "import sys;\
-        from distutils import sysconfig;\
-        sys.stdout.write(\
-        (sysconfig.get_config_var('INCLUDEPY')\
-        if sysconfig.get_config_var('INCLUDEDIR') is not None else None)\
-        or sysconfig.get_python_inc())"
-        OUTPUT_VARIABLE PYTHON_INCLUDE_DIRS)
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
-        "import sys;\
-        from distutils import sysconfig;\
-        sys.stdout.write((sysconfig.get_config_var('LIBDIR') or sysconfig.get_python_lib()).replace('\\\\','/'))"
-        OUTPUT_VARIABLE PYTHON_LIBRARY_DIR)
 
+include_directories(${NUMPY_INCLUDE_DIR})
 
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
-        "import sys;\
-        sys.stdout.write(str(sys.version_info[1]))"
-        OUTPUT_VARIABLE PYTHON_MINOR_VERSION)
-
-
-if (WIN32)
-  execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
-          "import sys;sys.stdout.write(sys.base_prefix.replace('\\\\', '/'))"
-          OUTPUT_VARIABLE PYTHON_BASE_PREFIX)
-  link_directories(${PYTHON_BASE_PREFIX}/libs)
-  set(PYTHON_LIBRARIES ${PYTHON_BASE_PREFIX}/libs/python3.lib)
-  set(PYTHON_LIBRARIES ${PYTHON_BASE_PREFIX}/libs/python3${PYTHON_MINOR_VERSION}.lib)
-else()
-  find_library(PYTHON_LIBRARY NAMES python${PYTHON_VERSION} python${PYTHON_VERSION}m PATHS ${PYTHON_LIBRARY_DIR}
-          NO_DEFAULT_PATH NO_SYSTEM_ENVIRONMENT_PATH PATH_SUFFIXES x86_64-linux-gnu)
-  set(PYTHON_LIBRARIES ${PYTHON_LIBRARY})
-endif()
-
-
-include_directories(${PYTHON_INCLUDE_DIRS})
-message("    version: ${PYTHON_VERSION}")
-message("    include: ${PYTHON_INCLUDE_DIRS}")
-message("    library: ${PYTHON_LIBRARIES}")
-
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
-        "import numpy.distutils, sys;\
-        sys.stdout.write(':'.join(numpy.distutils.misc_util.get_numpy_include_dirs()))"
-        OUTPUT_VARIABLE PYTHON_NUMPY_INCLUDE_DIR)
-
-message("    numpy include: ${PYTHON_NUMPY_INCLUDE_DIR}")
-include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
-
-execute_process(COMMAND ${PYTHON_EXECUTABLE} -c
-        "import sys; import pybind11; sys.stdout.write(pybind11.get_include() + ';' + pybind11.get_include(True))"
-        OUTPUT_VARIABLE PYBIND11_INCLUDE_DIR
-        RESULT_VARIABLE PYBIND11_IMPORT_RET)
-if (NOT PYBIND11_IMPORT_RET)
-    # returns zero if success
-    message("    pybind11 include: ${PYBIND11_INCLUDE_DIR}")
-else ()
-    message(FATAL_ERROR "Cannot import pybind11. Please install. ([sudo] pip3 install --user pybind11)")
-endif ()
-
-include_directories(${PYBIND11_INCLUDE_DIR})
+find_package(pybind11 CONFIG REQUIRED)
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 9bc218d9cc9ea..687130d9e19d5 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -16,6 +16,7 @@ option(TI_EMSCRIPTENED "Build using emscripten" OFF)
 # projects.
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+set(INSTALL_LIB_DIR ${CMAKE_INSTALL_PREFIX}/python/taichi/_lib)
 
 if(ANDROID)
     set(TI_WITH_VULKAN ON)
@@ -384,6 +385,9 @@ if (TI_WITH_VULKAN)
         find_library(MOLTEN_VK libMoltenVK.dylib PATHS $HOMEBREW_CELLAR/molten-vk $VULKAN_SDK REQUIRED)
         configure_file(${MOLTEN_VK} ${CMAKE_BINARY_DIR}/libMoltenVK.dylib COPYONLY)
         message(STATUS "MoltenVK library ${MOLTEN_VK}")
+        if (EXISTS ${CMAKE_BINARY_DIR}/libMoltenVK.dylib)
+            install(FILES ${CMAKE_BINARY_DIR}/libMoltenVK.dylib DESTINATION ${INSTALL_LIB_DIR}/runtime)
+        endif()
     endif()
 endif ()
 
@@ -437,7 +441,7 @@ if(NOT TI_EMSCRIPTENED)
     # Cannot compile Python source code with Android, but TI_EXPORT_CORE should be set and
     # Android should only use the isolated library ignoring those source code.
     if (NOT ANDROID)
-        add_library(${CORE_WITH_PYBIND_LIBRARY_NAME} SHARED ${TAICHI_PYBIND_SOURCE})
+        pybind11_add_module(${CORE_WITH_PYBIND_LIBRARY_NAME} ${TAICHI_PYBIND_SOURCE})
     else()
         add_library(${CORE_WITH_PYBIND_LIBRARY_NAME} SHARED)
     endif ()
@@ -459,6 +463,10 @@ if(NOT TI_EMSCRIPTENED)
         set_target_properties(${CORE_WITH_PYBIND_LIBRARY_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY
                 "${CMAKE_CURRENT_SOURCE_DIR}/runtimes")
     endif ()
+
+    install(TARGETS ${CORE_WITH_PYBIND_LIBRARY_NAME}
+            RUNTIME DESTINATION ${INSTALL_LIB_DIR}/core
+            LIBRARY DESTINATION ${INSTALL_LIB_DIR}/core)
 endif()
 
 if(TI_EMSCRIPTENED)
@@ -487,3 +495,8 @@ endif()
     target_link_libraries(${CORE_LIBRARY_NAME} imgui)
 
 endif()
+
+if (NOT APPLE)
+    install(FILES ${CMAKE_SOURCE_DIR}/external/cuda_libdevice/slim_libdevice.10.bc
+            DESTINATION ${INSTALL_LIB_DIR}/runtime)
+endif()
diff --git a/pyproject.toml b/pyproject.toml
index c4a857f234460..f5d511fad16dc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,9 @@
 [build-system]
-requires = ["setuptools", "wheel", "numpy", "pybind11", "cmake"]
+requires = [
+    "setuptools", "wheel",
+    "numpy", "pybind11", "cmake",
+    "scikit-build", "ninja; platform_system != 'Windows'",
+]
 build-backend = "setuptools.build_meta"
 
 [tool.pytest.ini_options]
diff --git a/requirements_dev.txt b/requirements_dev.txt
index e8849dc3cb1aa..2cc165f61719d 100644
--- a/requirements_dev.txt
+++ b/requirements_dev.txt
@@ -14,3 +14,6 @@ twine
 wheel
 astunparse
 pre-commit
+scikit-build
+numpy
+ninja; platform_system != 'Windows'
diff --git a/scripts/run_clang_tidy.py b/scripts/run_clang_tidy.py
index 229d91bfdf4d0..4e0abeb45fcc4 100644
--- a/scripts/run_clang_tidy.py
+++ b/scripts/run_clang_tidy.py
@@ -77,6 +77,18 @@ def make_absolute(f, directory):
     return os.path.normpath(os.path.join(directory, f))
 
 
+def cmake_configure(source_path='.'):
+    import shlex
+
+    from skbuild.cmaker import CMaker
+    from skbuild.constants import CMAKE_BUILD_DIR
+
+    cmaker = CMaker()
+    cmake_args = shlex.split(os.getenv('CI_SETUP_CMAKE_ARGS', ''))
+    cmaker.configure(cmake_args)
+    return CMAKE_BUILD_DIR()
+
+
 def get_tidy_invocation(f, clang_tidy_binary, checks, tmpdir, build_path,
                         header_filter, extra_arg, extra_arg_before, quiet,
                         config):
@@ -265,8 +277,7 @@ def main():
     if args.build_path is not None:
         build_path = args.build_path
     else:
-        # Find our database
-        build_path = find_compilation_database(db_path)
+        build_path = cmake_configure('.')
 
     try:
         invocation = [args.clang_tidy_binary, '-list-checks']
diff --git a/setup.py b/setup.py
index 2110a34697234..a776e2d18d26f 100644
--- a/setup.py
+++ b/setup.py
@@ -8,17 +8,17 @@
 import glob
 import multiprocessing
 import os
-import platform
 import shutil
 import subprocess
 import sys
 from distutils.command.clean import clean
 from distutils.dir_util import remove_tree
 
-from setuptools import Extension, find_packages, setup
-from setuptools.command.build_ext import build_ext
-from setuptools.command.build_py import build_py
-from setuptools.command.egg_info import egg_info
+from setuptools import find_packages
+from skbuild import setup
+from skbuild.command.egg_info import egg_info
+
+root_dir = os.path.dirname(os.path.abspath(__file__))
 
 classifiers = [
     'Development Status :: 2 - Pre-Alpha',
@@ -58,43 +58,11 @@ def get_version():
 # Our python package root dir is python/
 package_dir = 'python'
 
-root_dir = os.path.abspath(os.path.dirname(__file__))
-
-
-def get_python_executable():
-    return sys.executable.replace('\\', '/')
-
-
-def get_os_name():
-    name = platform.platform()
-    # in python 3.8, platform.platform() uses mac_ver() on macOS
-    # it will return 'macOS-XXXX' instead of 'Darwin-XXXX'
-    if name.lower().startswith('darwin') or name.lower().startswith('macos'):
-        return 'osx'
-    elif name.lower().startswith('windows'):
-        return 'win'
-    elif name.lower().startswith('linux'):
-        return 'linux'
-    elif 'bsd' in name.lower():
-        return 'unix'
-    assert False, "Unknown platform name %s" % name
-
 
 def remove_tmp(taichi_dir):
     shutil.rmtree(os.path.join(taichi_dir, 'assets'), ignore_errors=True)
 
 
-def remove_files_with_extension(dir_name, extension):
-    for file in os.listdir(dir_name):
-        if file.endswith(extension):
-            os.remove(os.path.join(dir_name, file))
-
-
-class CMakeExtension(Extension):
-    def __init__(self, name):
-        Extension.__init__(self, name, sources=[])
-
-
 class EggInfo(egg_info):
     def run(self):
         taichi_dir = os.path.join(package_dir, 'taichi')
@@ -105,148 +73,15 @@ def run(self):
         egg_info.run(self)
 
 
-# python setup.py build runs the following commands in order:
-#   python setup.py build_py
-#   python setup.py build_ext
-class BuildPy(build_py):
-    def run(self):
-        build_py.run(self)
-        taichi_dir = os.path.join(package_dir, 'taichi')
-        remove_tmp(taichi_dir)
-
-
-class CMakeBuild(build_ext):
-    def parse_cmake_args_from_env(self):
-        # Source: TAICHI_CMAKE_ARGS=... python setup.py ...
-        import shlex
-        cmake_args = os.getenv('TAICHI_CMAKE_ARGS', '')
-        return shlex.split(cmake_args.strip())
-
-    def run(self):
-        try:
-            subprocess.check_call(['cmake', '--version'])
-        except OSError:
-            raise RuntimeError(
-                "CMake must be installed to build the following extensions: " +
-                ", ".join(e.name for e in self.extensions))
-
-        # CMakeLists.txt is in the same directory as this setup.py file
-        cmake_list_dir = root_dir
-        self.build_temp = os.path.join(cmake_list_dir, 'build')
-
-        build_directory = os.path.abspath(self.build_temp)
-
-        cmake_args = self.parse_cmake_args_from_env()
-
-        cmake_args += [
-            f'-DCMAKE_LIBRARY_OUTPUT_DIRECTORY={build_directory}',
-            f'-DPYTHON_EXECUTABLE={get_python_executable()}',
-            f'-DTI_VERSION_MAJOR={TI_VERSION_MAJOR}',
-            f'-DTI_VERSION_MINOR={TI_VERSION_MINOR}',
-            f'-DTI_VERSION_PATCH={TI_VERSION_PATCH}',
-        ]
-
-        emscriptened = os.getenv('TI_EMSCRIPTENED', '0') in ('1', 'ON')
-        if emscriptened:
-            cmake_args += ['-DTI_EMSCRIPTENED=ON']
-
-        if shutil.which('ninja'):
-            cmake_args += ['-GNinja']
-
-        cfg = 'Release'
-        if (os.getenv('DEBUG', '0') in ('1', 'ON')):
-            cfg = 'Debug'
-        elif (os.getenv('RELWITHDEBINFO', '0') in ('1', 'ON')):
-            cfg = 'RelWithDebInfo'
-        elif (os.getenv('MINSIZEREL', '0') in ('1', 'ON')):
-            cfg = 'MinSizeRel'
-
-        build_args = ['--config', cfg]
-
-        cmake_args += ['-DCMAKE_BUILD_TYPE=' + cfg]
-
-        # Assuming Makefiles
-        if get_os_name() != 'win':
-            num_threads = os.getenv('BUILD_NUM_THREADS',
-                                    multiprocessing.cpu_count())
-            build_args += ['--', f'-j{num_threads}']
-
-        self.build_args = build_args
-
-        env = os.environ.copy()
-        os.makedirs(self.build_temp, exist_ok=True)
-
-        print('-' * 10, 'Running CMake prepare', '-' * 40)
-        print(' '.join(['cmake', cmake_list_dir] + cmake_args))
-        subprocess.check_call(['cmake', cmake_list_dir] + cmake_args,
-                              cwd=self.build_temp,
-                              env=env)
-
-        print('-' * 10, 'Building extensions', '-' * 40)
-        cmake_cmd = ['cmake', '--build', '.'] + self.build_args
-        subprocess.check_call(cmake_cmd, cwd=self.build_temp)
-
-        self.prepare_package()
-
-    def prepare_package(self):
-        # We need to make sure these additional files are ready for
-        #   - develop mode: must exist in local python/taichi/lib/ folder
-        #   - install mode: must exist in self.build_lib/taichi/lib
-        base_dir = package_dir if self.inplace else self.build_lib
-        taichi_lib_dir = os.path.join(base_dir, 'taichi', '_lib')
-
-        runtime_dir = os.path.join(taichi_lib_dir, "runtime")
-        core_dir = os.path.join(taichi_lib_dir, "core")
-        os.makedirs(runtime_dir, exist_ok=True)
-        os.makedirs(core_dir, exist_ok=True)
-
-        if (get_os_name() == 'linux' or get_os_name() == 'unix'
-                or get_os_name() == 'osx'):
-            remove_files_with_extension(core_dir, ".so")
-        else:
-            remove_files_with_extension(core_dir, ".pyd")
-        if get_os_name() == 'osx':
-            remove_files_with_extension(runtime_dir, ".dylib")
-        remove_files_with_extension(runtime_dir, ".bc")
-
-        if get_os_name() == 'linux' or get_os_name() == 'unix':
-            self.copy_file(os.path.join(self.build_temp, 'libtaichi_core.so'),
-                           os.path.join(core_dir, 'taichi_core.so'))
-        elif get_os_name() == 'osx':
-            self.copy_file(
-                os.path.join(self.build_temp, 'libtaichi_core.dylib'),
-                os.path.join(core_dir, 'taichi_core.so'))
-            moltenvk_path = os.path.join(self.build_temp, 'libMoltenVK.dylib')
-            if os.path.exists(moltenvk_path):
-                self.copy_file(moltenvk_path,
-                               os.path.join(runtime_dir, 'libMoltenVK.dylib'))
-        else:
-            self.copy_file('runtimes/taichi_core.dll',
-                           os.path.join(core_dir, 'taichi_core.pyd'))
-
-        if get_os_name() != 'osx':
-            libdevice_path = 'external/cuda_libdevice/slim_libdevice.10.bc'
-            print("copying libdevice:", libdevice_path)
-            assert os.path.exists(libdevice_path)
-            self.copy_file(libdevice_path,
-                           os.path.join(runtime_dir, 'slim_libdevice.10.bc'))
-
-        llvm_runtime_dir = 'taichi/runtime/llvm'
-        for f in os.listdir(llvm_runtime_dir):
-            if f.startswith('runtime_') and f.endswith('.bc'):
-                print(f"Fetching runtime file {f} to {taichi_lib_dir} folder")
-                self.copy_file(os.path.join(llvm_runtime_dir, f), runtime_dir)
-
-
 class Clean(clean):
     def run(self):
         super().run()
-        self.build_temp = os.path.join(root_dir, 'build')
+        self.build_temp = os.path.join(root_dir, '_skbuild')
         if os.path.exists(self.build_temp):
             remove_tree(self.build_temp, dry_run=self.dry_run)
         generated_folders = ('bin', 'dist', 'python/taichi/assets',
-                             'python/taichi/_lib/runtime',
-                             'python/taichi.egg-info')
+                             'python/taichi/_lib/runtime', 'taichi.egg-info',
+                             'python/taichi.egg-info', 'build')
         for d in generated_folders:
             if os.path.exists(d):
                 remove_tree(d, dry_run=self.dry_run)
@@ -263,6 +98,45 @@ def run(self):
                     os.remove(f)
 
 
+def get_cmake_args():
+    import shlex
+
+    num_threads = os.getenv('BUILD_NUM_THREADS', multiprocessing.cpu_count())
+    cmake_args = shlex.split(os.getenv('TAICHI_CMAKE_ARGS', '').strip())
+
+    if (os.getenv('DEBUG', '0') in ('1', 'ON')):
+        cfg = 'Debug'
+    elif (os.getenv('RELWITHDEBINFO', '0') in ('1', 'ON')):
+        cfg = 'RelWithDebInfo'
+    elif (os.getenv('MINSIZEREL', '0') in ('1', 'ON')):
+        cfg = 'MinSizeRel'
+    else:
+        cfg = None
+    if cfg:
+        sys.argv[2:2] = ['--build-type', cfg]
+
+    cmake_args += [
+        f'-DTI_VERSION_MAJOR={TI_VERSION_MAJOR}',
+        f'-DTI_VERSION_MINOR={TI_VERSION_MINOR}',
+        f'-DTI_VERSION_PATCH={TI_VERSION_PATCH}',
+    ]
+    emscriptened = os.getenv('TI_EMSCRIPTENED', '0') in ('1', 'ON')
+    if emscriptened:
+        cmake_args += ['-DTI_EMSCRIPTENED=ON']
+
+    if sys.platform != 'win32':
+        os.environ['SKBUILD_BUILD_OPTIONS'] = f'-j{num_threads}'
+    return cmake_args
+
+
+def exclude_paths(manifest_files):
+    return [
+        f for f in manifest_files
+        if f.endswith(('.so', 'pyd',
+                       '.bc')) or os.path.basename(f) == 'libMoltenVK.dylib'
+    ]
+
+
 setup(name=project_name,
       packages=packages,
       package_dir={"": package_dir},
@@ -287,9 +161,10 @@ def run(self):
           ],
       },
       classifiers=classifiers,
-      ext_modules=[CMakeExtension('taichi_core')],
-      cmdclass=dict(egg_info=EggInfo,
-                    build_py=BuildPy,
-                    build_ext=CMakeBuild,
-                    clean=Clean),
+      cmake_args=get_cmake_args(),
+      cmake_process_manifest_hook=exclude_paths,
+      cmdclass={
+          'egg_info': EggInfo,
+          'clean': Clean
+      },
       has_ext_modules=lambda: True)

From 5bb28e25c22669b6f0da950557c5b171ce19dadd Mon Sep 17 00:00:00 2001
From: Frost Ming <mianghong@gmail.com>
Date: Wed, 27 Apr 2022 12:49:07 +0800
Subject: [PATCH 002/176] [build] Install export core library to build dir
 (#4866)

---
 cmake/TaichiExportCore.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/TaichiExportCore.cmake b/cmake/TaichiExportCore.cmake
index c7fc0335948bf..5ffaa960a963e 100644
--- a/cmake/TaichiExportCore.cmake
+++ b/cmake/TaichiExportCore.cmake
@@ -4,3 +4,4 @@ set(TAICHI_EXPORT_CORE_NAME taichi_export_core)
 
 add_library(${TAICHI_EXPORT_CORE_NAME} SHARED)
 target_link_libraries(${TAICHI_EXPORT_CORE_NAME} taichi_isolated_core)
+install(TARGETS ${TAICHI_EXPORT_CORE_NAME} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/build)

From 5d20a1d7a796418175009399606477a32ddc5f82 Mon Sep 17 00:00:00 2001
From: Taichi Gardener <62079278+taichi-gardener@users.noreply.github.com>
Date: Wed, 27 Apr 2022 13:47:38 +0800
Subject: [PATCH 003/176] [misc] Bump version to v1.0.2 (#4867)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index b18d46540b351..570c796513fb7 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v1.0.1
+v1.0.2

From 92f8464f23c9561c28df1950cf960978dd7e3b97 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <zy2284@columbia.edu>
Date: Wed, 27 Apr 2022 15:45:19 +0800
Subject: [PATCH 004/176] [Bug] Remove redundant AllocStmt when lowering
 FrontendWhileStmt (#4870)

---
 taichi/transforms/lower_ast.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index 5562ec9bb9188..5141724e19deb 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -156,7 +156,6 @@ class LowerAST : public IRVisitor {
     stmts->insert(
         std::make_unique<WhileControlStmt>(new_while->mask, cond_stmt),
         fctx.stmts.size());
-    stmt->insert_before_me(std::make_unique<AllocaStmt>(PrimitiveType::i32));
     auto &&const_stmt =
         std::make_unique<ConstStmt>(TypedConstant((int32)0xFFFFFFFF));
     auto const_stmt_ptr = const_stmt.get();

From 5bb3b0ee758dc6062089b8441010e08356c1005a Mon Sep 17 00:00:00 2001
From: Frost Ming <mianghong@gmail.com>
Date: Wed, 27 Apr 2022 18:41:44 +0800
Subject: [PATCH 005/176] [build] [bug] Fix a bug of skbuild that loses the
 root package_dir (#4875)

---
 setup.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/setup.py b/setup.py
index a776e2d18d26f..53752a499083f 100644
--- a/setup.py
+++ b/setup.py
@@ -64,6 +64,12 @@ def remove_tmp(taichi_dir):
 
 
 class EggInfo(egg_info):
+    def finalize_options(self, *args, **kwargs):
+        if '' not in self.distribution.package_dir:
+            # XXX: skbuild loses the root package dir
+            self.distribution.package_dir[''] = package_dir
+        return super().finalize_options(*args, **kwargs)
+
     def run(self):
         taichi_dir = os.path.join(package_dir, 'taichi')
         remove_tmp(taichi_dir)

From 477c996e84f4d08e70674e367982f1f394623efb Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Wed, 27 Apr 2022 21:49:43 +0800
Subject: [PATCH 006/176] [ci] Add libtaichi_export_core build for desktop in
 CI (#4871)

---
 .github/workflows/scripts/unix_build.sh | 19 ++++++---
 .github/workflows/testing.yml           | 54 +++++++++++++++++++++++++
 cmake/TaichiExportCore.cmake            |  1 -
 3 files changed, 68 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/scripts/unix_build.sh b/.github/workflows/scripts/unix_build.sh
index a4c1f5679662d..0ad894d176fbd 100755
--- a/.github/workflows/scripts/unix_build.sh
+++ b/.github/workflows/scripts/unix_build.sh
@@ -49,7 +49,7 @@ setup_python() {
     python3 -m pip install -r requirements_dev.txt
 }
 
-build() {
+build_taichi_wheel() {
     git fetch origin master
     PROJECT_TAGS=""
     EXTRA_ARGS=""
@@ -69,12 +69,21 @@ build() {
     sccache -s
 }
 
+build_libtaichi_export() {
+    git fetch origin master
+    python3 setup.py build_ext
+}
+
 setup_sccache
 setup_python
-build
-cat "$SCCACHE_ERROR_LOG" || true
-NUM_WHL=$(ls dist/*.whl | wc -l)
-if [ $NUM_WHL -ne 1 ]; then echo "ERROR: created more than 1 whl." && exit 1; fi
+
+if [ "$EXPORT_CORE" == "1" ]; then
+    build_libtaichi_export
+else
+    build_taichi_wheel
+    NUM_WHL=$(ls dist/*.whl | wc -l)
+    if [ $NUM_WHL -ne 1 ]; then echo "ERROR: created more than 1 whl." && exit 1; fi
+fi
 
 chmod -R 777 "$SCCACHE_DIR"
 rm -f python/CHANGELOG.md
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index ccc2b08d2ffc7..00edaaa750b9f 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -415,3 +415,57 @@ jobs:
           PY: ${{ matrix.python }}
           PLATFORM: 'm1'
           TI_CI: 1
+
+  build_libtaichi_export:
+    name: Build libtaichi_export.so(GPU)
+    needs: check_files
+    runs-on: [self-hosted, cuda, vulkan, cn]
+    timeout-minutes: 60
+    strategy:
+      matrix:
+        include:
+          - os: ubuntu-latest
+            python: py39
+            with_cc: ON
+    permissions:
+      packages: read
+      contents: read
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: "recursive"
+
+      - name: Get sccache cache
+        uses: actions/cache@v2
+        with:
+          path: sccache_cache
+          key: sccache-linux-${{matrix.with_cc}}-${{ github.sha }}
+          restore-keys: |
+            sccache-linux-${{matrix.with_cc}}-
+
+      - name: Build For Desktop
+        run: |
+          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+            exit 0
+          fi
+          docker create --user dev --name taichi_build_desktop --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
+            -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY -e EXPORT_CORE\
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.2.1 \
+            /home/dev/taichi/.github/workflows/scripts/unix_build.sh
+          # A tarball is needed because sccache needs some permissions that only the file owner has.
+          # 1000 is the uid and gid of user "dev" in the container.
+          # If the uid or gid of the user inside the docker changes, please change the uid and gid in the following line.
+          tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx --owner 1000 --group 1000 | docker cp - taichi_build_desktop:/home/dev/
+          docker start -a taichi_build_desktop
+        env:
+          PY: ${{ matrix.python }}
+          GPU_BUILD: ON
+          PROJECT_NAME: taichi
+          TAICHI_CMAKE_ARGS: -DTI_WITH_VULKAN:BOOL=ON -DTI_WITH_CUDA:BOOL=OFF -DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_LLVM:BOOL=OFF -DCMAKE_C_COMPILER_LAUNCHER=sccache -DCMAKE_CXX_COMPILER_LAUNCHER=sccache -DTI_EXPORT_CORE:BOOL=ON
+          EXPORT_CORE: 1
+          DISPLAY: :1
+
+      - name: clean docker container
+        if: always()
+        run: |
+          docker rm taichi_build_desktop -f
diff --git a/cmake/TaichiExportCore.cmake b/cmake/TaichiExportCore.cmake
index 5ffaa960a963e..c7fc0335948bf 100644
--- a/cmake/TaichiExportCore.cmake
+++ b/cmake/TaichiExportCore.cmake
@@ -4,4 +4,3 @@ set(TAICHI_EXPORT_CORE_NAME taichi_export_core)
 
 add_library(${TAICHI_EXPORT_CORE_NAME} SHARED)
 target_link_libraries(${TAICHI_EXPORT_CORE_NAME} taichi_isolated_core)
-install(TARGETS ${TAICHI_EXPORT_CORE_NAME} DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/build)

From 6e055f0c28c1092af9402e25b1c03325e7458f62 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Wed, 27 Apr 2022 23:53:55 +0800
Subject: [PATCH 007/176] [Build] [refactor] Define runtime build target
 (#4838)

* Move LLVM Cmake to its own dir

* Suppress warning from submodules

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use current source dir

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate Vulkan runtime files from codegen

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CMakeLists.txt                                | 11 +----------
 cmake/TaichiCore.cmake                        | 13 ++++++++-----
 taichi/backends/dx/dx_program.h               |  2 +-
 .../backends/vulkan/aot_module_builder_impl.h |  2 +-
 .../vulkan/aot_module_loader_impl.cpp         |  2 +-
 .../backends/vulkan/aot_module_loader_impl.h  |  2 +-
 taichi/backends/vulkan/snode_tree_manager.cpp |  2 +-
 taichi/backends/vulkan/vulkan_program.h       |  2 +-
 taichi/python/export_misc.cpp                 |  2 +-
 taichi/runtime/llvm/CMakeLists.txt            | 19 +++++++++++++++++++
 taichi/runtime/vulkan/CMakeLists.txt          | 13 +++++++++++++
 .../{backends => runtime}/vulkan/runtime.cpp  |  2 +-
 taichi/{backends => runtime}/vulkan/runtime.h |  0
 13 files changed, 49 insertions(+), 23 deletions(-)
 create mode 100644 taichi/runtime/llvm/CMakeLists.txt
 create mode 100644 taichi/runtime/vulkan/CMakeLists.txt
 rename taichi/{backends => runtime}/vulkan/runtime.cpp (99%)
 rename taichi/{backends => runtime}/vulkan/runtime.h (100%)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7387e8b648b56..75d71cb604609 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -148,16 +148,7 @@ if (${CLANG_VERSION_MAJOR} VERSION_GREATER ${CLANG_HIGHEST_VERSION})
   endif()
 endif()
 
-# Build llvm-runtime for host arch and cuda (if available)
-foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
-  add_custom_target(
-      "generate_llvm_runtime_${arch}"
-      COMMAND ${CLANG_EXECUTABLE} ${CLANG_OSX_FLAGS} -c runtime.cpp -o "runtime_${arch}.bc" -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
-      WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm"
-  )
-  add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}")
-  install(FILES "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm/runtime_${arch}.bc" DESTINATION ${CMAKE_INSTALL_PREFIX}/python/taichi/_lib/runtime)
-endforeach()
+add_subdirectory(taichi/runtime/llvm)
 
 configure_file(taichi/common/version.h.in ${CMAKE_SOURCE_DIR}/taichi/common/version.h)
 configure_file(taichi/common/commit_hash.h.in ${CMAKE_SOURCE_DIR}/taichi/common/commit_hash.h)
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 687130d9e19d5..8b740b5e74f82 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -16,6 +16,9 @@ option(TI_EMSCRIPTENED "Build using emscripten" OFF)
 # projects.
 set(CMAKE_CXX_VISIBILITY_PRESET hidden)
 set(CMAKE_VISIBILITY_INLINES_HIDDEN ON)
+# Suppress warnings from submodules introduced by the above symbol visibility change
+set(CMAKE_POLICY_DEFAULT_CMP0063 NEW)
+set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
 set(INSTALL_LIB_DIR ${CMAKE_INSTALL_PREFIX}/python/taichi/_lib)
 
 if(ANDROID)
@@ -140,10 +143,6 @@ file(GLOB TAICHI_OPENGL_REQUIRED_SOURCE
   "taichi/backends/opengl/codegen_opengl.*"
   "taichi/backends/opengl/struct_opengl.*"
 )
-file(GLOB TAICHI_VULKAN_REQUIRED_SOURCE
-  "taichi/backends/vulkan/runtime.h"
-  "taichi/backends/vulkan/runtime.cpp"
-)
 
 list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_BACKEND_SOURCE})
 
@@ -198,7 +197,7 @@ if (TI_WITH_VULKAN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_VULKAN")
     list(APPEND TAICHI_CORE_SOURCE ${TAICHI_VULKAN_SOURCE})
 endif()
-list(APPEND TAICHI_CORE_SOURCE ${TAICHI_VULKAN_REQUIRED_SOURCE})
+
 
 if (TI_WITH_DX11)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_DX11")
@@ -389,8 +388,12 @@ if (TI_WITH_VULKAN)
             install(FILES ${CMAKE_BINARY_DIR}/libMoltenVK.dylib DESTINATION ${INSTALL_LIB_DIR}/runtime)
         endif()
     endif()
+
+    add_subdirectory(taichi/runtime/vulkan)
+    target_link_libraries(${CORE_LIBRARY_NAME} vulkan_runtime)
 endif ()
 
+
 # Optional dependencies
 
 if (APPLE)
diff --git a/taichi/backends/dx/dx_program.h b/taichi/backends/dx/dx_program.h
index 4974139147b0c..493cb5c62c69a 100644
--- a/taichi/backends/dx/dx_program.h
+++ b/taichi/backends/dx/dx_program.h
@@ -3,7 +3,7 @@
 #ifdef TI_WITH_DX11
 
 #include "taichi/backends/dx/dx_device.h"
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/backends/vulkan/snode_tree_manager.h"
 #include "taichi/program/program_impl.h"
 
diff --git a/taichi/backends/vulkan/aot_module_builder_impl.h b/taichi/backends/vulkan/aot_module_builder_impl.h
index bbd6b40e4df48..0accfcc203343 100644
--- a/taichi/backends/vulkan/aot_module_builder_impl.h
+++ b/taichi/backends/vulkan/aot_module_builder_impl.h
@@ -5,7 +5,7 @@
 
 #include "taichi/aot/module_builder.h"
 #include "taichi/backends/vulkan/aot_utils.h"
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/codegen/spirv/snode_struct_compiler.h"
 #include "taichi/codegen/spirv/kernel_utils.h"
 
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.cpp b/taichi/backends/vulkan/aot_module_loader_impl.cpp
index a773f71f37def..5f87eb4fc9ca8 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.cpp
+++ b/taichi/backends/vulkan/aot_module_loader_impl.cpp
@@ -3,7 +3,7 @@
 #include <fstream>
 #include <type_traits>
 
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 
 namespace taichi {
 namespace lang {
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.h b/taichi/backends/vulkan/aot_module_loader_impl.h
index 7d32d991f2e89..b188281cb749d 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.h
+++ b/taichi/backends/vulkan/aot_module_loader_impl.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "taichi/backends/vulkan/aot_utils.h"
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/codegen/spirv/kernel_utils.h"
 
 #include "taichi/aot/module_loader.h"
diff --git a/taichi/backends/vulkan/snode_tree_manager.cpp b/taichi/backends/vulkan/snode_tree_manager.cpp
index 0bfb6d2f01edd..b7d4816ae42d9 100644
--- a/taichi/backends/vulkan/snode_tree_manager.cpp
+++ b/taichi/backends/vulkan/snode_tree_manager.cpp
@@ -1,6 +1,6 @@
 #include "taichi/backends/vulkan/snode_tree_manager.h"
 
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 
 namespace taichi {
 namespace lang {
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index a94f2abbb2ba7..b3b33348c525f 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -6,7 +6,7 @@
 #include "taichi/backends/vulkan/vulkan_device_creator.h"
 #include "taichi/backends/vulkan/vulkan_utils.h"
 #include "taichi/backends/vulkan/vulkan_loader.h"
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/backends/vulkan/snode_tree_manager.h"
 #include "taichi/backends/vulkan/vulkan_device.h"
 #include "vk_mem_alloc.h"
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index e75df0b88d856..a1abb389ab782 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -5,7 +5,7 @@
 
 #include "taichi/backends/metal/api.h"
 #include "taichi/backends/opengl/opengl_api.h"
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/backends/dx/dx_api.h"
 #include "taichi/common/core.h"
 #include "taichi/common/interface.h"
diff --git a/taichi/runtime/llvm/CMakeLists.txt b/taichi/runtime/llvm/CMakeLists.txt
new file mode 100644
index 0000000000000..fdb5c723c2fb7
--- /dev/null
+++ b/taichi/runtime/llvm/CMakeLists.txt
@@ -0,0 +1,19 @@
+# ./taichi/runtime/llvm/CMakeLists.txt
+
+function(COMPILE_LLVM_RUNTIME rtm_arch)
+    message(STATUS "Compiling LLVM byte code file for arch ${rtm_arch}")
+    # Keep this for now, as .bc need to be generated.
+    add_custom_target(
+        "generate_llvm_runtime_${rtm_arch}"
+        COMMAND ${CLANG_EXECUTABLE} ${CLANG_OSX_FLAGS} -c runtime.cpp -o "runtime_${rtm_arch}.bc" -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${rtm_arch}" -I ${PROJECT_SOURCE_DIR};
+        # TODO, it's better to avoid polluting the source dir, keep in build
+        WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}"
+    )
+    add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${rtm_arch}")
+    install(FILES "${CMAKE_SOURCE_DIR}/taichi/runtime/llvm/runtime_${arch}.bc" DESTINATION ${CMAKE_INSTALL_PREFIX}/python/taichi/_lib/runtime)
+endfunction()
+
+# Build llvm-runtime for host arch and cuda (if available)
+foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
+  compile_llvm_runtime(${arch})
+endforeach()
diff --git a/taichi/runtime/vulkan/CMakeLists.txt b/taichi/runtime/vulkan/CMakeLists.txt
new file mode 100644
index 0000000000000..00ecee7a09caf
--- /dev/null
+++ b/taichi/runtime/vulkan/CMakeLists.txt
@@ -0,0 +1,13 @@
+# ./taichi/runtime/vulkan/CMakeLists.txt
+
+add_library(vulkan_runtime)
+target_sources(vulkan_runtime
+  PRIVATE
+    runtime.cpp
+  )
+target_include_directories(vulkan_runtime
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+    ${PROJECT_SOURCE_DIR}/external/eigen
+    ${PROJECT_SOURCE_DIR}/external/FP16/include
+  )
diff --git a/taichi/backends/vulkan/runtime.cpp b/taichi/runtime/vulkan/runtime.cpp
similarity index 99%
rename from taichi/backends/vulkan/runtime.cpp
rename to taichi/runtime/vulkan/runtime.cpp
index 7da6b181719b6..82478b05bf427 100644
--- a/taichi/backends/vulkan/runtime.cpp
+++ b/taichi/runtime/vulkan/runtime.cpp
@@ -1,4 +1,4 @@
-#include "taichi/backends/vulkan/runtime.h"
+#include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/program/program.h"
 
 #include <chrono>
diff --git a/taichi/backends/vulkan/runtime.h b/taichi/runtime/vulkan/runtime.h
similarity index 100%
rename from taichi/backends/vulkan/runtime.h
rename to taichi/runtime/vulkan/runtime.h

From 0abd24b0d29a7bd5e93195b6ebd9616b05fbaea9 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Thu, 28 Apr 2022 10:29:54 +0800
Subject: [PATCH 008/176] [Doc] Add limitation about TLS optimization (#4877)

* [Doc] Add limitation about TLS optimization

* Add link to reduction sum benchmark

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: Haidong Lan <turbo0628g@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/lang/articles/advanced/performance.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/docs/lang/articles/advanced/performance.md b/docs/lang/articles/advanced/performance.md
index 4e69cb533a2b5..011670ed380c3 100644
--- a/docs/lang/articles/advanced/performance.md
+++ b/docs/lang/articles/advanced/performance.md
@@ -153,14 +153,18 @@ Additionally, the last atomic add to the global memory `s[None]` is optimized us
 CUDA's warp-level intrinsics, further reducing the number of required atomic adds.
 
 Currently, Taichi supports TLS optimization for these reduction operators: `add`,
-`sub`, `min` and `max`. [Here](https://github.com/taichi-dev/taichi/pull/2956) is
-a benchmark comparison when running a global max reduction on a 1-D Taichi field
+`sub`, `min` and `max` on **0D** scalar/vector/matrix `ti.field`s. It is not yet
+supported on `ti.ndarray`s. [Here](https://github.com/taichi-dev/taichi/pull/2956)
+is a benchmark comparison when running a global max reduction on a 1-D Taichi field
 of 8M floats on an Nvidia GeForce RTX 3090 card:
 
 * TLS disabled: 5.2 x 1e3 us
 * TLS enabled: 5.7 x 1e1 us
 
-TLS has led to an approximately 100x speedup.
+TLS has led to an approximately 100x speedup. We also show that TLS reduction sum
+achieves comparable performance with CUDA implementations, see
+[benchmark](https://github.com/taichi-dev/taichi_benchmark/tree/main/reduce_sum) for
+details.
 
 ### Block Local Storage (BLS)
 

From a9bd5c91b6b587c6f621b1bbfc9e1c1d6cc5e165 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 28 Apr 2022 11:11:36 +0800
Subject: [PATCH 009/176] [ci] Use the updated docker image for
 libtaichi_export_core (#4881)

---
 .github/workflows/testing.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 00edaaa750b9f..a34de5b8581fb 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -450,7 +450,7 @@ jobs:
           fi
           docker create --user dev --name taichi_build_desktop --gpus all -v /tmp/.X11-unix:/tmp/.X11-unix \
             -e PY -e GPU_BUILD -e PROJECT_NAME -e TAICHI_CMAKE_ARGS -e DISPLAY -e EXPORT_CORE\
-            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.2.1 \
+            registry.taichigraphics.com/taichidev-ubuntu18.04:v0.3.0 \
             /home/dev/taichi/.github/workflows/scripts/unix_build.sh
           # A tarball is needed because sccache needs some permissions that only the file owner has.
           # 1000 is the uid and gid of user "dev" in the container.

From 3291be64dd29103b01d005915278d1b455444dc1 Mon Sep 17 00:00:00 2001
From: PGZXB <420254146@qq.com>
Date: Thu, 28 Apr 2022 11:14:40 +0800
Subject: [PATCH 010/176] [refactor] Add ASTSerializer and use it to generate
 offline-cache-key (#4863)

* Add ASTSerializer, using it to generate offline-cache-key

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/analysis/gen_offline_cache_key.cpp | 441 ++++++++++++++++++++++
 taichi/analysis/offline_cache_util.cpp    |   5 +-
 taichi/analysis/offline_cache_util.h      |   5 +-
 taichi/inc/frontend_statements.inc.h      |  15 +
 taichi/inc/statements.inc.h               |  16 +-
 taichi/ir/expression_printer.h            |   8 +-
 taichi/ir/transforms.h                    |   1 -
 taichi/transforms/ir_printer.cpp          |   6 -
 8 files changed, 468 insertions(+), 29 deletions(-)
 create mode 100644 taichi/analysis/gen_offline_cache_key.cpp
 create mode 100644 taichi/inc/frontend_statements.inc.h

diff --git a/taichi/analysis/gen_offline_cache_key.cpp b/taichi/analysis/gen_offline_cache_key.cpp
new file mode 100644
index 0000000000000..e2a336c6b6252
--- /dev/null
+++ b/taichi/analysis/gen_offline_cache_key.cpp
@@ -0,0 +1,441 @@
+#include <unordered_map>
+#include "taichi/analysis/offline_cache_util.h"
+#include "taichi/common/logging.h"
+#include "taichi/ir/expr.h"
+#include "taichi/ir/expression_printer.h"
+#include "taichi/ir/frontend_ir.h"
+#include "taichi/ir/ir.h"
+#include "taichi/ir/mesh.h"
+#include "taichi/ir/type.h"
+#include "taichi/program/function.h"
+#include "taichi/program/program.h"
+
+namespace taichi {
+namespace lang {
+
+namespace {
+
+enum class StmtOpCode : std::uint8_t {
+  EnterBlock,
+  ExitBlock,
+#define PER_STATEMENT(x) x,
+#include "taichi/inc/frontend_statements.inc.h"
+#undef PER_STATEMENT
+};
+
+enum class ForLoopType : std::uint8_t {
+  RangeFor,
+  StructFor,
+  MeshFor,
+};
+
+enum class ExternalFuncType : std::uint8_t {
+  SO,
+  ASM,
+  BC,
+};
+
+class ASTSerializer : public IRVisitor {
+ public:
+  ASTSerializer(Program *prog,
+                ExpressionPrinter *expr_printer,
+                std::ostream *os)
+      : prog_(prog), os_(os), expr_printer_(expr_printer) {
+    this->allow_undefined_visitor = true;
+    expr_printer_->set_ostream(os);
+  }
+
+  void set_ostream(std::ostream *os) {
+    this->os_ = os;
+  }
+
+  std::ostream *get_ostream() {
+    return this->os_;
+  }
+
+  void visit(Block *block) override {
+    emit(StmtOpCode::EnterBlock);
+    emit(static_cast<std::size_t>(block->statements.size()));
+    for (auto &stmt : block->statements) {
+      stmt->accept(this);
+    }
+    emit(StmtOpCode::ExitBlock);
+  }
+
+  void visit(FrontendExprStmt *stmt) override {
+    emit(StmtOpCode::FrontendExprStmt);
+    emit(stmt->val);
+  }
+
+  void visit(FrontendBreakStmt *stmt) override {
+    emit(StmtOpCode::FrontendBreakStmt);
+  }
+
+  void visit(FrontendContinueStmt *stmt) override {
+    emit(StmtOpCode::FrontendContinueStmt);
+  }
+
+  void visit(FrontendAssignStmt *stmt) override {
+    emit(StmtOpCode::FrontendAssignStmt);
+    emit(stmt->lhs);
+    emit(stmt->rhs);
+  }
+
+  void visit(FrontendAllocaStmt *stmt) override {
+    emit(StmtOpCode::FrontendAllocaStmt);
+    emit(stmt->ret_type);
+    emit(stmt->ident);
+  }
+
+  void visit(FrontendAssertStmt *stmt) override {
+    emit(StmtOpCode::FrontendAssertStmt);
+    emit(stmt->cond);
+  }
+
+  void visit(FrontendSNodeOpStmt *stmt) override {
+    emit(StmtOpCode::FrontendSNodeOpStmt);
+    emit(stmt->op_type);
+    emit(stmt->snode);
+    std::size_t count = stmt->indices.size();
+    if (stmt->val.expr)
+      ++count;
+    emit(count);
+    for (const auto &i : stmt->indices.exprs) {
+      emit(i);
+    }
+    if (stmt->val.expr) {
+      emit(stmt->val);
+    }
+  }
+
+  void visit(FrontendIfStmt *stmt) override {
+    emit(StmtOpCode::FrontendIfStmt);
+    emit(stmt->condition);
+    std::uint8_t branch_count = 0;
+    if (stmt->true_statements) {
+      ++branch_count;
+    }
+    if (stmt->false_statements) {
+      ++branch_count;
+    }
+    emit(branch_count);
+    if (stmt->true_statements) {
+      emit(stmt->true_statements.get());
+    }
+    if (stmt->false_statements) {
+      emit(stmt->false_statements.get());
+    }
+  }
+
+  void visit(FrontendPrintStmt *stmt) override {
+    emit(StmtOpCode::FrontendPrintStmt);
+    emit(static_cast<std::size_t>(stmt->contents.size()));
+    for (const auto &c : stmt->contents) {
+      emit(static_cast<std::uint8_t>(c.index()));
+      if (std::holds_alternative<Expr>(c)) {
+        emit(std::get<Expr>(c).expr);
+      } else {
+        const auto &str = std::get<std::string>(c);
+        emit(str);
+      }
+    }
+  }
+
+  void visit(FrontendFuncDefStmt *stmt) override {
+    emit(StmtOpCode::FrontendFuncDefStmt);
+    emit(stmt->body.get());
+  }
+
+  void visit(FrontendWhileStmt *stmt) override {
+    emit(StmtOpCode::FrontendWhileStmt);
+    emit(stmt->cond);
+    emit(stmt->body.get());
+  }
+
+  void visit(FrontendForStmt *stmt) override {
+    emit(StmtOpCode::FrontendForStmt);
+    if (stmt->is_ranged()) {
+      emit(ForLoopType::RangeFor);
+      emit(stmt->loop_var_id);
+      emit(stmt->begin);
+      emit(stmt->end);
+    } else if (stmt->mesh_for) {
+      emit(ForLoopType::MeshFor);
+      emit(stmt->element_type);
+      emit(stmt->mesh);
+    } else {
+      emit(ForLoopType::StructFor);
+      emit(stmt->loop_var_id);
+      if (stmt->global_var.is<GlobalVariableExpression>()) {
+        emit(stmt->global_var.cast<GlobalVariableExpression>()->snode);
+      } else {
+        emit(stmt->global_var);
+      }
+    }
+    emit(stmt->bit_vectorize);
+    emit(stmt->num_cpu_threads);
+    emit(stmt->strictly_serialized);
+    emit(stmt->mem_access_opt);
+    emit(stmt->block_dim);
+    emit(stmt->body.get());
+  }
+
+  void visit(FrontendReturnStmt *stmt) override {
+    emit(StmtOpCode::FrontendReturnStmt);
+    emit(stmt->ret_type);
+    emit(stmt->values.exprs);
+  }
+
+  void visit(FrontendExternalFuncStmt *stmt) override {
+    // Note: The result of serializing FrontendExternalFuncStmt is not parsable
+    // now
+    emit(StmtOpCode::FrontendExternalFuncStmt);
+    if (stmt->so_func != nullptr) {
+      emit(ExternalFuncType::SO);
+    } else if (!stmt->asm_source.empty()) {
+      emit(ExternalFuncType::ASM);
+      emit(stmt->asm_source);
+    } else {
+      emit(ExternalFuncType::BC);
+      emit(stmt->bc_filename);
+      emit(stmt->bc_funcname);
+    }
+    emit(stmt->args);
+    emit(stmt->outputs);
+  }
+
+  static void run(Program *prog, IRNode *ast, std::ostream *os) {
+    // Temporary: using ExpressionOfflineCacheKeyGenerator, which will be
+    // refactored
+    ExpressionOfflineCacheKeyGenerator generator(prog);
+    ASTSerializer serializer(prog, &generator, os);
+    ast->accept(&serializer);
+    serializer.emit_dependencies();
+  }
+
+ private:
+  void emit_dependencies() {
+    // Serialize dependent real-func recursively
+    std::ostringstream temp_oss;
+    auto *curr_os = this->get_ostream();
+    this->set_ostream(&temp_oss);
+    expr_printer_->set_ostream(&temp_oss);
+    std::size_t last_size{0};
+    do {
+      last_size = real_funcs_.size();
+      for (auto &[func, visited] : real_funcs_) {
+        if (!visited) {
+          visited = true;
+          func->ir->accept(this);  // Maybe add new func
+        }
+      }
+    } while (real_funcs_.size() > last_size);
+    this->set_ostream(curr_os);
+    expr_printer_->set_ostream(curr_os);
+    emit(static_cast<std::size_t>(real_funcs_.size()));
+    emit(&temp_oss);
+
+    // Serialize snode_trees(Temporary: using offline-cache-key of SNode)
+    // Note: The result of serializing snode_tree_roots_ is not parsable now
+    emit(static_cast<std::size_t>(snode_tree_roots_.size()));
+    for (auto *snode : snode_tree_roots_) {
+      auto key = get_hashed_offline_cache_key_of_snode(snode);
+      emit_bytes(key.c_str(), key.size());
+    }
+
+    // Dump string-pool
+    emit(static_cast<std::size_t>(string_pool_.size()));
+    emit_bytes(string_pool_.data(), string_pool_.size());
+  }
+
+  template <typename T>
+  void emit_pod(const T &val) {
+    static_assert(std::is_pod<T>::value);
+    TI_ASSERT(os_);
+    os_->write((const char *)&val, sizeof(T));
+  }
+
+  void emit_bytes(const char *bytes, std::size_t len) {
+    TI_ASSERT(os_);
+    os_->write(bytes, len);
+  }
+
+  template <typename K, typename V>
+  void emit(const std::unordered_map<K, V> &map) {
+    emit(static_cast<std::size_t>(map.size()));
+    for (const auto &[k, v] : map) {
+      emit(k);
+      emit(v);
+    }
+  }
+
+  template <typename T1, typename T2>
+  void emit(const std::pair<T1, T2> &pair) {
+    emit(pair.first);
+    emit(pair.second);
+  }
+
+  template <typename K, typename V>
+  void emit(const std::map<K, V> &map) {
+    emit(static_cast<std::size_t>(map.size()));
+    for (const auto &[k, v] : map) {
+      emit(k);
+      emit(v);
+    }
+  }
+
+  void emit(std::ostream *os) {
+    TI_ASSERT(os_ && os);
+    *os_ << os->rdbuf();
+  }
+
+  void emit(const std::string &str) {
+    std::size_t size = str.size();
+    std::size_t offset = string_pool_.size();
+    string_pool_.insert(string_pool_.end(), str.begin(), str.end());
+    emit(size);
+    emit(offset);
+  }
+
+  void emit(SNodeOpType type) {
+    emit_pod(type);
+  }
+
+  void emit(SNode *snode) {
+    TI_ASSERT(snode);
+    TI_ASSERT(prog_);
+    emit(static_cast<std::size_t>(snode->get_snode_tree_id()));
+    emit(static_cast<std::size_t>(snode->id));
+    auto *root = prog_->get_snode_root(snode->get_snode_tree_id());
+    snode_tree_roots_.insert(root);
+  }
+
+  void emit(mesh::MeshElementType type) {
+    emit_pod(type);
+  }
+
+  void emit(mesh::MeshRelationType type) {
+    emit_pod(type);
+  }
+
+  void emit(mesh::ConvType type) {
+    emit_pod(type);
+  }
+
+  void emit(const mesh::MeshLocalRelation &r) {
+    emit(r.fixed);
+    emit(r.value);
+    emit(r.patch_offset);
+    emit(r.offset);
+  }
+
+  void emit(mesh::Mesh *mesh) {
+    emit(mesh->num_patches);
+    emit(mesh->num_elements);
+    emit(mesh->patch_max_element_num);
+    emit(mesh->owned_offset);
+    emit(mesh->total_offset);
+    emit(mesh->index_mapping);
+    emit(mesh->relations);
+  }
+
+  void emit(const Identifier &ident) {
+    emit(ident.id);
+  }
+
+  void emit(const std::vector<Identifier> &identifiers) {
+    emit(static_cast<std::size_t>(identifiers.size()));
+    for (const auto &id : identifiers) {
+      emit(id);
+    }
+  }
+
+  void emit(PrimitiveTypeID type_id) {
+    emit_pod(type_id);
+  }
+
+  void emit(const DataType &type) {
+    if (auto *p = type->cast<PrimitiveType>()) {
+      emit(p->type);
+    } else {
+      TI_NOT_IMPLEMENTED;
+    }
+  }
+
+  void emit(StmtOpCode code) {
+    emit_pod(code);
+  }
+
+  void emit(IRNode *ir) {
+    TI_ASSERT(ir);
+    ir->accept(this);
+  }
+
+  void emit(const Expr &expr) {
+    TI_ASSERT(expr_printer_);
+    expr.expr->accept(expr_printer_);
+  }
+
+  void emit(const std::vector<Expr> &exprs) {
+    emit(static_cast<std::size_t>(exprs.size()));
+    for (const auto &e : exprs) {
+      emit(e);
+    }
+  }
+
+  void emit(std::size_t size) {
+    emit_pod(size);
+  }
+
+  void emit(std::uint8_t u8) {
+    emit_pod(u8);
+  }
+
+  void emit(int i) {
+    emit_pod(i);
+  }
+
+  void emit(bool v) {
+    emit_pod(v);
+  }
+
+  void emit(ForLoopType type) {
+    emit_pod(type);
+  }
+
+  void emit(SNodeAccessFlag flag) {
+    emit_pod(flag);
+  }
+
+  void emit(const MemoryAccessOptions &mem_access_options) {
+    auto all_options = mem_access_options.get_all();
+    emit(static_cast<std::size_t>(all_options.size()));
+    for (const auto &[snode, options] : all_options) {
+      emit(snode);
+      emit(static_cast<std::size_t>(options.size()));
+      for (auto e : options) {
+        emit(e);
+      }
+    }
+  }
+
+  void emit(ExternalFuncType type) {
+    emit_pod(type);
+  }
+
+  Program *prog_{nullptr};
+  std::ostream *os_{nullptr};
+  ExpressionPrinter *expr_printer_{nullptr};
+  std::unordered_set<SNode *> snode_tree_roots_;
+  std::unordered_map<Function *, bool> real_funcs_;
+  std::vector<char> string_pool_;
+};
+
+}  // namespace
+
+void gen_offline_cache_key(Program *prog, IRNode *ast, std::ostream *os) {
+  ASTSerializer::run(prog, ast, os);
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
index c73ef3440389c..140a3915df67a 100644
--- a/taichi/analysis/offline_cache_util.cpp
+++ b/taichi/analysis/offline_cache_util.cpp
@@ -152,8 +152,9 @@ std::string get_hashed_offline_cache_key(CompileConfig *config,
                                          Kernel *kernel) {
   std::string kernel_ast_string;
   if (kernel) {
-    irpass::gen_offline_cache_key(kernel->program, kernel->ir.get(),
-                                  &kernel_ast_string);
+    std::ostringstream oss;
+    gen_offline_cache_key(kernel->program, kernel->ir.get(), &oss);
+    kernel_ast_string = oss.str();
   }
 
   std::vector<std::uint8_t> compile_config_key;
diff --git a/taichi/analysis/offline_cache_util.h b/taichi/analysis/offline_cache_util.h
index 03ca580d96ef5..314024763bbc9 100644
--- a/taichi/analysis/offline_cache_util.h
+++ b/taichi/analysis/offline_cache_util.h
@@ -6,11 +6,14 @@ namespace taichi {
 namespace lang {
 
 struct CompileConfig;
-class Kernel;
+class Program;
+class IRNode;
 class SNode;
+class Kernel;
 
 std::string get_hashed_offline_cache_key_of_snode(SNode *snode);
 std::string get_hashed_offline_cache_key(CompileConfig *config, Kernel *kernel);
+void gen_offline_cache_key(Program *prog, IRNode *ast, std::ostream *os);
 
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/inc/frontend_statements.inc.h b/taichi/inc/frontend_statements.inc.h
new file mode 100644
index 0000000000000..cf61ef62e9eaa
--- /dev/null
+++ b/taichi/inc/frontend_statements.inc.h
@@ -0,0 +1,15 @@
+PER_STATEMENT(FrontendExternalFuncStmt)
+PER_STATEMENT(FrontendExprStmt)
+PER_STATEMENT(FrontendIfStmt)
+PER_STATEMENT(FrontendForStmt)
+PER_STATEMENT(FrontendPrintStmt)
+PER_STATEMENT(FrontendWhileStmt)
+PER_STATEMENT(FrontendBreakStmt)
+PER_STATEMENT(FrontendContinueStmt)
+PER_STATEMENT(FrontendAllocaStmt)
+PER_STATEMENT(FrontendAssignStmt)
+PER_STATEMENT(FrontendEvalStmt)
+PER_STATEMENT(FrontendSNodeOpStmt)  // activate, deactivate, append, clear
+PER_STATEMENT(FrontendAssertStmt)
+PER_STATEMENT(FrontendFuncDefStmt)
+PER_STATEMENT(FrontendReturnStmt)
diff --git a/taichi/inc/statements.inc.h b/taichi/inc/statements.inc.h
index e82ba95ca407a..b26a942860a8a 100644
--- a/taichi/inc/statements.inc.h
+++ b/taichi/inc/statements.inc.h
@@ -1,19 +1,5 @@
 // Frontend statements
-PER_STATEMENT(FrontendExternalFuncStmt)
-PER_STATEMENT(FrontendExprStmt)
-PER_STATEMENT(FrontendIfStmt)
-PER_STATEMENT(FrontendForStmt)
-PER_STATEMENT(FrontendPrintStmt)
-PER_STATEMENT(FrontendWhileStmt)
-PER_STATEMENT(FrontendBreakStmt)
-PER_STATEMENT(FrontendContinueStmt)
-PER_STATEMENT(FrontendAllocaStmt)
-PER_STATEMENT(FrontendAssignStmt)
-PER_STATEMENT(FrontendEvalStmt)
-PER_STATEMENT(FrontendSNodeOpStmt)  // activate, deactivate, append, clear
-PER_STATEMENT(FrontendAssertStmt)
-PER_STATEMENT(FrontendFuncDefStmt)
-PER_STATEMENT(FrontendReturnStmt)
+#include "frontend_statements.inc.h"
 
 // Middle-end statement
 
diff --git a/taichi/ir/expression_printer.h b/taichi/ir/expression_printer.h
index b2215db252611..7d1b463896f61 100644
--- a/taichi/ir/expression_printer.h
+++ b/taichi/ir/expression_printer.h
@@ -18,9 +18,8 @@ class ExpressionPrinter : public ExpressionVisitor {
     os_ = os;
   }
 
-  std::ostream &get_ostream() {
-    TI_ASSERT(os_);
-    return *os_;
+  std::ostream *get_ostream() {
+    return os_;
   }
 
  private:
@@ -227,7 +226,8 @@ class ExpressionHumanFriendlyPrinter : public ExpressionPrinter {
  protected:
   template <typename... Args>
   void emit(Args &&...args) {
-    (this->get_ostream() << ... << std::forward<Args>(args));
+    TI_ASSERT(this->get_ostream());
+    (*this->get_ostream() << ... << std::forward<Args>(args));
   }
 
   template <typename T>
diff --git a/taichi/ir/transforms.h b/taichi/ir/transforms.h
index 09716b746eecf..a3b601b915f03 100644
--- a/taichi/ir/transforms.h
+++ b/taichi/ir/transforms.h
@@ -47,7 +47,6 @@ void full_simplify(IRNode *root,
                    const CompileConfig &config,
                    const FullSimplifyPass::Args &args);
 void print(IRNode *root, std::string *output = nullptr);
-void gen_offline_cache_key(Program *program, IRNode *root, std::string *output);
 void frontend_type_check(IRNode *root);
 void lower_ast(IRNode *root);
 void type_check(IRNode *root, const CompileConfig &config);
diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index 69ce6c4195e71..e04a4b9a1cfbc 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -805,12 +805,6 @@ void print(IRNode *root, std::string *output) {
   return IRPrinter::run(&expr_printer, root, output);
 }
 
-void gen_offline_cache_key(Program *prog, IRNode *root, std::string *output) {
-  irpass::re_id(root);
-  ExpressionOfflineCacheKeyGenerator cache_key_generator(prog);
-  return IRPrinter::run(&cache_key_generator, root, output);
-}
-
 }  // namespace irpass
 
 TLANG_NAMESPACE_END

From 850c8c70fc28b9dd94df63bb553ba371967e96de Mon Sep 17 00:00:00 2001
From: Frost Ming <mianghong@gmail.com>
Date: Thu, 28 Apr 2022 11:34:37 +0800
Subject: [PATCH 011/176] [build] Change the library output dir for export core
 (#4880)

* Change the library output dir for export core

* limit the change to the target
---
 cmake/TaichiExportCore.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/TaichiExportCore.cmake b/cmake/TaichiExportCore.cmake
index c7fc0335948bf..463a028711e8a 100644
--- a/cmake/TaichiExportCore.cmake
+++ b/cmake/TaichiExportCore.cmake
@@ -4,3 +4,5 @@ set(TAICHI_EXPORT_CORE_NAME taichi_export_core)
 
 add_library(${TAICHI_EXPORT_CORE_NAME} SHARED)
 target_link_libraries(${TAICHI_EXPORT_CORE_NAME} taichi_isolated_core)
+set_target_properties(${TAICHI_EXPORT_CORE_NAME} PROPERTIES
+    CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")

From eae8339a5cb4b3b6e782189a89efb31b083ee7ae Mon Sep 17 00:00:00 2001
From: Bob Cao <bobcaocheng@gmail.com>
Date: Wed, 27 Apr 2022 21:01:45 -0700
Subject: [PATCH 012/176] [vulkan] Device API explicit semaphores (#4852)

* Device API explicit semaphores

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Destroy the semaphore before the context

* Fix type warnings

* fix nits

* return nullptr for devices that don't need semaphores

* test out no semaphores between same queue

* Use native command list instead of emulated for dx11

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove the in-queue semaphore

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use flush instead of sync in places

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix possible null semaphore

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/ui/staging_buffer.py            |   2 +-
 taichi/aot/module_loader.h                    |   3 +
 taichi/backends/cpu/cpu_device.h              |  11 +-
 taichi/backends/cuda/cuda_device.h            |  11 +-
 taichi/backends/device.h                      |  26 ++-
 taichi/backends/dx/dx_device.cpp              | 176 ++++++++----------
 taichi/backends/dx/dx_device.h                |  78 ++------
 taichi/backends/metal/device.cpp              |  11 +-
 taichi/backends/opengl/opengl_device.cpp      |  25 ++-
 taichi/backends/opengl/opengl_device.h        |  14 +-
 taichi/backends/vulkan/vulkan_device.cpp      | 145 +++++++++------
 taichi/backends/vulkan/vulkan_device.h        |  38 +++-
 .../backends/vulkan/vulkan_device_creator.cpp |  13 +-
 taichi/backends/vulkan/vulkan_program.h       |   4 +
 taichi/program/program.cpp                    |   4 +
 taichi/program/program.h                      |   2 +
 taichi/program/program_impl.h                 |   5 +
 taichi/runtime/vulkan/runtime.cpp             |  26 ++-
 taichi/runtime/vulkan/runtime.h               |   2 +
 taichi/ui/backends/vulkan/renderable.cpp      |   2 +-
 .../backends/vulkan/renderables/set_image.cpp |   9 +-
 taichi/ui/backends/vulkan/renderer.cpp        |  22 ++-
 taichi/ui/backends/vulkan/renderer.h          |   4 +
 taichi/ui/backends/vulkan/window.cpp          |   3 +-
 24 files changed, 375 insertions(+), 261 deletions(-)

diff --git a/python/taichi/ui/staging_buffer.py b/python/taichi/ui/staging_buffer.py
index 7056ed9007cae..1613bebcb0135 100644
--- a/python/taichi/ui/staging_buffer.py
+++ b/python/taichi/ui/staging_buffer.py
@@ -100,7 +100,7 @@ def copy_image_u8_to_u8(src: ti.template(), dst: ti.template(),
                         num_components: ti.template()):
     for i, j in src:
         for k in ti.static(range(num_components)):
-            dst[i, j][k] = src[i, j][k]
+            dst[i, j][k] = ti.cast(src[i, j][k], ti.u8)
         if num_components < 4:
             # alpha channel
             dst[i, j][3] = u8(255)
diff --git a/taichi/aot/module_loader.h b/taichi/aot/module_loader.h
index 80f9232f3c232..0551152cfae4e 100644
--- a/taichi/aot/module_loader.h
+++ b/taichi/aot/module_loader.h
@@ -166,6 +166,9 @@ class TargetDevice : public Device {
   Stream *get_compute_stream() override {
     TI_NOT_IMPLEMENTED;
   }
+  void wait_idle() override {
+    TI_NOT_IMPLEMENTED;
+  }
 };
 
 }  // namespace aot
diff --git a/taichi/backends/cpu/cpu_device.h b/taichi/backends/cpu/cpu_device.h
index 5d5ccfd5e5ff1..90ecff83814ae 100644
--- a/taichi/backends/cpu/cpu_device.h
+++ b/taichi/backends/cpu/cpu_device.h
@@ -69,8 +69,13 @@ class CpuStream : public Stream {
   ~CpuStream() override{};
 
   std::unique_ptr<CommandList> new_command_list() override{TI_NOT_IMPLEMENTED};
-  void submit(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
-  void submit_synced(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit(CommandList *cmdlist,
+                         const std::vector<StreamSemaphore> &wait_semaphores =
+                             {}) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override{
+      TI_NOT_IMPLEMENTED};
 
   void command_sync() override{TI_NOT_IMPLEMENTED};
 };
@@ -111,6 +116,8 @@ class CpuDevice : public LlvmDevice {
 
   Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};
 
+  void wait_idle() override{TI_NOT_IMPLEMENTED};
+
  private:
   std::vector<AllocInfo> allocations_;
   std::unordered_map<int, std::unique_ptr<VirtualMemoryAllocator>>
diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h
index 039c17b012061..c38fde46d53ec 100644
--- a/taichi/backends/cuda/cuda_device.h
+++ b/taichi/backends/cuda/cuda_device.h
@@ -69,8 +69,13 @@ class CudaStream : public Stream {
   ~CudaStream() override{};
 
   std::unique_ptr<CommandList> new_command_list() override{TI_NOT_IMPLEMENTED};
-  void submit(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
-  void submit_synced(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit(CommandList *cmdlist,
+                         const std::vector<StreamSemaphore> &wait_semaphores =
+                             {}) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override{
+      TI_NOT_IMPLEMENTED};
 
   void command_sync() override{TI_NOT_IMPLEMENTED};
 };
@@ -123,6 +128,8 @@ class CudaDevice : public LlvmDevice {
 
   Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};
 
+  void wait_idle() override{TI_NOT_IMPLEMENTED};
+
  private:
   std::vector<AllocInfo> allocations_;
   void validate_device_alloc(const DeviceAllocation alloc) {
diff --git a/taichi/backends/device.h b/taichi/backends/device.h
index 647634d5cf580..098a2a9645baf 100644
--- a/taichi/backends/device.h
+++ b/taichi/backends/device.h
@@ -397,13 +397,26 @@ inline bool operator&(AllocUsage a, AllocUsage b) {
   return static_cast<int>(a) & static_cast<int>(b);
 }
 
+class StreamSemaphoreObject {
+ public:
+  virtual ~StreamSemaphoreObject() {
+  }
+};
+
+using StreamSemaphore = std::shared_ptr<StreamSemaphoreObject>;
+
 class Stream {
  public:
-  virtual ~Stream(){};
+  virtual ~Stream() {
+  }
 
   virtual std::unique_ptr<CommandList> new_command_list() = 0;
-  virtual void submit(CommandList *cmdlist) = 0;
-  virtual void submit_synced(CommandList *cmdlist) = 0;
+  virtual StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
+  virtual StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
 
   virtual void command_sync() = 0;
 };
@@ -457,6 +470,9 @@ class Device {
   // Each thraed will acquire its own stream
   virtual Stream *get_compute_stream() = 0;
 
+  // Wait for all tasks to complete (task from all streams)
+  virtual void wait_idle() = 0;
+
   // Mapping can fail and will return nullptr
   virtual void *map_range(DevicePtr ptr, uint64_t size) = 0;
   virtual void *map(DeviceAllocation alloc) = 0;
@@ -498,8 +514,10 @@ class Surface {
   virtual ~Surface() {
   }
 
+  virtual StreamSemaphore acquire_next_image() = 0;
   virtual DeviceAllocation get_target_image() = 0;
-  virtual void present_image() = 0;
+  virtual void present_image(
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
   virtual std::pair<uint32_t, uint32_t> get_size() = 0;
   virtual int get_image_count() = 0;
   virtual BufferFormat image_format() = 0;
diff --git a/taichi/backends/dx/dx_device.cpp b/taichi/backends/dx/dx_device.cpp
index abb8364c5617c..436a40c221aee 100644
--- a/taichi/backends/dx/dx_device.cpp
+++ b/taichi/backends/dx/dx_device.cpp
@@ -79,17 +79,25 @@ Dx11ResourceBinder::~Dx11ResourceBinder() {
 }
 
 Dx11CommandList::Dx11CommandList(Dx11Device *ti_device) : device_(ti_device) {
+  HRESULT hr;
+  hr = device_->d3d11_device()->CreateDeferredContext(0,
+                                                      &d3d11_deferred_context_);
+  check_dx_error(hr, "create deferred context");
 }
 
 Dx11CommandList::~Dx11CommandList() {
+  for (ID3D11Buffer *cb : used_spv_workgroup_cb) {
+    cb->Release();
+  }
+  if (d3d11_command_list_) {
+    d3d11_command_list_->Release();
+  }
+  d3d11_deferred_context_->Release();
 }
 
 void Dx11CommandList::bind_pipeline(Pipeline *p) {
   Dx11Pipeline *pipeline = static_cast<Dx11Pipeline *>(p);
-  std::unique_ptr<CmdBindPipeline> cmd =
-      std::make_unique<CmdBindPipeline>(this);
-  cmd->compute_shader_ = pipeline->get_program();
-  recorded_commands_.push_back(std::move(cmd));
+  d3d11_deferred_context_->CSSetShader(pipeline->get_program(), nullptr, 0);
 }
 
 void Dx11CommandList::bind_resources(ResourceBinder *binder_) {
@@ -97,22 +105,28 @@ void Dx11CommandList::bind_resources(ResourceBinder *binder_) {
 
   // UAV
   for (auto &[binding, alloc_id] : binder->uav_binding_to_alloc_id()) {
-    std::unique_ptr<CmdBindUAVBufferToIndex> cmd =
-        std::make_unique<CmdBindUAVBufferToIndex>(this);
     ID3D11UnorderedAccessView *uav = device_->alloc_id_to_uav(alloc_id);
-    cmd->binding = binding;
-    cmd->uav = uav;
-    recorded_commands_.push_back(std::move(cmd));
+    d3d11_deferred_context_->CSSetUnorderedAccessViews(binding, 1, &uav,
+                                                       nullptr);
   }
 
   // CBV
   for (auto &[binding, alloc_id] : binder->cb_binding_to_alloc_id()) {
-    std::unique_ptr<CmdBindConstantBufferToIndex> cmd =
-        std::make_unique<CmdBindConstantBufferToIndex>(this);
-    cmd->binding = binding;
-    cmd->cb_buffer = device_->create_or_get_cb_buffer(alloc_id);
-    cmd->buffer = device_->alloc_id_to_buffer(alloc_id);
-    recorded_commands_.push_back(std::move(cmd));
+    auto cb_buffer = device_->create_or_get_cb_buffer(alloc_id);
+    auto buffer = device_->alloc_id_to_buffer(alloc_id);
+
+    D3D11_BUFFER_DESC desc;
+    buffer->GetDesc(&desc);
+    D3D11_BOX box{};
+    box.left = 0;
+    box.right = desc.ByteWidth;
+    box.top = 0;
+    box.bottom = 1;  // 1 past the end!
+    box.front = 0;
+    box.back = 1;
+    d3d11_deferred_context_->CopySubresourceRegion(cb_buffer, 0, 0, 0, 0,
+                                                   buffer, 0, &box);
+    d3d11_deferred_context_->CSSetConstantBuffers(binding, 1, &cb_buffer);
 
     cb_slot_watermark_ = std::max(cb_slot_watermark_, int(binding));
   }
@@ -140,68 +154,26 @@ void Dx11CommandList::buffer_copy(DevicePtr dst, DevicePtr src, size_t size) {
 }
 
 void Dx11CommandList::buffer_fill(DevicePtr ptr, size_t size, uint32_t data) {
-  std::unique_ptr<Dx11CommandList::CmdBufferFill> cmd =
-      std::make_unique<CmdBufferFill>(this);
   ID3D11Buffer *buf = device_->alloc_id_to_buffer(ptr.alloc_id);
   ID3D11UnorderedAccessView *uav = device_->alloc_id_to_uav(ptr.alloc_id);
-  cmd->uav = uav;
-  D3D11_BUFFER_DESC desc;
-  buf->GetDesc(&desc);
-  cmd->size = desc.ByteWidth;
-  recorded_commands_.push_back(std::move(cmd));
-}
 
-void Dx11CommandList::CmdBufferFill::execute() {
-  ID3D11DeviceContext *context = cmdlist_->device_->d3d11_context();
   const UINT values[4] = {data, data, data, data};
-  context->ClearUnorderedAccessViewUint(uav, values);
-}
-
-void Dx11CommandList::CmdBindPipeline::execute() {
-  ID3D11DeviceContext *context = cmdlist_->device_->d3d11_context();
-  context->CSSetShader(compute_shader_, nullptr, 0);
-}
-
-void Dx11CommandList::CmdBindUAVBufferToIndex::execute() {
-  cmdlist_->device_->d3d11_context()->CSSetUnorderedAccessViews(binding, 1,
-                                                                &uav, nullptr);
-}
-
-void Dx11CommandList::CmdBindConstantBufferToIndex::execute() {
-  D3D11_BUFFER_DESC desc;
-  buffer->GetDesc(&desc);
-  D3D11_BOX box{};
-  box.left = 0;
-  box.right = desc.ByteWidth;
-  box.top = 0;
-  box.bottom = 1;  // 1 past the end!
-  box.front = 0;
-  box.back = 1;
-  cmdlist_->device_->d3d11_context()->CopySubresourceRegion(cb_buffer, 0, 0, 0,
-                                                            0, buffer, 0, &box);
-  cmdlist_->device_->d3d11_context()->CSSetConstantBuffers(binding, 1,
-                                                           &cb_buffer);
-}
-
-void Dx11CommandList::CmdDispatch::execute() {
-  cmdlist_->device_->set_spirv_cross_numworkgroups(x, y, z,
-                                                   spirv_cross_num_wg_cb_slot_);
-  cmdlist_->device_->d3d11_context()->Dispatch(x, y, z);
+  d3d11_deferred_context_->ClearUnorderedAccessViewUint(uav, values);
 }
 
 void Dx11CommandList::dispatch(uint32_t x, uint32_t y, uint32_t z) {
-  std::unique_ptr<CmdDispatch> cmd = std::make_unique<CmdDispatch>(this);
-  cmd->x = x;
-  cmd->y = y;
-  cmd->z = z;
-
   // Set SPIRV_Cross_NumWorkgroups's CB slot based on the watermark
-  cmd->spirv_cross_num_wg_cb_slot_ = cb_slot_watermark_ + 1;
+  auto cb_slot = cb_slot_watermark_ + 1;
+  auto spirv_cross_numworkgroups_cb =
+      device_->set_spirv_cross_numworkgroups(x, y, z, cb_slot);
+  d3d11_deferred_context_->CSSetConstantBuffers(cb_slot, 1,
+                                                &spirv_cross_numworkgroups_cb);
+  used_spv_workgroup_cb.push_back(spirv_cross_numworkgroups_cb);
 
   // Reset watermark
   cb_slot_watermark_ = -1;
 
-  recorded_commands_.push_back(std::move(cmd));
+  d3d11_deferred_context_->Dispatch(x, y, z);
 }
 
 void Dx11CommandList::begin_renderpass(int x0,
@@ -260,19 +232,14 @@ void Dx11CommandList::image_to_buffer(DevicePtr dst_buf,
 }
 
 void Dx11CommandList::run_commands() {
-  for (const auto &cmd : recorded_commands_) {
-    cmd->execute();
+  if (!d3d11_command_list_) {
+    HRESULT hr;
+    hr =
+        d3d11_deferred_context_->FinishCommandList(FALSE, &d3d11_command_list_);
+    check_dx_error(hr, "error finishing command list");
   }
-}
 
-int Dx11CommandList::cb_count() {
-  int ret = 0;
-  for (const auto &cmd : recorded_commands_) {
-    if (dynamic_cast<CmdBindConstantBufferToIndex *>(cmd.get()) != nullptr) {
-      ret++;
-    }
-  }
-  return ret;
+  device_->d3d11_context()->ExecuteCommandList(d3d11_command_list_, TRUE);
 }
 
 namespace {
@@ -739,6 +706,9 @@ void Dx11Device::image_to_buffer(DevicePtr dst_buf,
   TI_NOT_IMPLEMENTED;
 }
 
+void Dx11Device::wait_idle() {
+}
+
 ID3D11Buffer *Dx11Device::alloc_id_to_buffer(uint32_t alloc_id) {
   return alloc_id_to_buffer_.at(alloc_id);
 }
@@ -766,33 +736,35 @@ ID3D11Buffer *Dx11Device::create_or_get_cb_buffer(uint32_t alloc_id) {
   return cb_buf;
 }
 
-void Dx11Device::set_spirv_cross_numworkgroups(uint32_t x,
-                                               uint32_t y,
-                                               uint32_t z,
-                                               int cb_slot) {
-  if (spirv_cross_numworkgroups_ == nullptr) {
-    ID3D11Buffer *temp;
-    create_raw_buffer(device_, 16, nullptr, &temp);
-    create_cpu_accessible_buffer_copy(device_, temp,
-                                      &spirv_cross_numworkgroups_);
-    temp->Release();
-  }
-  if (spirv_cross_numworkgroups_cb_ == nullptr) {
-    create_constant_buffer_copy(device_, spirv_cross_numworkgroups_,
-                                &spirv_cross_numworkgroups_cb_);
-  }
+ID3D11Buffer *Dx11Device::set_spirv_cross_numworkgroups(uint32_t x,
+                                                        uint32_t y,
+                                                        uint32_t z,
+                                                        int cb_slot) {
+  ID3D11Buffer *spirv_cross_numworkgroups;
+  ID3D11Buffer *temp;
+  create_raw_buffer(device_, 16, nullptr, &temp);
+  create_cpu_accessible_buffer_copy(device_, temp, &spirv_cross_numworkgroups);
+  temp->Release();
+
+  ID3D11Buffer *spirv_cross_numworkgroups_cb;
+  create_constant_buffer_copy(device_, spirv_cross_numworkgroups,
+                              &spirv_cross_numworkgroups_cb);
 
   D3D11_MAPPED_SUBRESOURCE mapped;
-  context_->Map(spirv_cross_numworkgroups_, 0, D3D11_MAP_WRITE, 0, &mapped);
+  d3d11_context()->Map(spirv_cross_numworkgroups, 0, D3D11_MAP_WRITE, 0,
+                       &mapped);
   uint32_t *u = reinterpret_cast<uint32_t *>(mapped.pData);
   u[0] = x;
   u[1] = y;
   u[2] = z;
-  context_->Unmap(spirv_cross_numworkgroups_, 0);
+  d3d11_context()->Unmap(spirv_cross_numworkgroups, 0);
+
+  d3d11_context()->CopyResource(spirv_cross_numworkgroups_cb,
+                                spirv_cross_numworkgroups);
 
-  context_->CopyResource(spirv_cross_numworkgroups_cb_,
-                         spirv_cross_numworkgroups_);
-  context_->CSSetConstantBuffers(cb_slot, 1, &spirv_cross_numworkgroups_cb_);
+  spirv_cross_numworkgroups->Release();
+
+  return spirv_cross_numworkgroups_cb;
 }
 
 Dx11Stream::Dx11Stream(Dx11Device *device_) : device_(device_) {
@@ -805,15 +777,23 @@ std::unique_ptr<CommandList> Dx11Stream::new_command_list() {
   return std::make_unique<Dx11CommandList>(device_);
 }
 
-void Dx11Stream::submit(CommandList *cmdlist) {
+StreamSemaphore Dx11Stream::submit(
+    CommandList *cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   Dx11CommandList *dx_cmd_list = static_cast<Dx11CommandList *>(cmdlist);
   dx_cmd_list->run_commands();
+
+  return nullptr;
 }
 
 // No difference for DX11
-void Dx11Stream::submit_synced(CommandList *cmdlist) {
+StreamSemaphore Dx11Stream::submit_synced(
+    CommandList *cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   Dx11CommandList *dx_cmd_list = static_cast<Dx11CommandList *>(cmdlist);
   dx_cmd_list->run_commands();
+
+  return nullptr;
 }
 
 void Dx11Stream::command_sync() {
diff --git a/taichi/backends/dx/dx_device.h b/taichi/backends/dx/dx_device.h
index e473a140e2370..9b90e7b75c08d 100644
--- a/taichi/backends/dx/dx_device.h
+++ b/taichi/backends/dx/dx_device.h
@@ -94,8 +94,12 @@ class Dx11Stream : public Stream {
   ~Dx11Stream() override;
 
   std::unique_ptr<CommandList> new_command_list() override;
-  void submit(CommandList *cmdlist) override;
-  void submit_synced(CommandList *cmdlist) override;
+  StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
   void command_sync() override;
 
  private:
@@ -151,62 +155,13 @@ class Dx11CommandList : public CommandList {
   void run_commands();
 
  private:
-  struct Cmd {
-    explicit Cmd(Dx11CommandList *cmdlist) : cmdlist_(cmdlist) {
-    }
-    virtual void execute() {
-    }
-    Dx11CommandList *cmdlist_;
-  };
-
-  struct CmdBufferFill : public Cmd {
-    explicit CmdBufferFill(Dx11CommandList *cmdlist) : Cmd(cmdlist) {
-    }
-    ID3D11UnorderedAccessView *uav{nullptr};
-    size_t offset{0}, size{0};
-    uint32_t data{0};
-    void execute() override;
-  };
-
-  struct CmdBindPipeline : public Cmd {
-    explicit CmdBindPipeline(Dx11CommandList *cmdlist) : Cmd(cmdlist) {
-    }
-    ID3D11ComputeShader *compute_shader_{nullptr};
-    void execute() override;
-  };
-
-  struct CmdBindUAVBufferToIndex : public Cmd {
-    explicit CmdBindUAVBufferToIndex(Dx11CommandList *cmdlist) : Cmd(cmdlist) {
-    }
-    ID3D11UnorderedAccessView *uav;  // UAV of the buffer
-    uint32_t binding;                // U register; UAV slot
-    void execute() override;
-  };
-
-  struct CmdBindConstantBufferToIndex : public Cmd {
-    explicit CmdBindConstantBufferToIndex(Dx11CommandList *cmdlist)
-        : Cmd(cmdlist) {
-    }
-    ID3D11Buffer *buffer;     // Original buffer, can't be bound to CB slot
-    ID3D11Buffer *cb_buffer;  // Constant buffer-version of buffer, for binding
-                              // to CB slots
-    uint32_t binding;         // CB register; constant buffer slot
-    void execute() override;
-  };
-
-  struct CmdDispatch : public Cmd {
-    explicit CmdDispatch(Dx11CommandList *cmdlist) : Cmd(cmdlist) {
-    }
-    uint32_t x{0}, y{0}, z{0};
-    // Constant Buffer slot for SPIRV_Cross_NumWorkgroups
-    uint32_t spirv_cross_num_wg_cb_slot_{0};
-    void execute() override;
-  };
-
-  std::vector<std::unique_ptr<Cmd>> recorded_commands_;
+  ID3D11DeviceContext *d3d11_deferred_context_{nullptr};
+  ID3D11CommandList *d3d11_command_list_{nullptr};
+
+  std::vector<ID3D11Buffer *> used_spv_workgroup_cb;
+
   Dx11Device *device_;
   int cb_slot_watermark_{-1};
-  int cb_count();
 };
 
 class Dx11Device : public GraphicsDevice {
@@ -247,6 +202,7 @@ class Dx11Device : public GraphicsDevice {
                        DeviceAllocation src_img,
                        ImageLayout img_layout,
                        const BufferImageCopyParams &params) override;
+  void wait_idle() override;
 
   int live_dx11_object_count();
   ID3D11DeviceContext *d3d11_context() {
@@ -264,10 +220,10 @@ class Dx11Device : public GraphicsDevice {
   // cb_slot should be 1 after pre-occupied buffers
   // example: in the presence of args_t, cb_slot will be cb0
   // in the absence of args_t, cb_slot will be cb0
-  void set_spirv_cross_numworkgroups(uint32_t x,
-                                     uint32_t y,
-                                     uint32_t z,
-                                     int cb_slot);
+  ID3D11Buffer *set_spirv_cross_numworkgroups(uint32_t x,
+                                              uint32_t y,
+                                              uint32_t z,
+                                              int cb_slot);
 
  private:
   void create_dx11_device();
@@ -286,8 +242,6 @@ class Dx11Device : public GraphicsDevice {
   int alloc_serial_;
   Dx11Stream *stream_;
 
-  ID3D11Buffer *spirv_cross_numworkgroups_{}, *spirv_cross_numworkgroups_cb_{};
-
   // temporary debug use
   std::unordered_map<uint32_t, void *> mapped_;
 };
diff --git a/taichi/backends/metal/device.cpp b/taichi/backends/metal/device.cpp
index e46808d7c2c0d..d24605eecc501 100644
--- a/taichi/backends/metal/device.cpp
+++ b/taichi/backends/metal/device.cpp
@@ -215,11 +215,15 @@ class StreamImpl : public Stream {
     return std::make_unique<CommandListImpl>(std::move(cb), alloc_buf_mapper_);
   }
 
-  void submit(CommandList *cmdlist) override {
+  StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores) override {
     auto *cb = static_cast<CommandListImpl *>(cmdlist)->command_buffer();
     commit_command_buffer(cb);
   }
-  void submit_synced(CommandList *cmdlist) override {
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores) override {
     auto *cb = static_cast<CommandListImpl *>(cmdlist)->command_buffer();
     commit_command_buffer(cb);
     wait_until_completed(cb);
@@ -340,6 +344,9 @@ class DeviceImpl : public Device, public AllocToMTLBufferMapper {
     return bm;
   }
 
+  void wait_idle() override {
+  }
+
  private:
   struct AllocationInternal {
     std::unique_ptr<BufferMemoryView> buffer_mem{nullptr};
diff --git a/taichi/backends/opengl/opengl_device.cpp b/taichi/backends/opengl/opengl_device.cpp
index 6794c53da666e..2eda864bab310 100644
--- a/taichi/backends/opengl/opengl_device.cpp
+++ b/taichi/backends/opengl/opengl_device.cpp
@@ -431,14 +431,24 @@ std::unique_ptr<CommandList> GLStream::new_command_list() {
   return std::make_unique<GLCommandList>();
 }
 
-void GLStream::submit(CommandList *_cmdlist) {
+StreamSemaphore GLStream::submit(
+    CommandList *_cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   GLCommandList *cmdlist = static_cast<GLCommandList *>(_cmdlist);
   cmdlist->run_commands();
+
+  // OpenGL is fully serial
+  return nullptr;
 }
 
-void GLStream::submit_synced(CommandList *cmdlist) {
+StreamSemaphore GLStream::submit_synced(
+    CommandList *cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   submit(cmdlist);
   glFinish();
+
+  // OpenGL is fully serial
+  return nullptr;
 }
 void GLStream::command_sync() {
   glFinish();
@@ -559,6 +569,9 @@ Stream *GLDevice::get_graphics_stream() {
   return nullptr;
 }
 
+void GLDevice::wait_idle() {
+}
+
 std::unique_ptr<Surface> GLDevice::create_surface(const SurfaceConfig &config) {
   TI_NOT_IMPLEMENTED;
   return nullptr;
@@ -634,12 +647,18 @@ GLSurface::~GLSurface() {
   TI_NOT_IMPLEMENTED;
 }
 
+StreamSemaphore GLSurface::acquire_next_image() {
+  TI_NOT_IMPLEMENTED;
+  return nullptr;
+}
+
 DeviceAllocation GLSurface::get_target_image() {
   TI_NOT_IMPLEMENTED;
   return kDeviceNullAllocation;
 }
 
-void GLSurface::present_image() {
+void GLSurface::present_image(
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   TI_NOT_IMPLEMENTED;
 }
 
diff --git a/taichi/backends/opengl/opengl_device.h b/taichi/backends/opengl/opengl_device.h
index b32b848930650..99eb1cba77eb5 100644
--- a/taichi/backends/opengl/opengl_device.h
+++ b/taichi/backends/opengl/opengl_device.h
@@ -198,8 +198,12 @@ class GLStream : public Stream {
   ~GLStream() override;
 
   std::unique_ptr<CommandList> new_command_list() override;
-  void submit(CommandList *cmdlist) override;
-  void submit_synced(CommandList *cmdlist) override;
+  StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
 
   void command_sync() override;
 };
@@ -237,6 +241,8 @@ class GLDevice : public GraphicsDevice {
 
   Stream *get_graphics_stream() override;
 
+  void wait_idle() override;
+
   std::unique_ptr<Surface> create_surface(const SurfaceConfig &config) override;
   DeviceAllocation create_image(const ImageParams &params) override;
   void destroy_image(DeviceAllocation handle) override;
@@ -272,8 +278,10 @@ class GLSurface : public Surface {
  public:
   ~GLSurface() override;
 
+  StreamSemaphore acquire_next_image() override;
   DeviceAllocation get_target_image() override;
-  void present_image() override;
+  void present_image(
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
   std::pair<uint32_t, uint32_t> get_size() override;
   BufferFormat image_format() override;
   void resize(uint32_t width, uint32_t height) override;
diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
index 406eaab429ce0..f627f2b01fb7b 100644
--- a/taichi/backends/vulkan/vulkan_device.cpp
+++ b/taichi/backends/vulkan/vulkan_device.cpp
@@ -1298,6 +1298,18 @@ DeviceAllocation VulkanDevice::allocate_memory(const AllocParams &params) {
   if (params.usage & AllocUsage::Index) {
     buffer_info.usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
   }
+  buffer_info.sharingMode = VK_SHARING_MODE_CONCURRENT;
+
+  uint32_t queue_family_indices[] = {compute_queue_family_index_,
+                                     graphics_queue_family_index_};
+
+  if (compute_queue_family_index_ == graphics_queue_family_index_) {
+    buffer_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  } else {
+    buffer_info.sharingMode = VK_SHARING_MODE_CONCURRENT;
+    buffer_info.queueFamilyIndexCount = 2;
+    buffer_info.pQueueFamilyIndices = queue_family_indices;
+  }
 
   VkExternalMemoryBufferCreateInfo external_mem_buffer_create_info = {};
   external_mem_buffer_create_info.sType =
@@ -1489,6 +1501,15 @@ Stream *VulkanDevice::get_graphics_stream() {
   }
 }
 
+void VulkanDevice::wait_idle() {
+  for (auto &[tid, stream] : compute_stream_) {
+    stream->command_sync();
+  }
+  for (auto &[tid, stream] : graphics_stream_) {
+    stream->command_sync();
+  }
+}
+
 std::unique_ptr<CommandList> VulkanStream::new_command_list() {
   vkapi::IVkCommandBuffer buffer =
       vkapi::allocate_command_buffer(command_pool_);
@@ -1496,7 +1517,9 @@ std::unique_ptr<CommandList> VulkanStream::new_command_list() {
   return std::make_unique<VulkanCommandList>(&device_, this, buffer);
 }
 
-void VulkanStream::submit(CommandList *cmdlist_) {
+StreamSemaphore VulkanStream::submit(
+    CommandList *cmdlist_,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   VulkanCommandList *cmdlist = static_cast<VulkanCommandList *>(cmdlist_);
   vkapi::IVkCommandBuffer buffer = cmdlist->finalize();
 
@@ -1507,67 +1530,66 @@ void VulkanStream::submit(CommandList *cmdlist_) {
   }
   */
 
-  VkPipelineStageFlags stage_flag{VK_PIPELINE_STAGE_ALL_COMMANDS_BIT};
-
   VkSubmitInfo submit_info{};
   submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
   submit_info.commandBufferCount = 1;
   submit_info.pCommandBuffers = &buffer->buffer;
 
-  if (last_semaphore_) {
-    submit_info.waitSemaphoreCount = 1;
-    submit_info.pWaitSemaphores = &last_semaphore_->semaphore;
-    submit_info.pWaitDstStageMask = &stage_flag;
+  std::vector<VkSemaphore> vk_wait_semaphores;
+  std::vector<VkPipelineStageFlags> vk_wait_stages;
+
+  for (const StreamSemaphore &sema_ : wait_semaphores) {
+    auto sema = std::static_pointer_cast<VulkanStreamSemaphoreObject>(sema_);
+    vk_wait_semaphores.push_back(sema->vkapi_ref->semaphore);
+    vk_wait_stages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+    buffer->refs.push_back(sema->vkapi_ref);
   }
 
+  submit_info.pWaitSemaphores = vk_wait_semaphores.data();
+  submit_info.waitSemaphoreCount = vk_wait_semaphores.size();
+  submit_info.pWaitDstStageMask = vk_wait_stages.data();
+
   auto semaphore = vkapi::create_semaphore(buffer->device, 0);
-  last_semaphore_ = semaphore;
   buffer->refs.push_back(semaphore);
 
   submit_info.signalSemaphoreCount = 1;
   submit_info.pSignalSemaphores = &semaphore->semaphore;
 
-  submitted_cmdbuffers_.push_back(buffer);
-
-  BAIL_ON_VK_BAD_RESULT(vkQueueSubmit(queue_, /*submitCount=*/1, &submit_info,
-                                      /*fence=*/VK_NULL_HANDLE),
-                        "failed to submit command buffer");
-}
-
-void VulkanStream::submit_synced(CommandList *cmdlist) {
-  vkapi::IVkCommandBuffer buffer =
-      static_cast<VulkanCommandList *>(cmdlist)->finalize();
+  auto fence = vkapi::create_fence(buffer->device, 0);
 
-  VkSubmitInfo submit_info{};
-  submit_info.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
-  submit_info.commandBufferCount = 1;
-  submit_info.pCommandBuffers = &buffer->buffer;
-
-  VkPipelineStageFlags stage_flag{VK_PIPELINE_STAGE_ALL_COMMANDS_BIT};
+  // Resource tracking, check previously submitted commands
+  // FIXME: Figure out why it doesn't work
+  /*
+  std::remove_if(submitted_cmdbuffers_.begin(), submitted_cmdbuffers_.end(),
+                 [&](const TrackedCmdbuf &tracked) {
+                   // If fence is signaled, cmdbuf has completed
+                   VkResult res =
+                       vkGetFenceStatus(buffer->device, tracked.fence->fence);
+                   return res == VK_SUCCESS;
+    });
+  */
 
-  if (last_semaphore_) {
-    submit_info.waitSemaphoreCount = 1;
-    submit_info.pWaitSemaphores = &last_semaphore_->semaphore;
-    submit_info.pWaitDstStageMask = &stage_flag;
-  }
+  submitted_cmdbuffers_.push_back(TrackedCmdbuf{fence, buffer});
 
   BAIL_ON_VK_BAD_RESULT(vkQueueSubmit(queue_, /*submitCount=*/1, &submit_info,
-                                      /*fence=*/cmd_sync_fence_->fence),
+                                      /*fence=*/fence->fence),
                         "failed to submit command buffer");
 
-  vkWaitForFences(device_.vk_device(), 1, &cmd_sync_fence_->fence, true,
-                  UINT64_MAX);
-  vkResetFences(device_.vk_device(), 1, &cmd_sync_fence_->fence);
+  return std::make_shared<VulkanStreamSemaphoreObject>(semaphore);
+}
 
-  submitted_cmdbuffers_.clear();
-  last_semaphore_ = nullptr;
+StreamSemaphore VulkanStream::submit_synced(
+    CommandList *cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
+  auto sema = submit(cmdlist, wait_semaphores);
+  command_sync();
+  return sema;
 }
 
 void VulkanStream::command_sync() {
   vkQueueWaitIdle(queue_);
 
   submitted_cmdbuffers_.clear();
-  last_semaphore_ = nullptr;
 }
 
 std::unique_ptr<Pipeline> VulkanDevice::create_raster_pipeline(
@@ -1714,7 +1736,17 @@ DeviceAllocation VulkanDevice::create_image(const ImageParams &params) {
     image_info.usage |= VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT;
   }
   image_info.samples = VK_SAMPLE_COUNT_1_BIT;
-  image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+
+  uint32_t queue_family_indices[] = {compute_queue_family_index_,
+                                     graphics_queue_family_index_};
+
+  if (compute_queue_family_index_ == graphics_queue_family_index_) {
+    image_info.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
+  } else {
+    image_info.sharingMode = VK_SHARING_MODE_CONCURRENT;
+    image_info.queueFamilyIndexCount = 2;
+    image_info.pQueueFamilyIndices = queue_family_indices;
+  }
 
   alloc.format = image_info.format;
 
@@ -2088,12 +2120,7 @@ VulkanSurface::VulkanSurface(VulkanDevice *device, const SurfaceConfig &config)
 
     create_swap_chain();
 
-    VkSemaphoreCreateInfo sema_create_info;
-    sema_create_info.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
-    sema_create_info.pNext = nullptr;
-    sema_create_info.flags = 0;
-    vkCreateSemaphore(device->vk_device(), &sema_create_info,
-                      kNoVkAllocCallbacks, &image_available_);
+    image_available_ = vkapi::create_semaphore(device->vk_device(), 0);
   } else {
     ImageParams params = {ImageDimension::d2D,
                           BufferFormat::rgba8,
@@ -2246,7 +2273,7 @@ int VulkanSurface::get_image_count() {
 VulkanSurface::~VulkanSurface() {
   if (config_.window_handle) {
     destroy_swap_chain();
-    vkDestroySemaphore(device_->vk_device(), image_available_, nullptr);
+    image_available_ = nullptr;
     vkDestroySurfaceKHR(device_->vk_instance(), surface_, nullptr);
   } else {
     for (auto &img : swapchain_images_) {
@@ -2278,14 +2305,19 @@ std::pair<uint32_t, uint32_t> VulkanSurface::get_size() {
   return std::make_pair(width, height);
 }
 
-DeviceAllocation VulkanSurface::get_target_image() {
+StreamSemaphore VulkanSurface::acquire_next_image() {
   if (!config_.window_handle) {
     image_index_ = (image_index_ + 1) % swapchain_images_.size();
+    return nullptr;
   } else {
     vkAcquireNextImageKHR(device_->vk_device(), swapchain_, UINT64_MAX,
-                          image_available_, VK_NULL_HANDLE, &image_index_);
+                          image_available_->semaphore, VK_NULL_HANDLE,
+                          &image_index_);
+    return std::make_shared<VulkanStreamSemaphoreObject>(image_available_);
   }
+}
 
+DeviceAllocation VulkanSurface::get_target_image() {
   return swapchain_images_[image_index_];
 }
 
@@ -2293,20 +2325,27 @@ BufferFormat VulkanSurface::image_format() {
   return image_format_;
 }
 
-void VulkanSurface::present_image() {
-  // TODO: In the future tie the wait semaphores.
-  // Currently we should just halt and wait on host before present
-  vkDeviceWaitIdle(device_->vk_device());
+void VulkanSurface::present_image(
+    const std::vector<StreamSemaphore> &wait_semaphores) {
+  std::vector<VkSemaphore> vk_wait_semaphores;
+
+  for (const StreamSemaphore &sema_ : wait_semaphores) {
+    auto sema = std::static_pointer_cast<VulkanStreamSemaphoreObject>(sema_);
+    vk_wait_semaphores.push_back(sema->vkapi_ref->semaphore);
+  }
+
   VkPresentInfoKHR presentInfo{};
   presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
   presentInfo.waitSemaphoreCount = 1;
-  presentInfo.pWaitSemaphores = &image_available_;
-  presentInfo.swapchainCount = 1;
+  presentInfo.pWaitSemaphores = vk_wait_semaphores.data();
+  presentInfo.swapchainCount = vk_wait_semaphores.size();
   presentInfo.pSwapchains = &swapchain_;
   presentInfo.pImageIndices = &image_index_;
   presentInfo.pResults = nullptr;
 
   vkQueuePresentKHR(device_->graphics_queue(), &presentInfo);
+
+  device_->wait_idle();
 }
 
 DeviceAllocation VulkanSurface::get_image_data() {
@@ -2380,8 +2419,6 @@ VulkanStream::VulkanStream(VulkanDevice &device,
   command_pool_ = vkapi::create_command_pool(
       device_.vk_device(), VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
       queue_family_index);
-
-  cmd_sync_fence_ = vkapi::create_fence(device_.vk_device(), 0);
 }
 
 VulkanStream::~VulkanStream() {
diff --git a/taichi/backends/vulkan/vulkan_device.h b/taichi/backends/vulkan/vulkan_device.h
index 9f232f8f60065..3fd3a3a75ce10 100644
--- a/taichi/backends/vulkan/vulkan_device.h
+++ b/taichi/backends/vulkan/vulkan_device.h
@@ -428,9 +428,11 @@ class VulkanSurface : public Surface {
   VulkanSurface(VulkanDevice *device, const SurfaceConfig &config);
   ~VulkanSurface();
 
+  StreamSemaphore acquire_next_image() override;
   DeviceAllocation get_target_image() override;
 
-  void present_image() override;
+  void present_image(
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
   std::pair<uint32_t, uint32_t> get_size() override;
   int get_image_count() override;
   BufferFormat image_format() override;
@@ -447,7 +449,7 @@ class VulkanSurface : public Surface {
   VulkanDevice *device_;
   VkSurfaceKHR surface_;
   VkSwapchainKHR swapchain_;
-  VkSemaphore image_available_;
+  vkapi::IVkSemaphore image_available_;
 #ifdef ANDROID
   ANativeWindow *window_;
 #elif !defined(TI_EMSCRIPTENED)
@@ -472,6 +474,16 @@ struct DescPool {
   }
 };
 
+class VulkanStreamSemaphoreObject : public StreamSemaphoreObject {
+ public:
+  VulkanStreamSemaphoreObject(vkapi::IVkSemaphore sema) : vkapi_ref(sema) {
+  }
+  ~VulkanStreamSemaphoreObject() {
+  }
+
+  vkapi::IVkSemaphore vkapi_ref{nullptr};
+};
+
 class VulkanStream : public Stream {
  public:
   VulkanStream(VulkanDevice &device,
@@ -480,25 +492,33 @@ class VulkanStream : public Stream {
   ~VulkanStream();
 
   std::unique_ptr<CommandList> new_command_list() override;
-  void submit(CommandList *cmdlist) override;
-  void submit_synced(CommandList *cmdlist) override;
+  StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
 
   void command_sync() override;
 
  private:
+  struct TrackedCmdbuf {
+    vkapi::IVkFence fence;
+    vkapi::IVkCommandBuffer buf;
+  };
+
   VulkanDevice &device_;
   VkQueue queue_;
   uint32_t queue_family_index_;
 
-  vkapi::IVkSemaphore last_semaphore_{nullptr};
-
   // Command pools are per-thread
-  vkapi::IVkFence cmd_sync_fence_;
   vkapi::IVkCommandPool command_pool_;
-  std::vector<vkapi::IVkCommandBuffer> submitted_cmdbuffers_;
+  std::vector<TrackedCmdbuf> submitted_cmdbuffers_;
 };
 
 class VulkanDevice : public GraphicsDevice {
+  friend VulkanSurface;
+
  public:
   struct Params {
     VkInstance instance;
@@ -535,6 +555,8 @@ class VulkanDevice : public GraphicsDevice {
   Stream *get_compute_stream() override;
   Stream *get_graphics_stream() override;
 
+  void wait_idle() override;
+
   std::unique_ptr<Pipeline> create_raster_pipeline(
       const std::vector<PipelineSourceDesc> &src,
       const RasterParams &raster_params,
diff --git a/taichi/backends/vulkan/vulkan_device_creator.cpp b/taichi/backends/vulkan/vulkan_device_creator.cpp
index 0253f9e5febfc..030398947b2d8 100644
--- a/taichi/backends/vulkan/vulkan_device_creator.cpp
+++ b/taichi/backends/vulkan/vulkan_device_creator.cpp
@@ -117,11 +117,10 @@ VulkanQueueFamilyIndices find_queue_families(VkPhysicalDevice device,
       (~(VK_QUEUE_TRANSFER_BIT | VK_QUEUE_SPARSE_BINDING_BIT));
 
   // first try and find a queue that has just the compute bit set
-  // FIXME: Actually create two queues (async compute & graphics if supported)
   for (int i = 0; i < (int)queue_family_count; ++i) {
     const VkQueueFlags masked_flags = kFlagMask & queue_families[i].queueFlags;
     if ((masked_flags & VK_QUEUE_COMPUTE_BIT) &&
-        (masked_flags & VK_QUEUE_GRAPHICS_BIT)) {
+        !(masked_flags & VK_QUEUE_GRAPHICS_BIT)) {
       indices.compute_family = i;
     }
     if (masked_flags & VK_QUEUE_GRAPHICS_BIT) {
@@ -139,6 +138,8 @@ VulkanQueueFamilyIndices find_queue_families(VkPhysicalDevice device,
     }
 
     if (indices.is_complete() && indices.is_complete_for_ui()) {
+      TI_INFO("Async compute queue {}, graphics queue {}",
+              indices.compute_family.value(), indices.graphics_family.value());
       return indices;
     }
   }
@@ -668,12 +669,8 @@ void VulkanDeviceCreator::create_logical_device() {
     }
 
     // F16 / I8
-#ifdef __APPLE__
-    {
-#else
     if (CHECK_VERSION(1, 2) ||
         CHECK_EXTENSION(VK_KHR_SHADER_FLOAT16_INT8_EXTENSION_NAME)) {
-#endif
       features2.pNext = &shader_f16_i8_feature;
       vkGetPhysicalDeviceFeatures2KHR(physical_device_, &features2);
 
@@ -683,10 +680,6 @@ void VulkanDeviceCreator::create_logical_device() {
       if (shader_f16_i8_feature.shaderInt8) {
         ti_device_->set_cap(DeviceCapability::spirv_has_int8, true);
       }
-      if (portability_subset_enabled) {
-        // TODO: investigate why MoltenVK isn't reporting int8 caps. See #3252
-        ti_device_->set_cap(DeviceCapability::spirv_has_int8, true);
-      }
       *pNextEnd = &shader_f16_i8_feature;
       pNextEnd = &shader_f16_i8_feature.pNext;
     }
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index b3b33348c525f..febd0c658e703 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -54,6 +54,10 @@ class VulkanProgramImpl : public ProgramImpl {
     vulkan_runtime_->synchronize();
   }
 
+  StreamSemaphore flush() override {
+    return vulkan_runtime_->flush();
+  }
+
   std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override;
 
   virtual void destroy_snode_tree(SNodeTree *snode_tree) override {
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 3dba5f3d09d20..9904ed9912d70 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -245,6 +245,10 @@ void Program::synchronize() {
   }
 }
 
+StreamSemaphore Program::flush() {
+  return program_impl_->flush();
+}
+
 void Program::async_flush() {
   if (!config.async_mode) {
     TI_WARN("No point calling async_flush() when async mode is disabled.");
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 48178ba7409e4..8ceffa659c047 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -156,6 +156,8 @@ class TI_DLL_EXPORT Program {
 
   void synchronize();
 
+  StreamSemaphore flush();
+
   // See AsyncEngine::flush().
   // Only useful when async mode is enabled.
   void async_flush();
diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
index 6ba07b56c850f..cd9d9240600f4 100644
--- a/taichi/program/program_impl.h
+++ b/taichi/program/program_impl.h
@@ -60,6 +60,11 @@ class ProgramImpl {
    */
   virtual void synchronize() = 0;
 
+  virtual StreamSemaphore flush() {
+    synchronize();
+    return nullptr;
+  }
+
   /**
    * Make a AotModulerBuilder, currently only supported by metal and wasm.
    */
diff --git a/taichi/runtime/vulkan/runtime.cpp b/taichi/runtime/vulkan/runtime.cpp
index 82478b05bf427..bc1fca35c7c76 100644
--- a/taichi/runtime/vulkan/runtime.cpp
+++ b/taichi/runtime/vulkan/runtime.cpp
@@ -139,7 +139,8 @@ class HostDeviceContextBlitter {
   bool device_to_host(
       CommandList *cmdlist,
       const std::unordered_map<int, DeviceAllocation> &ext_arrays,
-      const std::unordered_map<int, size_t> &ext_arr_size) {
+      const std::unordered_map<int, size_t> &ext_arr_size,
+      const std::vector<StreamSemaphore> &wait_semaphore) {
     if (ctx_attribs_->empty()) {
       return false;
     }
@@ -520,9 +521,11 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   }
 
   // If we need to host sync, sync and remove in-flight references
+  std::vector<StreamSemaphore> wait_semaphore;
+
   if (ctx_blitter) {
     if (ctx_blitter->device_to_host(current_cmdlist_.get(), any_arrays,
-                                    ext_array_size)) {
+                                    ext_array_size, wait_semaphore)) {
       current_cmdlist_ = nullptr;
       ctx_buffers_.clear();
     }
@@ -536,8 +539,7 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
     auto duration = high_res_clock::now() - current_cmdlist_pending_since_;
     if (std::chrono::duration_cast<std::chrono::microseconds>(duration)
             .count() > max_pending_time) {
-      device_->get_compute_stream()->submit(current_cmdlist_.get());
-      current_cmdlist_ = nullptr;
+      flush();
     }
   }
 
@@ -552,12 +554,22 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
 }
 
 void VkRuntime::synchronize() {
+  flush();
+  device_->wait_idle();
+  ctx_buffers_.clear();
+}
+
+StreamSemaphore VkRuntime::flush() {
+  StreamSemaphore sema;
   if (current_cmdlist_) {
-    device_->get_compute_stream()->submit(current_cmdlist_.get());
+    sema = device_->get_compute_stream()->submit(current_cmdlist_.get());
     current_cmdlist_ = nullptr;
+  } else {
+    auto cmdlist = device_->get_compute_stream()->new_command_list();
+    cmdlist->memory_barrier();
+    sema = device_->get_compute_stream()->submit(cmdlist.get());
   }
-  device_->get_compute_stream()->command_sync();
-  ctx_buffers_.clear();
+  return sema;
 }
 
 Device *VkRuntime::get_ti_device() const {
diff --git a/taichi/runtime/vulkan/runtime.h b/taichi/runtime/vulkan/runtime.h
index fecf9812ccb4f..c4f356783bcea 100644
--- a/taichi/runtime/vulkan/runtime.h
+++ b/taichi/runtime/vulkan/runtime.h
@@ -102,6 +102,8 @@ class TI_DLL_EXPORT VkRuntime {
 
   void synchronize();
 
+  StreamSemaphore flush();
+
   Device *get_ti_device() const;
 
   void add_root_buffer(size_t root_buffer_size);
diff --git a/taichi/ui/backends/vulkan/renderable.cpp b/taichi/ui/backends/vulkan/renderable.cpp
index 626ea473decba..a17451899c941 100644
--- a/taichi/ui/backends/vulkan/renderable.cpp
+++ b/taichi/ui/backends/vulkan/renderable.cpp
@@ -45,7 +45,7 @@ void Renderable::update_data(const RenderableInfo &info) {
   // load AOT modules
   Program *prog = app_context_->prog();
   if (prog) {
-    prog->synchronize();
+    prog->flush();
   }
 
   int num_vertices = info.vbo.shape[0];
diff --git a/taichi/ui/backends/vulkan/renderables/set_image.cpp b/taichi/ui/backends/vulkan/renderables/set_image.cpp
index 27c8e0609f959..ea5da2b9ee977 100644
--- a/taichi/ui/backends/vulkan/renderables/set_image.cpp
+++ b/taichi/ui/backends/vulkan/renderables/set_image.cpp
@@ -31,8 +31,9 @@ void SetImage::update_data(const SetImageInfo &info) {
   // We might not have a current program if GGUI is used in external apps to
   // load AOT modules
   Program *prog = app_context_->prog();
+  StreamSemaphore data_ready_sema{nullptr};
   if (prog) {
-    prog->synchronize();
+    data_ready_sema = prog->flush();
   }
 
   const FieldInfo &img = info.img;
@@ -96,7 +97,11 @@ void SetImage::update_data(const SetImageInfo &info) {
 
   cmd_list->image_transition(texture_, ImageLayout::transfer_dst,
                              ImageLayout::shader_read);
-  stream->submit_synced(cmd_list.get());
+  if (data_ready_sema) {
+    stream->submit(cmd_list.get(), {data_ready_sema});
+  } else {
+    stream->submit(cmd_list.get());
+  }
 }
 
 SetImage::SetImage(AppContext *app_context, VertexAttributes vbo_attrs) {
diff --git a/taichi/ui/backends/vulkan/renderer.cpp b/taichi/ui/backends/vulkan/renderer.cpp
index 1d937a4868c2d..2403775fbeb12 100644
--- a/taichi/ui/backends/vulkan/renderer.cpp
+++ b/taichi/ui/backends/vulkan/renderer.cpp
@@ -112,6 +112,7 @@ void Renderer::scene(Scene *scene) {
 }
 
 void Renderer::cleanup() {
+  render_complete_semaphore_ = nullptr;
   for (auto &renderable : renderables_) {
     renderable->cleanup();
   }
@@ -130,6 +131,7 @@ void Renderer::draw_frame(Gui *gui) {
   bool color_clear = true;
   std::vector<float> clear_colors = {background_color_[0], background_color_[1],
                                      background_color_[2], 1};
+  auto semaphore = swap_chain_.surface().acquire_next_image();
   auto image = swap_chain_.surface().get_target_image();
   auto depth_image = swap_chain_.depth_allocation();
   cmd_list->begin_renderpass(
@@ -155,7 +157,21 @@ void Renderer::draw_frame(Gui *gui) {
 
   gui->draw(cmd_list.get());
   cmd_list->end_renderpass();
-  stream->submit_synced(cmd_list.get());
+
+  std::vector<StreamSemaphore> wait_semaphores;
+
+  if (app_context_.prog()) {
+    auto sema = app_context_.prog()->flush();
+    if (sema) {
+      wait_semaphores.push_back(sema);
+    }
+  }
+
+  if (semaphore) {
+    wait_semaphores.push_back(semaphore);
+  }
+
+  render_complete_semaphore_ = stream->submit(cmd_list.get(), wait_semaphores);
 }
 
 const AppContext &Renderer::app_context() const {
@@ -174,6 +190,10 @@ SwapChain &Renderer::swap_chain() {
   return swap_chain_;
 }
 
+taichi::lang::StreamSemaphore Renderer::get_render_complete_semaphore() {
+  return std::move(render_complete_semaphore_);
+}
+
 }  // namespace vulkan
 
 TI_UI_NAMESPACE_END
diff --git a/taichi/ui/backends/vulkan/renderer.h b/taichi/ui/backends/vulkan/renderer.h
index 53656592d1d86..25c866145b5df 100644
--- a/taichi/ui/backends/vulkan/renderer.h
+++ b/taichi/ui/backends/vulkan/renderer.h
@@ -71,12 +71,16 @@ class TI_DLL_EXPORT Renderer {
   const SwapChain &swap_chain() const;
   SwapChain &swap_chain();
 
+  taichi::lang::StreamSemaphore get_render_complete_semaphore();
+
  private:
   glm::vec3 background_color_ = glm::vec3(0.f, 0.f, 0.f);
 
   std::vector<std::unique_ptr<Renderable>> renderables_;
   int next_renderable_;
 
+  taichi::lang::StreamSemaphore render_complete_semaphore_{nullptr};
+
   SwapChain swap_chain_;
   AppContext app_context_;
 
diff --git a/taichi/ui/backends/vulkan/window.cpp b/taichi/ui/backends/vulkan/window.cpp
index 397a41a3ddb6e..b7a41c752f15b 100644
--- a/taichi/ui/backends/vulkan/window.cpp
+++ b/taichi/ui/backends/vulkan/window.cpp
@@ -80,7 +80,8 @@ void Window::draw_frame() {
 }
 
 void Window::present_frame() {
-  renderer_->swap_chain().surface().present_image();
+  renderer_->swap_chain().surface().present_image(
+      {renderer_->get_render_complete_semaphore()});
 }
 
 Window::~Window() {

From d096f15a272833c557c769a551f975fd8a8b554c Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Thu, 28 Apr 2022 12:57:12 +0800
Subject: [PATCH 013/176] [metal] Complete Device API (#4862)

* [metal] Complete Device API

* fix

* fix
---
 taichi/backends/metal/api.cpp    | 11 ++++++++
 taichi/backends/metal/api.h      |  7 +++++
 taichi/backends/metal/device.cpp | 46 +++++++++++++++++++++++++-------
 taichi/backends/metal/device.h   |  7 ++++-
 4 files changed, 61 insertions(+), 10 deletions(-)

diff --git a/taichi/backends/metal/api.cpp b/taichi/backends/metal/api.cpp
index f5ee4ba6a86fd..23b7270ea5621 100644
--- a/taichi/backends/metal/api.cpp
+++ b/taichi/backends/metal/api.cpp
@@ -168,6 +168,17 @@ void fill_buffer(MTLBlitCommandEncoder *encoder,
   call(encoder, "fillBuffer:bufferrange:rangevalue:", buffer, range, value);
 }
 
+void copy_from_buffer_to_buffer(MTLBlitCommandEncoder *encoder,
+                                MTLBuffer *source_buffer,
+                                size_t source_offset,
+                                MTLBuffer *destination_buffer,
+                                size_t destination_offset,
+                                size_t size) {
+  call(encoder, "copyFromBuffer:sourceOffset:toBuffer:destinationOffset:size:",
+       source_buffer, source_offset, destination_buffer, destination_offset,
+       size);
+}
+
 #endif  // TI_PLATFORM_OSX
 
 bool is_metal_api_available() {
diff --git a/taichi/backends/metal/api.h b/taichi/backends/metal/api.h
index 8d10431542fd8..7073184ae26c9 100644
--- a/taichi/backends/metal/api.h
+++ b/taichi/backends/metal/api.h
@@ -155,6 +155,13 @@ void fill_buffer(MTLBlitCommandEncoder *encoder,
                  mac::TI_NSRange range,
                  uint8_t value);
 
+void copy_from_buffer_to_buffer(MTLBlitCommandEncoder *encoder,
+                                MTLBuffer *source_buffer,
+                                size_t source_offset,
+                                MTLBuffer *destination_buffer,
+                                size_t destination_offset,
+                                size_t size);
+
 size_t get_max_total_threads_per_threadgroup(
     MTLComputePipelineState *pipeline_state);
 #endif  // TI_PLATFORM_OSX
diff --git a/taichi/backends/metal/device.cpp b/taichi/backends/metal/device.cpp
index d24605eecc501..aa75d61847625 100644
--- a/taichi/backends/metal/device.cpp
+++ b/taichi/backends/metal/device.cpp
@@ -18,6 +18,7 @@ class ResourceBinderImpl : public ResourceBinder {
     DeviceAllocationId alloc_id{0};
     // Not sure if this info is necessary yet.
     // TODO: Make it an enum?
+    uint64_t offset{0};
     [[maybe_unused]] bool is_constant{false};
   };
   using BindingMap = std::unordered_map<uint32_t, Binding>;
@@ -34,12 +35,12 @@ class ResourceBinderImpl : public ResourceBinder {
                  uint32_t binding,
                  DevicePtr ptr,
                  size_t size) override {
-    TI_NOT_IMPLEMENTED;
+    bind_buffer(set, binding, ptr, ptr.offset, /*is_constant=*/false);
   }
   void rw_buffer(uint32_t set,
                  uint32_t binding,
                  DeviceAllocation alloc) override {
-    bind_buffer(set, binding, alloc, /*is_constant=*/false);
+    bind_buffer(set, binding, alloc, /*offset=*/0, /*is_constant=*/false);
   }
 
   // Constant buffers
@@ -47,10 +48,10 @@ class ResourceBinderImpl : public ResourceBinder {
               uint32_t binding,
               DevicePtr ptr,
               size_t size) override {
-    TI_NOT_IMPLEMENTED;
+    bind_buffer(set, binding, ptr, ptr.offset, /*is_constant=*/false);
   }
   void buffer(uint32_t set, uint32_t binding, DeviceAllocation alloc) override {
-    bind_buffer(set, binding, alloc, /*is_constant=*/true);
+    bind_buffer(set, binding, alloc, /*offset=*/0, /*is_constant=*/true);
   }
 
   const BindingMap &binding_map() const {
@@ -60,11 +61,12 @@ class ResourceBinderImpl : public ResourceBinder {
  private:
   void bind_buffer(uint32_t set,
                    uint32_t binding,
-                   DeviceAllocation alloc,
+                   const DeviceAllocation &alloc,
+                   uint64_t offset,
                    bool is_constant) {
     TI_ASSERT(set == 0);
     TI_ASSERT(alloc.device == dev_);
-    binding_map_[binding] = {alloc.alloc_id, is_constant};
+    binding_map_[binding] = {alloc.alloc_id, offset, is_constant};
   }
 
   const Device *const dev_;
@@ -136,9 +138,25 @@ class CommandListImpl : public CommandList {
   }
 
   void buffer_copy(DevicePtr dst, DevicePtr src, size_t size) override {
+    TI_ERROR_IF(dst.device != src.device,
+                "dst and src must be from the same MTLDevice");
+    TI_ERROR_IF(inflight_compute_builder_.has_value(), "Inflight compute");
+    auto *dst_buf = alloc_buf_mapper_->find(dst).buffer;
+    TI_ASSERT(dst_buf != nullptr);
+    auto *src_buf = alloc_buf_mapper_->find(src).buffer;
+    TI_ASSERT(src_buf != nullptr);
+    auto encoder = new_blit_command_encoder(command_buffer_.get());
+    TI_ASSERT(encoder != nullptr);
+    if (!inflight_label_.empty()) {
+      metal::set_label(encoder.get(), inflight_label_);
+    }
+    copy_from_buffer_to_buffer(encoder.get(), src_buf, src.offset, dst_buf,
+                               dst.offset, size);
+    finish_encoder(encoder.get());
   }
 
   void buffer_fill(DevicePtr ptr, size_t size, uint32_t data) override {
+    TI_ERROR_IF(inflight_compute_builder_.has_value(), "Inflight compute");
     if ((data & 0xff) != data) {
       // TODO: Maybe create a shader just for this filling purpose?
       TI_ERROR("Metal can only support 8-bit data for buffer_fill");
@@ -146,7 +164,9 @@ class CommandListImpl : public CommandList {
     }
     auto encoder = new_blit_command_encoder(command_buffer_.get());
     TI_ASSERT(encoder != nullptr);
-    metal::set_label(encoder.get(), inflight_label_);
+    if (!inflight_label_.empty()) {
+      metal::set_label(encoder.get(), inflight_label_);
+    }
     auto *buf = alloc_buf_mapper_->find(ptr).buffer;
     TI_ASSERT(buf != nullptr);
     mac::TI_NSRange range;
@@ -170,6 +190,11 @@ class CommandListImpl : public CommandList {
     auto ceil_div = [](uint32_t a, uint32_t b) -> uint32_t {
       return (a + b - 1) / b;
     };
+    for (const auto &[idx, b] : builder.binding_map) {
+      auto *buf = alloc_buf_mapper_->find(b.alloc_id).buffer;
+      TI_ASSERT(buf != nullptr);
+      set_mtl_buffer(encoder.get(), buf, b.offset, idx);
+    }
     const auto num_blocks_x = ceil_div(grid_size.x, block_size.x);
     const auto num_blocks_y = ceil_div(grid_size.y, block_size.y);
     const auto num_blocks_z = ceil_div(grid_size.z, block_size.z);
@@ -333,9 +358,9 @@ class DeviceImpl : public Device, public AllocToMTLBufferMapper {
     return stream_.get();
   }
 
-  BufferAndMem find(DeviceAllocation alloc) const override {
+  BufferAndMem find(DeviceAllocationId alloc_id) const override {
     BufferAndMem bm;
-    auto itr = allocations_.find(alloc.alloc_id);
+    auto itr = allocations_.find(alloc_id);
     if (itr == allocations_.end()) {
       return bm;
     }
@@ -343,6 +368,9 @@ class DeviceImpl : public Device, public AllocToMTLBufferMapper {
     bm.mem = itr->second.buffer_mem.get();
     return bm;
   }
+  // Un-shadow the overload from the base class
+  // https://stackoverflow.com/a/34466458/12003165
+  using AllocToMTLBufferMapper::find;
 
   void wait_idle() override {
   }
diff --git a/taichi/backends/metal/device.h b/taichi/backends/metal/device.h
index c6fd695a728ab..b4ad0ce904f6e 100644
--- a/taichi/backends/metal/device.h
+++ b/taichi/backends/metal/device.h
@@ -32,7 +32,12 @@ class AllocToMTLBufferMapper {
     MTLBuffer *buffer{nullptr};
     BufferMemoryView *mem{nullptr};
   };
-  virtual BufferAndMem find(DeviceAllocation alloc) const = 0;
+
+  virtual BufferAndMem find(DeviceAllocationId alloc_id) const = 0;
+
+  BufferAndMem find(DeviceAllocation alloc) const {
+    return find(alloc.alloc_id);
+  }
 };
 
 struct MakeDeviceResult {

From ca81ba3dc4b7431638c007fd29b8ea6efce374c5 Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Thu, 28 Apr 2022 15:45:31 +0800
Subject: [PATCH 014/176] [Doc] Updated links that may break. (#4874)

* Updated logo

* Updated links that may break when the doc site has versions

* Added information that numpy arrays and torch tensors can be passed as arguments

* Fixed a broken link.
---
 .../lang/articles/contribution/doc_writing.md |  4 ++--
 docs/lang/articles/faq.md                     | 24 ++++++++++++++++++-
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/docs/lang/articles/contribution/doc_writing.md b/docs/lang/articles/contribution/doc_writing.md
index 7750e796bd0d6..b24d0e241e46e 100644
--- a/docs/lang/articles/contribution/doc_writing.md
+++ b/docs/lang/articles/contribution/doc_writing.md
@@ -67,8 +67,8 @@ To link to another section within the same article, you would use `[Return to ##
 We follow the best practices suggested by [Docusaurus](https://docusaurus.io/docs/docs-markdown-features#referencing-other-documents) to cross-reference other documents, so to link to sections in other articles, please use the following relative-path based syntax, which
 is docs-versioning and IDE/GitHub friendly:
 
-- `[Return to Contribution guidelines](./contributor_guide.md)`: [Return to Contribution guidelines](./contributor_guide.md)
-- `[Return to Getting Started](/#fields)`: [Return to Getting Started](/#fields)
+- `Return to [Contribution guidelines](./contributor_guide.md)`: Return to [Contribution guidelines](./contributor_guide.md)
+- `Return to [Getting Started](../get-started/index.md#fields)`: Return to [Getting Started](../get-started/index.md#fields)
 
 ## 4. Centered text blocks
 
diff --git a/docs/lang/articles/faq.md b/docs/lang/articles/faq.md
index 6a24d30e6e09b..b087f22d65024 100755
--- a/docs/lang/articles/faq.md
+++ b/docs/lang/articles/faq.md
@@ -28,7 +28,29 @@ One feasible solution is `field.from_numpy(ti.tools.imread('filename.png'))`.
 
 ### Can Taichi interact with **other Python packages** such as `matplotlib`?
 
-Yes, Taichi supports various popular Python packages. Please check out [Interacting with other Python packages](/#interacting-with-other-python-packages).
+Yes, Taichi supports many popular Python packages. Taichi provides helper functions such as `from_numpy` and `to_numpy` to transfer data between Taichi fields and NumPy arrays, so that you can also use your favorite Python packages (e.g., `numpy`, `pytorch`, `matplotlib`) together with Taichi as below:
+
+```python
+import taichi as ti
+pixels = ti.field(ti.f32, (1024, 512))
+import numpy as np
+arr = np.random.rand(1024, 512)
+pixels.from_numpy(arr)   # load numpy data into taichi fields
+import matplotlib.pyplot as plt
+arr = pixels.to_numpy()  # store taichi data into numpy arrays
+plt.imshow(arr)
+plt.show()
+import matplotlib.cm as cm
+cmap = cm.get_cmap('magma')
+gui = ti.GUI('Color map')
+while gui.running:
+    render_pixels()
+    arr = pixels.to_numpy()
+    gui.set_image(cmap(arr))
+    gui.show()
+```
+
+Besides, you can also pass numpy arrays or torch tensors into a Taichi kernel as arguments. See [Interacting with external arrays](./basic/external.md) for more details.
 
 ### How do I declare a field with a **dynamic length**?
 

From e4eea27694b064c6cee9c2d669df836433d030ea Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 28 Apr 2022 16:27:31 +0800
Subject: [PATCH 015/176] [error] [lang] Improved error messages for illegal
 slicing or indexing to ti.field (#4873)

* [bug] Improved error messages for ilegal slicing or indexing to ti.field

* Fixed test failures

* Addressed code-review comments
---
 python/taichi/lang/field.py | 16 ++++++++++++++--
 tests/python/test_field.py  | 19 +++++++++++++++++++
 tests/python/test_numpy.py  |  7 -------
 3 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/python/taichi/lang/field.py b/python/taichi/lang/field.py
index 165913fc5821e..3c66469ab8b3a 100644
--- a/python/taichi/lang/field.py
+++ b/python/taichi/lang/field.py
@@ -205,7 +205,10 @@ def _pad_key(self, key):
             key = ()
         if not isinstance(key, (tuple, list)):
             key = (key, )
-        assert len(key) == len(self.shape)
+
+        if len(key) != len(self.shape):
+            raise AssertionError("Slicing is not supported on ti.field")
+
         return key + ((0, ) * (_ti_core.get_max_num_indices() - len(key)))
 
     def _initialize_host_accessors(self):
@@ -289,7 +292,16 @@ def __setitem__(self, key, value):
     @python_scope
     def __getitem__(self, key):
         self._initialize_host_accessors()
-        return self.host_accessors[0].getter(*self._pad_key(key))
+        # Check for potential slicing behaviour
+        # for instance: x[0, :]
+        padded_key = self._pad_key(key)
+        for key in padded_key:
+            if not isinstance(key, int):
+                raise TypeError(
+                    f"Detected illegal element of type: {type(key)}. "
+                    f"Please be aware that slicing a ti.field is not supported so far."
+                )
+        return self.host_accessors[0].getter(*padded_key)
 
     def __repr__(self):
         # make interactive shell happy, prevent materialization
diff --git a/tests/python/test_field.py b/tests/python/test_field.py
index 06af7ed63a0ac..b2659416e4b15 100644
--- a/tests/python/test_field.py
+++ b/tests/python/test_field.py
@@ -189,3 +189,22 @@ def test_field_shape_0():
             ti._lib.core.TaichiRuntimeError,
             match="Every dimension of a Taichi field should be positive"):
         x = ti.field(dtype=ti.f32, shape=0)
+
+
+@test_utils.test()
+def test_index_mismatch():
+    with pytest.raises(AssertionError,
+                       match="Slicing is not supported on ti.field"):
+        val = ti.field(ti.i32, shape=(1, 2, 3))
+        val[0, 0] = 1
+
+
+@test_utils.test()
+def test_invalid_slicing():
+    with pytest.raises(
+            TypeError,
+            match=
+            "Detected illegal element of type: .*?\. Please be aware that slicing a ti.field is not supported so far."
+    ):
+        val = ti.field(ti.i32, shape=(2, 2))
+        val[0, :]
diff --git a/tests/python/test_numpy.py b/tests/python/test_numpy.py
index aab8cb50dcb4a..a207706f98196 100644
--- a/tests/python/test_numpy.py
+++ b/tests/python/test_numpy.py
@@ -180,13 +180,6 @@ def test_numpy(a: ti.types.ndarray(), b: ti.types.ndarray()):
         assert b[i] == d[i]
 
 
-@test_utils.test()
-def test_index_mismatch():
-    with pytest.raises(AssertionError):
-        val = ti.field(ti.i32, shape=(1, 2, 3))
-        val[0, 0] = 1
-
-
 @test_utils.test()
 def test_numpy_zero():
     @ti.kernel

From 8a3a2d7a397567eceab9c83b3242b60a9ad82b35 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Thu, 28 Apr 2022 21:14:38 +0800
Subject: [PATCH 016/176] [metal] Migrate runtime's MTLBuffer allocation to
 unified device API (#4865)

* wip

* migrate all buffers
---
 taichi/backends/metal/kernel_manager.cpp | 132 +++++++++++++----------
 1 file changed, 73 insertions(+), 59 deletions(-)

diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index 9999b613cd01b..96863e178b973 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -531,6 +531,12 @@ class HostMetalCtxBlitter {
   std::string kernel_name_;
 };
 
+struct DevAllocWithInternals {
+  DeviceAllocation handle;
+  BufferMemoryView *mem{nullptr};
+  MTLBuffer *buffer{nullptr};
+};
+
 }  // namespace
 
 class KernelManager::Impl {
@@ -562,69 +568,64 @@ class KernelManager::Impl {
       devalloc_mapper_ = make_res.mapper;
     }
 
-    global_tmps_mem_ = std::make_unique<BufferMemoryView>(
-        taichi_global_tmp_buffer_size, mem_pool_);
+    global_tmps_idevalloc_ = make_idevalloc(taichi_global_tmp_buffer_size);
 
     ActionRecorder::get_instance().record(
         "allocate_global_tmp_buffer",
         {ActionArg("size_in_bytes", (int64)taichi_global_tmp_buffer_size)});
 
-    global_tmps_buffer_ = new_mtl_buffer_no_copy(
-        device_.get(), global_tmps_mem_->ptr(), global_tmps_mem_->size());
-    TI_ASSERT(global_tmps_buffer_ != nullptr);
-
     const size_t mem_pool_bytes =
         (config_->device_memory_GB * 1024 * 1024 * 1024ULL);
-    runtime_mem_ = std::make_unique<BufferMemoryView>(
-        compiled_runtime_module_.runtime_size + mem_pool_bytes, mem_pool_);
-    runtime_buffer_ = new_mtl_buffer_no_copy(device_.get(), runtime_mem_->ptr(),
-                                             runtime_mem_->size());
+    runtime_idevalloc_ =
+        make_idevalloc(compiled_runtime_module_.runtime_size + mem_pool_bytes);
     buffer_meta_data_.runtime_buffer_size =
         compiled_runtime_module_.runtime_size;
     TI_DEBUG(
         "Metal runtime buffer size: {} bytes (sizeof(Runtime)={} "
         "memory_pool={})",
-        runtime_mem_->size(), compiled_runtime_module_.runtime_size,
+        runtime_idevalloc_.mem->size(), compiled_runtime_module_.runtime_size,
         mem_pool_bytes);
 
     ActionRecorder::get_instance().record(
         "allocate_runtime_buffer",
-        {ActionArg("runtime_buffer_size_in_bytes", (int64)runtime_mem_->size()),
+        {ActionArg("runtime_buffer_size_in_bytes",
+                   (int64)runtime_idevalloc_.mem->size()),
          ActionArg("runtime_size_in_bytes",
                    (int64)compiled_runtime_module_.runtime_size),
          ActionArg("memory_pool_size", (int64)mem_pool_bytes)});
 
-    TI_ASSERT_INFO(
-        runtime_buffer_ != nullptr,
-        "Failed to allocate Metal runtime buffer, requested {} bytes",
-        runtime_mem_->size());
-    print_mem_ = std::make_unique<BufferMemoryView>(
-        shaders::kMetalPrintAssertBufferSize, mem_pool_);
-    print_buffer_ = new_mtl_buffer_no_copy(device_.get(), print_mem_->ptr(),
-                                           print_mem_->size());
-    TI_ASSERT(print_buffer_ != nullptr);
+    TI_ASSERT_INFO(runtime_idevalloc_.buffer != nullptr,
+                   "Failed to allocate Metal runtime buffer");
+    print_assert_idevalloc_ =
+        make_idevalloc(shaders::kMetalPrintAssertBufferSize);
+    TI_ASSERT(print_assert_idevalloc_.buffer != nullptr);
 
     init_runtime_buffer(compiled_runtime_module_, params.config->random_seed);
     clear_print_assert_buffer();
   }
 
+  ~Impl() {
+    for (auto &rb : root_buffers_) {
+      rhi_device_->dealloc_memory(rb.idevalloc.handle);
+    }
+    rhi_device_->dealloc_memory(global_tmps_idevalloc_.handle);
+    rhi_device_->dealloc_memory(runtime_idevalloc_.handle);
+    rhi_device_->dealloc_memory(print_assert_idevalloc_.handle);
+  }
+
   void add_compiled_snode_tree(const CompiledStructs &compiled_tree) {
     SNodesRootBuffer rtbuf{};
     rtbuf.desc = BufferDescriptor::root(compiled_tree.root_id);
     if (compiled_tree.root_size > 0) {
-      rtbuf.mem = std::make_unique<BufferMemoryView>(compiled_tree.root_size,
-                                                     mem_pool_);
-      rtbuf.buffer = new_mtl_buffer_no_copy(device_.get(), rtbuf.mem->ptr(),
-                                            rtbuf.mem->size());
-
-      TI_ASSERT(rtbuf.buffer != nullptr);
-      buffer_meta_data_.root_buffer_size += rtbuf.mem->size();
+      rtbuf.idevalloc = make_idevalloc(compiled_tree.root_size);
+      const auto buf_sz = rtbuf.idevalloc.mem->size();
+      buffer_meta_data_.root_buffer_size += buf_sz;
       TI_DEBUG("Metal root={} buffer_size={} bytes", compiled_tree.root_id,
-               rtbuf.mem->size());
+               buf_sz);
       ActionRecorder::get_instance().record(
           "allocate_root_buffer",
           {ActionArg("root_id={}", (int64)compiled_tree.root_id),
-           ActionArg("size_in_bytes", (int64)rtbuf.mem->size())});
+           ActionArg("size_in_bytes", (int64)buf_sz)});
     }
     init_snode_tree_sparse_runtime(compiled_tree);
 
@@ -671,11 +672,12 @@ class KernelManager::Impl {
 
     InputBuffersMap input_buffers;
     for (auto &rb : root_buffers_) {
-      input_buffers[rb.desc] = rb.buffer.get();
+      input_buffers[rb.desc] = rb.idevalloc.buffer;
     }
-    input_buffers[BufferDescriptor::global_tmps()] = global_tmps_buffer_.get();
-    input_buffers[BufferDescriptor::runtime()] = runtime_buffer_.get();
-    input_buffers[BufferDescriptor::print()] = print_buffer_.get();
+    input_buffers[BufferDescriptor::global_tmps()] =
+        global_tmps_idevalloc_.buffer;
+    input_buffers[BufferDescriptor::runtime()] = runtime_idevalloc_.buffer;
+    input_buffers[BufferDescriptor::print()] = print_assert_idevalloc_.buffer;
 
     if (ctx_blitter) {
       ctx_blitter->host_to_metal();
@@ -698,7 +700,7 @@ class KernelManager::Impl {
       }
       if (used_print_assert) {
         clear_print_assert_buffer();
-        buffers_to_blit.push_back(print_buffer_.get());
+        buffers_to_blit.push_back(print_assert_idevalloc_.buffer);
       }
       blit_buffers_and_sync(buffers_to_blit);
 
@@ -730,7 +732,7 @@ class KernelManager::Impl {
   std::size_t get_snode_num_dynamically_allocated(SNode *snode) {
     // TODO(k-ye): Have a generic way for querying these sparse runtime stats.
     mac::ScopedAutoreleasePool pool;
-    blit_buffers_and_sync({runtime_buffer_.get()});
+    blit_buffers_and_sync({runtime_idevalloc_.buffer});
     auto *sna = dev_runtime_mirror_.snode_allocators + snode->id;
     // WHY -1?
     //
@@ -747,7 +749,7 @@ class KernelManager::Impl {
  private:
   void init_runtime_buffer(const CompiledRuntimeModule &rtm_module,
                            int random_seed) {
-    char *addr = runtime_mem_->ptr();
+    char *addr = runtime_idevalloc_.mem->ptr();
     // init rand_seeds
     std::default_random_engine generator((unsigned int)random_seed);
     std::uniform_int_distribution<uint32_t> distr(
@@ -760,7 +762,7 @@ class KernelManager::Impl {
     TI_DEBUG("Initialized random seeds size={}", rtm_module.rand_seeds_size);
 
     using namespace shaders;
-    addr = runtime_mem_->ptr() + rtm_module.rand_seeds_size;
+    addr = runtime_idevalloc_.mem->ptr() + rtm_module.rand_seeds_size;
     const char *const addr_begin = addr;
     dev_runtime_mirror_.snode_metas = (SNodeMeta *)addr;
     size_t addr_offset = sizeof(SNodeMeta) * kMaxNumSNodes;
@@ -958,14 +960,14 @@ class KernelManager::Impl {
   }
 
   void mark_runtime_buffer_modified() {
-    did_modify_range(runtime_buffer_.get(), /*location=*/0,
-                     runtime_mem_->size());
+    did_modify_range(runtime_idevalloc_.buffer, /*location=*/0,
+                     runtime_idevalloc_.mem->size());
   }
 
   void clear_print_assert_buffer() {
-    const auto sz = print_mem_->size();
-    std::memset(print_mem_->ptr(), 0, sz);
-    did_modify_range(print_buffer_.get(), /*location=*/0, sz);
+    const auto sz = print_assert_idevalloc_.mem->size();
+    std::memset(print_assert_idevalloc_.mem->ptr(), 0, sz);
+    did_modify_range(print_assert_idevalloc_.buffer, /*location=*/0, sz);
   }
 
   void blit_buffers_and_sync(
@@ -997,8 +999,8 @@ class KernelManager::Impl {
   void check_assertion_failure() {
     // TODO: Copy this to program's result_buffer, and let the Taichi runtime
     // handle the assertion failures uniformly.
-    auto *asst_rec =
-        reinterpret_cast<shaders::AssertRecorderData *>(print_mem_->ptr());
+    auto *asst_rec = reinterpret_cast<shaders::AssertRecorderData *>(
+        print_assert_idevalloc_.mem->ptr());
     if (!asst_rec->flag) {
       return;
     }
@@ -1025,8 +1027,9 @@ class KernelManager::Impl {
     //
     //   check_assertion_failure();  <-- Code below is skipped...
     //   flush_print_buffers();
-    //   memset(print_mem_->ptr(), 0, print_mem_->size());
-    //   did_modify_range(print_buffer_);
+    //   memset(print_assert_idevalloc_.mem->ptr(), 0,
+    //          print_assert_idevalloc_.mem->size());
+    //   did_modify_range(print_assert_idevalloc_.buffer);
     //
     // As a workaround, we put [didModifyRange:] before sync, where the program
     // is still executing normally.
@@ -1036,7 +1039,7 @@ class KernelManager::Impl {
 
   void flush_print_buffers() {
     auto *pa = reinterpret_cast<shaders::PrintMsgAllocator *>(
-        print_mem_->ptr() + shaders::kMetalAssertBufferSize);
+        print_assert_idevalloc_.mem->ptr() + shaders::kMetalAssertBufferSize);
     const int used_sz =
         std::min(pa->next, shaders::kMetalPrintMsgsMaxQueueSize);
     TI_TRACE("Print buffer used bytes: {}", used_sz);
@@ -1095,14 +1098,29 @@ class KernelManager::Impl {
 
   template <typename T>
   inline T load_global_tmp(int offset) const {
-    return *reinterpret_cast<const T *>((const char *)global_tmps_mem_->ptr() +
-                                        offset);
+    return *reinterpret_cast<const T *>(
+        (const char *)global_tmps_idevalloc_.mem->ptr() + offset);
+  }
+
+  DevAllocWithInternals make_idevalloc(size_t size) {
+    Device::AllocParams params;
+    // host_read|write honestly don't matter at this point, because the Metal
+    // backend only uses .managed mode.
+    params.host_read = false;
+    params.host_write = false;
+    params.size = size;
+    params.usage = AllocUsage::Storage;
+    DevAllocWithInternals res;
+    res.handle = rhi_device_->allocate_memory(params);
+    auto bm = devalloc_mapper_->find(res.handle);
+    res.buffer = bm.buffer;
+    res.mem = bm.mem;
+    return res;
   }
 
   struct SNodesRootBuffer {
     BufferDescriptor desc;
-    std::unique_ptr<BufferMemoryView> mem{nullptr};
-    nsobj_unique_ptr<MTLBuffer> buffer{nullptr};
+    DevAllocWithInternals idevalloc;
   };
 
   CompileConfig *const config_;
@@ -1121,14 +1139,10 @@ class KernelManager::Impl {
   nsobj_unique_ptr<MTLCommandBuffer> cur_command_buffer_{nullptr};
   std::size_t command_buffer_id_{0};
   std::vector<SNodesRootBuffer> root_buffers_;
-  std::unique_ptr<BufferMemoryView> global_tmps_mem_{nullptr};
-  nsobj_unique_ptr<MTLBuffer> global_tmps_buffer_{nullptr};
-  std::unique_ptr<BufferMemoryView> runtime_mem_{nullptr};
-  nsobj_unique_ptr<MTLBuffer> runtime_buffer_{nullptr};
+  DevAllocWithInternals global_tmps_idevalloc_;
+  DevAllocWithInternals runtime_idevalloc_;
+  DevAllocWithInternals print_assert_idevalloc_;
   int last_snode_id_used_in_runtime_{-1};
-  // TODO: Rename these to 'print_assert_{mem|buffer}_'
-  std::unique_ptr<BufferMemoryView> print_mem_{nullptr};
-  nsobj_unique_ptr<MTLBuffer> print_buffer_{nullptr};
   std::unordered_map<std::string, std::unique_ptr<CompiledTaichiKernel>>
       compiled_taichi_kernels_;
   PrintStringTable print_strtable_;

From c2a44d889f65536749b84988a32a286d5f94eef8 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Fri, 29 Apr 2022 09:20:25 +0800
Subject: [PATCH 017/176] [Build] [refactor] Use keywords instead of plain
 target_link_libraries CMake (#4864)

* Move LLVM Cmake to its own dir

* Suppress warning from submodules

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use current source dir

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate Vulkan runtime files from codegen

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use keywords instead of plain target_link_libraries

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CMakeLists.txt               |  2 +-
 cmake/TaichiCore.cmake       | 60 +++++++++++++++++++++++-------------
 cmake/TaichiExamples.cmake   |  2 +-
 cmake/TaichiExportCore.cmake |  2 +-
 cmake/TaichiTests.cmake      |  4 +--
 5 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 75d71cb604609..d12963aee94b7 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # The Taichi Programming Language
 #*********************************************************************
 
-cmake_minimum_required(VERSION 3.12)
+cmake_minimum_required(VERSION 3.13)
 
 project(taichi)
 
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 8b740b5e74f82..7017364300774 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -246,12 +246,15 @@ add_library(${CORE_LIBRARY_NAME} OBJECT ${TAICHI_CORE_SOURCE})
 
 if (APPLE)
     # Ask OS X to minic Linux dynamic linking behavior
-    target_link_libraries(${CORE_LIBRARY_NAME} "-undefined dynamic_lookup")
+    set_target_properties(${CORE_LIBRARY_NAME}
+      PROPERTIES INTERFACE_LINK_LIBRARIES "-undefined dynamic_lookup"
+    )
 endif()
 
 include_directories(${CMAKE_SOURCE_DIR})
 include_directories(external/include)
 include_directories(external/spdlog/include)
+include_directories(external/SPIRV-Tools/include)
 include_directories(external/PicoSHA2)
 if (TI_WITH_OPENGL)
     target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/glad/include)
@@ -272,7 +275,7 @@ if (TI_WITH_OPENGL OR TI_WITH_VULKAN AND NOT ANDROID AND NOT TI_EMSCRIPTENED)
 
   message("Building with GLFW")
   add_subdirectory(external/glfw)
-  target_link_libraries(${LIBRARY_NAME} glfw)
+  target_link_libraries(${LIBRARY_NAME} PRIVATE glfw)
   target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/glfw/include)
 endif()
 
@@ -314,16 +317,16 @@ if(TI_WITH_LLVM)
             ipo
             Analysis
             )
-    target_link_libraries(${LIBRARY_NAME} ${llvm_libs})
+    target_link_libraries(${LIBRARY_NAME} PRIVATE ${llvm_libs})
 
     if (APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64")
         llvm_map_components_to_libnames(llvm_aarch64_libs AArch64)
-        target_link_libraries(${LIBRARY_NAME} ${llvm_aarch64_libs})
+        target_link_libraries(${LIBRARY_NAME} PRIVATE ${llvm_aarch64_libs})
     endif()
 
     if (TI_WITH_CUDA)
         llvm_map_components_to_libnames(llvm_ptx_libs NVPTX)
-        target_link_libraries(${LIBRARY_NAME} ${llvm_ptx_libs})
+        target_link_libraries(${LIBRARY_NAME} PRIVATE ${llvm_ptx_libs})
     endif()
 endif()
 
@@ -337,7 +340,7 @@ if (TI_WITH_CUDA_TOOLKIT)
         include_directories($ENV{CUDA_TOOLKIT_ROOT_DIR}/include)
         link_directories($ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64)
         #libraries for cuda kernel profiler CuptiToolkit
-        target_link_libraries(${CORE_LIBRARY_NAME} cupti nvperf_host)
+        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE cupti nvperf_host)
     endif()
 else()
     message(STATUS "TI_WITH_CUDA_TOOLKIT = OFF")
@@ -347,13 +350,13 @@ if (TI_WITH_OPENGL)
     set(SPIRV_CROSS_CLI false)
     add_subdirectory(external/SPIRV-Cross)
     target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Cross)
-    target_link_libraries(${CORE_LIBRARY_NAME} spirv-cross-glsl spirv-cross-core)
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE spirv-cross-glsl spirv-cross-core)
 endif()
 
 if (TI_WITH_DX11)
     set(SPIRV_CROSS_CLI false)
     #target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Cross)
-    target_link_libraries(${CORE_LIBRARY_NAME} spirv-cross-hlsl spirv-cross-core)
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE spirv-cross-hlsl spirv-cross-core)
 endif()
 
 # SPIR-V codegen is always there, regardless of Vulkan
@@ -362,7 +365,7 @@ set(SPIRV-Headers_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/external/SPIRV-Headers)
 add_subdirectory(external/SPIRV-Tools)
 # NOTE: SPIRV-Tools-opt must come before SPIRV-Tools
 # https://github.com/KhronosGroup/SPIRV-Tools/issues/1569#issuecomment-390250792
-target_link_libraries(${CORE_LIBRARY_NAME} SPIRV-Tools-opt ${SPIRV_TOOLS})
+target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE SPIRV-Tools-opt ${SPIRV_TOOLS})
 
 if (TI_WITH_VULKAN)
     include_directories(SYSTEM external/Vulkan-Headers/include)
@@ -377,7 +380,7 @@ if (TI_WITH_VULKAN)
         # shaderc requires pthread
         set(THREADS_PREFER_PTHREAD_FLAG ON)
         find_package(Threads REQUIRED)
-        target_link_libraries(${CORE_LIBRARY_NAME} Threads::Threads)
+        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE Threads::Threads)
     endif()
 
     if (APPLE)
@@ -390,44 +393,57 @@ if (TI_WITH_VULKAN)
     endif()
 
     add_subdirectory(taichi/runtime/vulkan)
-    target_link_libraries(${CORE_LIBRARY_NAME} vulkan_runtime)
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE vulkan_runtime)
 endif ()
 
 
 # Optional dependencies
 
 if (APPLE)
-    target_link_libraries(${CORE_LIBRARY_NAME} "-framework Cocoa -framework Metal")
+  find_library(COCOA Cocoa)
+  if (NOT COCOA)
+    message(FATAL_ERROR "Cocoa not found")
+  endif()
+  find_library(METAL Metal)
+  if (NOT METAL)
+    message(FATAL_ERROR "Metal not found")
+  endif()
+  target_link_libraries(${CORE_LIBRARY_NAME}
+    PRIVATE
+      ${COCOA}
+      ${METAL}
+    )
 endif ()
 
 if (NOT WIN32)
     # Android has a custom toolchain so pthread is not available and should
     # link against other libraries as well for logcat and internal features.
     if (ANDROID)
-        target_link_libraries(${CORE_LIBRARY_NAME} android log)
+        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE android log)
     else()
-        target_link_libraries(${CORE_LIBRARY_NAME} pthread stdc++)
+        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE pthread stdc++)
     endif()
 
     if (UNIX AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Linux")
 	# OS X or BSD
     else()
         # Linux
-        target_link_libraries(${CORE_LIBRARY_NAME} stdc++fs X11)
-        target_link_libraries(${CORE_LIBRARY_NAME} -static-libgcc -static-libstdc++)
+        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE stdc++fs X11)
+
+        target_link_options(${CORE_LIBRARY_NAME} PRIVATE -static-libgcc -static-libstdc++)
         if (${CMAKE_HOST_SYSTEM_PROCESSOR} STREQUAL "x86_64")
             # Avoid glibc dependencies
             if (TI_WITH_VULKAN)
-                target_link_libraries(${CORE_LIBRARY_NAME} -Wl,--wrap=log2f)
+                target_link_options(${CORE_LIBRARY_NAME} PRIVATE -Wl,--wrap=log2f)
             else()
                 # Enforce compatibility with manylinux2014
-                target_link_libraries(${CORE_LIBRARY_NAME} -Wl,--wrap=log2f -Wl,--wrap=exp2 -Wl,--wrap=log2 -Wl,--wrap=logf -Wl,--wrap=powf -Wl,--wrap=exp -Wl,--wrap=log -Wl,--wrap=pow)
+                target_link_options(${CORE_LIBRARY_NAME} PRIVATE -Wl,--wrap=log2f -Wl,--wrap=exp2 -Wl,--wrap=log2 -Wl,--wrap=logf -Wl,--wrap=powf -Wl,--wrap=exp -Wl,--wrap=log -Wl,--wrap=pow)
             endif()
         endif()
     endif()
 else()
     # windows
-    target_link_libraries(${CORE_LIBRARY_NAME} Winmm)
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE Winmm)
 endif ()
 
 foreach (source IN LISTS TAICHI_CORE_SOURCE)
@@ -455,7 +471,7 @@ if(NOT TI_EMSCRIPTENED)
     endif()
     # It is actually possible to link with an OBJECT library
     # https://cmake.org/cmake/help/v3.13/command/target_link_libraries.html?highlight=target_link_libraries#linking-object-libraries
-    target_link_libraries(${CORE_WITH_PYBIND_LIBRARY_NAME} PUBLIC ${CORE_LIBRARY_NAME})
+    target_link_libraries(${CORE_WITH_PYBIND_LIBRARY_NAME} PRIVATE ${CORE_LIBRARY_NAME})
 
     # These commands should apply to the DLL that is loaded from python, not the OBJECT library.
     if (MSVC)
@@ -475,7 +491,7 @@ endif()
 if(TI_EMSCRIPTENED)
     set(CORE_WITH_EMBIND_LIBRARY_NAME taichi)
     add_executable(${CORE_WITH_EMBIND_LIBRARY_NAME} ${TAICHI_EMBIND_SOURCE})
-    target_link_libraries(${CORE_WITH_EMBIND_LIBRARY_NAME} PUBLIC ${CORE_LIBRARY_NAME})
+    target_link_libraries(${CORE_WITH_EMBIND_LIBRARY_NAME} PRIVATE ${CORE_LIBRARY_NAME})
     target_compile_options(${CORE_WITH_EMBIND_LIBRARY_NAME} PRIVATE "-Oz")
     # target_compile_options(${CORE_LIBRARY_NAME} PRIVATE "-Oz")
     set_target_properties(${CORE_LIBRARY_NAME} PROPERTIES LINK_FLAGS "-s ERROR_ON_UNDEFINED_SYMBOLS=0 -s ASSERTIONS=1")
@@ -495,7 +511,7 @@ else()
     include_directories(external/glfw/include)
     add_library(imgui  ${IMGUI_DIR}/backends/imgui_impl_glfw.cpp ${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp ${IMGUI_DIR}/imgui.cpp ${IMGUI_DIR}/imgui_draw.cpp  ${IMGUI_DIR}/imgui_tables.cpp ${IMGUI_DIR}/imgui_widgets.cpp)
 endif()
-    target_link_libraries(${CORE_LIBRARY_NAME} imgui)
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE imgui)
 
 endif()
 
diff --git a/cmake/TaichiExamples.cmake b/cmake/TaichiExamples.cmake
index 742119013d623..3ab7d0084a1f3 100644
--- a/cmake/TaichiExamples.cmake
+++ b/cmake/TaichiExamples.cmake
@@ -25,6 +25,6 @@ if (WIN32)
     set_target_properties(${EXAMPLES_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${EXAMPLES_OUTPUT_DIR})
     set_target_properties(${EXAMPLES_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${EXAMPLES_OUTPUT_DIR})
 endif()
-target_link_libraries(${EXAMPLES_NAME} taichi_isolated_core)
+target_link_libraries(${EXAMPLES_NAME} PRIVATE taichi_isolated_core)
 
 endif()
diff --git a/cmake/TaichiExportCore.cmake b/cmake/TaichiExportCore.cmake
index 463a028711e8a..b4f646fb92e2b 100644
--- a/cmake/TaichiExportCore.cmake
+++ b/cmake/TaichiExportCore.cmake
@@ -3,6 +3,6 @@ cmake_minimum_required(VERSION 3.0)
 set(TAICHI_EXPORT_CORE_NAME taichi_export_core)
 
 add_library(${TAICHI_EXPORT_CORE_NAME} SHARED)
-target_link_libraries(${TAICHI_EXPORT_CORE_NAME} taichi_isolated_core)
+target_link_libraries(${TAICHI_EXPORT_CORE_NAME} PRIVATE taichi_isolated_core)
 set_target_properties(${TAICHI_EXPORT_CORE_NAME} PROPERTIES
     CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index 2b6da207c82ff..5f6adb66d1c07 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -35,7 +35,7 @@ if (WIN32)
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_MINSIZEREL ${TESTS_OUTPUT_DIR})
     set_target_properties(${TESTS_NAME} PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${TESTS_OUTPUT_DIR})
 endif()
-target_link_libraries(${TESTS_NAME} taichi_isolated_core)
-target_link_libraries(${TESTS_NAME} gtest_main)
+target_link_libraries(${TESTS_NAME} PRIVATE taichi_isolated_core)
+target_link_libraries(${TESTS_NAME} PRIVATE gtest_main)
 
 add_test(NAME ${TESTS_NAME} COMMAND ${TESTS_NAME})

From b9bb482e312eb9017dd5de4e3209aafebebcee7d Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 29 Apr 2022 11:27:53 +0800
Subject: [PATCH 018/176] [bug] Fixed type promotion rule for bit-shift
 operations (#4884)

* [bug] Fixed type promotion rule for shift operations

* removed debug info

* Addressed review comments
---
 taichi/ir/frontend_ir.cpp               |  5 +++
 taichi/ir/stmt_op_types.h               |  5 +++
 taichi/transforms/type_check.cpp        | 45 ++++++++++++++++++++++++-
 tests/cpp/ir/ir_type_promotion_test.cpp | 31 +++++++++++++++++
 4 files changed, 85 insertions(+), 1 deletion(-)
 create mode 100644 tests/cpp/ir/ir_type_promotion_test.cpp

diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index bedd7f5fb5fd5..627ff3592cf26 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -197,6 +197,11 @@ void BinaryOpExpression::type_check(CompileConfig *config) {
     ret_type = PrimitiveType::i32;
     return;
   }
+  if (is_shift_op(type)) {
+    ret_type = lhs_type;
+    return;
+  }
+
   if (type == BinaryOpType::truediv) {
     auto default_fp = config->default_fp;
     if (!is_real(lhs_type)) {
diff --git a/taichi/ir/stmt_op_types.h b/taichi/ir/stmt_op_types.h
index 8c43c2626fcec..5e04525930a3f 100644
--- a/taichi/ir/stmt_op_types.h
+++ b/taichi/ir/stmt_op_types.h
@@ -47,6 +47,11 @@ inline bool binary_is_logical(BinaryOpType t) {
 
 std::string binary_op_type_name(BinaryOpType type);
 
+inline bool is_shift_op(BinaryOpType type) {
+  return type == BinaryOpType::bit_sar || type == BinaryOpType::bit_shl ||
+         type == BinaryOpType::bit_shr;
+}
+
 inline bool is_comparison(BinaryOpType type) {
   return starts_with(binary_op_type_name(type), "cmp");
 }
diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index cf1d0b3080702..5eb1ba0e95804 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -244,6 +244,29 @@ class TypeCheck : public IRVisitor {
     return stmt;
   }
 
+  void insert_shift_op_assertion_before(Stmt *stmt, Stmt *lhs, Stmt *rhs) {
+    int rhs_limit = data_type_bits(lhs->ret_type);
+    auto const_stmt =
+        Stmt::make<ConstStmt>(TypedConstant(rhs->ret_type, rhs_limit));
+    auto cond_stmt =
+        Stmt::make<BinaryOpStmt>(BinaryOpType::cmp_le, rhs, const_stmt.get());
+
+    std::string msg =
+        "Detected overflow for bit_shift_op with rhs = %d, exceeding limit of "
+        "%d.";
+    std::vector<Stmt *> args = {rhs, const_stmt.get()};
+    auto assert_stmt =
+        Stmt::make<AssertStmt>(cond_stmt.get(), msg, std::move(args));
+
+    const_stmt->accept(this);
+    cond_stmt->accept(this);
+    assert_stmt->accept(this);
+
+    stmt->insert_before_me(std::move(const_stmt));
+    stmt->insert_before_me(std::move(cond_stmt));
+    stmt->insert_before_me(std::move(assert_stmt));
+  }
+
   void cast(Stmt *&val, DataType dt) {
     auto cast_stmt = insert_type_cast_after(val, val, dt);
     val = cast_stmt;
@@ -287,7 +310,27 @@ class TypeCheck : public IRVisitor {
       };
       stmt->lhs = promote_custom_int_type(stmt, stmt->lhs);
       stmt->rhs = promote_custom_int_type(stmt, stmt->rhs);
-      auto ret_type = promoted_type(stmt->lhs->ret_type, stmt->rhs->ret_type);
+
+      DataType ret_type;
+      if (is_shift_op(stmt->op_type)) {
+        // shift_ops does not follow the same type promotion rule as numerical
+        // ops numerical ops: u8 + i32 = i32 shift_ops:     u8 << i32 = u8
+        // (return dtype follows that of the lhs)
+        //
+        // In the above example, while truncating rhs(i32) to u8 risks an
+        // overflow, the runtime value of rhs is very likely less than 8
+        // (otherwise meaningless). Nevertheless, we insert an AssertStmt here
+        // to warn user of this potential overflow.
+        ret_type = stmt->lhs->ret_type;
+
+        // Insert AssertStmt
+        if (config_.debug) {
+          insert_shift_op_assertion_before(stmt, stmt->lhs, stmt->rhs);
+        }
+      } else {
+        ret_type = promoted_type(stmt->lhs->ret_type, stmt->rhs->ret_type);
+      }
+
       if (ret_type != stmt->lhs->ret_type) {
         // promote lhs
         auto cast_stmt = insert_type_cast_before(stmt, stmt->lhs, ret_type);
diff --git a/tests/cpp/ir/ir_type_promotion_test.cpp b/tests/cpp/ir/ir_type_promotion_test.cpp
new file mode 100644
index 0000000000000..6f444763d7bd0
--- /dev/null
+++ b/tests/cpp/ir/ir_type_promotion_test.cpp
@@ -0,0 +1,31 @@
+#include "gtest/gtest.h"
+
+#include "taichi/ir/statements.h"
+#include "taichi/ir/ir_builder.h"
+#include "taichi/ir/transforms.h"
+#include "tests/cpp/program/test_program.h"
+
+namespace taichi {
+namespace lang {
+
+TEST(IRTypePromotionTest, ShiftOp) {
+  IRBuilder builder;
+
+  // (u8)x << (i32)1 -> (u8)res
+  auto *lhs = builder.create_arg_load(0, get_data_type<uint8>(), false);
+  auto *res = builder.create_shl(lhs, builder.get_int32(1));
+  auto ir = builder.extract_ir();
+
+  ASSERT_TRUE(ir->is<Block>());
+  auto *ir_block = ir->as<Block>();
+  irpass::type_check(ir_block, CompileConfig());
+
+  EXPECT_TRUE(ir_block->statements.back()->is<BinaryOpStmt>());
+  auto *binary_stmt = ir_block->statements.back()->as<BinaryOpStmt>();
+
+  auto ret_type = binary_stmt->ret_type;
+  EXPECT_TRUE(ret_type->is_primitive(PrimitiveTypeID::u8));
+}
+
+}  // namespace lang
+}  // namespace taichi

From de08a2734a1645d927f8a7316475374deb36caa0 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Fri, 29 Apr 2022 21:04:48 +0800
Subject: [PATCH 019/176] [aot] [vulkan] Expose symbols for AOT (#4879)

* [aot] [vulkan] Expose symbols for AOT

* weird windows

* hide to make win happy

* fix
---
 taichi/backends/device.h                 |  2 +-
 taichi/backends/vulkan/vulkan_device.cpp | 37 +++++++++++++++---------
 taichi/backends/vulkan/vulkan_device.h   | 13 +++++----
 taichi/backends/vulkan/vulkan_loader.h   |  2 +-
 4 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/taichi/backends/device.h b/taichi/backends/device.h
index 098a2a9645baf..175919a6ae1fc 100644
--- a/taichi/backends/device.h
+++ b/taichi/backends/device.h
@@ -584,7 +584,7 @@ struct RasterParams {
   std::vector<BlendingParams> blending{};
 };
 
-class GraphicsDevice : public Device {
+class TI_DLL_EXPORT GraphicsDevice : public Device {
  public:
   virtual std::unique_ptr<Pipeline> create_raster_pipeline(
       const std::vector<PipelineSourceDesc> &src,
diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
index f627f2b01fb7b..b171a79927a64 100644
--- a/taichi/backends/vulkan/vulkan_device.cpp
+++ b/taichi/backends/vulkan/vulkan_device.cpp
@@ -1224,6 +1224,15 @@ vkapi::IVkCommandBuffer VulkanCommandList::finalize() {
   return buffer_;
 }
 
+struct VulkanDevice::ThreadLocalStreams {
+  unordered_map<std::thread::id, std::unique_ptr<VulkanStream>> map;
+};
+
+VulkanDevice::VulkanDevice()
+    : compute_streams_(std::make_unique<ThreadLocalStreams>()),
+      graphics_streams_(std::make_unique<ThreadLocalStreams>()) {
+}
+
 void VulkanDevice::init_vulkan_structs(Params &params) {
   instance_ = params.instance;
   device_ = params.device;
@@ -1479,33 +1488,33 @@ void VulkanDevice::memcpy_internal(DevicePtr dst,
 
 Stream *VulkanDevice::get_compute_stream() {
   auto tid = std::this_thread::get_id();
-  auto iter = compute_stream_.find(tid);
-  if (iter == compute_stream_.end()) {
-    compute_stream_[tid] = std::make_unique<VulkanStream>(
+  auto &stream_map = compute_streams_->map;
+  auto iter = stream_map.find(tid);
+  if (iter == stream_map.end()) {
+    stream_map[tid] = std::make_unique<VulkanStream>(
         *this, compute_queue_, compute_queue_family_index_);
-    return compute_stream_.at(tid).get();
-  } else {
-    return iter->second.get();
+    return stream_map.at(tid).get();
   }
+  return iter->second.get();
 }
 
 Stream *VulkanDevice::get_graphics_stream() {
   auto tid = std::this_thread::get_id();
-  auto iter = graphics_stream_.find(tid);
-  if (iter == graphics_stream_.end()) {
-    graphics_stream_[tid] = std::make_unique<VulkanStream>(
+  auto &stream_map = graphics_streams_->map;
+  auto iter = stream_map.find(tid);
+  if (iter == stream_map.end()) {
+    stream_map[tid] = std::make_unique<VulkanStream>(
         *this, graphics_queue_, graphics_queue_family_index_);
-    return graphics_stream_.at(tid).get();
-  } else {
-    return iter->second.get();
+    return stream_map.at(tid).get();
   }
+  return iter->second.get();
 }
 
 void VulkanDevice::wait_idle() {
-  for (auto &[tid, stream] : compute_stream_) {
+  for (auto &[tid, stream] : compute_streams_->map) {
     stream->command_sync();
   }
-  for (auto &[tid, stream] : graphics_stream_) {
+  for (auto &[tid, stream] : graphics_streams_->map) {
     stream->command_sync();
   }
 }
diff --git a/taichi/backends/vulkan/vulkan_device.h b/taichi/backends/vulkan/vulkan_device.h
index 3fd3a3a75ce10..5a4b9eeeefff8 100644
--- a/taichi/backends/vulkan/vulkan_device.h
+++ b/taichi/backends/vulkan/vulkan_device.h
@@ -516,9 +516,7 @@ class VulkanStream : public Stream {
   std::vector<TrackedCmdbuf> submitted_cmdbuffers_;
 };
 
-class VulkanDevice : public GraphicsDevice {
-  friend VulkanSurface;
-
+class TI_DLL_EXPORT VulkanDevice : public GraphicsDevice {
  public:
   struct Params {
     VkInstance instance;
@@ -530,6 +528,7 @@ class VulkanDevice : public GraphicsDevice {
     uint32_t graphics_queue_family_index;
   };
 
+  VulkanDevice();
   void init_vulkan_structs(Params &params);
   ~VulkanDevice() override;
 
@@ -620,6 +619,8 @@ class VulkanDevice : public GraphicsDevice {
   vkapi::IVkDescriptorSet alloc_desc_set(vkapi::IVkDescriptorSetLayout layout);
 
  private:
+  friend VulkanSurface;
+
   void create_vma_allocator();
   void new_descriptor_pool();
 
@@ -635,9 +636,9 @@ class VulkanDevice : public GraphicsDevice {
   VkQueue graphics_queue_;
   uint32_t graphics_queue_family_index_;
 
-  unordered_map<std::thread::id, std::unique_ptr<VulkanStream>> compute_stream_;
-  unordered_map<std::thread::id, std::unique_ptr<VulkanStream>>
-      graphics_stream_;
+  struct ThreadLocalStreams;
+  std::unique_ptr<ThreadLocalStreams> compute_streams_{nullptr};
+  std::unique_ptr<ThreadLocalStreams> graphics_streams_{nullptr};
 
   // Memory allocation
   struct AllocationInternal {
diff --git a/taichi/backends/vulkan/vulkan_loader.h b/taichi/backends/vulkan/vulkan_loader.h
index 408b0262d71c0..3b5c0e1ad7242 100644
--- a/taichi/backends/vulkan/vulkan_loader.h
+++ b/taichi/backends/vulkan/vulkan_loader.h
@@ -10,7 +10,7 @@ namespace taichi {
 namespace lang {
 namespace vulkan {
 
-class VulkanLoader {
+class TI_DLL_EXPORT VulkanLoader {
  public:
   static VulkanLoader &instance() {
     static VulkanLoader instance;

From b3446e905a98ebd469e6a18c04d3b4ca957ec922 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Sat, 30 Apr 2022 10:57:03 +0800
Subject: [PATCH 020/176] [Build] [refactor] Define Cmake OpenGL runtime target
 (#4887)

* Move LLVM Cmake to its own dir

* Suppress warning from submodules

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use current source dir

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate Vulkan runtime files from codegen

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use keywords instead of plain target_link_libraries

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Separate opengl runtime files from backend

* Remove some warnings

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Minor

* Add glfw include

* Add link to taichi core

* Update taichi/program/extension.cpp

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: yekuang <k-ye@users.noreply.github.com>
---
 cmake/TaichiCore.cmake                           |  5 +++++
 taichi/backends/opengl/codegen_opengl.cpp        |  4 ++--
 taichi/backends/opengl/codegen_opengl.h          |  2 +-
 taichi/backends/opengl/opengl_device.cpp         |  2 +-
 taichi/backends/opengl/opengl_program.h          |  4 ++--
 taichi/backends/opengl/shaders/indirect.glsl.h   |  2 +-
 taichi/backends/opengl/struct_opengl.h           |  2 +-
 taichi/codegen/spirv/spirv_ir_builder.cpp        |  2 +-
 taichi/program/extension.cpp                     |  1 -
 taichi/python/export_misc.cpp                    |  2 +-
 taichi/runtime/opengl/CMakeLists.txt             | 16 ++++++++++++++++
 .../{backends => runtime}/opengl/opengl_api.cpp  |  2 +-
 taichi/{backends => runtime}/opengl/opengl_api.h |  2 +-
 .../opengl/opengl_kernel_launcher.h              |  0
 .../opengl/shaders/runtime.h                     |  0
 15 files changed, 33 insertions(+), 13 deletions(-)
 create mode 100644 taichi/runtime/opengl/CMakeLists.txt
 rename taichi/{backends => runtime}/opengl/opengl_api.cpp (99%)
 rename taichi/{backends => runtime}/opengl/opengl_api.h (98%)
 rename taichi/{backends => runtime}/opengl/opengl_kernel_launcher.h (100%)
 rename taichi/{backends => runtime}/opengl/shaders/runtime.h (100%)

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 7017364300774..3096832be40c7 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -251,9 +251,11 @@ if (APPLE)
     )
 endif()
 
+# TODO: replace these includes per target basis
 include_directories(${CMAKE_SOURCE_DIR})
 include_directories(external/include)
 include_directories(external/spdlog/include)
+include_directories(external/glad/include)
 include_directories(external/SPIRV-Tools/include)
 include_directories(external/PicoSHA2)
 if (TI_WITH_OPENGL)
@@ -351,6 +353,9 @@ if (TI_WITH_OPENGL)
     add_subdirectory(external/SPIRV-Cross)
     target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Cross)
     target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE spirv-cross-glsl spirv-cross-core)
+
+    add_subdirectory(taichi/runtime/opengl)
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE opengl_runtime)
 endif()
 
 if (TI_WITH_DX11)
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index 173d8cd1c8aee..7f232b26315b5 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -3,7 +3,7 @@
 
 #include <string>
 
-#include "taichi/backends/opengl/opengl_api.h"
+#include "taichi/runtime/opengl/opengl_api.h"
 #include "taichi/backends/opengl/opengl_data_types.h"
 #include "taichi/backends/opengl/opengl_kernel_util.h"
 #include "taichi/ir/ir.h"
@@ -31,7 +31,7 @@ namespace shaders {
 
 #define TI_INSIDE_OPENGL_CODEGEN
 #include "taichi/backends/opengl/shaders/atomics_macro_f32.glsl.h"
-#include "taichi/backends/opengl/shaders/runtime.h"
+#include "taichi/runtime/opengl/shaders/runtime.h"
 #include "taichi/backends/opengl/shaders/random.glsl.h"
 #include "taichi/backends/opengl/shaders/fast_pow.glsl.h"
 #include "taichi/backends/opengl/shaders/print.glsl.h"
diff --git a/taichi/backends/opengl/codegen_opengl.h b/taichi/backends/opengl/codegen_opengl.h
index a630f291b3959..8e4dba27f3977 100644
--- a/taichi/backends/opengl/codegen_opengl.h
+++ b/taichi/backends/opengl/codegen_opengl.h
@@ -3,7 +3,7 @@
 #include "taichi/inc/constants.h"
 #include "taichi/lang_util.h"
 #include "taichi/backends/opengl/struct_opengl.h"
-#include "taichi/backends/opengl/opengl_api.h"
+#include "taichi/runtime/opengl/opengl_api.h"
 
 #include <string>
 #include <unordered_map>
diff --git a/taichi/backends/opengl/opengl_device.cpp b/taichi/backends/opengl/opengl_device.cpp
index 2eda864bab310..b399cbb12e9ab 100644
--- a/taichi/backends/opengl/opengl_device.cpp
+++ b/taichi/backends/opengl/opengl_device.cpp
@@ -1,5 +1,5 @@
 #include "opengl_device.h"
-#include "opengl_api.h"
+#include "taichi/runtime/opengl/opengl_api.h"
 
 namespace taichi {
 namespace lang {
diff --git a/taichi/backends/opengl/opengl_program.h b/taichi/backends/opengl/opengl_program.h
index d6f8e9e120307..1d495f7654a3a 100644
--- a/taichi/backends/opengl/opengl_program.h
+++ b/taichi/backends/opengl/opengl_program.h
@@ -2,8 +2,8 @@
 
 #include "taichi/backends/opengl/struct_opengl.h"
 
-#include "taichi/backends/opengl/opengl_kernel_launcher.h"
-#include "taichi/backends/opengl/opengl_api.h"
+#include "taichi/runtime/opengl/opengl_kernel_launcher.h"
+#include "taichi/runtime/opengl/opengl_api.h"
 #include "taichi/backends/opengl/codegen_opengl.h"
 
 #include "taichi/system/memory_pool.h"
diff --git a/taichi/backends/opengl/shaders/indirect.glsl.h b/taichi/backends/opengl/shaders/indirect.glsl.h
index 04b215a7ef50d..09170c998c5de 100644
--- a/taichi/backends/opengl/shaders/indirect.glsl.h
+++ b/taichi/backends/opengl/shaders/indirect.glsl.h
@@ -5,7 +5,7 @@
 "#version 430 core\nprecision highp float;\n"
 #define TI_INSIDE_OPENGL_CODEGEN
 #define TI_OPENGL_NESTED_INCLUDE
-#include "taichi/backends/opengl/shaders/runtime.h"
+#include "taichi/runtime/opengl/shaders/runtime.h"
 #undef TI_OPENGL_NESTED_INCLUDE
 #undef TI_INSIDE_OPENGL_CODEGEN
 STR(
diff --git a/taichi/backends/opengl/struct_opengl.h b/taichi/backends/opengl/struct_opengl.h
index 475bf97d52270..2d9faad8a6d5d 100644
--- a/taichi/backends/opengl/struct_opengl.h
+++ b/taichi/backends/opengl/struct_opengl.h
@@ -1,7 +1,7 @@
 // Codegen for the hierarchical data structure
 #pragma once
 
-#include "taichi/backends/opengl/opengl_kernel_launcher.h"
+#include "taichi/runtime/opengl/opengl_kernel_launcher.h"
 #include "taichi/backends/opengl/opengl_data_types.h"
 #include "taichi/ir/snode.h"
 
diff --git a/taichi/codegen/spirv/spirv_ir_builder.cpp b/taichi/codegen/spirv/spirv_ir_builder.cpp
index b5e58359d915d..f3386e98b0891 100644
--- a/taichi/codegen/spirv/spirv_ir_builder.cpp
+++ b/taichi/codegen/spirv/spirv_ir_builder.cpp
@@ -1206,7 +1206,7 @@ void IRBuilder::init_random_function(Value global_tmp_) {
   store_var(rand_z_, _521288629u);
   store_var(rand_w_, _88675123u);
 
-  enum spv::Op add_op = spv::OpIAdd;
+  // enum spv::Op add_op = spv::OpIAdd;
   bool use_atomic_increment = false;
 
 // use atomic increment for DX API to avoid error X3694
diff --git a/taichi/program/extension.cpp b/taichi/program/extension.cpp
index 0db7e436fca61..cff4620c5e913 100644
--- a/taichi/program/extension.cpp
+++ b/taichi/program/extension.cpp
@@ -1,5 +1,4 @@
 #include "extension.h"
-//#include "taichi/backends/opengl/opengl_api.h"
 
 #include <unordered_map>
 #include <unordered_set>
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index a1abb389ab782..2ede85517e551 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -4,7 +4,7 @@
 *******************************************************************************/
 
 #include "taichi/backends/metal/api.h"
-#include "taichi/backends/opengl/opengl_api.h"
+#include "taichi/runtime/opengl/opengl_api.h"
 #include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/backends/dx/dx_api.h"
 #include "taichi/common/core.h"
diff --git a/taichi/runtime/opengl/CMakeLists.txt b/taichi/runtime/opengl/CMakeLists.txt
new file mode 100644
index 0000000000000..e736722bade47
--- /dev/null
+++ b/taichi/runtime/opengl/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ./taichi/runtime/opengl/CMakeLists.txt
+
+add_library(opengl_runtime)
+target_sources(opengl_runtime
+  PRIVATE
+    opengl_api.cpp
+  )
+target_include_directories(opengl_runtime
+  PRIVATE
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_CURRENT_SOURCE_DIR}/shaders
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+    ${PROJECT_SOURCE_DIR}/external/glad/include
+    ${PROJECT_SOURCE_DIR}/external/eigen
+    ${PROJECT_SOURCE_DIR}/external/glfw/include
+  )
diff --git a/taichi/backends/opengl/opengl_api.cpp b/taichi/runtime/opengl/opengl_api.cpp
similarity index 99%
rename from taichi/backends/opengl/opengl_api.cpp
rename to taichi/runtime/opengl/opengl_api.cpp
index 9e870814e75f0..cbb05d0340b56 100644
--- a/taichi/backends/opengl/opengl_api.cpp
+++ b/taichi/runtime/opengl/opengl_api.cpp
@@ -4,7 +4,7 @@
 
 #include "taichi/backends/opengl/opengl_kernel_util.h"
 #include "taichi/backends/opengl/opengl_utils.h"
-#include "taichi/backends/opengl/shaders/runtime.h"
+#include "taichi/runtime/opengl/shaders/runtime.h"
 #include "taichi/ir/transforms.h"
 #include "taichi/program/kernel.h"
 #include "taichi/program/program.h"
diff --git a/taichi/backends/opengl/opengl_api.h b/taichi/runtime/opengl/opengl_api.h
similarity index 98%
rename from taichi/backends/opengl/opengl_api.h
rename to taichi/runtime/opengl/opengl_api.h
index 6f4ee6e27a8f5..7fb5fe203e1be 100644
--- a/taichi/backends/opengl/opengl_api.h
+++ b/taichi/runtime/opengl/opengl_api.h
@@ -5,7 +5,7 @@
 #include <vector>
 
 #include "taichi/backends/device.h"
-#include "taichi/backends/opengl/opengl_kernel_launcher.h"
+#include "taichi/runtime/opengl/opengl_kernel_launcher.h"
 #include "taichi/backends/opengl/opengl_kernel_util.h"
 #include "taichi/common/core.h"
 #include "taichi/ir/offloaded_task_type.h"
diff --git a/taichi/backends/opengl/opengl_kernel_launcher.h b/taichi/runtime/opengl/opengl_kernel_launcher.h
similarity index 100%
rename from taichi/backends/opengl/opengl_kernel_launcher.h
rename to taichi/runtime/opengl/opengl_kernel_launcher.h
diff --git a/taichi/backends/opengl/shaders/runtime.h b/taichi/runtime/opengl/shaders/runtime.h
similarity index 100%
rename from taichi/backends/opengl/shaders/runtime.h
rename to taichi/runtime/opengl/shaders/runtime.h

From 6d223da3f9990cdf47d81470df998b85bfdcb438 Mon Sep 17 00:00:00 2001
From: Gabriel H <64807734+ghuau-innopeak@users.noreply.github.com>
Date: Fri, 29 Apr 2022 21:02:05 -0700
Subject: [PATCH 021/176] [vulkan] Fix typo for waitSemaphoreCount (#4892)

---
 taichi/backends/vulkan/vulkan_device.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
index b171a79927a64..97a1954d8b884 100644
--- a/taichi/backends/vulkan/vulkan_device.cpp
+++ b/taichi/backends/vulkan/vulkan_device.cpp
@@ -2345,9 +2345,9 @@ void VulkanSurface::present_image(
 
   VkPresentInfoKHR presentInfo{};
   presentInfo.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
-  presentInfo.waitSemaphoreCount = 1;
+  presentInfo.waitSemaphoreCount = vk_wait_semaphores.size();
   presentInfo.pWaitSemaphores = vk_wait_semaphores.data();
-  presentInfo.swapchainCount = vk_wait_semaphores.size();
+  presentInfo.swapchainCount = 1;
   presentInfo.pSwapchains = &swapchain_;
   presentInfo.pImageIndices = &image_index_;
   presentInfo.pResults = nullptr;

From e583b2d4afc78407ff52281a1abd356801721c8a Mon Sep 17 00:00:00 2001
From: Bob Cao <bobcaocheng@gmail.com>
Date: Fri, 29 Apr 2022 23:09:38 -0700
Subject: [PATCH 022/176] [vulkan] Add new VMA vulkan functions. (#4893)

* Add new VMA vulkan functions.

* fix
---
 taichi/backends/vulkan/vulkan_device.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
index 97a1954d8b884..83c278a0bd058 100644
--- a/taichi/backends/vulkan/vulkan_device.cpp
+++ b/taichi/backends/vulkan/vulkan_device.cpp
@@ -2007,6 +2007,10 @@ void VulkanDevice::create_vma_allocator() {
   vk_vma_functions.vkGetPhysicalDeviceMemoryProperties2KHR =
       PFN_vkGetPhysicalDeviceMemoryProperties2KHR(vkGetInstanceProcAddr(
           volkGetLoadedInstance(), "vkGetPhysicalDeviceMemoryProperties2KHR"));
+  vk_vma_functions.vkGetDeviceBufferMemoryRequirements =
+      table.vkGetDeviceBufferMemoryRequirements;
+  vk_vma_functions.vkGetDeviceImageMemoryRequirements =
+      table.vkGetDeviceImageMemoryRequirements;
 
   allocatorInfo.pVulkanFunctions = &vk_vma_functions;
 

From eedf20d90d8d0f8d9132fbb8a8b3d8182fbb2a93 Mon Sep 17 00:00:00 2001
From: Frost Ming <mianghong@gmail.com>
Date: Tue, 3 May 2022 09:36:52 +0800
Subject: [PATCH 023/176] Use Ninja generator on Windows and skip generator
 test (#4896)

---
 setup.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 53752a499083f..902a1e30e7dfb 100644
--- a/setup.py
+++ b/setup.py
@@ -118,8 +118,12 @@ def get_cmake_args():
         cfg = 'MinSizeRel'
     else:
         cfg = None
+    build_options = []
     if cfg:
-        sys.argv[2:2] = ['--build-type', cfg]
+        build_options.extend(['--build-type', cfg])
+    if sys.platform == 'win32':
+        build_options.extend(['-G', 'Ninja', '--skip-generator-test'])
+    sys.argv[2:2] = build_options
 
     cmake_args += [
         f'-DTI_VERSION_MAJOR={TI_VERSION_MAJOR}',

From 3d6fcb058f7a533c471d872e6e9c4cc85c940373 Mon Sep 17 00:00:00 2001
From: 0xzhang <33616362+0xzhang@users.noreply.github.com>
Date: Wed, 4 May 2022 16:21:46 +0800
Subject: [PATCH 024/176] [Lang] [test] Copy-free interaction between Taichi
 and PaddlePaddle (#4886)

* Implement has_paddle(), to_paddle_type() and update to_taichi_type in python\taichi\lang\util.py

* Implement get_paddle_callbacks() and update get_function_body(), match_ext_arr() in python\taichi\lang\kernel_impl.py

* Add test test_io_devices() in tests\python\test_torch_io.py

* Implement callback for CPU-GPU/GPU-CPU copy between Taichi and Paddle

* Partially implement to_torch()/from_torch() according to PyTorch in Taichi

* Fix paddle.Tensor's backend check

* Update tests for from_paddle()/to_paddle()

* [doc] Update Global settings with TI_ENABLE_PADDLE

* Fix to avoid fail when only import paddle

* [test] Fix the expected list alphabetically

* [doc] Add info about paddle.Tensor

* [ci] Try to test paddle's GPU version

* Fix the usage of paddle.ones

* Fix f16 tests for paddle

* Fixed supported archs for tests of paddle

* Use 1 thread run tests for torch and paddle

* Fix linux test

* Fix windows test

* Unify the name to Paddle

* Add tests for paddle

* Replace usage of device to place for paddle

* Paddle's GPU develop package on Linux import error
---
 .github/workflows/scripts/unix_test.sh        |  26 +-
 .github/workflows/scripts/win_test.ps1        |  12 +-
 ci/scripts/ubuntu_build_test.sh               |   3 +-
 ci/scripts/ubuntu_build_test_cpu.sh           |   6 +-
 ci/windows/win_build_test.ps1                 |   2 +-
 docs/lang/articles/basic/external.md          |  63 +++--
 docs/lang/articles/misc/global_settings.md    |   1 +
 python/taichi/lang/field.py                   |  42 ++-
 python/taichi/lang/kernel_impl.py             |  56 +++-
 python/taichi/lang/matrix.py                  |  28 +-
 python/taichi/lang/mesh.py                    |   9 +
 python/taichi/lang/struct.py                  |  27 ++
 python/taichi/lang/util.py                    |  73 ++++-
 tests/python/test_api.py                      |  17 +-
 tests/python/test_f16.py                      |  46 +++-
 .../python/test_get_external_tensor_shape.py  |  20 +-
 tests/python/test_paddle_io.py                | 250 ++++++++++++++++++
 17 files changed, 622 insertions(+), 59 deletions(-)
 create mode 100644 tests/python/test_paddle_io.py

diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
index 93f5f4edbe298..e7294f8387696 100755
--- a/.github/workflows/scripts/unix_test.sh
+++ b/.github/workflows/scripts/unix_test.sh
@@ -22,11 +22,16 @@ python3 -m pip install dist/*.whl
 if [ -z "$GPU_TEST" ]; then
     python3 -m pip install -r requirements_test.txt
     python3 -m pip install "torch; python_version < '3.10'"
+    # Paddle's develop package doesn't support CI's MACOS machine at present
+    if [[ $OSTYPE == "linux-"* ]]; then
+        python3 -m pip install "paddlepaddle==0.0.0; python_version < '3.10'" -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
+    fi
 else
     ## Only GPU machine uses system python.
     export PATH=$PATH:$HOME/.local/bin
     # pip will skip packages if already installed
     python3 -m pip install -r requirements_test.txt
+    # Import Paddle's develop GPU package will occur error `Illegal Instruction`.
 fi
 ti diagnose
 ti changelog
@@ -38,27 +43,32 @@ TI_LIB_DIR="$TI_PATH/_lib/runtime" ./build/taichi_cpp_tests
 if [ -z "$GPU_TEST" ]; then
     if [[ $PLATFORM == *"m1"* ]]; then
 	# Split per arch to avoid flaky test
-        python3 tests/run_tests.py -vr2 -t4 -k "not torch" -a cpu
+        python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cpu
         # Run metal and vulkan separately so that they don't use M1 chip simultaneously.
-        python3 tests/run_tests.py -vr2 -t4 -k "not torch" -a vulkan
-        python3 tests/run_tests.py -vr2 -t2 -k "not torch" -a metal
+        python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a vulkan
+        python3 tests/run_tests.py -vr2 -t2 -k "not torch and not paddle" -a metal
         python3 tests/run_tests.py -vr2 -t1 -k "torch" -a "$TI_WANTED_ARCHS"
     else
-        python3 tests/run_tests.py -vr2 -t4 -a "$TI_WANTED_ARCHS"
+        # Fail fast, give priority to the error-prone tests
+        if [[ $OSTYPE == "linux-"* ]]; then
+            python3 tests/run_tests.py -vr2 -t1 -k "paddle" -a "$TI_WANTED_ARCHS"
+        fi
+        python3 tests/run_tests.py -vr2 -t4 -k "not paddle" -a "$TI_WANTED_ARCHS"
     fi
 else
     # Split per arch to increase parallelism for linux GPU tests
     if [[ $TI_WANTED_ARCHS == *"cuda"* ]]; then
-        python3 tests/run_tests.py -vr2 -t4 -k "not torch" -a cuda
+        python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cuda
     fi
     if [[ $TI_WANTED_ARCHS == *"cpu"* ]]; then
-        python3 tests/run_tests.py -vr2 -t8 -k "not torch" -a cpu
+        python3 tests/run_tests.py -vr2 -t8 -k "not torch and not paddle" -a cpu
     fi
     if [[ $TI_WANTED_ARCHS == *"vulkan"* ]]; then
-        python3 tests/run_tests.py -vr2 -t8 -k "not torch" -a vulkan
+        python3 tests/run_tests.py -vr2 -t8 -k "not torch and not paddle" -a vulkan
     fi
     if [[ $TI_WANTED_ARCHS == *"opengl"* ]]; then
-        python3 tests/run_tests.py -vr2 -t4 -k "not torch" -a opengl
+        python3 tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a opengl
     fi
     python3 tests/run_tests.py -vr2 -t1 -k "torch" -a "$TI_WANTED_ARCHS"
+    # Paddle's paddle.fluid.core.Tensor._ptr() is only available on develop branch, and CUDA version on linux will get error `Illegal Instruction`
 fi
diff --git a/.github/workflows/scripts/win_test.ps1 b/.github/workflows/scripts/win_test.ps1
index 40ab79826257d..7b6c3f5da21fc 100644
--- a/.github/workflows/scripts/win_test.ps1
+++ b/.github/workflows/scripts/win_test.ps1
@@ -9,20 +9,24 @@ pip install -r requirements_test.txt
 # TODO relax this when torch supports 3.10
 if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
     pip install "torch==1.10.1+cu113; python_version < '3.10'" -f https://download.pytorch.org/whl/cu113/torch_stable.html
+    pip install "paddlepaddle-gpu==0.0.0.post112; python_version < '3.10'" -f https://www.paddlepaddle.org.cn/whl/windows/gpu/develop.html
 } else {
     pip install "torch; python_version < '3.10'"
+    pip install "paddlepaddle==0.0.0; python_version < '3.10'" -f https://www.paddlepaddle.org.cn/whl/windows/cpu-mkl-avx/develop.html
 }
+# Fail fast, give priority to the error-prone tests
+python tests/run_tests.py -vr2 -t1 -k "paddle" -a "$env:TI_WANTED_ARCHS"
 if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
-  python tests/run_tests.py -vr2 -t4 -k "not torch" -a cuda
+  python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cuda
   if (-not $?) { exit 1 }
 }
 if ("$env:TI_WANTED_ARCHS".Contains("cpu")) {
-  python tests/run_tests.py -vr2 -t6 -k "not torch" -a cpu
+  python tests/run_tests.py -vr2 -t6 -k "not torch and not paddle" -a cpu
   if (-not $?) { exit 1 }
 }
 if ("$env:TI_WANTED_ARCHS".Contains("opengl")) {
-  python tests/run_tests.py -vr2 -t4 -k "not torch" -a opengl
+  python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a opengl
   if (-not $?) { exit 1 }
 }
-python tests/run_tests.py -vr2 -t2 -k "torch" -a "$env:TI_WANTED_ARCHS"
+python tests/run_tests.py -vr2 -t1 -k "torch" -a "$env:TI_WANTED_ARCHS"
 if (-not $?) { exit 1 }
diff --git a/ci/scripts/ubuntu_build_test.sh b/ci/scripts/ubuntu_build_test.sh
index ed5acc79c99a8..b68b2218be2c7 100755
--- a/ci/scripts/ubuntu_build_test.sh
+++ b/ci/scripts/ubuntu_build_test.sh
@@ -31,5 +31,6 @@ export TI_IN_DOCKER=true
 
 # Run tests
 ti diagnose
-python tests/run_tests.py -vr2 -t2 -k "not ndarray and not torch"
+# Paddle's paddle.fluid.core.Tensor._ptr() is only available on develop branch, and CUDA version on linux will get error `Illegal Instruction`
+python tests/run_tests.py -vr2 -t2 -k "not ndarray and not torch and not paddle"
 python tests/run_tests.py -vr2 -t1 -k "ndarray or torch"
diff --git a/ci/scripts/ubuntu_build_test_cpu.sh b/ci/scripts/ubuntu_build_test_cpu.sh
index feba31b80e874..abfafbd2e2bb3 100755
--- a/ci/scripts/ubuntu_build_test_cpu.sh
+++ b/ci/scripts/ubuntu_build_test_cpu.sh
@@ -22,6 +22,8 @@ git clone --recursive https://github.com/taichi-dev/taichi --branch=master
 cd taichi
 git checkout $SHA
 python3 -m pip install -r requirements_dev.txt -i http://repo.taichigraphics.com/repository/pypi/simple --trusted-host repo.taichigraphics.com
+# Paddle's paddle.fluid.core.Tensor._ptr() is only available on develop branch
+python3 -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
 TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN:BOOL=OFF -DTI_WITH_CUDA:BOOL=OFF -DTI_WITH_OPENGL:BOOL=OFF" python3 setup.py install
 
 # Add Docker specific ENV
@@ -29,5 +31,5 @@ export TI_IN_DOCKER=true
 
 # Run tests
 ti diagnose
-python tests/run_tests.py -vr2 -t2 -k "not ndarray and not torch"
-python tests/run_tests.py -vr2 -t1 -k "ndarray or torch"
+python tests/run_tests.py -vr2 -t2 -k "not ndarray and not torch and not paddle"
+python tests/run_tests.py -vr2 -t1 -k "ndarray or torch or paddle"
diff --git a/ci/windows/win_build_test.ps1 b/ci/windows/win_build_test.ps1
index eea924a966dce..7d374b45510ea 100644
--- a/ci/windows/win_build_test.ps1
+++ b/ci/windows/win_build_test.ps1
@@ -59,5 +59,5 @@ python setup.py develop
 WriteInfo("Build finished")
 
 WriteInfo("Testing Taichi")
-python tests/run_tests.py -vr2 -t2 -k "not torch" -a cpu
+python tests/run_tests.py -vr2 -t2 -k "not torch and not paddle" -a cpu
 WriteInfo("Test finished")
diff --git a/docs/lang/articles/basic/external.md b/docs/lang/articles/basic/external.md
index e343671bf1730..b9774c05a5555 100644
--- a/docs/lang/articles/basic/external.md
+++ b/docs/lang/articles/basic/external.md
@@ -4,17 +4,12 @@ sidebar_position: 5
 
 # Interacting with external arrays
 
-Although Taichi fields are mainly used in Taichi-scope, in some cases
-efficiently manipulating Taichi field data in Python-scope could also be
+Although Taichi fields are mainly used in Taichi-scope, in some cases efficiently manipulating Taichi field data in Python-scope could also be
 helpful.
 
-We provide various interfaces to copy the data between Taichi fields and
-external arrays. External arrays refer to NumPy arrays or PyTorch tensors.
-Let's take a look at the most common usage: interacting with NumPy arrays.
+We provide various interfaces to copy the data between Taichi fields and external arrays. External arrays refer to NumPy arrays, PyTorch tensors or Paddle Tensors. Let's take a look at the most common usage: interacting with NumPy arrays.
 
-**Export data in Taichi fields to NumPy arrays** via `to_numpy()`. This
-allows us to export computation results to other Python packages that
-support NumPy, e.g. `matplotlib`.
+**Export data in Taichi fields to NumPy arrays** via `to_numpy()`. This allows us to export computation results to other Python packages that support NumPy, e.g. `matplotlib`.
 
 ```python {8}
 @ti.kernel
@@ -28,8 +23,7 @@ x_np = x.to_numpy()
 print(x_np)  # np.array([0, 2, 4, 6])
 ```
 
-**Import data from NumPy arrays to Taichi fields** via `from_numpy()`.
-This allows us to initialize Taichi fields via NumPy arrays:
+**Import data from NumPy arrays to Taichi fields** via `from_numpy()`. This allows us to initialize Taichi fields via NumPy arrays:
 
 ```python {3}
 x = ti.field(ti.f32, 4)
@@ -59,7 +53,28 @@ print(x[1])  # 7
 print(x[2])  # 3
 print(x[3])  # 5
 ```
+And Taichi fields also can be **imported from and exported to Paddle tensors**:
+
+```python
+@ti.kernel
+def my_kernel():
+    for i in x:
+        x[i] = i * 2
+
+x = ti.field(ti.f32, 4)
+my_kernel()
+x_paddle = x.to_paddle()
+print(x_paddle)  # paddle.Tensor([0, 2, 4, 6])
+
+x.from_numpy(paddle.to_tensor([1, 7, 3, 5]))
+print(x[0])  # 1
+print(x[1])  # 7
+print(x[2])  # 3
+print(x[3])  # 5
+```
+
 When calling `to_torch()`, specify the PyTorch device where the Taichi field is exported using the `device` argument:
+
 ```python
 x = ti.field(ti.f32, 4)
 x.fill(3.0)
@@ -67,13 +82,13 @@ x_torch = x.to_torch(device="cuda:0")
 print(x_torch.device) # device(type='cuda', index=0)
 ```
 
+For Paddle, specify the device by `paddle.CPUPlace()` or `paddle.CUDAPlace(n)` where n is an optional ID, default is 0.
+
 ## External array shapes
 
-Shapes of Taichi fields and those of corresponding NumPy arrays or PyTorch tensors are closely
-connected via the following rules:
+Shapes of Taichi fields and those of corresponding NumPy arrays, PyTorch tensors or Paddle Tensors are closely connected via the following rules:
 
-- For scalar fields, **the shape of NumPy array or PyTorch tensor equals the shape of
-  the Taichi field**:
+- For scalar fields, **the shape of NumPy array, PyTorch tensor or Paddle Tensor equals the shape of the Taichi field**
 
 ```python
 field = ti.field(ti.i32, shape=(256, 512))
@@ -85,8 +100,7 @@ array.shape  # (256, 512)
 field.from_numpy(array)  # the input array must be of shape (256, 512)
 ```
 
-- For vector fields, if the vector is `n`-D, then **the shape of NumPy
-  array or Pytorch tensor should be** `(*field_shape, vector_n)`:
+- For vector fields, if the vector is `n`-D, then **the shape of NumPy array, PyTorch tensor or Paddle Tensor should be** `(*field_shape, vector_n)`:
 
 ```python
 field = ti.Vector.field(3, ti.i32, shape=(256, 512))
@@ -99,8 +113,7 @@ array.shape  # (256, 512, 3)
 field.from_numpy(array)  # the input array must be of shape (256, 512, 3)
 ```
 
-- For matrix fields, if the matrix is `n`-by-`m` (`n x m`), then **the shape of NumPy
-array or Pytorch Tensor should be** `(*field_shape, matrix_n, matrix_m)`:
+- For matrix fields, if the matrix is `n`-by-`m` (`n x m`), then **the shape of NumPy array, PyTorch tensor or Paddle Tensor should be** `(*field_shape, matrix_n, matrix_m)`:
 
 ```python
 field = ti.Matrix.field(3, 4, ti.i32, shape=(256, 512))
@@ -114,8 +127,7 @@ array.shape  # (256, 512, 3, 4)
 field.from_numpy(array)  # the input array must be of shape (256, 512, 3, 4)
 ```
 
-- For struct fields, the external array will be exported as **a dictionary of NumPy arrays or PyTorch tensors** with keys
-being struct member names and values being struct member arrays. Nested structs will be exported as nested dictionaries:
+- For struct fields, the external array will be exported as **a dictionary of NumPy arrays, PyTorch tensors or Paddle Tensors** with keys being struct member names and values being struct member arrays. Nested structs will be exported as nested dictionaries:
 
 ```python
 field = ti.Struct.field({'a': ti.i32, 'b': ti.types.vector(float, 3)} shape=(256, 512))
@@ -131,8 +143,7 @@ field.from_numpy(array_dict) # the input array must have the same keys as the fi
 
 ## Using external arrays as Taichi kernel arguments
 
-Use type hint `ti.types.ndarray()` to pass external arrays as kernel
-arguments. For example:
+Use type hint `ti.types.ndarray()` to pass external arrays as kernel arguments. For example:
 
 ```python {10}
 import taichi as ti
@@ -163,8 +174,7 @@ for i in range(n):
         assert a[i, j] == i * j + i + j
 ```
 
-Note that the elements in an external array must be indexed using a single square bracket.
-This contrasts with a Taichi vector or matrix field where field and matrix indices are indexed separately:
+Note that the elements in an external array must be indexed using a single square bracket. This contrasts with a Taichi vector or matrix field where field and matrix indices are indexed separately:
 ```python
 @ti.kernel
 def copy_vector(x: ti.template(), y: ti.types.ndarray()):
@@ -174,9 +184,8 @@ def copy_vector(x: ti.template(), y: ti.types.ndarray()):
             # y[i][j][k] = x[i, j][k] incorrect
             # y[i, j][k] = x[i, j][k] incorrect
 ```
-Also, external arrays in a Taichi kernel are indexed using its **physical memory layout**. For PyTorch users,
-this implies that the PyTorch tensor [needs to be made contiguous](https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html)
-before being passed into a Taichi kernel:
+Also, external arrays in a Taichi kernel are indexed using its **physical memory layout**. For PyTorch users, this implies that the PyTorch tensor [needs to be made contiguous](https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html) before being passed into a Taichi kernel:
+
 ```python
 @ti.kernel
 def copy_scalar(x: ti.template(), y: ti.types.ndarray()):
diff --git a/docs/lang/articles/misc/global_settings.md b/docs/lang/articles/misc/global_settings.md
index ef87dcdc886b0..ba501fb455b4c 100644
--- a/docs/lang/articles/misc/global_settings.md
+++ b/docs/lang/articles/misc/global_settings.md
@@ -29,6 +29,7 @@ sidebar_position: 7
 - To start program in debug mode: `ti.init(debug=True)` or
   `ti debug your_script.py`.
 - To disable importing torch on start up: `export TI_ENABLE_TORCH=0`.
+- To disable importing paddle on start up: `export TI_ENABLE_PADDLE=0`.
 
 ## Logging
 
diff --git a/python/taichi/lang/field.py b/python/taichi/lang/field.py
index 3c66469ab8b3a..5213ebd99c9af 100644
--- a/python/taichi/lang/field.py
+++ b/python/taichi/lang/field.py
@@ -1,6 +1,7 @@
 import taichi.lang
 from taichi._lib import core as _ti_core
-from taichi.lang.util import python_scope, to_numpy_type, to_pytorch_type
+from taichi.lang.util import (python_scope, to_numpy_type, to_paddle_type,
+                              to_pytorch_type)
 
 
 class Field:
@@ -132,6 +133,18 @@ def to_torch(self, device=None):
         """
         raise NotImplementedError()
 
+    @python_scope
+    def to_paddle(self, place=None):
+        """Converts `self` to a paddle tensor.
+
+        Args:
+            place (paddle.CPUPlace()/CUDAPlace(n), optional): The desired place of returned tensor.
+
+        Returns:
+            paddle.Tensor: The result paddle tensor.
+        """
+        raise NotImplementedError()
+
     @python_scope
     def from_numpy(self, arr):
         """Loads all elements from a numpy array.
@@ -154,6 +167,17 @@ def from_torch(self, arr):
         """
         self.from_numpy(arr.contiguous())
 
+    @python_scope
+    def from_paddle(self, arr):
+        """Loads all elements from a paddle tensor.
+
+        The shape of the paddle tensor needs to be the same as `self`.
+
+        Args:
+            arr (paddle.Tensor): The source paddle tensor.
+        """
+        self.from_numpy(arr)
+
     @python_scope
     def copy_from(self, other):
         """Copies all elements from another field.
@@ -267,6 +291,22 @@ def to_torch(self, device=None):
         taichi.lang.runtime_ops.sync()
         return arr
 
+    @python_scope
+    def to_paddle(self, place=None):
+        """Converts this field to a `paddle.Tensor`.
+        """
+        import paddle  # pylint: disable=C0415
+
+        # pylint: disable=E1101
+        # paddle.empty() doesn't support argument `place``
+        arr = paddle.to_tensor(paddle.zeros(self.shape,
+                                            to_paddle_type(self.dtype)),
+                               place=place)
+        from taichi._kernels import tensor_to_ext_arr  # pylint: disable=C0415
+        tensor_to_ext_arr(self, arr)
+        taichi.lang.runtime_ops.sync()
+        return arr
+
     @python_scope
     def from_numpy(self, arr):
         """Copies the data from a `numpy.ndarray` into this field.
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 58d02087c064b..0b820c11eac04 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -20,7 +20,7 @@
 from taichi.lang.kernel_arguments import KernelArgument
 from taichi.lang.matrix import Matrix, MatrixType
 from taichi.lang.shell import _shell_pop_print, oinspect
-from taichi.lang.util import has_pytorch, to_taichi_type
+from taichi.lang.util import has_paddle, has_pytorch, to_taichi_type
 from taichi.types import (ndarray_type, primitive_types, sparse_matrix_builder,
                           template)
 
@@ -29,6 +29,9 @@
 if has_pytorch():
     import torch
 
+if has_paddle():
+    import paddle
+
 
 def func(fn, is_real_function=False):
     """Marks a function as callable in Taichi-scope.
@@ -574,6 +577,42 @@ def call_back():
                 callbacks.append(get_call_back(v, gpu_v))
         return tmp, callbacks
 
+    def get_paddle_callbacks(self, v, has_pp):
+        callbacks = []
+
+        def get_call_back(u, v):
+            def call_back():
+                u.copy_(v, False)
+
+            return call_back
+
+        assert has_pp
+        assert isinstance(v, paddle.Tensor)
+
+        tmp = v.value().get_tensor()
+        taichi_arch = self.runtime.prog.config.arch
+
+        if v.place.is_gpu_place():
+            # External tensor on cuda
+            if taichi_arch != _ti_core.Arch.cuda:
+                # copy data back to cpu
+                host_v = v.cpu()
+                tmp = host_v.value().get_tensor()
+                callbacks.append(get_call_back(v, host_v))
+        elif v.place.is_cpu_place():
+            # External tensor on cpu
+            if taichi_arch == _ti_core.Arch.cuda:
+                gpu_v = v.cuda()
+                tmp = gpu_v.value().get_tensor()
+                callbacks.append(get_call_back(v, gpu_v))
+        else:
+            # Paddle do support many other backends like XPU, NPU, MLU, IPU.
+            raise TaichiRuntimeError(
+                f"Taichi do not support backend {v.place} that Paddle support."
+            )
+
+        return tmp, callbacks
+
     def get_function_body(self, t_kernel):
         # The actual function body
         def func__(*args):
@@ -585,6 +624,7 @@ def func__(*args):
             callbacks = []
             has_external_arrays = False
             has_torch = has_pytorch()
+            has_pp = has_paddle()
 
             actual_argument_slot = 0
             launch_ctx = t_kernel.make_launch_context()
@@ -618,6 +658,8 @@ def func__(*args):
                         ndarray_type.NdarrayType) and (self.match_ext_arr(v)):
                     has_external_arrays = True
                     is_numpy = isinstance(v, np.ndarray)
+                    is_torch = isinstance(v,
+                                          torch.Tensor) if has_torch else False
                     if is_numpy:
                         tmp = np.ascontiguousarray(v)
                         # Purpose: DO NOT GC |tmp|!
@@ -625,7 +667,7 @@ def func__(*args):
                         launch_ctx.set_arg_external_array_with_shape(
                             actual_argument_slot, int(tmp.ctypes.data),
                             tmp.nbytes, v.shape)
-                    else:
+                    elif is_torch:
                         is_ndarray = False
                         tmp, torch_callbacks = self.get_torch_callbacks(
                             v, has_torch, is_ndarray)
@@ -633,6 +675,14 @@ def func__(*args):
                         launch_ctx.set_arg_external_array_with_shape(
                             actual_argument_slot, int(tmp.data_ptr()),
                             tmp.element_size() * tmp.nelement(), v.shape)
+                    else:
+                        # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch
+                        tmp, paddle_callbacks = self.get_paddle_callbacks(
+                            v, has_pp)
+                        callbacks += paddle_callbacks
+                        launch_ctx.set_arg_external_array_with_shape(
+                            actual_argument_slot, int(tmp._ptr()),
+                            v.element_size() * v.size, v.shape)
 
                 elif isinstance(needed, MatrixType):
                     if id(needed.dtype) in primitive_types.real_type_ids:
@@ -725,6 +775,8 @@ def match_ext_arr(v):
         has_array = isinstance(v, np.ndarray)
         if not has_array and has_pytorch():
             has_array = isinstance(v, torch.Tensor)
+        if not has_array and has_paddle():
+            has_array = isinstance(v, paddle.Tensor)
         return has_array
 
     def ensure_compiled(self, *args):
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 5f1f22db25489..d0835835c6481 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -14,8 +14,8 @@
 from taichi.lang.field import Field, ScalarField, SNodeHostAccess
 from taichi.lang.swizzle_generator import SwizzleGenerator
 from taichi.lang.util import (cook_dtype, in_python_scope, python_scope,
-                              taichi_scope, to_numpy_type, to_pytorch_type,
-                              warning)
+                              taichi_scope, to_numpy_type, to_paddle_type,
+                              to_pytorch_type, warning)
 from taichi.types import primitive_types
 from taichi.types.compound_types import CompoundType
 
@@ -1455,6 +1455,30 @@ def to_torch(self, device=None, keep_dims=False):
         runtime_ops.sync()
         return arr
 
+    def to_paddle(self, place=None, keep_dims=False):
+        """Converts the field instance to a Paddle tensor.
+
+        Args:
+            place (paddle.CPUPlace()/CUDAPlace(n), optional): The desired place of returned tensor.
+            keep_dims (bool, optional): Whether to keep the dimension after conversion.
+                See :meth:`~taichi.lang.field.MatrixField.to_numpy` for more detailed explanation.
+
+        Returns:
+            paddle.Tensor: The result paddle tensor.
+        """
+        import paddle  # pylint: disable=C0415
+        as_vector = self.m == 1 and not keep_dims
+        shape_ext = (self.n, ) if as_vector else (self.n, self.m)
+        # pylint: disable=E1101
+        # paddle.empty() doesn't support argument `place``
+        arr = paddle.to_tensor(paddle.empty(self.shape + shape_ext,
+                                            to_paddle_type(self.dtype)),
+                               place=place)
+        from taichi._kernels import matrix_to_ext_arr  # pylint: disable=C0415
+        matrix_to_ext_arr(self, arr, as_vector)
+        runtime_ops.sync()
+        return arr
+
     @python_scope
     def from_numpy(self, arr):
         """Copies an `numpy.ndarray` into this field.
diff --git a/python/taichi/lang/mesh.py b/python/taichi/lang/mesh.py
index 6b9e474b1ae75..08a72305c0c6a 100644
--- a/python/taichi/lang/mesh.py
+++ b/python/taichi/lang/mesh.py
@@ -171,6 +171,11 @@ def from_torch(self, array_dict):
         for k, v in self._items:
             v.from_torch(array_dict[k])
 
+    @python_scope
+    def from_paddle(self, array_dict):
+        for k, v in self._items:
+            v.from_paddle(array_dict[k])
+
     @python_scope
     def to_numpy(self):
         return {k: v.to_numpy() for k, v in self._items}
@@ -179,6 +184,10 @@ def to_numpy(self):
     def to_torch(self, device=None):
         return {k: v.to_torch(device=device) for k, v in self._items}
 
+    @python_scope
+    def to_paddle(self, place=None):
+        return {k: v.to_paddle(place=place) for k, v in self._items}
+
     @python_scope
     def __len__(self):
         return _ti_core.get_num_elements(self.mesh.mesh_ptr, self._type)
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index 3574e80eed57d..b2965f3f4c7bc 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -504,6 +504,17 @@ def from_torch(self, array_dict):
         for k, v in self._items:
             v.from_torch(array_dict[k])
 
+    @python_scope
+    def from_paddle(self, array_dict):
+        """Copies the data from a set of `paddle.Tensor` into this field.
+
+        The argument `array_dict` must be a dictionay-like object, it
+        contains all the keys in this field and the copying process
+        between corresponding items can be performed.
+        """
+        for k, v in self._items:
+            v.from_paddle(array_dict[k])
+
     @python_scope
     def to_numpy(self):
         """Converts the Struct field instance to a dictionary of NumPy arrays.
@@ -531,6 +542,22 @@ def to_torch(self, device=None):
         """
         return {k: v.to_torch(device=device) for k, v in self._items}
 
+    @python_scope
+    def to_paddle(self, place=None):
+        """Converts the Struct field instance to a dictionary of Paddle tensors.
+
+        The dictionary may be nested when converting nested structs.
+
+        Args:
+            place (paddle.CPUPlace()/CUDAPlace(n), optional): The
+                desired place of returned tensor.
+
+        Returns:
+            Dict[str, Union[paddle.Tensor, Dict]]: The result
+                Paddle tensor.
+        """
+        return {k: v.to_paddle(place=place) for k, v in self._items}
+
     @python_scope
     def __setitem__(self, indices, element):
         self._initialize_host_accessors()
diff --git a/python/taichi/lang/util.py b/python/taichi/lang/util.py
index feb18831d91cb..c2562c31d782c 100644
--- a/python/taichi/lang/util.py
+++ b/python/taichi/lang/util.py
@@ -10,6 +10,7 @@
                                           u16, u32, u64)
 
 _has_pytorch = False
+_has_paddle = False
 
 _env_torch = os.environ.get('TI_ENABLE_TORCH', '1')
 if not _env_torch or int(_env_torch):
@@ -19,6 +20,14 @@
     except:
         pass
 
+_env_paddle = os.environ.get('TI_ENABLE_PADDLE', '1')
+if not _env_paddle or int(_env_paddle):
+    try:
+        import paddle
+        _has_paddle = True
+    except:
+        pass
+
 
 def has_pytorch():
     """Whether has pytorch in the current Python environment.
@@ -30,6 +39,15 @@ def has_pytorch():
     return _has_pytorch
 
 
+def has_paddle():
+    """Whether has paddle in the current Python environment.
+
+    Returns:
+        bool: True if has paddle else False.
+    """
+    return _has_paddle
+
+
 from distutils.spawn import find_executable
 
 # Taichi itself uses llvm-10.0.0 to compile.
@@ -127,8 +145,40 @@ def to_pytorch_type(dt):
     assert False
 
 
+def to_paddle_type(dt):
+    """Convert taichi data type to its counterpart in paddle.
+
+    Args:
+        dt (DataType): The desired data type to convert.
+
+    Returns:
+        DataType: The counterpart data type in paddle.
+
+    """
+    if dt == f32:
+        return paddle.float32
+    if dt == f64:
+        return paddle.float64
+    if dt == i32:
+        return paddle.int32
+    if dt == i64:
+        return paddle.int64
+    if dt == i8:
+        return paddle.int8
+    if dt == i16:
+        return paddle.int16
+    if dt == u8:
+        return paddle.uint8
+    if dt == f16:
+        return paddle.float16
+    if dt in (u16, u32, u64):
+        raise RuntimeError(
+            f'Paddle doesn\'t support {dt.to_string()} data type.')
+    assert False
+
+
 def to_taichi_type(dt):
-    """Convert numpy or torch data type to its counterpart in taichi.
+    """Convert numpy or torch or paddle data type to its counterpart in taichi.
 
     Args:
         dt (DataType): The desired data type to convert.
@@ -185,6 +235,27 @@ def to_taichi_type(dt):
             raise RuntimeError(
                 f'PyTorch doesn\'t support {dt.to_string()} data type.')
 
+    if has_paddle():
+        if dt == paddle.float32:
+            return f32
+        if dt == paddle.float64:
+            return f64
+        if dt == paddle.int32:
+            return i32
+        if dt == paddle.int64:
+            return i64
+        if dt == paddle.int8:
+            return i8
+        if dt == paddle.int16:
+            return i16
+        if dt == paddle.uint8:
+            return u8
+        if dt == paddle.float16:
+            return f16
+        if dt in (u16, u32, u64):
+            raise RuntimeError(
+                f'Paddle doesn\'t support {dt.to_string()} data type.')
+
     raise AssertionError(f"Unknown type {dt}")
 
 
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 6f6dc5a765901..df555dc01512b 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -60,8 +60,8 @@ def _get_expected_matrix_apis():
     'wasm', 'x64', 'x86_64', 'zero'
 ]
 user_api[ti.Field] = [
-    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_torch', 'parent',
-    'shape', 'snode', 'to_numpy', 'to_torch'
+    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_paddle', 'from_torch',
+    'parent', 'shape', 'snode', 'to_numpy', 'to_paddle', 'to_torch'
 ]
 user_api[ti.FieldsBuilder] = [
     'bit_array', 'bit_struct', 'bitmasked', 'deactivate_all', 'dense',
@@ -77,8 +77,9 @@ def _get_expected_matrix_apis():
 ]
 user_api[ti.Matrix] = _get_expected_matrix_apis()
 user_api[ti.MatrixField] = [
-    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_torch',
-    'get_scalar_field', 'parent', 'shape', 'snode', 'to_numpy', 'to_torch'
+    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_paddle', 'from_torch',
+    'get_scalar_field', 'parent', 'shape', 'snode', 'to_numpy', 'to_paddle',
+    'to_torch'
 ]
 user_api[ti.MatrixNdarray] = [
     'copy_from', 'element_shape', 'fill', 'from_numpy', 'to_numpy'
@@ -89,17 +90,17 @@ def _get_expected_matrix_apis():
     'dynamic', 'lazy_grad', 'parent', 'place', 'pointer', 'shape'
 ]
 user_api[ti.ScalarField] = [
-    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_torch', 'parent',
-    'shape', 'snode', 'to_numpy', 'to_torch'
+    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_paddle', 'from_torch',
+    'parent', 'shape', 'snode', 'to_numpy', 'to_paddle', 'to_torch'
 ]
 user_api[ti.ScalarNdarray] = [
     'copy_from', 'element_shape', 'fill', 'from_numpy', 'to_numpy'
 ]
 user_api[ti.Struct] = ['field', 'fill', 'items', 'keys', 'to_dict']
 user_api[ti.StructField] = [
-    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_torch',
+    'copy_from', 'dtype', 'fill', 'from_numpy', 'from_paddle', 'from_torch',
     'get_member_field', 'keys', 'parent', 'shape', 'snode', 'to_numpy',
-    'to_torch'
+    'to_paddle', 'to_torch'
 ]
 user_api[ti.VectorNdarray] = [
     'copy_from', 'element_shape', 'fill', 'from_numpy', 'to_numpy'
diff --git a/tests/python/test_f16.py b/tests/python/test_f16.py
index 5f933aea8fd81..2cf9728b65b7d 100644
--- a/tests/python/test_f16.py
+++ b/tests/python/test_f16.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 import pytest
-from taichi.lang.util import has_pytorch
+from taichi.lang.util import has_paddle, has_pytorch
 
 import taichi as ti
 from tests import test_utils
@@ -101,6 +101,50 @@ def init():
         assert (z[i] == i * 3)
 
 
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=archs_support_f16, exclude=ti.vulkan)
+def test_to_paddle():
+    import paddle
+    n = 16
+    x = ti.field(ti.f16, shape=n)
+
+    @ti.kernel
+    def init():
+        for i in x:
+            x[i] = i * 2
+
+    init()
+    y = x.to_paddle()
+    # paddle's operator slice doesn't have kernel for f16, so cast to f32
+    y = y.cast(paddle.float32)
+    print(y)
+    for i in range(n):
+        assert (y[i] == 2 * i)
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=archs_support_f16, exclude=ti.vulkan)
+def test_from_paddle():
+    import paddle
+    n = 16
+    y = ti.field(dtype=ti.f16, shape=n)
+    # paddle doesn't have arrange implementation for float16 so we need to create other type first and then convert
+    x = paddle.arange(0, n).cast(paddle.float16)
+    y.from_paddle(x)
+
+    @ti.kernel
+    def init():
+        for i in y:
+            y[i] = 3 * i
+
+    init()
+    z = y.to_paddle()
+    # paddle's operator slice doesn't have kernel for f16, so cast to f32
+    z = z.cast(paddle.float32)
+    for i in range(n):
+        assert (z[i] == i * 3)
+
+
 @test_utils.test(arch=archs_support_f16)
 def test_binary_op():
     dtype = ti.f16
diff --git a/tests/python/test_get_external_tensor_shape.py b/tests/python/test_get_external_tensor_shape.py
index bc0efc11f3818..5f131c42849fc 100644
--- a/tests/python/test_get_external_tensor_shape.py
+++ b/tests/python/test_get_external_tensor_shape.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pytest
-from taichi.lang.util import has_pytorch
+from taichi.lang.util import has_paddle, has_pytorch
 
 import taichi as ti
 from tests import test_utils
@@ -8,6 +8,9 @@
 if has_pytorch():
     import torch
 
+if has_paddle():
+    import paddle
+
 
 @pytest.mark.parametrize('size', [[1], [1, 2, 3, 4]])
 @test_utils.test()
@@ -70,3 +73,18 @@ def func(x: ti.types.ndarray(), index: ti.template()) -> ti.i32:
         y_hat = func(x_hat, idx)
         assert y_ref == y_hat, "Size of axis {} should equal {} and not {}.".format(
             idx, y_ref, y_hat)
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@pytest.mark.parametrize('size', [[1, 2, 3, 4]])
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_get_external_tensor_shape_access_paddle(size):
+    @ti.kernel
+    def func(x: ti.types.ndarray(), index: ti.template()) -> ti.i32:
+        return x.shape[index]
+
+    x_hat = paddle.ones(shape=size, dtype=paddle.int32)
+    for idx, y_ref in enumerate(size):
+        y_hat = func(x_hat, idx)
+        assert y_ref == y_hat, "Size of axis {} should equal {} and not {}.".format(
+            idx, y_ref, y_hat)
diff --git a/tests/python/test_paddle_io.py b/tests/python/test_paddle_io.py
new file mode 100644
index 0000000000000..a6f906ab8a61c
--- /dev/null
+++ b/tests/python/test_paddle_io.py
@@ -0,0 +1,250 @@
+import numpy as np
+import pytest
+from taichi.lang import impl
+from taichi.lang.util import has_paddle
+
+import taichi as ti
+from tests import test_utils
+
+if has_paddle():
+    import paddle
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_io_devices():
+    n = 32
+    x = ti.field(dtype=ti.i32, shape=n)
+
+    @ti.kernel
+    def load(y: ti.types.ndarray()):
+        for i in x:
+            x[i] = y[i] + 10
+
+    @ti.kernel
+    def inc():
+        for i in x:
+            x[i] += i
+
+    @ti.kernel
+    def store(y: ti.types.ndarray()):
+        for i in x:
+            y[i] = x[i] * 2
+
+    devices = [paddle.CPUPlace()]
+    if paddle.device.is_compiled_with_cuda():
+        devices.append(paddle.CUDAPlace(0))
+    for device in devices:
+        y = paddle.to_tensor(np.ones(shape=n, dtype=np.int32), place=device)
+
+        load(y)
+        inc()
+        store(y)
+
+        y = y.cpu().numpy()
+
+        for i in range(n):
+            assert y[i] == (11 + i) * 2
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_io():
+    n = 32
+
+    @ti.kernel
+    def paddle_kernel(zero: ti.types.ndarray()):
+        for i in range(n):
+            zero[i] += i * i
+
+    x_zero = paddle.zeros(shape=[n], dtype=paddle.int32)
+    paddle_kernel(x_zero)
+    for i in range(n):
+        assert x_zero[i] == i * i
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_io_2d():
+    n = 32
+
+    @ti.kernel
+    def paddle_kernel(zero: ti.types.ndarray()):
+        for i in range(n):
+            for j in range(n):
+                zero[i, j] += i * j
+
+    x_zero = paddle.zeros(shape=(n, n), dtype=paddle.int32)
+    paddle_kernel(x_zero)
+    for i in range(n):
+        for j in range(n):
+            assert x_zero[i, j] == i * j
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_io_3d():
+    n = 32
+
+    @ti.kernel
+    def paddle_kernel(zero: ti.types.ndarray()):
+        for i in range(n):
+            for j in range(n):
+                for k in range(n):
+                    zero[i, j, k] += i * j * k
+
+    x_zero = paddle.zeros(shape=(n, n, n), dtype=paddle.int32)
+    paddle_kernel(x_zero)
+    for i in range(n):
+        for j in range(n):
+            for k in range(n):
+                assert x_zero[i, j, k] == i * j * k
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_io_simple():
+    n = 32
+
+    x1 = ti.field(ti.f32, shape=(n, n))
+    p1 = paddle.Tensor(3 * np.ones((n, n), dtype=np.float32))
+
+    x2 = ti.Matrix.field(2, 3, ti.f32, shape=(n, n))
+    p2 = paddle.Tensor(3 * np.ones((n, n, 2, 3), dtype=np.float32))
+
+    x1.from_paddle(p1)
+    for i in range(n):
+        for j in range(n):
+            assert x1[i, j] == 3
+
+    x2.from_paddle(p2)
+    for i in range(n):
+        for j in range(n):
+            for k in range(2):
+                for l in range(3):
+                    assert x2[i, j][k, l] == 3
+
+    p3 = x2.to_paddle()
+    assert (p2 == p3).all()
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_io_zeros():
+    mat = ti.Matrix.field(2, 6, dtype=ti.f32, shape=(), needs_grad=True)
+    zeros = paddle.zeros((2, 6))
+    zeros[1, 2] = 3
+    mat.from_paddle(zeros + 1)
+
+    assert mat[None][1, 2] == 4
+
+    zeros = mat.to_paddle()
+    assert zeros[1, 2] == 4
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_io_struct():
+    n = 16
+    x1 = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
+    p1 = {
+        "a": paddle.Tensor(2 * np.ones(n, dtype=np.int32)),
+        "b": paddle.Tensor(3 * np.ones(n, dtype=np.float32)),
+    }
+
+    x1.from_paddle(p1)
+    for i in range(n):
+        assert x1[i].a == 2
+        assert x1[i].b == 3
+
+    p2 = x1.to_paddle()
+    for k in p1:
+        assert (p1[k] == p2[k]).all()
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_fused_kernels():
+    n = 12
+    X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
+    s = impl.get_runtime().get_num_compiled_functions()
+    p = X.to_paddle()
+    assert impl.get_runtime().get_num_compiled_functions() == s + 1
+    X.from_paddle(p)
+    assert impl.get_runtime().get_num_compiled_functions() == s + 2
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_devices():
+    n = 12
+    X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
+    assert X.to_paddle(place=paddle.CPUPlace()).place.is_cpu_place()
+
+    if paddle.device.is_compiled_with_cuda():
+        assert X.to_paddle(place=paddle.CUDAPlace(0)).place.is_gpu_place()
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_shape_matrix():
+    n = 12
+    x = ti.Matrix.field(3, 2, ti.f32, shape=(n, n))
+    X = x.to_paddle()
+    for i in range(n):
+        for j in range(n):
+            for k in range(3):
+                for l in range(2):
+                    X[i, j, k, l] = i * 10 + j + k * 100 + l * 1000
+
+    x.from_paddle(X)
+    X1 = x.to_paddle()
+    x.from_paddle(X1)
+    X1 = x.to_paddle()
+
+    assert (X == X1).all()
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_shape_vector():
+    n = 12
+    x = ti.Vector.field(3, ti.f32, shape=(n, n))
+    X = x.to_paddle()
+    for i in range(n):
+        for j in range(n):
+            for k in range(3):
+                X[i, j, k] = i * 10 + j + k * 100
+
+    x.from_paddle(X)
+    X1 = x.to_paddle()
+    x.from_paddle(X1)
+    X1 = x.to_paddle()
+
+    assert (X == X1).all()
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_paddle_zero():
+    @ti.kernel
+    def test_paddle(arr: ti.types.ndarray()):
+        pass
+
+    test_paddle(paddle.zeros([0], dtype=paddle.int32))
+    test_paddle(paddle.zeros([0, 5], dtype=paddle.int32))
+    test_paddle(paddle.zeros([5, 0, 5], dtype=paddle.int32))
+
+
+@pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_paddle_view():
+    @ti.kernel
+    def copy(x: ti.types.ndarray(), y: ti.types.ndarray()):
+        for i, j in x:
+            y[i, j] = x[i, j]
+
+    x = paddle.to_tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]).T
+    y = ti.ndarray(int, (3, 3))
+
+    copy(x, y)

From 9a0f32ac655cd2d4b351e64236f9c332256fde03 Mon Sep 17 00:00:00 2001
From: 0xzhang <33616362+0xzhang@users.noreply.github.com>
Date: Fri, 6 May 2022 09:59:03 +0800
Subject: [PATCH 025/176] [test] Cancel tests for Paddle on GPU (#4914)

---
 .github/workflows/scripts/win_test.ps1 | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/win_test.ps1 b/.github/workflows/scripts/win_test.ps1
index 7b6c3f5da21fc..bc838b0478245 100644
--- a/.github/workflows/scripts/win_test.ps1
+++ b/.github/workflows/scripts/win_test.ps1
@@ -9,13 +9,12 @@ pip install -r requirements_test.txt
 # TODO relax this when torch supports 3.10
 if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
     pip install "torch==1.10.1+cu113; python_version < '3.10'" -f https://download.pytorch.org/whl/cu113/torch_stable.html
-    pip install "paddlepaddle-gpu==0.0.0.post112; python_version < '3.10'" -f https://www.paddlepaddle.org.cn/whl/windows/gpu/develop.html
 } else {
     pip install "torch; python_version < '3.10'"
     pip install "paddlepaddle==0.0.0; python_version < '3.10'" -f https://www.paddlepaddle.org.cn/whl/windows/cpu-mkl-avx/develop.html
 }
 # Fail fast, give priority to the error-prone tests
-python tests/run_tests.py -vr2 -t1 -k "paddle" -a "$env:TI_WANTED_ARCHS"
+python tests/run_tests.py -vr2 -t1 -k "paddle" -a cpu
 if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
   python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cuda
   if (-not $?) { exit 1 }

From 894727d7a448a8a39331e2d6038bd824820e37f3 Mon Sep 17 00:00:00 2001
From: yixu <BillXu2000@126.com>
Date: Fri, 6 May 2022 10:55:06 +0800
Subject: [PATCH 026/176] remove debug print (#4883)

---
 python/taichi/lang/mesh.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/taichi/lang/mesh.py b/python/taichi/lang/mesh.py
index 08a72305c0c6a..36e182cbdff27 100644
--- a/python/taichi/lang/mesh.py
+++ b/python/taichi/lang/mesh.py
@@ -351,7 +351,6 @@ def update_relation(self, from_order, to_order):
         rel_type = MeshRelationType(relation_by_orders(from_order, to_order))
         if rel_type not in self.relation_set:
             meta = self.patcher.get_relation_meta(from_order, to_order)
-            print('new relation')
 
             def fun(arr, dtype):
                 field = impl.field(dtype=dtype, shape=arr.shape)

From 64e0ba81fad09a5d95824d3a1ec4544dd3ed6064 Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Fri, 6 May 2022 11:19:16 +0800
Subject: [PATCH 027/176] [Doc] Updated broken links (#4912)

* [Doc] Updated broken links

* Updated links that may break.

* Added .md
---
 docs/lang/articles/advanced/odop.md                    |  2 +-
 docs/lang/articles/advanced/sparse.md                  |  6 +++---
 .../differences_between_taichi_and_python_programs.md  |  6 +++---
 docs/lang/articles/basic/field.md                      |  2 +-
 docs/lang/articles/basic/overview.md                   |  2 +-
 docs/lang/articles/reference.md                        | 10 +++++-----
 6 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/docs/lang/articles/advanced/odop.md b/docs/lang/articles/advanced/odop.md
index 7d008adf104c3..0384482c8fb01 100644
--- a/docs/lang/articles/advanced/odop.md
+++ b/docs/lang/articles/advanced/odop.md
@@ -41,7 +41,7 @@ a = TiArray(32)
 a.inc()
 ```
 
-Programmers used to define Taichi fields in `__init__` functions of `@ti.data_oriented` classes. With the new **Dynamic SNode** feature (released in `v0.8.0`, see [Field (advanced)](/lang/articles/advanced/layout.md#dynamic-field-allocation-and-destruction) for more details), you can define Taichi fields **at any places** of Python-scope functions. For example,
+Programmers used to define Taichi fields in `__init__` functions of `@ti.data_oriented` classes. With the new **Dynamic SNode** feature (released in `v0.8.0`, see [Field (advanced)](layout.md#dynamic-field-allocation-and-destruction) for more details), you can define Taichi fields **at any places** of Python-scope functions. For example,
 
 ```python {21,25}
 import taichi as ti
diff --git a/docs/lang/articles/advanced/sparse.md b/docs/lang/articles/advanced/sparse.md
index ff7557f3dadab..2a9b4389e5c97 100644
--- a/docs/lang/articles/advanced/sparse.md
+++ b/docs/lang/articles/advanced/sparse.md
@@ -5,7 +5,7 @@ sidebar_position: 3
 # Sparse spatial data structures
 
 :::note
-Prerequisite: please read the [Fields](lang/articles/basic/field.md), [Fields (advanced)](lang/articles/advanced/layout.md), and [SNodes](lang/articles/misc/internal.md#data-structure-organization) first.
+Prerequisite: please read the [Fields](../basic/field.md), [Fields (advanced)](layout.md), and [SNodes](../misc/internal.md#data-structure-organization) first.
 :::
 
 ![image](https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/doc/sparse_grids_3d.jpg)
@@ -14,7 +14,7 @@ Figure: A 3D fluid simulation that uses both particles and grids. Left to right:
 ## Motivation
 
 High-resolution 2D/3D grids are often needed in large-scale spatial computation, such as physical simulation, rendering, and 3D reconstruction.
-However, these grids tend to consume a huge amount of memory space and computation if we use dense data structures (see [field](lang/articles/basic/field.md) and [field advanced](lang/articles/advanced/layout.md)).
+However, these grids tend to consume a huge amount of memory space and computation if we use dense data structures (see [field](../basic/field.md) and [field advanced](layout.md)).
 While a programmer may allocate large dense grids to store spatial data (especially physical quantities such as a density or velocity field),
 oftentimes, they only care about a small fraction of this dense grid since the rest may be empty space (vacuum or air).
 
@@ -48,7 +48,7 @@ In Taichi, programmers can compose data structures similar to VDB and SPGrid wit
 
 
 :::note
-Sparse matrices are usually **not** implemented in Taichi via sparse spatial data structures. See [sparse matrix](lang/articles/advanced/sparse_matrix.md) instead.
+Sparse matrices are usually **not** implemented in Taichi via sparse spatial data structures. See [sparse matrix](sparse_matrix.md) instead.
 :::
 
 ## Sparse spatial data structures in Taichi
diff --git a/docs/lang/articles/basic/differences_between_taichi_and_python_programs.md b/docs/lang/articles/basic/differences_between_taichi_and_python_programs.md
index c2c6b7dbd7bc3..0404eb8f33f5c 100644
--- a/docs/lang/articles/basic/differences_between_taichi_and_python_programs.md
+++ b/docs/lang/articles/basic/differences_between_taichi_and_python_programs.md
@@ -53,7 +53,7 @@ def discarded_after_first_return(a: ti.i32) -> ti.i32:
 
 discarded_after_first_return(0)  # OK: returns 1
 ```
-- If there are [compile-time evaluations](/lang/articles/advanced/meta.md#compile-time-evaluations) in the code, make sure there is a return statement under all circumstances.
+- If there are [compile-time evaluations](../advanced/meta.md#compile-time-evaluations) in the code, make sure there is a return statement under all circumstances.
 Otherwise, error occurs when a branch is chosen which does not have return statement.
 ```python {7-8,15-16,21,23-24}
 @ti.kernel
@@ -123,7 +123,7 @@ Currently, Taichi does not support `set`.
 List and dictionary before assigning to a variable works as the python list and dictionary.
 However, after assigning to a variable, the content of the list and the values (not keys) of the dictionary are converted to Taichi variables.
 
-Taichi does not have a runtime implementation of `in` currently. Therefore, operator `in` and `not in` only works in  [static scope](/lang/articles/advanced/meta.md#static-scope) (inside `ti.static()`).
+Taichi does not have a runtime implementation of `in` currently. Therefore, operator `in` and `not in` only works in  [static scope](../advanced/meta.md#static-scope) (inside `ti.static()`).
 
 ```python {3,11-12,20}
 @ti.kernel
@@ -155,7 +155,7 @@ Taichi partially supports list comprehension and dictionary comprehension,
 but does not support set comprehension.
 
 For list comprehensions and dictionary comprehensions, the `if`s and `for`s in them are evaluated at compile time.
-The iterators and conditions are implicitly in [static scope](/lang/articles/advanced/meta.md#static-scope).
+The iterators and conditions are implicitly in [static scope](../advanced/meta.md#static-scope).
 
 ### Operator `is`
 
diff --git a/docs/lang/articles/basic/field.md b/docs/lang/articles/basic/field.md
index 60e83fde7c44f..8fdd4fd6a94b7 100644
--- a/docs/lang/articles/basic/field.md
+++ b/docs/lang/articles/basic/field.md
@@ -74,7 +74,7 @@ while gui.running:
 ```
 
 :::tip
-With Taichi versions earlier than v0.8.0, you cannot allocate new fields after executing a kernel. Starting from v0.8.0, you can use the `FieldsBuilder` class to dynamically allocate or destruct fields. See the [Field (advanced)](/lang/articles/advanced/layout.md) for more information.
+With Taichi versions earlier than v0.8.0, you cannot allocate new fields after executing a kernel. Starting from v0.8.0, you can use the `FieldsBuilder` class to dynamically allocate or destruct fields. See the [Field (advanced)](./advanced/layout.md) for more information.
 :::
 
 :::caution WARNING
diff --git a/docs/lang/articles/basic/overview.md b/docs/lang/articles/basic/overview.md
index 6944c360a3f90..2eeda69a778f6 100644
--- a/docs/lang/articles/basic/overview.md
+++ b/docs/lang/articles/basic/overview.md
@@ -17,7 +17,7 @@ To be fair, a domain-specific language (DSL) with a Python frontend is not somet
 * Taichi heavily optimizes the source code using various compiler technologies: common subexpression elimination, dead code elimination, control flow graph analysis, etc. These optimizations are backend neutral, because Taichi hosts its own intermediate representation (IR) layer.
 * JIT compilation provides additional optimization opportunities.
 
-That said, Taichi goes beyond a Python JIT transpiler. One of the initial design goals is to *decouple the computation from the data structures*. The mechanism that Taichi provides is a set of generic data containers, called *SNode* (/ˈsnoʊd/). SNodes can be used to compose hierarchical, dense or sparse, multi-dimensional fields conveniently. Switching between array-of-structures and structure-of-arrays layouts is usually a matter of ≤10 lines of code. This has sparked many use cases in numerical simulation. If you are interested to learn them, please check out [Fields (advanced)](https://docs.taichi-lang.org/lang/articles/layout), [Sparse spatial data structures](https://docs.taichi-lang.org/lang/articles/sparse), or [the original Taichi paper](https://yuanming.taichi.graphics/publication/2019-taichi/taichi-lang.pdf).
+That said, Taichi goes beyond a Python JIT transpiler. One of the initial design goals is to *decouple the computation from the data structures*. The mechanism that Taichi provides is a set of generic data containers, called *SNode* (/ˈsnoʊd/). SNodes can be used to compose hierarchical, dense or sparse, multi-dimensional fields conveniently. Switching between array-of-structures and structure-of-arrays layouts is usually a matter of ≤10 lines of code. This has sparked many use cases in numerical simulation. If you are interested to learn them, please check out [Fields (advanced)](../advanced/layout.md), [Sparse spatial data structures](../advanced/sparse.md), or [the original Taichi paper](https://yuanming.taichi.graphics/publication/2019-taichi/taichi-lang.pdf).
 
 The concept of decoupling is further extended to the type system. With GPU memory capacity and bandwidth becoming the major bottlenecks nowadays, it is vital to be able to pack more data per memory unit. Since 2021, Taichi has introduced customizable quantized types, allowing for the definition of fixed point or floating point numbers with arbitrary bits (still needs to be under 64). This has allowed an MPM simulation of over 400 million particles on a single GPU device. Learn more details in [the QuanTaichi paper](https://yuanming.taichi.graphics/publication/2021-quantaichi/quantaichi.pdf).
 
diff --git a/docs/lang/articles/reference.md b/docs/lang/articles/reference.md
index 41026974fc8c6..3af07ef06a1c3 100644
--- a/docs/lang/articles/reference.md
+++ b/docs/lang/articles/reference.md
@@ -235,8 +235,8 @@ attributeref ::= primary "." identifier
 Attribute references are evaluated at compile time. The `primary` must be
 evaluated to a Python value with an attribute named `identifier`. Common use
 cases in Taichi include metadata queries of
-[field](https://docs.taichi-lang.org/lang/articles/meta#field-metadata) and
-[matrices](https://docs.taichi-lang.org/lang/articles/meta#matrix--vector-metadata).
+[field](advanced/meta.md#field-metadata) and
+[matrices](advanced/meta.md#matrix--vector-metadata).
 
 #### Subscriptions
 
@@ -440,9 +440,9 @@ The `positional_arguments` is evaluated at compile time, and the items inside mu
 - When multiple arguments are passed in, it returns a tuple containing all the arguments in the same order as they are passed.
 
 The static expressions work as a mechanism to trigger many metaprogramming functions in Taichi,
-such as [compile-time loop unrolling and compile-time branching](lang/articles/advanced/meta.md#compile-time-evaluations).
+such as [compile-time loop unrolling and compile-time branching](advanced/meta.md#compile-time-evaluations).
 
-The static expressions can also be used to [create aliases for Taichi fields and Taichi functions](lang/articles/advanced/syntax_sugars.md#aliases).
+The static expressions can also be used to [create aliases for Taichi fields and Taichi functions](advanced/syntax_sugars.md#aliases).
 
 ### Expression lists
 
@@ -728,7 +728,7 @@ The `iter_expression` of ndrange `for` statement must be a call to `ti.ndrange()
 - If the `iter_expression` is a call to `ti.range()`, it is a normal ndrange `for`.
 - If the `iter_expression` is a call to `ti.grouped(ti.range())`, it is a grouped ndrange `for`.
 
-You can use grouped `for` loops to write [dimensionality-independent programs](lang/articles/advanced/meta.md#dimensionality-independent-programming-using-grouped-indices).
+You can use grouped `for` loops to write [dimensionality-independent programs](advanced/meta.md#dimensionality-independent-programming-using-grouped-indices).
 
 `ti.ndrange` receives arbitrary numbers of arguments.
 The k-th argument represents the iteration range of the k-th dimension,

From 92bb3593f6ef1575c41c39a18341072bfb044f70 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Fri, 6 May 2022 12:02:55 +0800
Subject: [PATCH 028/176] [test] Exit on error during Paddle windows test
 (#4910)

* [test] Exit on error during Paddle windows test

* Check if paddle test leaks memory

* Increase device memory and reduce thread number

* Revert "Check if paddle test leaks memory"

This reverts commit e0522b0e520050fb50d2c338a2a7d0b2a363bfb0.

* Disable paddle for non-paddle test
---
 .github/workflows/scripts/win_test.ps1 | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/scripts/win_test.ps1 b/.github/workflows/scripts/win_test.ps1
index bc838b0478245..9cd94f1d66f30 100644
--- a/.github/workflows/scripts/win_test.ps1
+++ b/.github/workflows/scripts/win_test.ps1
@@ -15,6 +15,11 @@ if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
 }
 # Fail fast, give priority to the error-prone tests
 python tests/run_tests.py -vr2 -t1 -k "paddle" -a cpu
+if (-not $?) { exit 1 }
+
+# Disable paddle for the remaining test
+$env:TI_ENABLE_PADDLE = "0"
+
 if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
   python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cuda
   if (-not $?) { exit 1 }

From abd0136ee6f46abb69ae34fa08b14caae0a149f9 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 6 May 2022 12:07:30 +0800
Subject: [PATCH 029/176] [build] Warning Suppression PR #2: Fixed codebase
 warnings (#4909)

---
 taichi/backends/vulkan/vulkan_device.cpp         | 5 +----
 taichi/backends/vulkan/vulkan_device_creator.cpp | 2 +-
 taichi/backends/wasm/codegen_wasm.cpp            | 2 +-
 taichi/codegen/spirv/spirv_codegen.cpp           | 1 -
 taichi/program/async_engine.cpp                  | 8 +++++++-
 taichi/program/state_flow_graph.cpp              | 2 +-
 taichi/util/io.h                                 | 6 +++++-
 tests/cpp/ir/ir_type_promotion_test.cpp          | 2 +-
 tests/cpp/transforms/alg_simp_test.cpp           | 3 ++-
 9 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
index 83c278a0bd058..62b1b4fe6b865 100644
--- a/taichi/backends/vulkan/vulkan_device.cpp
+++ b/taichi/backends/vulkan/vulkan_device.cpp
@@ -1396,9 +1396,8 @@ void VulkanDevice::dealloc_memory(DeviceAllocation handle) {
   TI_ASSERT_INFO(map_pair != allocations_.end(),
                  "Invalid handle (double free?) {}", handle.alloc_id);
 
-  AllocationInternal &alloc = map_pair->second;
-
 #ifdef TI_VULKAN_DEBUG_ALLOCATIONS
+  AllocationInternal &alloc = map_pair->second;
   TI_TRACE("Dealloc VK buffer {}, alloc_id={}", (void *)alloc.buffer,
            handle.alloc_id);
 #endif
@@ -1831,8 +1830,6 @@ void VulkanDevice::destroy_image(DeviceAllocation handle) {
   TI_ASSERT_INFO(map_pair != image_allocations_.end(),
                  "Invalid handle (double free?) {}", handle.alloc_id);
 
-  ImageAllocInternal &alloc_int = map_pair->second;
-
   image_allocations_.erase(handle.alloc_id);
 }
 
diff --git a/taichi/backends/vulkan/vulkan_device_creator.cpp b/taichi/backends/vulkan/vulkan_device_creator.cpp
index 030398947b2d8..4cbce62120532 100644
--- a/taichi/backends/vulkan/vulkan_device_creator.cpp
+++ b/taichi/backends/vulkan/vulkan_device_creator.cpp
@@ -455,7 +455,7 @@ void VulkanDeviceCreator::create_logical_device() {
 
   bool has_swapchain = false;
 
-  bool portability_subset_enabled = false;
+  [[maybe_unused]] bool portability_subset_enabled = false;
 
   for (auto &ext : extension_properties) {
     TI_TRACE("Vulkan device extension {} ({})", ext.extensionName,
diff --git a/taichi/backends/wasm/codegen_wasm.cpp b/taichi/backends/wasm/codegen_wasm.cpp
index bba625393ecf8..7170eab5e1779 100644
--- a/taichi/backends/wasm/codegen_wasm.cpp
+++ b/taichi/backends/wasm/codegen_wasm.cpp
@@ -31,7 +31,7 @@ class CodeGenLLVMWASM : public CodeGenLLVM {
   }
 
   void create_offload_range_for(OffloadedStmt *stmt) override {
-    int step = 1;
+    [[maybe_unused]] int step = 1;
 
     // In parallel for-loops reversing the order doesn't make sense.
     // However, we may need to support serial offloaded range for's in the
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 0d841a94e2b27..b8901c20c4dde 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -474,7 +474,6 @@ class TaskCodegen : public IRVisitor {
 
   void visit(GlobalStoreStmt *stmt) override {
     TI_ASSERT(stmt->width() == 1);
-    const auto dt = stmt->val->element_type();
 
     spirv::Value val = ir_->query_value(stmt->val->raw_name());
 
diff --git a/taichi/program/async_engine.cpp b/taichi/program/async_engine.cpp
index a8e644ef610f2..ffd4cae24d199 100644
--- a/taichi/program/async_engine.cpp
+++ b/taichi/program/async_engine.cpp
@@ -325,8 +325,14 @@ void AsyncEngine::debug_sfg(const std::string &stage) {
     std::ofstream dot_file(dot_fn + ".dot");
     dot_file << dot;
   }
-  std::system(
+
+  int return_code = std::system(
       fmt::format("dot -Tpdf -o {}.pdf {}.dot", dot_fn, dot_fn).c_str());
+  if (return_code != 0) {
+    throw std::runtime_error(
+        fmt::format("Unable to convert {dot_fn}.dot into {dot_fn}.pdf")
+            .c_str());
+  }
 }
 
 TLANG_NAMESPACE_END
diff --git a/taichi/program/state_flow_graph.cpp b/taichi/program/state_flow_graph.cpp
index aa4e6a1ae9e3b..b1354b644c8ef 100644
--- a/taichi/program/state_flow_graph.cpp
+++ b/taichi/program/state_flow_graph.cpp
@@ -447,7 +447,7 @@ bool StateFlowGraph::optimize_listgen() {
     for (int i = i_start; i < listgens.size(); i++) {
       auto node_a = listgens[i];
 
-      bool erased_any = false;
+      [[maybe_unused]] bool erased_any = false;
 
       auto new_i = i;
 
diff --git a/taichi/util/io.h b/taichi/util/io.h
index e3e372e90a63f..df95671738d79 100644
--- a/taichi/util/io.h
+++ b/taichi/util/io.h
@@ -22,7 +22,11 @@ inline void create_directories(const std::string &dir) {
 #if defined(TI_PLATFORM_WINDOWS)
   std::filesystem::create_directories(dir);
 #else
-  std::system(fmt::format("mkdir -p {}", dir).c_str());
+  int return_code = std::system(fmt::format("mkdir -p {}", dir).c_str());
+  if (return_code != 0) {
+    throw std::runtime_error(
+        fmt::format("Unable to create directory at: {dir}").c_str());
+  }
 #endif
 }
 
diff --git a/tests/cpp/ir/ir_type_promotion_test.cpp b/tests/cpp/ir/ir_type_promotion_test.cpp
index 6f444763d7bd0..b4b71b5044092 100644
--- a/tests/cpp/ir/ir_type_promotion_test.cpp
+++ b/tests/cpp/ir/ir_type_promotion_test.cpp
@@ -13,7 +13,7 @@ TEST(IRTypePromotionTest, ShiftOp) {
 
   // (u8)x << (i32)1 -> (u8)res
   auto *lhs = builder.create_arg_load(0, get_data_type<uint8>(), false);
-  auto *res = builder.create_shl(lhs, builder.get_int32(1));
+  builder.create_shl(lhs, builder.get_int32(1));
   auto ir = builder.extract_ir();
 
   ASSERT_TRUE(ir->is<Block>());
diff --git a/tests/cpp/transforms/alg_simp_test.cpp b/tests/cpp/transforms/alg_simp_test.cpp
index 469e40b34c371..a8e8f2b850ac6 100644
--- a/tests/cpp/transforms/alg_simp_test.cpp
+++ b/tests/cpp/transforms/alg_simp_test.cpp
@@ -96,7 +96,8 @@ TEST_F(AlgebraicSimplicationTest, SimplifyMultiplyZeroFastMath) {
   auto add = block->push_back<BinaryOpStmt>(BinaryOpType::add, mul, one);
   auto global_store_addr = block->push_back<GlobalTemporaryStmt>(
       4, TypeFactory::create_vector_or_scalar_type(1, PrimitiveType::i32));
-  auto global_store = block->push_back<GlobalStoreStmt>(global_store_addr, add);
+  [[maybe_unused]] auto global_store =
+      block->push_back<GlobalStoreStmt>(global_store_addr, add);
 
   CompileConfig config_without_fast_math;
   config_without_fast_math.fast_math = false;

From e3d58c660820fe696f0c23c7941a67ee90f4a374 Mon Sep 17 00:00:00 2001
From: Zeyu Li <47965866+GaleSeLee@users.noreply.github.com>
Date: Fri, 6 May 2022 16:48:49 +0800
Subject: [PATCH 030/176] [SIMT] Add syncwarp warp intrinsics (#4917)

* add warp_barries warp instrinsic

add warp_barrier unit test

fix error: add Args mask in warp.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/simt/warp.py |  7 ++++---
 tests/python/test_simt.py       | 17 +++++++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
index a843c0220adf4..d8bd7424201cb 100644
--- a/python/taichi/lang/simt/warp.py
+++ b/python/taichi/lang/simt/warp.py
@@ -102,9 +102,10 @@ def active_mask():
     pass
 
 
-def sync():
-    # TODO
-    pass
+def sync(mask):
+    expr.Expr(
+        _ti_core.insert_internal_func_call("warp_barrier",
+                                           expr.make_expr_group(mask), False))
 
 
 __all__ = [
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 144a609465ad0..5730bd7d41293 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -257,8 +257,21 @@ def test_active_mask():
 
 @test_utils.test(arch=ti.cuda)
 def test_sync():
-    # TODO
-    pass
+    a = ti.field(dtype=ti.u32, shape=32)
+
+    @ti.kernel
+    def foo():
+        ti.loop_config(block_dim=32)
+        for i in range(32):
+            a[i] = i
+        ti.simt.warp.sync(ti.u32(0xFFFFFFFF))
+        for i in range(16):
+            a[i] = a[i + 16]
+
+    foo()
+
+    for i in range(32):
+        assert a[i] == i % 16 + 16
 
 
 # Higher level primitives test

From a951d056050a83d177e439f25c47ed8954f5c9da Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Fri, 6 May 2022 17:41:50 +0800
Subject: [PATCH 031/176] [refactor] Create MatrixImpl to differentiate Taichi
 and Python scopes (#4853)

* wip

* wip

* wip

* wip

* wip

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* cleanup

* fix impl._subscript()

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix mesh

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix useless __init__

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix py-scope subscript

* fix swizzle

* fix doc

* fix api

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/matrix.py | 384 +++++++++++++++++++++--------------
 python/taichi/lang/mesh.py   |   5 +-
 tests/python/test_api.py     |  39 +++-
 3 files changed, 266 insertions(+), 162 deletions(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index d0835835c6481..8ff72b03dd0bf 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -47,7 +47,7 @@ def gen_property(attr, attr_idx, key_group):
 
                 def prop_getter(instance):
                     checker(instance, attr)
-                    return instance._get_entry_and_read([attr_idx])
+                    return instance._impl._get_entry_and_read([attr_idx])
 
                 @python_scope
                 def prop_setter(instance, value):
@@ -73,7 +73,8 @@ def prop_getter(instance):
                     checker(instance, pattern)
                     res = []
                     for ch in pattern:
-                        res.append(instance._get_entry(key_group.index(ch)))
+                        res.append(
+                            instance._impl._get_entry(key_group.index(ch)))
                     return Vector(res, is_ref=True)
 
                 def prop_setter(instance, value):
@@ -96,6 +97,184 @@ def prop_setter(instance, value):
     return cls
 
 
+class _MatrixBaseImpl:
+    def __init__(self, m, n, entries):
+        self.m = m
+        self.n = n
+        self.entries = entries
+
+    def _get_entry(self, *indices):
+        return self.entries[self._linearize_entry_id(*indices)]
+
+    def _get_entry_and_read(self, indices):
+        # Can be invoked in both Python and Taichi scope. `indices` must be
+        # compile-time constants (e.g. Python values)
+        ret = self._get_entry(*indices)
+
+        if isinstance(ret, SNodeHostAccess):
+            ret = ret.accessor.getter(*ret.key)
+        elif isinstance(ret, NdarrayHostAccess):
+            ret = ret.getter()
+        return ret
+
+    def _linearize_entry_id(self, *args):
+        assert 1 <= len(args) <= 2
+        if len(args) == 1 and isinstance(args[0], (list, tuple)):
+            args = args[0]
+        if len(args) == 1:
+            args = args + (0, )
+        # TODO(#1004): See if it's possible to support indexing at runtime
+        for i, a in enumerate(args):
+            if not isinstance(a, int):
+                raise TaichiSyntaxError(
+                    f'The {i}-th index of a Matrix/Vector must be a compile-time constant '
+                    f'integer, got {type(a)}.\n'
+                    'This is because matrix operations will be **unrolled** at compile-time '
+                    'for performance reason.\n'
+                    'If you want to *iterate through matrix elements*, use a static range:\n'
+                    '  for i in ti.static(range(3)):\n'
+                    '    print(i, "-th component is", vec[i])\n'
+                    'See https://docs.taichi-lang.org/lang/articles/meta#when-to-use-tistatic-with-for-loops for more details.'
+                    'Or turn on ti.init(..., dynamic_index=True) to support indexing with variables!'
+                )
+        assert 0 <= args[0] < self.n, \
+            f"The 0-th matrix index is out of range: 0 <= {args[0]} < {self.n}"
+        assert 0 <= args[1] < self.m, \
+            f"The 1-th matrix index is out of range: 0 <= {args[1]} < {self.m}"
+        return args[0] * self.m + args[1]
+
+
+class _PyScopeMatrixImpl(_MatrixBaseImpl):
+    @python_scope
+    def __getitem__(self, indices):
+        """Access to the element at the given indices in a matrix.
+
+        Args:
+            indices (Sequence[Expr]): the indices of the element.
+
+        Returns:
+            The value of the element at a specific position of a matrix.
+
+        """
+        return self.subscript_scope_ignored(indices)
+
+    def subscript_scope_ignored(self, indices):
+        if not isinstance(indices, (list, tuple)):
+            indices = [indices]
+        assert len(indices) in [1, 2]
+        i = indices[0]
+        j = 0 if len(indices) == 1 else indices[1]
+        if isinstance(i, slice) or isinstance(j, slice):
+            return self._get_slice(i, j)
+        return self._get_entry_and_read([i, j])
+
+    @python_scope
+    def __setitem__(self, indices, item):
+        """Set the element value at the given indices in a matrix.
+
+        Args:
+            indices (Sequence[Expr]): the indices of a element.
+
+        """
+        if not isinstance(indices, (list, tuple)):
+            indices = [indices]
+        assert len(indices) in [1, 2]
+        i = indices[0]
+        j = 0 if len(indices) == 1 else indices[1]
+        idx = self._linearize_entry_id(i, j)
+        if isinstance(self.entries[idx], SNodeHostAccess):
+            self.entries[idx].accessor.setter(item, *self.entries[idx].key)
+        elif isinstance(self.entries[idx], NdarrayHostAccess):
+            self.entries[idx].setter(item)
+        else:
+            self.entries[idx] = item
+
+    def _get_slice(self, a, b):
+        if not isinstance(a, slice):
+            a = [a]
+        else:
+            a = range(a.start or 0, a.stop or self.n, a.step or 1)
+        if not isinstance(b, slice):
+            b = [b]
+        else:
+            b = range(b.start or 0, b.stop or self.m, b.step or 1)
+        return Matrix([[self(i, j) for j in b] for i in a])
+
+    def _set_entries(self, value):
+        if not isinstance(value, (list, tuple)):
+            value = list(value)
+        if not isinstance(value[0], (list, tuple)):
+            value = [[i] for i in value]
+        for i in range(self.n):
+            for j in range(self.m):
+                self[i, j] = value[i][j]
+
+
+class _TiScopeMatrixImpl(_MatrixBaseImpl):
+    def __init__(self, m, n, entries, local_tensor_proxy,
+                 dynamic_index_stride):
+        super().__init__(m, n, entries)
+        self.any_array_access = None
+        self.local_tensor_proxy = local_tensor_proxy
+        self.dynamic_index_stride = dynamic_index_stride
+
+    @taichi_scope
+    def _subscript(self, is_global_mat, *indices):
+        assert len(indices) in [1, 2]
+        i = indices[0]
+        j = 0 if len(indices) == 1 else indices[1]
+        has_slice = False
+        if isinstance(i, slice):
+            i = self._calc_slice(i, 0)
+            has_slice = True
+        if isinstance(j, slice):
+            j = self._calc_slice(j, 1)
+            has_slice = True
+
+        if has_slice:
+            if not isinstance(i, list):
+                i = [i]
+            if not isinstance(j, list):
+                j = [j]
+            if len(indices) == 1:
+                return Vector([self._subscript(is_global_mat, a) for a in i])
+            return Matrix([[self._subscript(is_global_mat, a, b) for b in j]
+                           for a in i])
+
+        if self.any_array_access:
+            return self.any_array_access.subscript(i, j)
+        if self.local_tensor_proxy is not None:
+            assert self.dynamic_index_stride is not None
+            if len(indices) == 1:
+                return impl.make_tensor_element_expr(self.local_tensor_proxy,
+                                                     (i, ), (self.n, ),
+                                                     self.dynamic_index_stride)
+            return impl.make_tensor_element_expr(self.local_tensor_proxy,
+                                                 (i, j), (self.n, self.m),
+                                                 self.dynamic_index_stride)
+        if impl.current_cfg(
+        ).dynamic_index and is_global_mat and self.dynamic_index_stride:
+            return impl.make_tensor_element_expr(self.entries[0].ptr, (i, j),
+                                                 (self.n, self.m),
+                                                 self.dynamic_index_stride)
+        return self._get_entry(i, j)
+
+    def _calc_slice(self, index, dim):
+        start, stop, step = index.start or 0, index.stop or (
+            self.n if dim == 0 else self.m), index.step or 1
+
+        def helper(x):
+            #  TODO(mzmzm): support variable in slice
+            if isinstance(x, expr.Expr):
+                raise TaichiCompilationError(
+                    "Taichi does not support variables in slice now, please use constant instead of it."
+                )
+            return x
+
+        start, stop, step = helper(start), helper(stop), helper(step)
+        return [_ for _ in range(start, stop, step)]
+
+
 @_gen_swizzles
 class Matrix(TaichiOperations):
     """The matrix class.
@@ -141,10 +320,8 @@ class Matrix(TaichiOperations):
     _is_taichi_class = True
 
     def __init__(self, arr, dt=None, suppress_warning=False, is_ref=False):
-        self.local_tensor_proxy = None
-        self.any_array_access = None
-        self.grad = None
-        self.dynamic_index_stride = None
+        local_tensor_proxy = None
+        dynamic_index_stride = None
 
         if not isinstance(arr, (list, tuple, np.ndarray)):
             raise TaichiTypeError(
@@ -182,18 +359,18 @@ def __init__(self, arr, dt=None, suppress_warning=False, is_ref=False):
                         raise Exception(
                             'dt required when using dynamic_index for local tensor'
                         )
-                self.local_tensor_proxy = impl.expr_init_local_tensor(
+                local_tensor_proxy = impl.expr_init_local_tensor(
                     [len(arr)], dt,
                     expr.make_expr_group([expr.Expr(x) for x in arr]))
-                self.dynamic_index_stride = 1
+                dynamic_index_stride = 1
                 mat = []
                 for i in range(len(arr)):
                     mat.append(
                         list([
                             impl.make_tensor_element_expr(
-                                self.local_tensor_proxy,
+                                local_tensor_proxy,
                                 (expr.Expr(i, dtype=primitive_types.i32), ),
-                                (len(arr), ), self.dynamic_index_stride)
+                                (len(arr), ), dynamic_index_stride)
                         ]))
         else:  # now init a Matrix
             if in_python_scope() or is_ref:
@@ -225,28 +402,27 @@ def __init__(self, arr, dt=None, suppress_warning=False, is_ref=False):
                         raise Exception(
                             'dt required when using dynamic_index for local tensor'
                         )
-                self.local_tensor_proxy = impl.expr_init_local_tensor(
+                local_tensor_proxy = impl.expr_init_local_tensor(
                     [len(arr), len(arr[0])], dt,
                     expr.make_expr_group(
                         [expr.Expr(x) for row in arr for x in row]))
-                self.dynamic_index_stride = 1
+                dynamic_index_stride = 1
                 mat = []
                 for i in range(len(arr)):
                     mat.append([])
                     for j in range(len(arr[0])):
                         mat[i].append(
                             impl.make_tensor_element_expr(
-                                self.local_tensor_proxy,
+                                local_tensor_proxy,
                                 (expr.Expr(i, dtype=primitive_types.i32),
                                  expr.Expr(j, dtype=primitive_types.i32)),
-                                (len(arr), len(arr[0])),
-                                self.dynamic_index_stride))
+                                (len(arr), len(arr[0])), dynamic_index_stride))
         self.n = len(mat)
         if len(mat) > 0:
             self.m = len(mat[0])
         else:
             self.m = 1
-        self.entries = [x for row in mat for x in row]
+        entries = [x for row in mat for x in row]
 
         if self.n * self.m > 32 and not suppress_warning:
             warning(
@@ -259,6 +435,12 @@ def __init__(self, arr, dt=None, suppress_warning=False, is_ref=False):
                 ' for more details.',
                 UserWarning,
                 stacklevel=2)
+        m, n = self.m, self.n
+        if in_python_scope():
+            self._impl = _PyScopeMatrixImpl(m, n, entries)
+        else:
+            self._impl = _TiScopeMatrixImpl(m, n, entries, local_tensor_proxy,
+                                            dynamic_index_stride)
 
     def _element_wise_binary(self, foo, other):
         other = self._broadcast_copy(other)
@@ -317,39 +499,8 @@ def __matmul__(self, other):
                 for k in range(1, other.n):
                     acc = acc + self(i, k) * other(k, j)
                 entries[i].append(acc)
-        # A hack way to check if this is a vector from `taichi.math`,
-        # to avoid importing a deleted name across modules.
-        if isinstance(other, Matrix) and (hasattr(other, "_DIM")):
-            return type(other)(*[x for x, in entries])
-
         return Matrix(entries)
 
-    def _linearize_entry_id(self, *args):
-        assert 1 <= len(args) <= 2
-        if len(args) == 1 and isinstance(args[0], (list, tuple)):
-            args = args[0]
-        if len(args) == 1:
-            args = args + (0, )
-        # TODO(#1004): See if it's possible to support indexing at runtime
-        for i, a in enumerate(args):
-            if not isinstance(a, int):
-                raise TaichiSyntaxError(
-                    f'The {i}-th index of a Matrix/Vector must be a compile-time constant '
-                    f'integer, got {type(a)}.\n'
-                    'This is because matrix operations will be **unrolled** at compile-time '
-                    'for performance reason.\n'
-                    'If you want to *iterate through matrix elements*, use a static range:\n'
-                    '  for i in ti.static(range(3)):\n'
-                    '    print(i, "-th component is", vec[i])\n'
-                    'See https://docs.taichi-lang.org/lang/articles/meta#when-to-use-tistatic-with-for-loops for more details.'
-                    'Or turn on ti.init(..., dynamic_index=True) to support indexing with variables!'
-                )
-        assert 0 <= args[0] < self.n, \
-            f"The 0-th matrix index is out of range: 0 <= {args[0]} < {self.n}"
-        assert 0 <= args[1] < self.m, \
-            f"The 1-th matrix index is out of range: 0 <= {args[1]} < {self.m}"
-        return args[0] * self.m + args[1]
-
     # host access & python scope operation
     def __len__(self):
         """Get the length of each row of a matrix"""
@@ -372,14 +523,7 @@ def __getitem__(self, indices):
             The value of the element at a specific position of a matrix.
 
         """
-        if not isinstance(indices, (list, tuple)):
-            indices = [indices]
-        assert len(indices) in [1, 2]
-        i = indices[0]
-        j = 0 if len(indices) == 1 else indices[1]
-        if isinstance(i, slice) or isinstance(j, slice):
-            return self._get_slice(i, j)
-        return self._get_entry_and_read([i, j])
+        return self._impl[indices]
 
     @python_scope
     def __setitem__(self, indices, item):
@@ -389,115 +533,47 @@ def __setitem__(self, indices, item):
             indices (Sequence[Expr]): the indices of a element.
 
         """
-        if not isinstance(indices, (list, tuple)):
-            indices = [indices]
-        assert len(indices) in [1, 2]
-        i = indices[0]
-        j = 0 if len(indices) == 1 else indices[1]
-        idx = self._linearize_entry_id(i, j)
-        if isinstance(self.entries[idx], SNodeHostAccess):
-            self.entries[idx].accessor.setter(item, *self.entries[idx].key)
-        elif isinstance(self.entries[idx], NdarrayHostAccess):
-            self.entries[idx].setter(item)
-        else:
-            self.entries[idx] = item
+        self._impl[indices] = item
 
     def __call__(self, *args, **kwargs):
         # TODO: It's quite hard to search for __call__, consider replacing this
         # with a method of actual names?
         assert kwargs == {}
-        return self._get_entry_and_read(args)
-
-    def _get_entry_and_read(self, indices):
-        # Can be invoked in both Python and Taichi scope. `indices` must be
-        # compile-time constants (e.g. Python values)
-        ret = self._get_entry(*indices)
-
-        if isinstance(ret, SNodeHostAccess):
-            ret = ret.accessor.getter(*ret.key)
-        elif isinstance(ret, NdarrayHostAccess):
-            ret = ret.getter()
-        return ret
+        return self._impl._get_entry_and_read(args)
 
     @python_scope
     def _set_entries(self, value):
-        if not isinstance(value, (list, tuple)):
-            value = list(value)
-        if not isinstance(value[0], (list, tuple)):
-            value = [[i] for i in value]
-        for i in range(self.n):
-            for j in range(self.m):
-                self[i, j] = value[i][j]
+        self._impl._set_entries(value)
 
-    def _get_entry(self, *args):
-        return self.entries[self._linearize_entry_id(*args)]
+    @property
+    def entries(self):
+        return self._impl.entries
 
-    def _get_slice(self, a, b):
-        if not isinstance(a, slice):
-            a = [a]
-        else:
-            a = range(a.start or 0, a.stop or self.n, a.step or 1)
-        if not isinstance(b, slice):
-            b = [b]
-        else:
-            b = range(b.start or 0, b.stop or self.m, b.step or 1)
-        return Matrix([[self(i, j) for j in b] for i in a])
+    @property
+    def any_array_access(self):
+        return self._impl.any_array_access
 
-    def _cal_slice(self, index, dim):
-        start, stop, step = index.start or 0, index.stop or (
-            self.n if dim == 0 else self.m), index.step or 1
+    @any_array_access.setter
+    def any_array_access(self, value):
+        self._impl.any_array_access = value
 
-        def helper(x):
-            #  TODO(mzmzm): support variable in slice
-            if isinstance(x, expr.Expr):
-                raise TaichiCompilationError(
-                    "Taichi does not support variables in slice now, please use constant instead of it."
-                )
-            return x
+    @property
+    def local_tensor_proxy(self):
+        return self._impl.local_tensor_proxy
 
-        start, stop, step = helper(start), helper(stop), helper(step)
-        return [_ for _ in range(start, stop, step)]
+    @property
+    def dynamic_index_stride(self):
+        return self._impl.dynamic_index_stride
 
     @taichi_scope
     def _subscript(self, *indices):
-        assert len(indices) in [1, 2]
-        i = indices[0]
-        j = 0 if len(indices) == 1 else indices[1]
-        has_slice = False
-        if isinstance(i, slice):
-            i = self._cal_slice(i, 0)
-            has_slice = True
-        if isinstance(j, slice):
-            j = self._cal_slice(j, 1)
-            has_slice = True
-
-        if has_slice:
-            if not isinstance(i, list):
-                i = [i]
-            if not isinstance(j, list):
-                j = [j]
-            if len(indices) == 1:
-                return Vector([self._subscript(a) for a in i])
-            return Matrix([[self._subscript(a, b) for b in j] for a in i])
-
-        if self.any_array_access:
-            return self.any_array_access.subscript(i, j)
-        if self.local_tensor_proxy is not None:
-            assert self.dynamic_index_stride is not None
-            if len(indices) == 1:
-                return impl.make_tensor_element_expr(self.local_tensor_proxy,
-                                                     (i, ), (self.n, ),
-                                                     self.dynamic_index_stride)
-            return impl.make_tensor_element_expr(self.local_tensor_proxy,
-                                                 (i, j), (self.n, self.m),
-                                                 self.dynamic_index_stride)
-        if impl.current_cfg().dynamic_index and isinstance(
-                self,
-                _MatrixFieldElement) and self.dynamic_index_stride is not None:
-            return impl.make_tensor_element_expr(self.entries[0].ptr, (i, j),
-                                                 (self.n, self.m),
-                                                 self.dynamic_index_stride)
-        return self._get_entry(i, j)
+        if isinstance(self._impl, _PyScopeMatrixImpl):
+            # This can happpen in these cases:
+            # 1. A Python scope matrix is passed into a Taichi kernel as ti.template()
+            # 2. Taichi kernel directlly uses a matrix (global variable) created in the Python scope.
+            return self._impl.subscript_scope_ignored(indices)
+        is_global_mat = isinstance(self, _MatrixFieldElement)
+        return self._impl._subscript(is_global_mat, *indices)
 
     def to_list(self):
         """Return this matrix as a 1D `list`.
@@ -1296,11 +1372,11 @@ def __init__(self, n, m, entries):
         assert n * m == len(entries), "Number of entries doesn't match n * m"
         self.n = n
         self.m = m
-        self.entries = entries
-        self.local_tensor_proxy = None
-        self.any_array_access = None
-        self.grad = None
-        self.dynamic_index_stride = None
+        self._impl = _TiScopeMatrixImpl(m,
+                                        n,
+                                        entries,
+                                        local_tensor_proxy=None,
+                                        dynamic_index_stride=None)
 
 
 class _MatrixFieldElement(_IntermediateMatrix):
@@ -1315,7 +1391,7 @@ def __init__(self, field, indices):
             expr.Expr(ti_core.subscript(e.ptr, indices))
             for e in field._get_field_members()
         ])
-        self.dynamic_index_stride = field.dynamic_index_stride
+        self._impl.dynamic_index_stride = field.dynamic_index_stride
 
 
 class MatrixField(Field):
diff --git a/python/taichi/lang/mesh.py b/python/taichi/lang/mesh.py
index 36e182cbdff27..6d2d7031e1201 100644
--- a/python/taichi/lang/mesh.py
+++ b/python/taichi/lang/mesh.py
@@ -7,8 +7,7 @@
 from taichi.lang.enums import Layout
 from taichi.lang.exception import TaichiSyntaxError
 from taichi.lang.field import Field, ScalarField
-from taichi.lang.matrix import (MatrixField, _IntermediateMatrix,
-                                _MatrixFieldElement)
+from taichi.lang.matrix import Matrix, MatrixField, _MatrixFieldElement
 from taichi.lang.struct import StructField
 from taichi.lang.util import python_scope
 from taichi.types import i32, u16, u32
@@ -84,7 +83,7 @@ def __getitem__(self, key):
         self._initialize_host_accessors()
         key = self.g2r_field[key]
         key = self._pad_key(key)
-        return _IntermediateMatrix(self.n, self.m, self._host_access(key))
+        return Matrix(self._host_access(key), is_ref=True)
 
 
 class MeshElementField:
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index df555dc01512b..23d5843aa70c9 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -19,11 +19,40 @@ def _get_matrix_swizzle_apis():
 
 def _get_expected_matrix_apis():
     base = [
-        'all', 'any', 'cast', 'cols', 'cross', 'determinant', 'diag', 'dot',
-        'field', 'fill', 'identity', 'inverse', 'max', 'min', 'ndarray',
-        'norm', 'norm_inv', 'norm_sqr', 'normalized', 'one', 'outer_product',
-        'rotation2d', 'rows', 'sum', 'to_list', 'to_numpy', 'trace',
-        'transpose', 'unit', 'zero'
+        'all',
+        'any',
+        'any_array_access',
+        'cast',
+        'cols',
+        'cross',
+        'determinant',
+        'diag',
+        'dot',
+        'dynamic_index_stride',
+        'entries',
+        'field',
+        'fill',
+        'identity',
+        'inverse',
+        'local_tensor_proxy',
+        'max',
+        'min',
+        'ndarray',
+        'norm',
+        'norm_inv',
+        'norm_sqr',
+        'normalized',
+        'one',
+        'outer_product',
+        'rotation2d',
+        'rows',
+        'sum',
+        'to_list',
+        'to_numpy',
+        'trace',
+        'transpose',
+        'unit',
+        'zero',
     ]
     res = base + _get_matrix_swizzle_apis()
     return sorted(res)

From cebee890793733b164f83f908764244b864511d7 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Sat, 7 May 2022 10:32:29 +0800
Subject: [PATCH 032/176] [build] Warning Suppression PR #1: Turned on
 -Wno-ignored-attributes & Removed unused functions (#4916)

---
 cmake/TaichiCXXFlags.cmake                    | 18 ++++++++++
 .../backends/interop/vulkan_cpu_interop.cpp   |  1 -
 taichi/backends/vulkan/vulkan_program.cpp     |  4 +--
 taichi/backends/vulkan/vulkan_utils.cpp       | 36 -------------------
 4 files changed, 20 insertions(+), 39 deletions(-)
 delete mode 100644 taichi/backends/vulkan/vulkan_utils.cpp

diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index 79fe36770b646..da3ff8432982a 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -46,7 +46,25 @@ else()
         message("Invalid compiler ${CMAKE_CXX_COMPILER_ID} detected.")
         message(FATAL_ERROR "clang and MSVC are the only supported compilers for Taichi compiler development. Consider using 'cmake -DCMAKE_CXX_COMPILER=clang' if you are on Linux.")
     endif()
+
+    # [Global] CXX compilation option to enable all warnings.
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall ")
+
+    # [Global] By default, CXX compiler will throw a warning if it decides to ignore an attribute, for example "[[ maybe unused ]]".
+    # However, this behaviour diverges across different compilers (GCC/CLANG), as well as different compiler versions.
+    # Therefore we disable such warnings for now.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ignored-attributes ")
+
+    # [Global] Clang warns if a C++ pointer's nullability wasn't marked explicitly (__nonnull, nullable, ...).
+    # Nullability seems to be a clang-specific feature, thus we disable this warning.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nullability-completeness ")
+
+    # [Global] Disable warning for unused-private-field for convenience in development.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-private-field ")
+
+    # [Global] By evaluating "constexpr", compiler throws a warning for functions known to be dead at compile time.
+    # However, some of these "constexpr" are debug flags and will be manually enabled upon debuging.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unneeded-internal-declaration ")
 endif ()
 
 message("Building for processor ${CMAKE_SYSTEM_PROCESSOR}")
diff --git a/taichi/backends/interop/vulkan_cpu_interop.cpp b/taichi/backends/interop/vulkan_cpu_interop.cpp
index 11359dcc2b8bb..126d9eca7e256 100644
--- a/taichi/backends/interop/vulkan_cpu_interop.cpp
+++ b/taichi/backends/interop/vulkan_cpu_interop.cpp
@@ -22,7 +22,6 @@ void memcpy_cpu_to_vulkan_via_staging(DevicePtr dst,
   VulkanDevice *vk_dev = dynamic_cast<VulkanDevice *>(dst.device);
   CpuDevice *cpu_dev = dynamic_cast<CpuDevice *>(src.device);
 
-  DeviceAllocation dst_alloc(dst);
   DeviceAllocation src_alloc(src);
 
   CpuDevice::AllocInfo src_alloc_info = cpu_dev->get_alloc_info(src_alloc);
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/backends/vulkan/vulkan_program.cpp
index 81184bdd56722..63511189d99fb 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/backends/vulkan/vulkan_program.cpp
@@ -71,8 +71,8 @@ FunctionType compile_to_executable(Kernel *kernel,
                                    VkRuntime *runtime,
                                    SNodeTreeManager *snode_tree_mgr) {
   auto handle = runtime->register_taichi_kernel(
-      std::move(run_codegen(kernel, runtime->get_ti_device(),
-                            snode_tree_mgr->get_compiled_structs())));
+      run_codegen(kernel, runtime->get_ti_device(),
+                  snode_tree_mgr->get_compiled_structs()));
   return [runtime, handle](RuntimeContext &ctx) {
     runtime->launch_kernel(handle, &ctx);
   };
diff --git a/taichi/backends/vulkan/vulkan_utils.cpp b/taichi/backends/vulkan/vulkan_utils.cpp
deleted file mode 100644
index e54133ce0757a..0000000000000
--- a/taichi/backends/vulkan/vulkan_utils.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-#include "taichi/backends/vulkan/vulkan_utils.h"
-
-#include <spirv-tools/libspirv.hpp>
-
-namespace taichi {
-namespace lang {
-namespace vulkan {
-namespace {
-
-std::vector<VkExtensionProperties> GetInstanceExtensionProperties() {
-  constexpr char *kNoLayerName = nullptr;
-  uint32_t count = 0;
-  vkEnumerateInstanceExtensionProperties(kNoLayerName, &count, nullptr);
-  std::vector<VkExtensionProperties> extensions(count);
-  vkEnumerateInstanceExtensionProperties(kNoLayerName, &count,
-                                         extensions.data());
-  return extensions;
-}
-
-std::vector<VkExtensionProperties> GetDeviceExtensionProperties(
-    VkPhysicalDevice physicalDevice) {
-  constexpr char *kNoLayerName = nullptr;
-  uint32_t count = 0;
-  vkEnumerateDeviceExtensionProperties(physicalDevice, kNoLayerName, &count,
-                                       nullptr);
-  std::vector<VkExtensionProperties> extensions(count);
-  vkEnumerateDeviceExtensionProperties(physicalDevice, kNoLayerName, &count,
-                                       extensions.data());
-  return extensions;
-}
-
-}  // namespace
-
-}  // namespace vulkan
-}  // namespace lang
-}  // namespace taichi

From 72a1517bee31da8c9eab8f8856cae87a32cd4ad3 Mon Sep 17 00:00:00 2001
From: Zeyu Li <47965866+GaleSeLee@users.noreply.github.com>
Date: Sat, 7 May 2022 11:00:41 +0800
Subject: [PATCH 033/176] [SIMT] Add activemask warp intrinsics (#4918)

* add activemask warp intrinsic

add test function call

del extra space

unit-test print->assert

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/simt/warp.py |  5 +++--
 tests/python/test_simt.py       | 14 ++++++++++++--
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
index d8bd7424201cb..21a45329e0810 100644
--- a/python/taichi/lang/simt/warp.py
+++ b/python/taichi/lang/simt/warp.py
@@ -98,8 +98,9 @@ def match_all():
 
 
 def active_mask():
-    # TODO
-    pass
+    return expr.Expr(
+        _ti_core.insert_internal_func_call("cuda_active_mask",
+                                           expr.make_expr_group(), False))
 
 
 def sync(mask):
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 5730bd7d41293..2d7204b509d54 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -251,8 +251,18 @@ def test_match_all():
 
 @test_utils.test(arch=ti.cuda)
 def test_active_mask():
-    # TODO
-    pass
+    a = ti.field(dtype=ti.u32, shape=32)
+
+    @ti.kernel
+    def foo():
+        ti.loop_config(block_dim=16)
+        for i in range(32):
+            a[i] = ti.simt.warp.active_mask()
+
+    foo()
+
+    for i in range(32):
+        assert a[i] == 65535
 
 
 @test_utils.test(arch=ti.cuda)

From 7324903172c69382cb3e065b6634af35d938fd3b Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Sat, 7 May 2022 12:43:02 +0800
Subject: [PATCH 034/176] [build] Warning Suppression PR #3: Eliminate warnings
 from third-party headers (#4920)

* [build] Warning Suppression PR #1: Turned on -Wno-ignored-attributes & Removed unused functions

* [build] Warning Suppression PR #2: Eliminate warnings from third-party headers

* Fixed an warning with enum comparison
---
 cmake/TaichiCore.cmake                   | 4 +++-
 taichi/backends/vulkan/vulkan_device.cpp | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 3096832be40c7..1a41bc2020552 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -379,7 +379,9 @@ if (TI_WITH_VULKAN)
 
     target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Headers/include)
     target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Reflect)
-    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/VulkanMemoryAllocator/include)
+
+    # By specifying SYSTEM, we suppressed the warnings from third-party headers.
+    target_include_directories(${CORE_LIBRARY_NAME} SYSTEM PRIVATE external/VulkanMemoryAllocator/include)
 
     if (LINUX)
         # shaderc requires pthread
diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
index 62b1b4fe6b865..a739758e69f88 100644
--- a/taichi/backends/vulkan/vulkan_device.cpp
+++ b/taichi/backends/vulkan/vulkan_device.cpp
@@ -278,7 +278,7 @@ void VulkanPipeline::create_descriptor_set_layout(const Params &params) {
 
       for (auto var : variables) {
         // We want to remove auxiliary outputs such as frag depth
-        if (var->built_in == -1) {
+        if (static_cast<int>(var->built_in) == -1) {
           render_target_count++;
         }
       }

From 37c47b716f39e7d516bf30305238d86df1a7826c Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Sat, 7 May 2022 15:15:36 +0800
Subject: [PATCH 035/176] [build] Warning Suppression PR #4: Fixed warnings
 with MacOS (#4926)

* [build] Warning Suppression PR #1: Turned on -Wno-ignored-attributes & Removed unused functions

* [build] Warning Suppression PR #2: Eliminate warnings from third-party headers

* Fixed an warning with enum comparison

* [build] Warning Suppression PR #4: Fixed Mac-specific warnings
---
 taichi/backends/metal/codegen_metal.cpp     | 2 +-
 taichi/backends/metal/device.cpp            | 7 +++++++
 taichi/backends/metal/kernel_manager.cpp    | 4 ++--
 taichi/backends/metal/metal_program.cpp     | 2 +-
 taichi/backends/opengl/codegen_opengl.cpp   | 2 +-
 taichi/transforms/make_mesh_block_local.cpp | 2 +-
 6 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index 14bc796597aa1..b6b5c0c3c0937 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -190,7 +190,7 @@ class KernelCodegenImpl : public IRVisitor {
 
     for (int i = 0; i < compiled_snode_trees_.size(); ++i) {
       const auto &cst = compiled_snode_trees_[i];
-      for (const auto [node_id, _] : cst.snode_descriptors) {
+      for (const auto &[node_id, _] : cst.snode_descriptors) {
         RootInfo ri{};
         ri.snode_id = cst.root_id;
         ri.index_in_cst = i;
diff --git a/taichi/backends/metal/device.cpp b/taichi/backends/metal/device.cpp
index aa75d61847625..26b363966f7af 100644
--- a/taichi/backends/metal/device.cpp
+++ b/taichi/backends/metal/device.cpp
@@ -245,6 +245,11 @@ class StreamImpl : public Stream {
       const std::vector<StreamSemaphore> &wait_semaphores) override {
     auto *cb = static_cast<CommandListImpl *>(cmdlist)->command_buffer();
     commit_command_buffer(cb);
+
+    // FIXME: Implement semaphore mechanism for Metal backend
+    //        and return the actual semaphore corresponding to the submitted
+    //        cmds.
+    return nullptr;
   }
   StreamSemaphore submit_synced(
       CommandList *cmdlist,
@@ -252,6 +257,8 @@ class StreamImpl : public Stream {
     auto *cb = static_cast<CommandListImpl *>(cmdlist)->command_buffer();
     commit_command_buffer(cb);
     wait_until_completed(cb);
+
+    return nullptr;
   }
 
   void command_sync() override {
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index 96863e178b973..9f3524cc35eed 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -188,7 +188,7 @@ class UserMtlKernel : public CompiledMtlKernelBase {
     // 0 is valid for |num_threads|!
     TI_ASSERT(kernel_attribs_.advisory_total_num_threads >= 0);
     BindBuffers buffers;
-    for (const auto b : kernel_attribs_.buffers) {
+    for (const auto &b : kernel_attribs_.buffers) {
       buffers.push_back({input_buffers.find(b)->second, b});
     }
     launch_if_not_empty(std::move(buffers), command_buffer);
@@ -215,7 +215,7 @@ class SparseRuntimeMtlKernelBase : public CompiledMtlKernelBase {
   void launch(InputBuffersMap &input_buffers,
               MTLCommandBuffer *command_buffer) override {
     BindBuffers buffers;
-    for (const auto b : kernel_attribs_.buffers) {
+    for (const auto &b : kernel_attribs_.buffers) {
       if (b.type() == BufferDescriptor::Type::Context) {
         buffers.push_back({args_buffer_.get(), b});
       } else {
diff --git a/taichi/backends/metal/metal_program.cpp b/taichi/backends/metal/metal_program.cpp
index 209aeefef3f52..e7eb52ddf166a 100644
--- a/taichi/backends/metal/metal_program.cpp
+++ b/taichi/backends/metal/metal_program.cpp
@@ -11,7 +11,7 @@ namespace {
 std::unordered_set<const SNode *> find_all_dense_snodes(
     const metal::SNodeDescriptorsMap &snodes_map) {
   std::unordered_set<const SNode *> res;
-  for (const auto [_, desc] : snodes_map) {
+  for (const auto &[_, desc] : snodes_map) {
     const auto *sn = desc.snode;
     if (sn->type == SNodeType::dense) {
       res.insert(sn);
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index 7f232b26315b5..dbc762574c88a 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -224,7 +224,7 @@ class KernelGen : public IRVisitor {
         kernel_header += shaders::kOpenGlAtomicF32Source_gtmp;
       }
       std::unordered_set<int> arr_ids;
-      for ([[maybe_unused]] const auto [arr_id, bind_idx] :
+      for ([[maybe_unused]] const auto &[arr_id, bind_idx] :
            used.arr_arg_to_bind_idx) {
         arr_ids.insert(arr_id);
       }
diff --git a/taichi/transforms/make_mesh_block_local.cpp b/taichi/transforms/make_mesh_block_local.cpp
index 86480250a75cd..c0add06662531 100644
--- a/taichi/transforms/make_mesh_block_local.cpp
+++ b/taichi/transforms/make_mesh_block_local.cpp
@@ -441,7 +441,7 @@ MakeMeshBlockLocal::MakeMeshBlockLocal(OffloadedStmt *offload,
     TI_TRACE("available cache attributes bytes = {}", available_bytes);
     TI_TRACE("caches size = {}", caches->caches.size());
     std::vector<MeshBLSCache> priority_caches;
-    for (const auto [snode, cache] : caches->caches) {
+    for (const auto &[snode, cache] : caches->caches) {
       priority_caches.push_back(cache);
     }
     std::sort(priority_caches.begin(), priority_caches.end(),

From 278b4beee59cc8fe329936895db5255fab3ea75b Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Sat, 7 May 2022 18:19:36 +0800
Subject: [PATCH 036/176] [refactor] Simplify Matrix's initializer (#4923)

* [refactor] Simplify Matrix's initializer

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update python/taichi/lang/matrix.py

* Update python/taichi/lang/matrix.py

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/matrix.py | 182 ++++++++++++++++++++---------------
 1 file changed, 103 insertions(+), 79 deletions(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 8ff72b03dd0bf..2cd17d3064e7d 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -275,6 +275,98 @@ def helper(x):
         return [_ for _ in range(start, stop, step)]
 
 
+class _MatrixEntriesInitializer:
+    def pyscope_or_ref(self, arr):
+        raise NotImplementedError('Override')
+
+    def no_dynamic_index(self, arr, dt):
+        raise NotImplementedError('Override')
+
+    def with_dynamic_index(self, arr, dt):
+        raise NotImplementedError('Override')
+
+    def _get_entry_to_infer(self, arr):
+        raise NotImplementedError('Override')
+
+    def infer_dt(self, arr):
+        entry = self._get_entry_to_infer(arr)
+        if isinstance(entry, (int, np.integer)):
+            return impl.get_runtime().default_ip
+        if isinstance(entry, float):
+            return impl.get_runtime().default_fp
+        if isinstance(entry, expr.Expr):
+            dt = entry.ptr.get_ret_type()
+            if dt == ti_core.DataType_unknown:
+                raise TypeError(
+                    'Element type of the matrix cannot be inferred. Please set dt instead for now.'
+                )
+            return dt
+        raise Exception(
+            'dt required when using dynamic_index for local tensor')
+
+
+def _make_entries_initializer(is_matrix: bool) -> _MatrixEntriesInitializer:
+    class _VecImpl(_MatrixEntriesInitializer):
+        def pyscope_or_ref(self, arr):
+            return [[x] for x in arr]
+
+        def no_dynamic_index(self, arr, dt):
+            return [[impl.expr_init(ops_mod.cast(x, dt) if dt else x)]
+                    for x in arr]
+
+        def with_dynamic_index(self, arr, dt):
+            local_tensor_proxy = impl.expr_init_local_tensor(
+                [len(arr)], dt,
+                expr.make_expr_group([expr.Expr(x) for x in arr]))
+            dynamic_index_stride = 1
+            mat = []
+            for i in range(len(arr)):
+                mat.append(
+                    list([
+                        impl.make_tensor_element_expr(
+                            local_tensor_proxy,
+                            (expr.Expr(i, dtype=primitive_types.i32), ),
+                            (len(arr), ), dynamic_index_stride)
+                    ]))
+            return local_tensor_proxy, dynamic_index_stride, mat
+
+        def _get_entry_to_infer(self, arr):
+            return arr[0]
+
+    class _MatImpl(_MatrixEntriesInitializer):
+        def pyscope_or_ref(self, arr):
+            return [list(row) for row in arr]
+
+        def no_dynamic_index(self, arr, dt):
+            return [[
+                impl.expr_init(ops_mod.cast(x, dt) if dt else x) for x in row
+            ] for row in arr]
+
+        def with_dynamic_index(self, arr, dt):
+            local_tensor_proxy = impl.expr_init_local_tensor(
+                [len(arr), len(arr[0])], dt,
+                expr.make_expr_group(
+                    [expr.Expr(x) for row in arr for x in row]))
+
+            dynamic_index_stride = 1
+            mat = []
+            for i in range(len(arr)):
+                mat.append([])
+                for j in range(len(arr[0])):
+                    mat[i].append(
+                        impl.make_tensor_element_expr(
+                            local_tensor_proxy,
+                            (expr.Expr(i, dtype=primitive_types.i32),
+                             expr.Expr(j, dtype=primitive_types.i32)),
+                            (len(arr), len(arr[0])), dynamic_index_stride))
+            return local_tensor_proxy, dynamic_index_stride, mat
+
+        def _get_entry_to_infer(self, arr):
+            return arr[0][0]
+
+    return _MatImpl() if is_matrix else _VecImpl()
+
+
 @_gen_swizzles
 class Matrix(TaichiOperations):
     """The matrix class.
@@ -331,55 +423,14 @@ def __init__(self, arr, dt=None, suppress_warning=False, is_ref=False):
             mat = []
         elif isinstance(arr[0], Matrix):
             raise Exception('cols/rows required when using list of vectors')
-        elif not isinstance(arr[0], Iterable):  # now init a Vector
-            if in_python_scope() or is_ref:
-                mat = [[x] for x in arr]
-            elif not impl.current_cfg().dynamic_index:
-                mat = [[impl.expr_init(ops_mod.cast(x, dt) if dt else x)]
-                       for x in arr]
-            else:
-                if not ti_core.is_extension_supported(
-                        impl.current_cfg().arch,
-                        ti_core.Extension.dynamic_index):
-                    raise Exception(
-                        f"Backend {impl.current_cfg().arch} doesn't support dynamic index"
-                    )
-                if dt is None:
-                    if isinstance(arr[0], (int, np.integer)):
-                        dt = impl.get_runtime().default_ip
-                    elif isinstance(arr[0], float):
-                        dt = impl.get_runtime().default_fp
-                    elif isinstance(arr[0], expr.Expr):
-                        dt = arr[0].ptr.get_ret_type()
-                        if dt == ti_core.DataType_unknown:
-                            raise TypeError(
-                                'Element type of the matrix cannot be inferred. Please set dt instead for now.'
-                            )
-                    else:
-                        raise Exception(
-                            'dt required when using dynamic_index for local tensor'
-                        )
-                local_tensor_proxy = impl.expr_init_local_tensor(
-                    [len(arr)], dt,
-                    expr.make_expr_group([expr.Expr(x) for x in arr]))
-                dynamic_index_stride = 1
-                mat = []
-                for i in range(len(arr)):
-                    mat.append(
-                        list([
-                            impl.make_tensor_element_expr(
-                                local_tensor_proxy,
-                                (expr.Expr(i, dtype=primitive_types.i32), ),
-                                (len(arr), ), dynamic_index_stride)
-                        ]))
-        else:  # now init a Matrix
+        else:
+            is_matrix = isinstance(arr[0], Iterable)
+            initializer = _make_entries_initializer(is_matrix)
+
             if in_python_scope() or is_ref:
-                mat = [list(row) for row in arr]
+                mat = initializer.pyscope_or_ref(arr)
             elif not impl.current_cfg().dynamic_index:
-                mat = [[
-                    impl.expr_init(ops_mod.cast(x, dt) if dt else x)
-                    for x in row
-                ] for row in arr]
+                mat = initializer.no_dynamic_index(arr, dt)
             else:
                 if not ti_core.is_extension_supported(
                         impl.current_cfg().arch,
@@ -388,40 +439,13 @@ def __init__(self, arr, dt=None, suppress_warning=False, is_ref=False):
                         f"Backend {impl.current_cfg().arch} doesn't support dynamic index"
                     )
                 if dt is None:
-                    if isinstance(arr[0][0], (int, np.integer)):
-                        dt = impl.get_runtime().default_ip
-                    elif isinstance(arr[0][0], float):
-                        dt = impl.get_runtime().default_fp
-                    elif isinstance(arr[0][0], expr.Expr):
-                        dt = arr[0][0].ptr.get_ret_type()
-                        if dt == ti_core.DataType_unknown:
-                            raise TypeError(
-                                'Element type of the matrix cannot be inferred. Please set dt instead for now.'
-                            )
-                    else:
-                        raise Exception(
-                            'dt required when using dynamic_index for local tensor'
-                        )
-                local_tensor_proxy = impl.expr_init_local_tensor(
-                    [len(arr), len(arr[0])], dt,
-                    expr.make_expr_group(
-                        [expr.Expr(x) for row in arr for x in row]))
-                dynamic_index_stride = 1
-                mat = []
-                for i in range(len(arr)):
-                    mat.append([])
-                    for j in range(len(arr[0])):
-                        mat[i].append(
-                            impl.make_tensor_element_expr(
-                                local_tensor_proxy,
-                                (expr.Expr(i, dtype=primitive_types.i32),
-                                 expr.Expr(j, dtype=primitive_types.i32)),
-                                (len(arr), len(arr[0])), dynamic_index_stride))
-        self.n = len(mat)
+                    dt = initializer.infer_dt(arr)
+                local_tensor_proxy, dynamic_index_stride, mat = initializer.with_dynamic_index(
+                    arr, dt)
+
+        self.n, self.m = len(mat), 1
         if len(mat) > 0:
             self.m = len(mat[0])
-        else:
-            self.m = 1
         entries = [x for row in mat for x in row]
 
         if self.n * self.m > 32 and not suppress_warning:

From b485bde4dfb9ede34319f927ffd2fd797912b424 Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Sun, 8 May 2022 10:39:25 +0800
Subject: [PATCH 037/176] [Doc] Updated relative path (#4929)

* Update field.md

* Updated one broken link.
---
 docs/lang/articles/basic/field.md                    |  2 +-
 docs/lang/articles/contribution/contributor_guide.md | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/lang/articles/basic/field.md b/docs/lang/articles/basic/field.md
index 8fdd4fd6a94b7..38ba15c7537f4 100644
--- a/docs/lang/articles/basic/field.md
+++ b/docs/lang/articles/basic/field.md
@@ -74,7 +74,7 @@ while gui.running:
 ```
 
 :::tip
-With Taichi versions earlier than v0.8.0, you cannot allocate new fields after executing a kernel. Starting from v0.8.0, you can use the `FieldsBuilder` class to dynamically allocate or destruct fields. See the [Field (advanced)](./advanced/layout.md) for more information.
+With Taichi versions earlier than v0.8.0, you cannot allocate new fields after executing a kernel. Starting from v0.8.0, you can use the `FieldsBuilder` class to dynamically allocate or destruct fields. See the [Field (advanced)](../advanced/layout.md) for more information.
 :::
 
 :::caution WARNING
diff --git a/docs/lang/articles/contribution/contributor_guide.md b/docs/lang/articles/contribution/contributor_guide.md
index 04f16bbaa64f4..72f935e960461 100644
--- a/docs/lang/articles/contribution/contributor_guide.md
+++ b/docs/lang/articles/contribution/contributor_guide.md
@@ -85,16 +85,16 @@ Except for minor updates, most PRs start from a developer taking over an issue.
 As part of the effort to increase visibility of the community and to improve developer experience, we highly recommend including documentation updates in your PR if applicable. Here are some of the documentation-specific references and tips:
 
 - Documentation source files are hosted under [docs/](https://github.com/taichi-dev/taichi/blob/master/docs/).
-- We use GitHub Flavored Markdown (GFM) and [Docusaurus](https://docusaurus.io/) to build our documentation site. For information on the supported Markdown syntax, see the  [Documentation Writing Guide](./doc_writing).
+- We use GitHub Flavored Markdown (GFM) and [Docusaurus](https://docusaurus.io/) to build our documentation site. For information on the supported Markdown syntax, see the  [Documentation Writing Guide](./doc_writing.md).
 - When it comes to writing, we adhere to the [Google Developer Documentation Style Guide](https://developers.google.com/style/).
-- For instructions on setting up a local server and previewing your updated documentation in real-time, see the [Local Development](https://github.com/taichi-dev/docs.taichi-lang.org#local-development).
+- For instructions on setting up a local server and previewing your updated documentation in real-time, see the [Local Development](https://github.com/taichi-dev/docs.taichi.graphics#local-development).
 
 ## Add test cases for your local changes
 
 If your PR is to implement a new feature, we recommend that you write your own test cases to cover corner cases for your codes before filing a PR.
 
-- To write a Python test case, see the [Workflow for writing a Python test](./write_test).
-- To write a C++ test case, see the [Workflow for writing a C++ test](./writing_cpp_tests).
+- To write a Python test case, see the [Workflow for writing a Python test](./write_test.md).
+- To write a C++ test case, see the [Workflow for writing a C++ test](./writing_cpp_tests.md).
 
 ## Conduct style checks and integration tests locally
 
@@ -136,7 +136,7 @@ No problem, the CI bot will run the code checkers and format your codes automati
 
 <!-- Todo: Make this a reusable fragment. -->
 
-> For more style information for your C++ code, see [our C++ style](./cpp_style).
+> For more style information for your C++ code, see [our C++ style](./cpp_style.md).
 
 ### Run integration tests
 

From 736ebd54b534d036e906124e1de74e6f6aa72193 Mon Sep 17 00:00:00 2001
From: pengyu <6712304+FantasyVR@users.noreply.github.com>
Date: Mon, 9 May 2022 11:01:45 +0800
Subject: [PATCH 038/176] [Lang] Support sparse matrix datatype and storage
 format configuration (#4673)

* Add sparse matrix datatype configuration

* create sparse matrix with datatype in Python

* sparse solver takes as sparse matrix with datatype parameters

* operator overloading with bug

* fix operator overloading bugs

* Add more operator overloading functions

* EigenSparseMatrix operator overloading

* improve

* Clang-tidy

* add more datatype EigenSparseMatrix

* get/set element bug fix

* Bugfix:sparse matrix shape configuration

* improve sparse matrix test cases

* Update tests/python/test_sparse_matrix.py

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* improve

* Update taichi/program/sparse_matrix.h

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>
Co-authored-by: taichiCourse01 <tgc01@taichi.graphics>
---
 misc/sparse_matrix.py                 |  60 +++++----
 python/taichi/linalg/sparse_matrix.py |  41 +++++-
 taichi/program/sparse_matrix.cpp      | 152 +++++++++++-----------
 taichi/program/sparse_matrix.h        | 176 ++++++++++++++++++++++----
 taichi/program/sparse_solver.cpp      |  43 ++++---
 taichi/program/sparse_solver.h        |   2 +-
 taichi/python/export_lang.cpp         |  63 ++++++---
 tests/python/test_sparse_matrix.py    | 135 ++++++++++++++------
 tests/python/test_spmv.py             |  38 +++++-
 9 files changed, 496 insertions(+), 214 deletions(-)

diff --git a/misc/sparse_matrix.py b/misc/sparse_matrix.py
index a56054f7aedcd..f96748104e618 100644
--- a/misc/sparse_matrix.py
+++ b/misc/sparse_matrix.py
@@ -4,8 +4,16 @@
 
 n = 8
 
-K = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
-f = ti.linalg.SparseMatrixBuilder(n, 1, max_num_triplets=100)
+K = ti.linalg.SparseMatrixBuilder(n,
+                                  n,
+                                  max_num_triplets=100,
+                                  dtype=ti.f32,
+                                  storage_format='col_major')
+f = ti.linalg.SparseMatrixBuilder(n,
+                                  1,
+                                  max_num_triplets=100,
+                                  dtype=ti.f32,
+                                  storage_format='col_major')
 
 
 @ti.kernel
@@ -13,8 +21,8 @@ def fill(A: ti.types.sparse_matrix_builder(),
          b: ti.types.sparse_matrix_builder(), interval: ti.i32):
     for i in range(n):
         if i > 0:
-            A[i - 1, i] += -1.0
-            A[i, i] += 1
+            A[i - 1, i] += -2.0
+            A[i, i] += 1.0
         if i < n - 1:
             A[i + 1, i] += -1.0
             A[i, i] += 1.0
@@ -33,32 +41,36 @@ def fill(A: ti.types.sparse_matrix_builder(),
 print(">>>> A = K.build()")
 print(A)
 
-print(">>>> Summation: C = A + A")
-C = A + A
+print(">>>> Summation: B = A + A")
+B = A + A
+print(B)
+
+print(">>>> Summation: B += A")
+B += A
+print(B)
+
+print(">>>> Subtraction: C = B - A")
+C = B - A
+print(C)
+
+print(">>>> Subtraction: C -= A")
+C -= A
 print(C)
 
-print(">>>> Subtraction: D = A - A")
-D = A - A
+print(">>>> Multiplication with a scalar on the right: D = A * 3.0")
+D = A * 3.0
 print(D)
 
-print(">>>> Multiplication with a scalar on the right: E = A * 3.0")
-E = A * 3.0
-print(E)
+print(">>>> Multiplication with a scalar on the left: D = 3.0 * A")
+D = 3.0 * A
+print(D)
 
-print(">>>> Multiplication with a scalar on the left: E = 3.0 * A")
-E = 3.0 * A
+print(">>>> Transpose: E = D.transpose()")
+E = D.transpose()
 print(E)
 
-print(">>>> Transpose: F = A.transpose()")
-F = A.transpose()
+print(">>>> Matrix multiplication: F= E @ A")
+F = E @ A
 print(F)
 
-print(">>>> Matrix multiplication: G = E @ A")
-G = E @ A
-print(G)
-
-print(">>>> Element-wise multiplication: H = E * A")
-H = E * A
-print(H)
-
-print(f">>>> Element Access: A[0,0] = {A[0,0]}")
+print(f">>>> Element Access: F[0,0] = {F[0,0]}")
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
index 928aec91d1382..40d87cd796f3a 100644
--- a/python/taichi/linalg/sparse_matrix.py
+++ b/python/taichi/linalg/sparse_matrix.py
@@ -15,16 +15,32 @@ class SparseMatrix:
         m (int): the second dimension of a sparse matrix.
         sm (SparseMatrix): another sparse matrix that will be built from.
     """
-    def __init__(self, n=None, m=None, sm=None, dtype=f32):
+    def __init__(self,
+                 n=None,
+                 m=None,
+                 sm=None,
+                 dtype=f32,
+                 storage_format="col_major"):
         if sm is None:
             self.n = n
             self.m = m if m else n
-            self.matrix = get_runtime().prog.create_sparse_matrix(n, m)
+            self.matrix = get_runtime().prog.create_sparse_matrix(
+                n, m, dtype, storage_format)
         else:
             self.n = sm.num_rows()
             self.m = sm.num_cols()
             self.matrix = sm
 
+    def __iadd__(self, other):
+        """Addition operation for sparse matrix.
+
+        Returns:
+            The result sparse matrix of the addition.
+        """
+        assert self.n == other.n and self.m == other.m, f"Dimension mismatch between sparse matrices ({self.n}, {self.m}) and ({other.n}, {other.m})"
+        self.matrix += other.matrix
+        return self
+
     def __add__(self, other):
         """Addition operation for sparse matrix.
 
@@ -35,6 +51,16 @@ def __add__(self, other):
         sm = self.matrix + other.matrix
         return SparseMatrix(sm=sm)
 
+    def __isub__(self, other):
+        """Subtraction operation for sparse matrix.
+
+        Returns:
+             The result sparse matrix of the subtraction.
+        """
+        assert self.n == other.n and self.m == other.m, f"Dimension mismatch between sparse matrices ({self.n}, {self.m}) and ({other.n}, {other.m})"
+        self.matrix -= other.matrix
+        return self
+
     def __sub__(self, other):
         """Subtraction operation for sparse matrix.
 
@@ -54,7 +80,7 @@ def __mul__(self, other):
             The result of multiplication.
         """
         if isinstance(other, float):
-            sm = self.matrix * other
+            sm = other * self.matrix
             return SparseMatrix(sm=sm)
         if isinstance(other, SparseMatrix):
             assert self.n == other.n and self.m == other.m, f"Dimension mismatch between sparse matrices ({self.n}, {self.m}) and ({other.n}, {other.m})"
@@ -72,7 +98,7 @@ def __rmul__(self, other):
             The result of multiplication.
         """
         if isinstance(other, float):
-            sm = other * self.matrix
+            sm = self.matrix * other
             return SparseMatrix(sm=sm)
 
         return None
@@ -135,18 +161,21 @@ class SparseMatrixBuilder:
         num_rows (int): the first dimension of a sparse matrix.
         num_cols (int): the second dimension of a sparse matrix.
         max_num_triplets (int): the maximum number of triplets.
+        dtype (ti.dtype): the data type of the sparse matrix.
+        storage_format (str): the storage format of the sparse matrix.
     """
     def __init__(self,
                  num_rows=None,
                  num_cols=None,
                  max_num_triplets=0,
-                 dtype=f32):
+                 dtype=f32,
+                 storage_format="col_major"):
         self.num_rows = num_rows
         self.num_cols = num_cols if num_cols else num_rows
         self.dtype = dtype
         if num_rows is not None:
             self.ptr = get_runtime().prog.create_sparse_matrix_builder(
-                num_rows, num_cols, max_num_triplets, dtype)
+                num_rows, num_cols, max_num_triplets, dtype, storage_format)
 
     def _get_addr(self):
         """Get the address of the sparse matrix"""
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index 4d2a26f32ce41..1952b1b078bf3 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -1,21 +1,53 @@
 #include "taichi/program/sparse_matrix.h"
 
 #include <sstream>
+#include <string>
+#include <unordered_map>
+#include <utility>
 
 #include "Eigen/Dense"
 #include "Eigen/SparseLU"
 
+#define BUILD(TYPE)                                                         \
+  {                                                                         \
+    using T = Eigen::Triplet<float##TYPE>;                                  \
+    std::vector<T> *triplets = static_cast<std::vector<T> *>(triplets_adr); \
+    matrix_.setFromTriplets(triplets->begin(), triplets->end());            \
+  }
+
+#define MAKE_MATRIX(TYPE, STORAGE)                                             \
+  {                                                                            \
+    Pair("f" #TYPE, #STORAGE),                                                 \
+        [](int rows, int cols, DataType dt) -> std::unique_ptr<SparseMatrix> { \
+          using FC = Eigen::SparseMatrix<float##TYPE, Eigen::STORAGE>;         \
+          return std::make_unique<EigenSparseMatrix<FC>>(rows, cols, dt);      \
+        }                                                                      \
+  }
+
+namespace {
+using Pair = std::pair<std::string, std::string>;
+struct key_hash {
+  std::size_t operator()(const Pair &k) const {
+    auto h1 = std::hash<std::string>{}(k.first);
+    auto h2 = std::hash<std::string>{}(k.second);
+    return h1 ^ h2;
+  }
+};
+}  // namespace
+
 namespace taichi {
 namespace lang {
 
 SparseMatrixBuilder::SparseMatrixBuilder(int rows,
                                          int cols,
                                          int max_num_triplets,
-                                         DataType dtype)
+                                         DataType dtype,
+                                         const std::string &storage_format)
     : rows_(rows),
       cols_(cols),
       max_num_triplets_(max_num_triplets),
-      dtype_(dtype) {
+      dtype_(dtype),
+      storage_format_(storage_format) {
   auto element_size = data_type_size(dtype);
   TI_ASSERT((element_size == 4 || element_size == 8));
   data_base_ptr_ =
@@ -50,7 +82,7 @@ void SparseMatrixBuilder::print_triplets() {
 }
 
 template <typename T, typename G>
-SparseMatrix SparseMatrixBuilder::build_template() {
+void SparseMatrixBuilder::build_template(std::unique_ptr<SparseMatrix> &m) {
   using V = Eigen::Triplet<T>;
   std::vector<V> triplets;
   T *data = reinterpret_cast<T *>(data_base_ptr_.get());
@@ -58,25 +90,27 @@ SparseMatrix SparseMatrixBuilder::build_template() {
     triplets.push_back(V(((G *)data)[i * 3], ((G *)data)[i * 3 + 1],
                          taichi_union_cast<T>(data[i * 3 + 2])));
   }
-  SparseMatrix sm(rows_, cols_);
-  sm.get_matrix().setFromTriplets(triplets.begin(), triplets.end());
+  m->build_triplets(static_cast<void *>(&triplets));
   clear();
-  return sm;
 }
 
-SparseMatrix SparseMatrixBuilder::build() {
+std::unique_ptr<SparseMatrix> SparseMatrixBuilder::build() {
   TI_ASSERT(built_ == false);
   built_ = true;
+  auto sm = make_sparse_matrix(rows_, cols_, dtype_, storage_format_);
   auto element_size = data_type_size(dtype_);
   switch (element_size) {
     case 4:
-      return build_template<float32, int32>();
+      build_template<float32, int32>(sm);
+      break;
     case 8:
-      return build_template<float64, int64>();
+      build_template<float64, int64>(sm);
+      break;
     default:
       TI_ERROR("Unsupported sparse matrix data type!");
       break;
   }
+  return sm;
 }
 
 void SparseMatrixBuilder::clear() {
@@ -84,82 +118,48 @@ void SparseMatrixBuilder::clear() {
   num_triplets_ = 0;
 }
 
-SparseMatrix::SparseMatrix(Eigen::SparseMatrix<float32> &matrix) {
-  this->matrix_ = matrix;
-}
-
-SparseMatrix::SparseMatrix(int rows, int cols) : matrix_(rows, cols) {
-}
-
-const std::string SparseMatrix::to_string() const {
+template <class EigenMatrix>
+const std::string EigenSparseMatrix<EigenMatrix>::to_string() const {
   Eigen::IOFormat clean_fmt(4, 0, ", ", "\n", "[", "]");
   // Note that the code below first converts the sparse matrix into a dense one.
   // https://stackoverflow.com/questions/38553335/how-can-i-print-in-console-a-formatted-sparse-matrix-with-eigen
   std::ostringstream ostr;
-  ostr << Eigen::MatrixXf(matrix_).format(clean_fmt);
+  ostr << Eigen::MatrixXf(matrix_.template cast<float>()).format(clean_fmt);
   return ostr.str();
 }
 
-const int SparseMatrix::num_rows() const {
-  return matrix_.rows();
-}
-const int SparseMatrix::num_cols() const {
-  return matrix_.cols();
-}
-
-Eigen::SparseMatrix<float32> &SparseMatrix::get_matrix() {
-  return matrix_;
-}
-
-const Eigen::SparseMatrix<float32> &SparseMatrix::get_matrix() const {
-  return matrix_;
-}
-
-SparseMatrix operator+(const SparseMatrix &sm1, const SparseMatrix &sm2) {
-  Eigen::SparseMatrix<float32> res(sm1.matrix_ + sm2.matrix_);
-  return SparseMatrix(res);
-}
-
-SparseMatrix operator-(const SparseMatrix &sm1, const SparseMatrix &sm2) {
-  Eigen::SparseMatrix<float32> res(sm1.matrix_ - sm2.matrix_);
-  return SparseMatrix(res);
-}
-
-SparseMatrix operator*(float scale, const SparseMatrix &sm) {
-  Eigen::SparseMatrix<float32> res(scale * sm.matrix_);
-  return SparseMatrix(res);
-}
-
-SparseMatrix operator*(const SparseMatrix &sm, float scale) {
-  return scale * sm;
-}
-
-SparseMatrix operator*(const SparseMatrix &sm1, const SparseMatrix &sm2) {
-  Eigen::SparseMatrix<float32> res(sm1.matrix_.cwiseProduct(sm2.matrix_));
-  return SparseMatrix(res);
-}
-
-SparseMatrix SparseMatrix::matmul(const SparseMatrix &sm) {
-  Eigen::SparseMatrix<float32> res(matrix_ * sm.matrix_);
-  return SparseMatrix(res);
-}
-
-Eigen::VectorXf SparseMatrix::mat_vec_mul(
-    const Eigen::Ref<const Eigen::VectorXf> &b) {
-  return matrix_ * b;
-}
-
-SparseMatrix SparseMatrix::transpose() {
-  Eigen::SparseMatrix<float32> res(matrix_.transpose());
-  return SparseMatrix(res);
-}
-
-float32 SparseMatrix::get_element(int row, int col) {
-  return matrix_.coeff(row, col);
+template <class EigenMatrix>
+void EigenSparseMatrix<EigenMatrix>::build_triplets(void *triplets_adr) {
+  std::string sdtype = taichi::lang::data_type_name(dtype_);
+  if (sdtype == "f32") {
+    BUILD(32)
+  } else if (sdtype == "f64") {
+    BUILD(64)
+  } else {
+    TI_ERROR("Unsupported sparse matrix data type {}!", sdtype);
+  }
 }
 
-void SparseMatrix::set_element(int row, int col, float32 value) {
-  matrix_.coeffRef(row, col) = value;
+std::unique_ptr<SparseMatrix> make_sparse_matrix(
+    int rows,
+    int cols,
+    DataType dt,
+    const std::string &storage_format = "col_major") {
+  using func_type = std::unique_ptr<SparseMatrix> (*)(int, int, DataType);
+  static const std::unordered_map<Pair, func_type, key_hash> map = {
+      MAKE_MATRIX(32, ColMajor), MAKE_MATRIX(32, RowMajor),
+      MAKE_MATRIX(64, ColMajor), MAKE_MATRIX(64, RowMajor)};
+  std::unordered_map<std::string, std::string> format_map = {
+      {"col_major", "ColMajor"}, {"row_major", "RowMajor"}};
+  std::string tdt = taichi::lang::data_type_name(dt);
+  Pair key = std::make_pair(tdt, format_map.at(storage_format));
+  auto it = map.find(key);
+  if (it != map.end()) {
+    auto func = map.at(key);
+    return func(rows, cols, dt);
+  } else
+    TI_ERROR("Unsupported sparse matrix data type: {}, storage format: {}", tdt,
+             storage_format);
 }
 
 }  // namespace lang
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
index f5baa62539912..5aedfd1b1d7b2 100644
--- a/taichi/program/sparse_matrix.h
+++ b/taichi/program/sparse_matrix.h
@@ -13,11 +13,15 @@ class SparseMatrix;
 
 class SparseMatrixBuilder {
  public:
-  SparseMatrixBuilder(int rows, int cols, int max_num_triplets, DataType dtype);
+  SparseMatrixBuilder(int rows,
+                      int cols,
+                      int max_num_triplets,
+                      DataType dtype,
+                      const std::string &storage_format);
 
   void print_triplets();
 
-  SparseMatrix build();
+  std::unique_ptr<SparseMatrix> build();
 
   void clear();
 
@@ -26,7 +30,7 @@ class SparseMatrixBuilder {
   void print_template();
 
   template <typename T, typename G>
-  SparseMatrix build_template();
+  void build_template(std::unique_ptr<SparseMatrix> &);
 
  private:
   uint64 num_triplets_{0};
@@ -36,37 +40,153 @@ class SparseMatrixBuilder {
   uint64 max_num_triplets_{0};
   bool built_{false};
   DataType dtype_{PrimitiveType::f32};
+  std::string storage_format_{"col_major"};
 };
 
 class SparseMatrix {
  public:
-  SparseMatrix() = delete;
-  SparseMatrix(int rows, int cols);
-  SparseMatrix(Eigen::SparseMatrix<float32> &matrix);
-
-  const int num_rows() const;
-  const int num_cols() const;
-  const std::string to_string() const;
-  Eigen::SparseMatrix<float32> &get_matrix();
-  const Eigen::SparseMatrix<float32> &get_matrix() const;
-  float32 get_element(int row, int col);
-  void set_element(int row, int col, float32 value);
-
-  friend SparseMatrix operator+(const SparseMatrix &sm1,
-                                const SparseMatrix &sm2);
-  friend SparseMatrix operator-(const SparseMatrix &sm1,
-                                const SparseMatrix &sm2);
-  friend SparseMatrix operator*(float scale, const SparseMatrix &sm);
-  friend SparseMatrix operator*(const SparseMatrix &sm, float scale);
-  friend SparseMatrix operator*(const SparseMatrix &sm1,
-                                const SparseMatrix &sm2);
-  SparseMatrix matmul(const SparseMatrix &sm);
-  Eigen::VectorXf mat_vec_mul(const Eigen::Ref<const Eigen::VectorXf> &b);
-
-  SparseMatrix transpose();
+  SparseMatrix() : rows_(0), cols_(0), dtype_(PrimitiveType::f32){};
+  SparseMatrix(int rows, int cols, DataType dt = PrimitiveType::f32)
+      : rows_{rows}, cols_(cols), dtype_(dt){};
+  SparseMatrix(SparseMatrix &sm)
+      : rows_(sm.rows_), cols_(sm.cols_), dtype_(sm.dtype_) {
+  }
+  SparseMatrix(SparseMatrix &&sm)
+      : rows_(sm.rows_), cols_(sm.cols_), dtype_(sm.dtype_) {
+  }
+  virtual ~SparseMatrix() = default;
+
+  virtual void build_triplets(void *triplets_adr){};
+
+  inline const int num_rows() const {
+    return rows_;
+  }
+
+  inline const int num_cols() const {
+    return cols_;
+  }
+
+  virtual const std::string to_string() const {
+    return nullptr;
+  }
+
+  virtual const void *get_matrix() const {
+    return nullptr;
+  }
+
+  template <class T>
+  T get_element(int row, int col) {
+    std::cout << "get_element not implemented" << std::endl;
+    return 0;
+  }
+
+  template <class T>
+  void set_element(int row, int col, T value) {
+    std::cout << "set_element not implemented" << std::endl;
+    return;
+  }
+
+ protected:
+  int rows_{0};
+  int cols_{0};
+  DataType dtype_{PrimitiveType::f32};
+};
+
+template <class EigenMatrix>
+class EigenSparseMatrix : public SparseMatrix {
+ public:
+  explicit EigenSparseMatrix(int rows, int cols, DataType dt)
+      : SparseMatrix(rows, cols, dt), matrix_(rows, cols) {
+  }
+  explicit EigenSparseMatrix(EigenSparseMatrix &sm)
+      : SparseMatrix(sm.num_rows(), sm.num_cols(), sm.dtype_),
+        matrix_(sm.matrix_) {
+  }
+  explicit EigenSparseMatrix(EigenSparseMatrix &&sm)
+      : SparseMatrix(sm.num_rows(), sm.num_cols(), sm.dtype_),
+        matrix_(sm.matrix_) {
+  }
+  explicit EigenSparseMatrix(const EigenMatrix &em)
+      : SparseMatrix(em.rows(), em.cols()), matrix_(em) {
+  }
+
+  ~EigenSparseMatrix() override = default;
+  void build_triplets(void *triplets_adr) override;
+  const std::string to_string() const override;
+
+  const void *get_matrix() const override {
+    return &matrix_;
+  };
+
+  virtual EigenSparseMatrix &operator+=(const EigenSparseMatrix &other) {
+    this->matrix_ += other.matrix_;
+    return *this;
+  };
+
+  friend EigenSparseMatrix operator+(const EigenSparseMatrix &lhs,
+                                     const EigenSparseMatrix &rhs) {
+    return EigenSparseMatrix(lhs.matrix_ + rhs.matrix_);
+  };
+
+  virtual EigenSparseMatrix &operator-=(const EigenSparseMatrix &other) {
+    this->matrix_ -= other.matrix_;
+    return *this;
+  }
+
+  friend EigenSparseMatrix operator-(const EigenSparseMatrix &lhs,
+                                     const EigenSparseMatrix &rhs) {
+    return EigenSparseMatrix(lhs.matrix_ - rhs.matrix_);
+  };
+
+  virtual EigenSparseMatrix &operator*=(float scale) {
+    this->matrix_ *= scale;
+    return *this;
+  }
+
+  friend EigenSparseMatrix operator*(const EigenSparseMatrix &sm, float scale) {
+    return EigenSparseMatrix(sm.matrix_ * scale);
+  }
+
+  friend EigenSparseMatrix operator*(float scale, const EigenSparseMatrix &sm) {
+    return EigenSparseMatrix(sm.matrix_ * scale);
+  }
+
+  friend EigenSparseMatrix operator*(const EigenSparseMatrix &lhs,
+                                     const EigenSparseMatrix &rhs) {
+    return EigenSparseMatrix(lhs.matrix_.cwiseProduct(rhs.matrix_));
+  }
+
+  EigenSparseMatrix transpose() {
+    return EigenSparseMatrix(matrix_.transpose());
+  }
+
+  EigenSparseMatrix matmul(const EigenSparseMatrix &sm) {
+    return EigenSparseMatrix(matrix_ * sm.matrix_);
+  }
+
+  template <typename T>
+  T get_element(int row, int col) {
+    return matrix_.coeff(row, col);
+  }
+
+  template <typename T>
+  void set_element(int row, int col, T value) {
+    matrix_.coeffRef(row, col) = value;
+  }
+
+  template <class VT>
+  VT mat_vec_mul(const Eigen::Ref<const VT> &b) {
+    return matrix_ * b;
+  }
 
  private:
-  Eigen::SparseMatrix<float32, Eigen::ColMajor> matrix_;
+  EigenMatrix matrix_;
 };
+
+std::unique_ptr<SparseMatrix> make_sparse_matrix(
+    int rows,
+    int cols,
+    DataType dt,
+    const std::string &storage_format);
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/sparse_solver.cpp b/taichi/program/sparse_solver.cpp
index 77d0bf8981921..4743a511ced50 100644
--- a/taichi/program/sparse_solver.cpp
+++ b/taichi/program/sparse_solver.cpp
@@ -9,7 +9,8 @@
     {#dt, #type, #order}, []() -> std::unique_ptr<SparseSolver> {              \
       using T = Eigen::Simplicial##type<Eigen::SparseMatrix<dt>, Eigen::Lower, \
                                         Eigen::order##Ordering<int>>;          \
-      return std::make_unique<EigenSparseSolver<T>>();                         \
+      return std::make_unique<                                                 \
+          EigenSparseSolver<T, Eigen::SparseMatrix<dt>>>();                    \
     }                                                                          \
   }
 
@@ -28,32 +29,41 @@ struct key_hash {
 namespace taichi {
 namespace lang {
 
-template <class EigenSolver>
-bool EigenSparseSolver<EigenSolver>::compute(const SparseMatrix &sm) {
-  solver_.compute(sm.get_matrix());
+#define GET_EM(sm) \
+  const EigenMatrix *mat = (const EigenMatrix *)(sm.get_matrix());
+
+template <class EigenSolver, class EigenMatrix>
+bool EigenSparseSolver<EigenSolver, EigenMatrix>::compute(
+    const SparseMatrix &sm) {
+  GET_EM(sm);
+  solver_.compute(*mat);
   if (solver_.info() != Eigen::Success) {
     return false;
   } else
     return true;
 }
-template <class EigenSolver>
-void EigenSparseSolver<EigenSolver>::analyze_pattern(const SparseMatrix &sm) {
-  solver_.analyzePattern(sm.get_matrix());
+template <class EigenSolver, class EigenMatrix>
+void EigenSparseSolver<EigenSolver, EigenMatrix>::analyze_pattern(
+    const SparseMatrix &sm) {
+  GET_EM(sm);
+  solver_.analyzePattern(*mat);
 }
 
-template <class EigenSolver>
-void EigenSparseSolver<EigenSolver>::factorize(const SparseMatrix &sm) {
-  solver_.factorize(sm.get_matrix());
+template <class EigenSolver, class EigenMatrix>
+void EigenSparseSolver<EigenSolver, EigenMatrix>::factorize(
+    const SparseMatrix &sm) {
+  GET_EM(sm);
+  solver_.factorize(*mat);
 }
 
-template <class EigenSolver>
-Eigen::VectorXf EigenSparseSolver<EigenSolver>::solve(
+template <class EigenSolver, class EigenMatrix>
+Eigen::VectorXf EigenSparseSolver<EigenSolver, EigenMatrix>::solve(
     const Eigen::Ref<const Eigen::VectorXf> &b) {
   return solver_.solve(b);
 }
 
-template <class EigenSolver>
-bool EigenSparseSolver<EigenSolver>::info() {
+template <class EigenSolver, class EigenMatrix>
+bool EigenSparseSolver<EigenSolver, EigenMatrix>::info() {
   return solver_.info() == Eigen::Success;
 }
 
@@ -78,8 +88,9 @@ std::unique_ptr<SparseSolver> make_sparse_solver(DataType dt,
     auto solver_func = solver_factory.at(solver_key);
     return solver_func();
   } else if (solver_type == "LU") {
-    using LU = Eigen::SparseLU<Eigen::SparseMatrix<float32>>;
-    return std::make_unique<EigenSparseSolver<LU>>();
+    using EigenMatrix = Eigen::SparseMatrix<float32>;
+    using LU = Eigen::SparseLU<EigenMatrix>;
+    return std::make_unique<EigenSparseSolver<LU, EigenMatrix>>();
   } else
     TI_ERROR("Not supported sparse solver type: {}", solver_type);
 }
diff --git a/taichi/program/sparse_solver.h b/taichi/program/sparse_solver.h
index 8325802bdc288..e156e826a0db1 100644
--- a/taichi/program/sparse_solver.h
+++ b/taichi/program/sparse_solver.h
@@ -17,7 +17,7 @@ class SparseSolver {
   virtual bool info() = 0;
 };
 
-template <class EigenSolver>
+template <class EigenSolver, class EigenMatrix>
 class EigenSparseSolver : public SparseSolver {
  private:
   EigenSolver solver_;
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 8b2800dfcca02..29374e70fad48 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -366,16 +366,18 @@ void export_lang(py::module &m) {
            py::return_value_policy::reference)
       .def("create_sparse_matrix_builder",
            [](Program *program, int n, int m, uint64 max_num_entries,
-              DataType dtype) {
+              DataType dtype, const std::string &storage_format) {
              TI_ERROR_IF(!arch_is_cpu(program->config.arch),
                          "SparseMatrix only supports CPU for now.");
-             return SparseMatrixBuilder(n, m, max_num_entries, dtype);
+             return SparseMatrixBuilder(n, m, max_num_entries, dtype,
+                                        storage_format);
            })
       .def("create_sparse_matrix",
-           [](Program *program, int n, int m) {
+           [](Program *program, int n, int m, DataType dtype,
+              std::string storage_format) {
              TI_ERROR_IF(!arch_is_cpu(program->config.arch),
                          "SparseMatrix only supports CPU for now.");
-             return SparseMatrix(n, m);
+             return make_sparse_matrix(n, m, dtype, storage_format);
            })
       .def(
           "dump_dot",
@@ -936,28 +938,57 @@ void export_lang(py::module &m) {
       },
       py::return_value_policy::reference);
 
+  // Sparse Matrix
   py::class_<SparseMatrixBuilder>(m, "SparseMatrixBuilder")
       .def("print_triplets", &SparseMatrixBuilder::print_triplets)
       .def("build", &SparseMatrixBuilder::build)
       .def("get_addr", [](SparseMatrixBuilder *mat) { return uint64(mat); });
 
   py::class_<SparseMatrix>(m, "SparseMatrix")
+      .def(py::init<>())
+      .def(py::init<int, int, DataType>(), py::arg("rows"), py::arg("cols"),
+           py::arg("dt") = PrimitiveType::f32)
+      .def(py::init<SparseMatrix &>())
       .def("to_string", &SparseMatrix::to_string)
-      .def(py::self + py::self, py::return_value_policy::reference_internal)
-      .def(py::self - py::self, py::return_value_policy::reference_internal)
-      .def(float() * py::self, py::return_value_policy::reference_internal)
-      .def(py::self * float(), py::return_value_policy::reference_internal)
-      .def(py::self * py::self, py::return_value_policy::reference_internal)
-      .def("matmul", &SparseMatrix::matmul,
-           py::return_value_policy::reference_internal)
-      .def("mat_vec_mul", &SparseMatrix::mat_vec_mul)
-      .def("transpose", &SparseMatrix::transpose,
-           py::return_value_policy::reference_internal)
-      .def("get_element", &SparseMatrix::get_element)
-      .def("set_element", &SparseMatrix::set_element)
+      .def("get_element", &SparseMatrix::get_element<float32>)
+      .def("set_element", &SparseMatrix::set_element<float32>)
       .def("num_rows", &SparseMatrix::num_rows)
       .def("num_cols", &SparseMatrix::num_cols);
 
+#define MAKE_SPARSE_MATRIX(TYPE, STORAGE, VTYPE)                             \
+  using STORAGE##TYPE##EigenMatrix =                                         \
+      Eigen::SparseMatrix<float##TYPE, Eigen::STORAGE>;                      \
+  py::class_<EigenSparseMatrix<STORAGE##TYPE##EigenMatrix>, SparseMatrix>(   \
+      m, #VTYPE #STORAGE "_EigenSparseMatrix")                               \
+      .def(py::init<int, int, DataType>())                                   \
+      .def(py::init<EigenSparseMatrix<STORAGE##TYPE##EigenMatrix> &>())      \
+      .def(py::init<const STORAGE##TYPE##EigenMatrix &>())                   \
+      .def(py::self += py::self)                                             \
+      .def(py::self + py::self)                                              \
+      .def(py::self -= py::self)                                             \
+      .def(py::self - py::self)                                              \
+      .def(py::self *= float##TYPE())                                        \
+      .def(py::self *float##TYPE())                                          \
+      .def(float##TYPE() * py::self)                                         \
+      .def(py::self *py::self)                                               \
+      .def("matmul", &EigenSparseMatrix<STORAGE##TYPE##EigenMatrix>::matmul) \
+      .def("transpose",                                                      \
+           &EigenSparseMatrix<STORAGE##TYPE##EigenMatrix>::transpose)        \
+      .def("get_element",                                                    \
+           &EigenSparseMatrix<STORAGE##TYPE##EigenMatrix>::get_element<      \
+               float##TYPE>)                                                 \
+      .def("set_element",                                                    \
+           &EigenSparseMatrix<STORAGE##TYPE##EigenMatrix>::set_element<      \
+               float##TYPE>)                                                 \
+      .def("mat_vec_mul",                                                    \
+           &EigenSparseMatrix<STORAGE##TYPE##EigenMatrix>::mat_vec_mul<      \
+               Eigen::VectorX##VTYPE>);
+
+  MAKE_SPARSE_MATRIX(32, ColMajor, f);
+  MAKE_SPARSE_MATRIX(32, RowMajor, f);
+  MAKE_SPARSE_MATRIX(64, ColMajor, d);
+  MAKE_SPARSE_MATRIX(64, RowMajor, d);
+
   py::class_<SparseSolver>(m, "SparseSolver")
       .def("compute", &SparseSolver::compute)
       .def("analyze_pattern", &SparseSolver::analyze_pattern)
diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
index 179cbefedc31a..5b49316cd81b0 100644
--- a/tests/python/test_sparse_matrix.py
+++ b/tests/python/test_sparse_matrix.py
@@ -4,14 +4,18 @@
 from tests import test_utils
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_builder_deprecated_anno(dtype):
+def test_sparse_matrix_builder_deprecated_anno(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -25,14 +29,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert A[i, j] == i + j
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_builder(dtype):
+def test_sparse_matrix_builder(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -46,14 +54,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert A[i, j] == i + j
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_shape(dtype):
+def test_sparse_matrix_shape(dtype, storage_format):
     n, m = 8, 9
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              m,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -65,14 +77,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
     assert A.shape() == (n, m)
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_element_access(dtype):
+def test_sparse_matrix_element_access(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -85,14 +101,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
         assert A[i, i] == i
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_element_modify(dtype):
+def test_sparse_matrix_element_modify(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -105,18 +125,23 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
     assert A[0, 0] == 1024.0
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_addition(dtype):
+def test_sparse_matrix_addition(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     Bbuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -134,18 +159,23 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
             assert C[i, j] == 2 * i
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_subtraction(dtype):
+def test_sparse_matrix_subtraction(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     Bbuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -163,14 +193,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
             assert C[i, j] == 2 * j
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_scalar_multiplication(dtype):
+def test_sparse_matrix_scalar_multiplication(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -185,14 +219,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert B[i, j] == 3 * (i + j)
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_transpose(dtype):
+def test_sparse_matrix_transpose(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder()):
@@ -207,18 +245,23 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert B[i, j] == A[j, i]
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_elementwise_multiplication(dtype):
+def test_sparse_matrix_elementwise_multiplication(dtype, storage_format):
     n = 8
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     Bbuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -236,18 +279,23 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
             assert C[i, j] == (i + j) * (i - j)
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_multiplication(dtype):
+def test_sparse_matrix_multiplication(dtype, storage_format):
     n = 2
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     Bbuilder = ti.linalg.SparseMatrixBuilder(n,
                                              n,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
@@ -266,18 +314,23 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(),
     assert C[1, 1] == -1.0
 
 
-@pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_nonsymmetric_multiplication(dtype):
+def test_sparse_matrix_nonsymmetric_multiplication(dtype, storage_format):
     n, k, m = 2, 3, 4
     Abuilder = ti.linalg.SparseMatrixBuilder(n,
                                              k,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     Bbuilder = ti.linalg.SparseMatrixBuilder(k,
                                              m,
                                              max_num_triplets=100,
-                                             dtype=dtype)
+                                             dtype=dtype,
+                                             storage_format=storage_format)
 
     @ti.kernel
     def fill(Abuilder: ti.types.sparse_matrix_builder(),
diff --git a/tests/python/test_spmv.py b/tests/python/test_spmv.py
index ee0c78b4fd6be..638c5e29563c3 100644
--- a/tests/python/test_spmv.py
+++ b/tests/python/test_spmv.py
@@ -1,11 +1,21 @@
+import pytest
+
 import taichi as ti
 from tests import test_utils
 
 
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_vector_multiplication1():
+def test_sparse_matrix_vector_multiplication1(dtype, storage_format):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     b = ti.field(ti.f32, shape=n)
 
     @ti.kernel
@@ -23,10 +33,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(), b: ti.template()):
         assert x[i] == 8 * i
 
 
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_vector_multiplication2():
+def test_sparse_matrix_vector_multiplication2(dtype, storage_format):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     b = ti.field(ti.f32, shape=n)
 
     @ti.kernel
@@ -47,10 +65,18 @@ def fill(Abuilder: ti.types.sparse_matrix_builder(), b: ti.template()):
         assert x[i] == res[i]
 
 
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
 @test_utils.test(arch=ti.cpu)
-def test_sparse_matrix_vector_multiplication3():
+def test_sparse_matrix_vector_multiplication3(dtype, storage_format):
     n = 8
-    Abuilder = ti.linalg.SparseMatrixBuilder(n, n, max_num_triplets=100)
+    Abuilder = ti.linalg.SparseMatrixBuilder(n,
+                                             n,
+                                             max_num_triplets=100,
+                                             dtype=dtype,
+                                             storage_format=storage_format)
     b = ti.field(ti.f32, shape=n)
 
     @ti.kernel

From a627ceb5bdf30d475e10b5fa9b559dc333f66b49 Mon Sep 17 00:00:00 2001
From: Chang Yu <g1n0st@live.com>
Date: Mon, 9 May 2022 13:06:47 +0800
Subject: [PATCH 039/176] [lang] Fix type check warnings for ti.Mesh (#4930)

* fix

* fix
---
 python/taichi/lang/mesh.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/taichi/lang/mesh.py b/python/taichi/lang/mesh.py
index 6d2d7031e1201..42c15e935c0b0 100644
--- a/python/taichi/lang/mesh.py
+++ b/python/taichi/lang/mesh.py
@@ -10,7 +10,7 @@
 from taichi.lang.matrix import Matrix, MatrixField, _MatrixFieldElement
 from taichi.lang.struct import StructField
 from taichi.lang.util import python_scope
-from taichi.types import i32, u16, u32
+from taichi.types import u16, u32
 from taichi.types.compound_types import CompoundType
 
 from taichi import lang
@@ -383,15 +383,15 @@ def __init__(self, data):
             element["g2r_mapping"] = np.array(element["g2r_mapping"])
             self.element_fields[element_type] = {}
             self.element_fields[element_type]["owned"] = impl.field(
-                dtype=i32, shape=self.num_patches + 1)
+                dtype=u32, shape=self.num_patches + 1)
             self.element_fields[element_type]["total"] = impl.field(
-                dtype=i32, shape=self.num_patches + 1)
+                dtype=u32, shape=self.num_patches + 1)
             self.element_fields[element_type]["l2g"] = impl.field(
-                dtype=i32, shape=element["l2g_mapping"].shape[0])
+                dtype=u32, shape=element["l2g_mapping"].shape[0])
             self.element_fields[element_type]["l2r"] = impl.field(
-                dtype=i32, shape=element["l2r_mapping"].shape[0])
+                dtype=u32, shape=element["l2r_mapping"].shape[0])
             self.element_fields[element_type]["g2r"] = impl.field(
-                dtype=i32, shape=element["g2r_mapping"].shape[0])
+                dtype=u32, shape=element["g2r_mapping"].shape[0])
 
         for relation in data["relations"]:
             from_order = relation["from_order"]

From 407ff73b4ab78c7440116325784fa1be622dfd62 Mon Sep 17 00:00:00 2001
From: 0xzhang <33616362+0xzhang@users.noreply.github.com>
Date: Mon, 9 May 2022 14:38:40 +0800
Subject: [PATCH 040/176] [SIMT] Add uni_sync warp intrinsics (#4927)

* [SIMT] Add uni_sync warp intrinsics
---
 python/taichi/lang/simt/warp.py |  7 ++++---
 taichi/llvm/llvm_context.cpp    |  3 +++
 taichi/runtime/llvm/runtime.cpp |  8 ++++++++
 tests/python/test_simt.py       | 34 +++++++++++++++++++++++++++++++--
 4 files changed, 47 insertions(+), 5 deletions(-)

diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
index 21a45329e0810..6075ce5d4eb10 100644
--- a/python/taichi/lang/simt/warp.py
+++ b/python/taichi/lang/simt/warp.py
@@ -14,9 +14,10 @@ def any_nonzero(mask, predicate):
             "cuda_any_sync_i32", expr.make_expr_group(mask, predicate), False))
 
 
-def unique():
-    # TODO
-    pass
+def unique(mask, predicate):
+    return expr.Expr(
+        _ti_core.insert_internal_func_call(
+            "cuda_uni_sync_i32", expr.make_expr_group(mask, predicate), False))
 
 
 def ballot(predicate):
diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
index 5ae61e84e8b36..f71bf3715de7e 100644
--- a/taichi/llvm/llvm_context.cpp
+++ b/taichi/llvm/llvm_context.cpp
@@ -363,6 +363,9 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_module(
     patch_intrinsic("cuda_any", Intrinsic::nvvm_vote_any);
     patch_intrinsic("cuda_any_sync", Intrinsic::nvvm_vote_any_sync);
 
+    patch_intrinsic("cuda_uni", Intrinsic::nvvm_vote_uni);
+    patch_intrinsic("cuda_uni_sync", Intrinsic::nvvm_vote_uni_sync);
+
     patch_intrinsic("cuda_ballot", Intrinsic::nvvm_vote_ballot);
     patch_intrinsic("cuda_ballot_sync", Intrinsic::nvvm_vote_ballot_sync);
 
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 162d312d867cc..a3ff2bfee596f 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -1064,6 +1064,14 @@ int32 cuda_any_sync_i32(u32 mask, int32 predicate) {
   return (int32)cuda_any_sync(mask, (bool)predicate);
 }
 
+bool cuda_uni_sync(u32 mask, bool bit) {
+  return false;
+}
+
+int32 cuda_uni_sync_i32(u32 mask, int32 predicate) {
+  return (int32)cuda_uni_sync(mask, (bool)predicate);
+}
+
 int32 cuda_ballot_sync(int32 mask, bool bit) {
   return 0;
 }
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 2d7204b509d54..a29221d3735d6 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -64,8 +64,38 @@ def foo():
 
 @test_utils.test(arch=ti.cuda)
 def test_unique():
-    # TODO
-    pass
+    a = ti.field(dtype=ti.u32, shape=32)
+    b = ti.field(dtype=ti.u32, shape=32)
+
+    @ti.kernel
+    def check():
+        ti.loop_config(block_dim=32)
+        for i in range(32):
+            a[i] = ti.simt.warp.unique(ti.u32(0xFFFFFFFF), b[i])
+
+    for i in range(32):
+        b[i] = 0
+        a[i] = -1
+
+    check()
+
+    for i in range(32):
+        assert a[i] == 1
+
+    for i in range(32):
+        b[i] = i + 100
+
+    check()
+
+    for i in range(32):
+        assert a[i] == 1
+
+    b[np.random.randint(0, 32)] = 0
+
+    check()
+
+    for i in range(32):
+        assert a[i] == 0
 
 
 @test_utils.test(arch=ti.cuda)

From a2108071926c2be84270dd86ce8679bb67cd21bd Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 10 May 2022 16:31:36 +0800
Subject: [PATCH 041/176] [build] Enable -Werror on Linux & Mac (#4941)

---
 cmake/TaichiCXXFlags.cmake | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index da3ff8432982a..bbfe321b6152e 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -50,6 +50,9 @@ else()
     # [Global] CXX compilation option to enable all warnings.
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall ")
 
+    # [Global] CXX compilation option to treat all warnings as errors.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror ")
+
     # [Global] By default, CXX compiler will throw a warning if it decides to ignore an attribute, for example "[[ maybe unused ]]".
     # However, this behaviour diverges across different compilers (GCC/CLANG), as well as different compiler versions.
     # Therefore we disable such warnings for now.

From faad1e22ef5eab93425ee93a12d3183e25f55ce3 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 10 May 2022 16:31:51 +0800
Subject: [PATCH 042/176] [build] Turn on -Werror on Linux and Mac platforms
 (#4928)

* [build] Turn on -Werror on Linux and Mac platforms

* Added documentations for Werror

* Patched documentation
---
 .../contribution/contributor_guide.md         | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)

diff --git a/docs/lang/articles/contribution/contributor_guide.md b/docs/lang/articles/contribution/contributor_guide.md
index 72f935e960461..d2b47947b354b 100644
--- a/docs/lang/articles/contribution/contributor_guide.md
+++ b/docs/lang/articles/contribution/contributor_guide.md
@@ -102,7 +102,7 @@ We highly recommend that you complete code style checks and integration tests on
 
 ### Enforce code style
 
-Taichi enfoces code style via [pre-commit](https://pre-commit.com/) hooks, which includes the following checks:
+Taichi enforces code style via [pre-commit](https://pre-commit.com/) hooks, which includes the following checks:
 
 1. C++ codes are formatted by `clang-format-10`.
 2. Python codes are formatted by `yapf v0.31.0` based on PEP 8 rules.
@@ -293,6 +293,63 @@ Here, we do not want to repeat some best practices summarized in the following G
   - [Code Health: Respectful Reviews == Useful Reviews](https://testing.googleblog.com/2019/11/code-health-respectful-reviews-useful.html)
   - [How to have your PR merged quickly](https://testing.googleblog.com/2017/06/code-health-too-many-comments-on-your.html)
 
+## Compilation Warnings
+Taichi enforces warning-free codes by turning on `-Werror` (treat warning as error) by default. It is highly recommended to resolve a warning as soon as it raises.
+
+On the other hand, real world issues could be way more complicated than what the compiler expected. So we prepared the following HOWTOs to help resolve some common problems. You are also more than welcome to open up an issue or consult the reviewer inplace for further discussions.
+
+### How to deal with warnings from third-party header files
+There is little we can do about third-party warnings other than simply turning them off.
+
+To mute warnings from specific third-party header files, you can apply `SYSTEM` option when including third-party directories in CMakeFiles. The following example can be found in [cmake/TaichiCore.cmake](https://github.com/taichi-dev/taichi/blob/master/cmake/TaichiCore.cmake):
+```
+# Treat files under "external/Vulkan-Headers/include" as system headers, warnings of which will be muted.
+include_directories(SYSTEM external/Vulkan-Headers/include)
+
+# Treat files under "external/VulkanMemoryAllocator/include" as system headers for target "${CORE_LIBRARY_NAME}"
+target_include_directories(${CORE_LIBRARY_NAME} SYSTEM PRIVATE external/VulkanMemoryAllocator/include)
+```
+
+### How to deal with warnings raised when compiling third-party libraries or targets
+Ideally, third-party submodules should be built completely independent of Taichi project except for the topological dependency. Unfortunately, due to the design of CMake system, CMake variables from Taichi and its submodules could get messed up in certain circumstances. Refer to the following two steps to mute warnings from third-party targets.
+
+1. Separate submodule's `CMAKE_CXX_FLAGS` from that configured in Taichi.
+2. Remove "-Wall" option from submodule's `CMAKE_CXX_FLAGS`.
+
+### How to mute specific warning-types across the entire Taichi project
+Search for the option to mute certain warning-types on [Clang Compiler User Manual](https://clang.llvm.org/docs/UsersManual.html), usually it starts with `-Wno-`. In the comments, please explain what the warning does and why we should ignore it.
+
+The following example can be found in [cmake/TaichiCXXFlags.cmake](https://github.com/taichi-dev/taichi/blob/master/cmake/TaichiCXXFlags.cmake)
+```
+# [Global] Clang warns if a C++ pointer's nullability wasn't marked explicitly (__nonnull, nullable, ...).
+# Nullability seems to be a clang-specific feature, thus we disable this warning.
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-nullability-completeness ")
+
+# [Global] By evaluating "constexpr", compiler throws a warning for functions known to be dead at compile time.
+# However, some of these "constexpr" are debug flags and will be manually enabled upon debuging.
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unneeded-internal-declaration ")
+```
+
+### How to mute warnings for specific lines of codes (NOT RECOMMENDED)
+In rare situations where the warnings cannot be fixed nor muted via regular attempts, one of the last things you can try is to decorate your code with `#pragma clang diagnostic` macro. Be aware that `#pragma`s are not part of the C++ standard and strongly depend on the compiler's implementation. That is to say, the following solution is neither stable nor elegant.
+
+Wrap the lines of interest with the following two macros, warnings will be ignored for the codes in between.
+
+You may also replace the `-Wall` with a group of specific warning-types for finer control.
+
+```
+#if defined(__clang__)
+  #pragma clang diagnostic push
+  #pragma clang diagnostic ignored "-Wall"
+#endif
+
+{Your Code Goes Here}
+
+#if defined(__clang__)
+  #pragma clang diagnostic pop
+#endif
+```
+
 ##  Still have issues?
 
 If you encounter any issue that is not covered here, feel free to report it by asking us on GitHub discussions or by [opening an issue on GitHub](https://github.com/taichi-dev/taichi/issues/new?labels=potential+bug&template=bug_report.md) and including the details. We are always there to help!

From 76571859902eb4778a5b1aec54b2b95400d4fa03 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 10 May 2022 16:32:08 +0800
Subject: [PATCH 043/176] [doc] Updated documentations for implicit type
 casting rules (#4885)

* [doc] Updated documentations for type promotion rules

* Rearranged type promotion docs
---
 docs/lang/articles/basic/type.md | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/docs/lang/articles/basic/type.md b/docs/lang/articles/basic/type.md
index a245b22690ebf..1e2fd7511ced5 100644
--- a/docs/lang/articles/basic/type.md
+++ b/docs/lang/articles/basic/type.md
@@ -100,10 +100,33 @@ As a rule of thumb, implicit type casting is a major source of bugs. And Taichi
 
 #### Implicit type casting in binary operations
 
-Taichi follows the [implicit conversion rules](https://en.cppreference.com/w/c/language/conversion) for the C programming language and implicitly casts operands in a [binary operation](https://en.wikipedia.org/wiki/Binary_operation) into a *common type* if the operation involves different data types. Following are two most straightforward rules for determining the common type in a binary operation:
+Taichi implements its own implicit type casting rules for binary operations, which are slightly different from [those for the C programming language](https://en.cppreference.com/w/c/language/conversion).
 
-- `i32 + f32 = f32` (`int` + `float` = `float`)
-- `i32 + i64 = i64` (low precision bits + high precision bits = high precision bits)
+In general we have three rules with descending priority:
+
+1. integral OP floating_point -> floating_point
+- `i32 + f32 -> f32`
+- `i16 + f16 -> f16`
+
+2. low_precision_bits OP high_precision_bits -> high_precision_bits
+- `i16 + i32 -> i32`
+- `u8 + u16 -> u16`
+
+3. signed OP unsigned -> unsigned
+- `u32 + i32 -> u32`
+- `u8 + i8 -> u8`
+
+For conflicting rules, only the highest priority one will be applied.
+- `u8 + i16 -> i16` (rule #2 conflicts with rule #3: apply rule #2)
+- `f16 + i32 -> f16` (rule #1 conflicts with rule #2: apply rule #1)
+
+A few exceptions:
+1. bit-shift operations: always follow lhs's dtype
+- `u8 << i32 -> u8`
+- `i16 << i8 -> i16`
+
+2. logical operations: always return i32
+3. comparison operations: always return i32
 
 #### Implicit type casting in assignments
 

From 2586a9fc6c322d1133612cda541ac8dd2bce30a4 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Tue, 10 May 2022 16:55:11 +0800
Subject: [PATCH 044/176] [refactor] Remove unused snode_trees in ProgramImpl
 interface (#4942)

* [refactor] Remove unused snode_trees in ProgramImpl interface

* Update taichi/codegen/codegen_llvm.h
---
 taichi/backends/cc/cc_program.cpp         |  6 ++----
 taichi/backends/cc/cc_program.h           |  4 +---
 taichi/backends/dx/dx_program.cpp         |  6 ++----
 taichi/backends/dx/dx_program.h           |  6 ++----
 taichi/backends/metal/metal_program.cpp   | 10 +++-------
 taichi/backends/metal/metal_program.h     |  9 ++-------
 taichi/backends/opengl/opengl_program.cpp | 12 ++++--------
 taichi/backends/opengl/opengl_program.h   | 11 +++--------
 taichi/backends/vulkan/vulkan_program.cpp | 10 +++-------
 taichi/backends/vulkan/vulkan_program.h   |  8 ++------
 taichi/codegen/codegen_llvm.h             |  1 +
 taichi/llvm/llvm_program.cpp              | 23 +++++++++++------------
 taichi/llvm/llvm_program.h                | 15 +++++++--------
 taichi/program/program.cpp                |  5 ++---
 taichi/program/program_impl.cpp           |  4 +---
 taichi/program/program_impl.h             | 10 +++-------
 16 files changed, 49 insertions(+), 91 deletions(-)

diff --git a/taichi/backends/cc/cc_program.cpp b/taichi/backends/cc/cc_program.cpp
index 49d9d1d6e866a..d36cc0167a84a 100644
--- a/taichi/backends/cc/cc_program.cpp
+++ b/taichi/backends/cc/cc_program.cpp
@@ -32,10 +32,8 @@ void CCProgramImpl::materialize_runtime(MemoryPool *memory_pool,
   result_buffer_ = *result_buffer_ptr;
 }
 
-void CCProgramImpl::materialize_snode_tree(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &,
-    uint64 *result_buffer) {
+void CCProgramImpl::materialize_snode_tree(SNodeTree *tree,
+                                           uint64 *result_buffer) {
   auto *const root = tree->root();
   CCLayoutGen gen(this, root);
   layout_ = gen.compile();
diff --git a/taichi/backends/cc/cc_program.h b/taichi/backends/cc/cc_program.h
index adeea855ef96f..2b4f7e7b11e79 100644
--- a/taichi/backends/cc/cc_program.h
+++ b/taichi/backends/cc/cc_program.h
@@ -41,9 +41,7 @@ class CCProgramImpl : public ProgramImpl {
                            KernelProfilerBase *,
                            uint64 **result_buffer_ptr) override;
 
-  void materialize_snode_tree(SNodeTree *tree,
-                              std::vector<std::unique_ptr<SNodeTree>> &,
-                              uint64 *result_buffer) override;
+  void materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) override;
 
   void synchronize() override {
     // Not implemented yet.
diff --git a/taichi/backends/dx/dx_program.cpp b/taichi/backends/dx/dx_program.cpp
index 06665d0a18666..1b76bdeffde9d 100644
--- a/taichi/backends/dx/dx_program.cpp
+++ b/taichi/backends/dx/dx_program.cpp
@@ -51,10 +51,8 @@ void Dx11ProgramImpl::synchronize() {
   TI_NOT_IMPLEMENTED;
 }
 
-void Dx11ProgramImpl::materialize_snode_tree(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-    uint64 *result_buffer_ptr) {
+void Dx11ProgramImpl::materialize_snode_tree(SNodeTree *tree,
+                                             uint64 *result_buffer_ptr) {
   snode_tree_mgr_->materialize_snode_tree(tree);
 }
 
diff --git a/taichi/backends/dx/dx_program.h b/taichi/backends/dx/dx_program.h
index 493cb5c62c69a..603ee63abc3c4 100644
--- a/taichi/backends/dx/dx_program.h
+++ b/taichi/backends/dx/dx_program.h
@@ -24,10 +24,8 @@ class Dx11ProgramImpl : public ProgramImpl {
   void materialize_runtime(MemoryPool *memory_pool,
                            KernelProfilerBase *profiler,
                            uint64 **result_buffer_ptr) override;
-  virtual void materialize_snode_tree(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-      uint64 *result_buffer_ptr) override;
+  virtual void materialize_snode_tree(SNodeTree *tree,
+                                      uint64 *result_buffer_ptr) override;
   virtual void destroy_snode_tree(SNodeTree *snode_tree) override;
   void synchronize() override;
 
diff --git a/taichi/backends/metal/metal_program.cpp b/taichi/backends/metal/metal_program.cpp
index e7eb52ddf166a..e74b815beed95 100644
--- a/taichi/backends/metal/metal_program.cpp
+++ b/taichi/backends/metal/metal_program.cpp
@@ -80,16 +80,12 @@ void MetalProgramImpl::materialize_runtime(MemoryPool *memory_pool,
   metal_kernel_mgr_ = std::make_unique<metal::KernelManager>(std::move(params));
 }
 
-void MetalProgramImpl::compile_snode_tree_types(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees) {
+void MetalProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   (void)compile_snode_tree_types_impl(tree);
 }
 
-void MetalProgramImpl::materialize_snode_tree(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &,
-    uint64 *result_buffer) {
+void MetalProgramImpl::materialize_snode_tree(SNodeTree *tree,
+                                              uint64 *result_buffer) {
   const auto &csnode_tree = compile_snode_tree_types_impl(tree);
   metal_kernel_mgr_->add_compiled_snode_tree(csnode_tree);
 }
diff --git a/taichi/backends/metal/metal_program.h b/taichi/backends/metal/metal_program.h
index b00f53a40901c..99e6fe7a02edf 100644
--- a/taichi/backends/metal/metal_program.h
+++ b/taichi/backends/metal/metal_program.h
@@ -30,14 +30,9 @@ class MetalProgramImpl : public ProgramImpl {
                            KernelProfilerBase *profiler,
                            uint64 **result_buffer_ptr) override;
 
-  void compile_snode_tree_types(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees) override;
+  void compile_snode_tree_types(SNodeTree *tree) override;
 
-  void materialize_snode_tree(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-      uint64 *result_buffer) override;
+  void materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) override;
 
   void synchronize() override {
     metal_kernel_mgr_->synchronize();
diff --git a/taichi/backends/opengl/opengl_program.cpp b/taichi/backends/opengl/opengl_program.cpp
index 163ad3a089200..8ef7262ae9ee5 100644
--- a/taichi/backends/opengl/opengl_program.cpp
+++ b/taichi/backends/opengl/opengl_program.cpp
@@ -44,21 +44,17 @@ std::shared_ptr<Device> OpenglProgramImpl::get_device_shared() {
   return opengl_runtime_->device;
 }
 
-void OpenglProgramImpl::compile_snode_tree_types(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees) {
+void OpenglProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   // TODO: support materializing multiple snode trees
   opengl::OpenglStructCompiler scomp;
   opengl_struct_compiled_ = scomp.run(*(tree->root()));
   TI_TRACE("OpenGL root buffer size: {} B", opengl_struct_compiled_->root_size);
 }
 
-void OpenglProgramImpl::materialize_snode_tree(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-    uint64 *result_buffer) {
+void OpenglProgramImpl::materialize_snode_tree(SNodeTree *tree,
+                                               uint64 *result_buffer) {
 #ifdef TI_WITH_OPENGL
-  compile_snode_tree_types(tree, snode_trees_);
+  compile_snode_tree_types(tree);
   opengl_runtime_->add_snode_tree(opengl_struct_compiled_->root_size);
 #else
   TI_NOT_IMPLEMENTED;
diff --git a/taichi/backends/opengl/opengl_program.h b/taichi/backends/opengl/opengl_program.h
index 1d495f7654a3a..9f55d6678fae6 100644
--- a/taichi/backends/opengl/opengl_program.h
+++ b/taichi/backends/opengl/opengl_program.h
@@ -33,14 +33,9 @@ class OpenglProgramImpl : public ProgramImpl {
                            KernelProfilerBase *profiler,
                            uint64 **result_buffer_ptr) override;
 
-  void compile_snode_tree_types(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees) override;
-
-  void materialize_snode_tree(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-      uint64 *result_buffer) override;
+  void compile_snode_tree_types(SNodeTree *tree) override;
+
+  void materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) override;
 
   void synchronize() override {
   }
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/backends/vulkan/vulkan_program.cpp
index 63511189d99fb..6ec32b704056e 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/backends/vulkan/vulkan_program.cpp
@@ -151,9 +151,7 @@ void VulkanProgramImpl::materialize_runtime(MemoryPool *memory_pool,
       std::make_unique<vulkan::SNodeTreeManager>(vulkan_runtime_.get());
 }
 
-void VulkanProgramImpl::compile_snode_tree_types(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees) {
+void VulkanProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   if (vulkan_runtime_) {
     snode_tree_mgr_->materialize_snode_tree(tree);
   } else {
@@ -163,10 +161,8 @@ void VulkanProgramImpl::compile_snode_tree_types(
   }
 }
 
-void VulkanProgramImpl::materialize_snode_tree(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &,
-    uint64 *result_buffer) {
+void VulkanProgramImpl::materialize_snode_tree(SNodeTree *tree,
+                                               uint64 *result_buffer) {
   snode_tree_mgr_->materialize_snode_tree(tree);
 }
 
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index febd0c658e703..f18cd07d23715 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -38,17 +38,13 @@ class VulkanProgramImpl : public ProgramImpl {
     return 0;  // TODO: support sparse in vulkan
   }
 
-  void compile_snode_tree_types(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees) override;
+  void compile_snode_tree_types(SNodeTree *tree) override;
 
   void materialize_runtime(MemoryPool *memory_pool,
                            KernelProfilerBase *profiler,
                            uint64 **result_buffer_ptr) override;
 
-  void materialize_snode_tree(SNodeTree *tree,
-                              std::vector<std::unique_ptr<SNodeTree>> &,
-                              uint64 *result_buffer) override;
+  void materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) override;
 
   void synchronize() override {
     vulkan_runtime_->synchronize();
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index 592ac9205aafc..bbea19ba60dd6 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -1,5 +1,6 @@
 // The LLVM backend for CPUs/NVPTX/AMDGPU
 #pragma once
+
 #ifdef TI_WITH_LLVM
 
 #include <set>
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index d329dc2e34d33..e986b7486ac9a 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -147,10 +147,11 @@ void LlvmProgramImpl::synchronize() {
 
 std::unique_ptr<llvm::Module>
 LlvmProgramImpl::clone_struct_compiler_initial_context(
-    const std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
+    bool has_multiple_snode_trees,
     TaichiLLVMContext *tlctx) {
-  if (!snode_trees_.empty())
+  if (has_multiple_snode_trees) {
     return tlctx->clone_struct_module();
+  }
   return tlctx->clone_runtime_module();
 }
 
@@ -244,31 +245,29 @@ void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree,
   }
 }
 
-void LlvmProgramImpl::compile_snode_tree_types(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees) {
+void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   auto *const root = tree->root();
+  const bool has_multiple_snode_trees = (num_snode_trees_processed_ > 0);
   if (arch_is_cpu(config->arch)) {
     auto host_module = clone_struct_compiler_initial_context(
-        snode_trees, llvm_context_host_.get());
+        has_multiple_snode_trees, llvm_context_host_.get());
     struct_compiler_ = std::make_unique<StructCompilerLLVM>(
         host_arch(), this, std::move(host_module), tree->id());
 
   } else {
     TI_ASSERT(config->arch == Arch::cuda);
     auto device_module = clone_struct_compiler_initial_context(
-        snode_trees, llvm_context_device_.get());
+        has_multiple_snode_trees, llvm_context_device_.get());
     struct_compiler_ = std::make_unique<StructCompilerLLVM>(
         Arch::cuda, this, std::move(device_module), tree->id());
   }
   struct_compiler_->run(*root);
+  ++num_snode_trees_processed_;
 }
 
-void LlvmProgramImpl::materialize_snode_tree(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-    uint64 *result_buffer) {
-  compile_snode_tree_types(tree, snode_trees_);
+void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
+                                             uint64 *result_buffer) {
+  compile_snode_tree_types(tree);
   initialize_llvm_runtime_snodes(tree, struct_compiler_.get(), result_buffer);
 }
 
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index ee028e33b89d3..a38ff37a50ce7 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -1,4 +1,7 @@
 #pragma once
+
+#include <cstddef>
+
 #include "taichi/llvm/llvm_device.h"
 #include "taichi/llvm/llvm_offline_cache.h"
 #include "taichi/system/snode_tree_buffer_manager.h"
@@ -62,14 +65,9 @@ class LlvmProgramImpl : public ProgramImpl {
 
   FunctionType compile(Kernel *kernel, OffloadedStmt *offloaded) override;
 
-  void compile_snode_tree_types(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees) override;
+  void compile_snode_tree_types(SNodeTree *tree) override;
 
-  void materialize_snode_tree(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-      uint64 *result_buffer) override;
+  void materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) override;
 
   template <typename T>
   T fetch_result(int i, uint64 *result_buffer) {
@@ -122,7 +120,7 @@ class LlvmProgramImpl : public ProgramImpl {
 
  private:
   std::unique_ptr<llvm::Module> clone_struct_compiler_initial_context(
-      const std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
+      bool has_multiple_snode_trees,
       TaichiLLVMContext *tlctx);
 
   /**
@@ -173,6 +171,7 @@ class LlvmProgramImpl : public ProgramImpl {
   std::unique_ptr<Runtime> runtime_mem_info_{nullptr};
   std::unique_ptr<SNodeTreeBufferManager> snode_tree_buffer_manager_{nullptr};
   std::unique_ptr<StructCompiler> struct_compiler_{nullptr};
+  std::size_t num_snode_trees_processed_{0};
   void *llvm_runtime_{nullptr};
   void *preallocated_device_buffer_{nullptr};  // TODO: move to memory allocator
 
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 9904ed9912d70..f873f8e8debf2 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -204,10 +204,9 @@ SNodeTree *Program::add_snode_tree(std::unique_ptr<SNode> root,
   auto tree = std::make_unique<SNodeTree>(id, std::move(root));
   tree->root()->set_snode_tree_id(id);
   if (compile_only) {
-    program_impl_->compile_snode_tree_types(tree.get(), snode_trees_);
+    program_impl_->compile_snode_tree_types(tree.get());
   } else {
-    program_impl_->materialize_snode_tree(tree.get(), snode_trees_,
-                                          result_buffer);
+    program_impl_->materialize_snode_tree(tree.get(), result_buffer);
   }
   if (id < snode_trees_.size()) {
     snode_trees_[id] = std::move(tree);
diff --git a/taichi/program/program_impl.cpp b/taichi/program/program_impl.cpp
index 08eb8007f1a98..2838c3c2f152e 100644
--- a/taichi/program/program_impl.cpp
+++ b/taichi/program/program_impl.cpp
@@ -6,9 +6,7 @@ namespace lang {
 ProgramImpl::ProgramImpl(CompileConfig &config_) : config(&config_) {
 }
 
-void ProgramImpl::compile_snode_tree_types(
-    SNodeTree *tree,
-    std::vector<std::unique_ptr<SNodeTree>> &snode_trees) {
+void ProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   // FIXME: Eventually all the backends should implement this
   TI_NOT_IMPLEMENTED;
 }
diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
index cd9d9240600f4..86bd361d8157b 100644
--- a/taichi/program/program_impl.h
+++ b/taichi/program/program_impl.h
@@ -37,17 +37,13 @@ class ProgramImpl {
   /**
    * JIT compiles @param tree to backend-specific data types.
    */
-  virtual void compile_snode_tree_types(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees);
+  virtual void compile_snode_tree_types(SNodeTree *tree);
 
   /**
    * Compiles the @param tree types and allocates runtime buffer for it.
    */
-  virtual void materialize_snode_tree(
-      SNodeTree *tree,
-      std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
-      uint64 *result_buffer_ptr) = 0;
+  virtual void materialize_snode_tree(SNodeTree *tree,
+                                      uint64 *result_buffer_ptr) = 0;
 
   virtual void destroy_snode_tree(SNodeTree *snode_tree) = 0;
 

From d10c3e31f83477d0adb0819adaacee8fe15ed369 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 10 May 2022 17:55:28 +0800
Subject: [PATCH 045/176] [build] Turned off -Werror temporarily for issues
 with performance-bot (#4946)

---
 cmake/TaichiCXXFlags.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index bbfe321b6152e..558862506488c 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -51,7 +51,7 @@ else()
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall ")
 
     # [Global] CXX compilation option to treat all warnings as errors.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror ")
+    #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror ")
 
     # [Global] By default, CXX compiler will throw a warning if it decides to ignore an attribute, for example "[[ maybe unused ]]".
     # However, this behaviour diverges across different compilers (GCC/CLANG), as well as different compiler versions.

From 390ccacdcf54247e47eba61e196085bdba31a4bc Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Tue, 10 May 2022 18:45:44 +0800
Subject: [PATCH 046/176] [refactor] [llvm] Remove struct_compiler_ as a member
 variable (#4945)

---
 taichi/codegen/codegen_llvm.cpp |  3 ++-
 taichi/llvm/llvm_program.cpp    | 19 +++++++++++++------
 taichi/llvm/llvm_program.h      | 10 ++++++----
 taichi/struct/struct_llvm.h     |  1 +
 4 files changed, 22 insertions(+), 11 deletions(-)

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 694edca7c6401..5596189b42dd4 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1,6 +1,7 @@
+#include "taichi/codegen/codegen_llvm.h"
+
 #ifdef TI_WITH_LLVM
 #include "taichi/analysis/offline_cache_util.h"
-#include "taichi/codegen/codegen_llvm.h"
 #include "taichi/llvm/llvm_offline_cache.h"
 #include "taichi/ir/statements.h"
 #include "taichi/struct/struct_llvm.h"
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index e986b7486ac9a..2935735f852f6 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -245,30 +245,37 @@ void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree,
   }
 }
 
-void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
+std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
+    SNodeTree *tree) {
   auto *const root = tree->root();
   const bool has_multiple_snode_trees = (num_snode_trees_processed_ > 0);
+  std::unique_ptr<StructCompiler> struct_compiler{nullptr};
   if (arch_is_cpu(config->arch)) {
     auto host_module = clone_struct_compiler_initial_context(
         has_multiple_snode_trees, llvm_context_host_.get());
-    struct_compiler_ = std::make_unique<StructCompilerLLVM>(
+    struct_compiler = std::make_unique<StructCompilerLLVM>(
         host_arch(), this, std::move(host_module), tree->id());
 
   } else {
     TI_ASSERT(config->arch == Arch::cuda);
     auto device_module = clone_struct_compiler_initial_context(
         has_multiple_snode_trees, llvm_context_device_.get());
-    struct_compiler_ = std::make_unique<StructCompilerLLVM>(
+    struct_compiler = std::make_unique<StructCompilerLLVM>(
         Arch::cuda, this, std::move(device_module), tree->id());
   }
-  struct_compiler_->run(*root);
+  struct_compiler->run(*root);
   ++num_snode_trees_processed_;
+  return struct_compiler;
+}
+
+void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
+  compile_snode_tree_types_impl(tree);
 }
 
 void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
                                              uint64 *result_buffer) {
-  compile_snode_tree_types(tree);
-  initialize_llvm_runtime_snodes(tree, struct_compiler_.get(), result_buffer);
+  auto struct_compiler = compile_snode_tree_types_impl(tree);
+  initialize_llvm_runtime_snodes(tree, struct_compiler.get(), result_buffer);
 }
 
 uint64 LlvmProgramImpl::fetch_result_uint64(int i, uint64 *result_buffer) {
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index a38ff37a50ce7..f05647ae8eacd 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -24,19 +24,20 @@
 
 namespace llvm {
 class Module;
-}
+}  // namespace llvm
 
 namespace taichi {
 namespace lang {
+
 class StructCompiler;
 
 namespace cuda {
 class CudaDevice;
-}
+}  // namespace cuda
 
 namespace cpu {
 class CpuDevice;
-}
+}  // namespace cpu
 
 class LlvmProgramImpl : public ProgramImpl {
  public:
@@ -123,6 +124,8 @@ class LlvmProgramImpl : public ProgramImpl {
       bool has_multiple_snode_trees,
       TaichiLLVMContext *tlctx);
 
+  std::unique_ptr<StructCompiler> compile_snode_tree_types_impl(
+      SNodeTree *tree);
   /**
    * Initializes the SNodes for LLVM based backends.
    */
@@ -170,7 +173,6 @@ class LlvmProgramImpl : public ProgramImpl {
   std::unique_ptr<ThreadPool> thread_pool_{nullptr};
   std::unique_ptr<Runtime> runtime_mem_info_{nullptr};
   std::unique_ptr<SNodeTreeBufferManager> snode_tree_buffer_manager_{nullptr};
-  std::unique_ptr<StructCompiler> struct_compiler_{nullptr};
   std::size_t num_snode_trees_processed_{0};
   void *llvm_runtime_{nullptr};
   void *preallocated_device_buffer_{nullptr};  // TODO: move to memory allocator
diff --git a/taichi/struct/struct_llvm.h b/taichi/struct/struct_llvm.h
index df16363742b1b..5a725381ae7e5 100644
--- a/taichi/struct/struct_llvm.h
+++ b/taichi/struct/struct_llvm.h
@@ -1,4 +1,5 @@
 #pragma once
+
 #ifdef TI_WITH_LLVM
 // Codegen for the hierarchical data structure (LLVM)
 #include "taichi/llvm/llvm_program.h"

From d01aa125b39babe7ba729c5d0c2d6d486bc424c6 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 10 May 2022 19:18:59 +0800
Subject: [PATCH 047/176] [build] Limit -Werror to Clang-compiler only (#4947)

* [build] Enable -Werror on Linux & Mac

* [build] Limit -Werror to Clang-compiler only
---
 cmake/TaichiCXXFlags.cmake | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index 558862506488c..c5b647e597499 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -50,8 +50,11 @@ else()
     # [Global] CXX compilation option to enable all warnings.
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall ")
 
+# Due to limited CI coverage, -Werror is only turned on with Clang-compiler for now.
+if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
     # [Global] CXX compilation option to treat all warnings as errors.
-    #set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror ")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror ")
+endif()
 
     # [Global] By default, CXX compiler will throw a warning if it decides to ignore an attribute, for example "[[ maybe unused ]]".
     # However, this behaviour diverges across different compilers (GCC/CLANG), as well as different compiler versions.

From fbed595dc43cfeca8fe4c7b7863fb2f4c72f3c02 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Tue, 10 May 2022 21:10:36 +0800
Subject: [PATCH 048/176] [ci] Fix Nightly (#4948)

* [ci] Fix nightly test

* Add python 3.7 3.9 in nightly
---
 .github/workflows/release.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index fcfb1bb71f5e4..18d5fdd129ab7 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -56,10 +56,10 @@ jobs:
             echo '::set-output name=matrix_osx::{"include":[{"name":"taichi","python":"3.8"},{"name":"taichi","python":"3.9"},{"name":"taichi","python":"3.10"}]}"'
           else
             # For nightly release, we run on three python versions.
-            echo '::set-output name=matrix::{"include":[{"name":"taichi-nightly","python":"3.6","conda_python":"py36"},{"name":"taichi-nightly","python":"3.8","conda_python":"py38"},{"name":"taichi-nightly","python":"3.10","conda_python":"py310"}]}"'
+            echo '::set-output name=matrix::{"include":[{"name":"taichi-nightly","python":"3.6","conda_python":"py36"},{"name":"taichi-nightly","python":"3.7","conda_python":"py37"},{"name":"taichi-nightly","python":"3.8","conda_python":"py38"},{"name":"taichi-nightly","python":"3.9","conda_python":"py39"},{"name":"taichi-nightly","python":"3.10","conda_python":"py310"}]}"'
 
             # M1 only supports py38 and py310(conda), so change matrix.
-            echo '::set-output name=matrix_osx::{"include":[{"name":"taichi-nightly","python":"3.8"},{"name":"taichi-nightly","python":"3.10"}]}"'
+            echo '::set-output name=matrix_osx::{"include":[{"name":"taichi-nightly","python":"3.8"},{"name":"taichi-nightly","python":"3.9"},{"name":"taichi-nightly","python":"3.10"}]}"'
           fi
 
   build_and_test_linux:
@@ -402,6 +402,7 @@ jobs:
           . venv\Scripts\activate.ps1
           python -c "import taichi"
           pip install torch
+          pip install -r requirements_test.txt
           ti diagnose
           python tests/run_tests.py -vr2 -t2
         env:

From 6d538e171c96193fd2ce27967bd0b17a5797565e Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Tue, 10 May 2022 22:39:40 +0800
Subject: [PATCH 049/176] [Build] Improved building on Windows (#4925)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 CMakeLists.txt               |  4 ++--
 cmake/TaichiExportCore.cmake |  3 ++-
 taichi/gui/win32.cpp         | 37 ++++++++++++++++++------------------
 3 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index d12963aee94b7..494535cb4187c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -139,11 +139,11 @@ if (${CLANG_VERSION_MAJOR} VERSION_GREATER ${CLANG_HIGHEST_VERSION})
   unset(CLANG_EXECUTABLE)
   find_program(CLANG_EXECUTABLE NAMES clang-10 clang-11 clang-9 clang-8 clang-7)
   if (NOT CLANG_EXECUTABLE)
-    message(FATAL_ERROR "${CLANG_EXECUTABLE} version: ${CLANG_VERSION}, required: <=${CLANG_HIGHEST_VERSION}. Condider passing -DCLANG_PATH=/path/to/clang to cmake to use a specific clang.")
+    message(FATAL_ERROR "${CLANG_EXECUTABLE} version: ${CLANG_VERSION}, required: <=${CLANG_HIGHEST_VERSION}. Consider passing -DCLANG_EXECUTABLE=/path/to/clang to cmake to use a specific clang.")
   else()
     check_clang_version()
     if (${CLANG_VERSION_MAJOR} VERSION_GREATER ${CLANG_HIGHEST_VERSION})
-      message(FATAL_ERROR "${CLANG_EXECUTABLE} version: ${CLANG_VERSION}, required: <=${CLANG_HIGHEST_VERSION}. Condider passing -DCLANG_PATH=/path/to/clang to cmake to use a specific clang.")
+      message(FATAL_ERROR "${CLANG_EXECUTABLE} version: ${CLANG_VERSION}, required: <=${CLANG_HIGHEST_VERSION}. Consider passing -DCLANG_EXECUTABLE=/path/to/clang to cmake to use a specific clang.")
     endif()
   endif()
 endif()
diff --git a/cmake/TaichiExportCore.cmake b/cmake/TaichiExportCore.cmake
index b4f646fb92e2b..baf0f52589375 100644
--- a/cmake/TaichiExportCore.cmake
+++ b/cmake/TaichiExportCore.cmake
@@ -5,4 +5,5 @@ set(TAICHI_EXPORT_CORE_NAME taichi_export_core)
 add_library(${TAICHI_EXPORT_CORE_NAME} SHARED)
 target_link_libraries(${TAICHI_EXPORT_CORE_NAME} PRIVATE taichi_isolated_core)
 set_target_properties(${TAICHI_EXPORT_CORE_NAME} PROPERTIES
-    CMAKE_LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/build")
diff --git a/taichi/gui/win32.cpp b/taichi/gui/win32.cpp
index b7fa7fa6b6696..3708d62fa7620 100644
--- a/taichi/gui/win32.cpp
+++ b/taichi/gui/win32.cpp
@@ -153,7 +153,7 @@ void GUI::process_event() {
 }
 
 void GUI::create_window() {
-  auto CLASS_NAME = L"Taichi Win32 Window";
+  const char *CLASS_NAME = "Taichi Win32 Window";
 
   DWORD dwVersion = 0;
   DWORD dwMajorVersion = 0;
@@ -164,13 +164,13 @@ void GUI::create_window() {
   dwMajorVersion = (DWORD)(LOBYTE(LOWORD(dwVersion)));
   dwMinorVersion = (DWORD)(HIBYTE(LOWORD(dwVersion)));
 
-  WNDCLASS wc = {};
+  WNDCLASSA wc = {};
 
   wc.lpfnWndProc = WindowProc;
-  wc.hInstance = GetModuleHandle(0);
+  wc.hInstance = GetModuleHandleA(0);
   wc.lpszClassName = CLASS_NAME;
 
-  RegisterClass(&wc);
+  RegisterClassA(&wc);
 
   RECT window_rect;
   window_rect.left = 0;
@@ -180,19 +180,18 @@ void GUI::create_window() {
 
   AdjustWindowRect(&window_rect, WS_OVERLAPPEDWINDOW, false);
 
-  hwnd = CreateWindowEx(0,           // Optional window styles.
-                        CLASS_NAME,  // Window class
-                        std::wstring(window_name.begin(), window_name.end())
-                            .data(),          // Window text
-                        WS_OVERLAPPEDWINDOW,  // Window style
-                        // Size and position
-                        CW_USEDEFAULT, CW_USEDEFAULT,
-                        window_rect.right - window_rect.left,
-                        window_rect.bottom - window_rect.top,
-                        NULL,                // Parent window
-                        NULL,                // Menu
-                        GetModuleHandle(0),  // Instance handle
-                        NULL                 // Additional application data
+  hwnd = CreateWindowExA(0,                    // Optional window styles.
+                         CLASS_NAME,           // Window class
+                         window_name.c_str(),  // Window text
+                         WS_OVERLAPPEDWINDOW,  // Window style
+                         // Size and position
+                         CW_USEDEFAULT, CW_USEDEFAULT,
+                         window_rect.right - window_rect.left,
+                         window_rect.bottom - window_rect.top,
+                         NULL,                 // Parent window
+                         NULL,                 // Menu
+                         GetModuleHandleA(0),  // Instance handle
+                         NULL                  // Additional application data
   );
   TI_ERROR_IF(hwnd == NULL, "Window creation failed");
   gui_from_hwnd[hwnd] = this;
@@ -201,7 +200,7 @@ void GUI::create_window() {
     // https://www.cnblogs.com/lidabo/archive/2012/07/17/2595452.html
     LONG style = GetWindowLong(hwnd, GWL_STYLE);
     style &= ~WS_CAPTION & ~WS_SIZEBOX;
-    SetWindowLong(hwnd, GWL_STYLE, style);
+    SetWindowLongA(hwnd, GWL_STYLE, style);
     SetWindowPos(hwnd, NULL, 0, 0, GetSystemMetrics(SM_CXSCREEN),
                  GetSystemMetrics(SM_CYSCREEN), SWP_NOZORDER);
   }
@@ -235,7 +234,7 @@ void GUI::redraw() {
 }
 
 void GUI::set_title(std::string title) {
-  SetWindowText(hwnd, std::wstring(title.begin(), title.end()).data());
+  SetWindowTextA(hwnd, title.c_str());
 }
 
 GUI::~GUI() {

From c3631c09c1b2416f2e2740628acf78b7c06fb394 Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Wed, 11 May 2022 14:32:53 +0800
Subject: [PATCH 050/176] [Lang] Add more functions to math module (#4939)

* add more functions to math module

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more functions to math module

* add more functions to math module

* add more functions to math module

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more functions to math module

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add more functions to math module

* add more functions to math module

* Update _funcs.py

* Update python/taichi/_funcs.py

Co-authored-by: pengyu <6712304+FantasyVR@users.noreply.github.com>

* Update python/taichi/math/mathimpl.py

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: pengyu <6712304+FantasyVR@users.noreply.github.com>
---
 python/taichi/_funcs.py        | 33 ++++++++++++++++++++++--------
 python/taichi/math/__init__.py |  2 +-
 python/taichi/math/mathimpl.py | 37 ++++++++++++++++++++++++++++------
 tests/python/test_api.py       | 14 +++++++------
 4 files changed, 65 insertions(+), 21 deletions(-)

diff --git a/python/taichi/_funcs.py b/python/taichi/_funcs.py
index 6753234c77070..6fce332466981 100644
--- a/python/taichi/_funcs.py
+++ b/python/taichi/_funcs.py
@@ -100,7 +100,6 @@ def _matrix_outer_product(self, other):
 @func
 def polar_decompose2d(A, dt):
     """Perform polar decomposition (A=UP) for 2x2 matrix.
-
     Mathematical concept refers to https://en.wikipedia.org/wiki/Polar_decomposition.
 
     Args:
@@ -108,14 +107,32 @@ def polar_decompose2d(A, dt):
         dt (DataType): date type of elements in matrix `A`, typically accepts ti.f32 or ti.f64.
 
     Returns:
-        Decomposed 2x2 matrices `U` and `P`.
+        Decomposed 2x2 matrices `U` and `P`. `U` is a 2x2 orthogonal matrix
+        and `P` is a 2x2 positive or semi-positive definite matrix.
     """
-    x, y = A(0, 0) + A(1, 1), A(1, 0) - A(0, 1)
-    scale = (1.0 / ops.sqrt(x * x + y * y))
-    c = x * scale
-    s = y * scale
-    r = Matrix([[c, -s], [s, c]], dt=dt)
-    return r, r.transpose() @ A
+    U = Matrix.identity(dt, 2)
+    P = ops.cast(A, dt)
+    zero = ops.cast(0.0, dt)
+    # if A is a zero matrix we simply return the pair (I, A)
+    if (A[0, 0] == zero and A[0, 1] == zero and A[1, 0] == zero
+            and A[1, 1] == zero):
+        pass
+    else:
+        detA = A[0, 0] * A[1, 1] - A[1, 0] * A[0, 1]
+        adetA = abs(detA)
+        B = Matrix([[A[0, 0] + A[1, 1], A[0, 1] - A[1, 0]],
+                    [A[1, 0] - A[0, 1], A[1, 1] + A[0, 0]]], dt)
+
+        if detA < zero:
+            B = Matrix([[A[0, 0] - A[1, 1], A[0, 1] + A[1, 0]],
+                        [A[1, 0] + A[0, 1], A[1, 1] - A[0, 0]]], dt)
+        # here det(B) != 0 if A is not the zero matrix
+        adetB = abs(B[0, 0] * B[1, 1] - B[1, 0] * B[0, 1])
+        k = ops.cast(1.0, dt) / ops.sqrt(adetB)
+        U = B * k
+        P = (A.transpose() @ A + adetA * Matrix.identity(dt, 2)) * k
+
+    return U, P
 
 
 @func
diff --git a/python/taichi/math/__init__.py b/python/taichi/math/__init__.py
index b08ced0b26c53..b31f96b061de1 100644
--- a/python/taichi/math/__init__.py
+++ b/python/taichi/math/__init__.py
@@ -3,6 +3,6 @@
 The math module supports glsl-style vectors, matrices and functions.
 """
 from ._complex import *
-from .mathimpl import *
+from .mathimpl import *  # pylint: disable=W0622
 
 del mathimpl
diff --git a/python/taichi/math/mathimpl.py b/python/taichi/math/mathimpl.py
index fced31ec73bcf..085e7d169ff2b 100644
--- a/python/taichi/math/mathimpl.py
+++ b/python/taichi/math/mathimpl.py
@@ -1,9 +1,12 @@
+# pylint: disable=W0622
 """
 Math functions for glsl-like functions and other stuff.
 """
 from math import e, pi
 
 from taichi.lang import impl
+from taichi.lang.ops import (acos, asin, atan2, ceil, cos, exp, floor, log,
+                             max, min, pow, round, sin, sqrt, tan, tanh)
 
 import taichi as ti
 
@@ -576,7 +579,7 @@ def rot3(axis, ang):
         >>> from taichi.math import *
         >>> @ti.kernel
         >>> def test():
-        >>>     M = rot3(vec3(1, 1, 1), radians(30))
+        >>>     M = rot3(normalize(vec3(1, 1, 1)), radians(30))
         [[0.732051, -0.366025, 0.633975],
          [0.633975, 0.732051, -0.366025],
          [-0.366025, 0.633975, 0.732051]]
@@ -588,10 +591,32 @@ def rot3(axis, ang):
     return I + sa * K + (1.0 - ca) * K @ K
 
 
+@ti.func
+def length(x):
+    """Calculate the length of a vector.
+
+    This function is equivalent to the `length` function in GLSL.
+    Args:
+        x (:class:`~taichi.Matrix`): The vector of which to calculate the length.
+
+    Returns:
+        The Euclidean norm of the vector.
+
+    Example::
+
+        >>> x = ti.Vector([1, 1, 1])
+        >>> length(x)
+        1.732051
+    """
+    return x.norm()
+
+
 __all__ = [
-    "clamp", "cross", "degrees", "distance", "dot", "e", "eye", "fract",
-    "ivec2", "ivec3", "ivec4", "log2", "mat2", "mat3", "mat4", "mix", "mod",
-    "normalize", "pi", "radians", "reflect", "refract", "rot2", "rot3",
-    "rotate2d", "rotate3d", "sign", "smoothstep", "step", "uvec2", "uvec3",
-    "uvec4", "vec2", "vec3", "vec4"
+    "acos", "asin", "atan2", "ceil", "clamp", "cos", "cross", "degrees",
+    "distance", "dot", "e", "exp", "eye", "floor", "fract", "ivec2", "ivec3",
+    "ivec4", "length", "log", "log2", "mat2", "mat3", "mat4", "max", "min",
+    "mix", "mod", "normalize", "pi", "pow", "radians", "reflect", "refract",
+    "rot2", "rot3", "rotate2d", "rotate3d", "round", "sign", "sin",
+    "smoothstep", "sqrt", "step", "tan", "tanh", "uvec2", "uvec3", "uvec4",
+    "vec2", "vec3", "vec4"
 ]
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 23d5843aa70c9..33a19546be3de 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -97,12 +97,14 @@ def _get_expected_matrix_apis():
     'dynamic', 'finalize', 'lazy_grad', 'place', 'pointer'
 ]
 user_api[ti.math] = [
-    'cconj', 'cdiv', 'cexp', 'cinv', 'clamp', 'clog', 'cmul', 'cpow', 'cross',
-    'csqrt', 'degrees', 'distance', 'dot', 'e', 'eye', 'fract', 'ivec2',
-    'ivec3', 'ivec4', 'log2', 'mat2', 'mat3', 'mat4', 'mix', 'mod',
-    'normalize', 'pi', 'radians', 'reflect', 'refract', 'rot2', 'rot3',
-    'rotate2d', 'rotate3d', 'sign', 'smoothstep', 'step', 'uvec2', 'uvec3',
-    'uvec4', 'vec2', 'vec3', 'vec4'
+    'acos', 'asin', 'atan2', 'cconj', 'cdiv', 'ceil', 'cexp', 'cinv', 'clamp',
+    'clog', 'cmul', 'cos', 'cpow', 'cross', 'csqrt', 'degrees', 'distance',
+    'dot', 'e', 'exp', 'eye', 'floor', 'fract', 'ivec2', 'ivec3', 'ivec4',
+    'length', 'log', 'log2', 'mat2', 'mat3', 'mat4', 'max', 'min', 'mix',
+    'mod', 'normalize', 'pi', 'pow', 'radians', 'reflect', 'refract', 'rot2',
+    'rot3', 'rotate2d', 'rotate3d', 'round', 'sign', 'sin', 'smoothstep',
+    'sqrt', 'step', 'tan', 'tanh', 'uvec2', 'uvec3', 'uvec4', 'vec2', 'vec3',
+    'vec4'
 ]
 user_api[ti.Matrix] = _get_expected_matrix_apis()
 user_api[ti.MatrixField] = [

From 0795c74bf4401ac9c60795f833b51efac2589483 Mon Sep 17 00:00:00 2001
From: PGZXB <420254146@qq.com>
Date: Wed, 11 May 2022 16:36:16 +0800
Subject: [PATCH 051/176] [lang] [bug] Implement Expression serializing and fix
 some bugs (#4931)

* Serialize Expression and remove old useless ExpressionOfflineCacheKeyGenerator

* Fix some bugs(reported by test_assert and test-snodes with offline_cache=True)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/analysis/gen_offline_cache_key.cpp | 343 +++++++++++++++++-----
 taichi/codegen/codegen_llvm.cpp           |   4 +-
 taichi/ir/expression_printer.h            |  70 -----
 3 files changed, 267 insertions(+), 150 deletions(-)

diff --git a/taichi/analysis/gen_offline_cache_key.cpp b/taichi/analysis/gen_offline_cache_key.cpp
index e2a336c6b6252..678076650e51a 100644
--- a/taichi/analysis/gen_offline_cache_key.cpp
+++ b/taichi/analysis/gen_offline_cache_key.cpp
@@ -1,8 +1,6 @@
-#include <unordered_map>
-#include "taichi/analysis/offline_cache_util.h"
-#include "taichi/common/logging.h"
+#include "offline_cache_util.h"
+
 #include "taichi/ir/expr.h"
-#include "taichi/ir/expression_printer.h"
 #include "taichi/ir/frontend_ir.h"
 #include "taichi/ir/ir.h"
 #include "taichi/ir/mesh.h"
@@ -15,7 +13,15 @@ namespace lang {
 
 namespace {
 
+enum class ExprOpCode : std::uint8_t {
+  NIL,
+#define PER_EXPRESSION(x) x,
+#include "taichi/inc/expressions.inc.h"
+#undef PER_EXPRESSION
+};
+
 enum class StmtOpCode : std::uint8_t {
+  NIL,
   EnterBlock,
   ExitBlock,
 #define PER_STATEMENT(x) x,
@@ -35,14 +41,19 @@ enum class ExternalFuncType : std::uint8_t {
   BC,
 };
 
-class ASTSerializer : public IRVisitor {
+enum class MeshRelationAccessType {
+  Access,  // mesh_relation_access
+  Size,    // mesh_relation_size
+};
+
+class ASTSerializer : public IRVisitor, public ExpressionVisitor {
+ private:
+  using ExpressionVisitor::visit;
+  using IRVisitor::visit;
+
  public:
-  ASTSerializer(Program *prog,
-                ExpressionPrinter *expr_printer,
-                std::ostream *os)
-      : prog_(prog), os_(os), expr_printer_(expr_printer) {
+  ASTSerializer(Program *prog, std::ostream *os) : prog_(prog), os_(os) {
     this->allow_undefined_visitor = true;
-    expr_printer_->set_ostream(os);
   }
 
   void set_ostream(std::ostream *os) {
@@ -53,6 +64,178 @@ class ASTSerializer : public IRVisitor {
     return this->os_;
   }
 
+  void visit(Expression *expr) override {
+    expr->accept(this);
+  }
+
+  void visit(Stmt *stmt) override {
+    stmt->accept(this);
+  }
+
+  void visit(ExprGroup &expr_group) override {
+    emit(expr_group.exprs);
+  }
+
+  void visit(ArgLoadExpression *expr) override {
+    emit(ExprOpCode::ArgLoadExpression);
+    emit(expr->dt);
+    emit(expr->arg_id);
+  }
+
+  void visit(RandExpression *expr) override {
+    emit(ExprOpCode::RandExpression);
+    emit(expr->dt);
+  }
+
+  void visit(UnaryOpExpression *expr) override {
+    emit(ExprOpCode::UnaryOpExpression);
+    emit(expr->type);
+    if (expr->is_cast()) {
+      emit(expr->cast_type);
+    }
+    emit(expr->operand);
+  }
+
+  void visit(BinaryOpExpression *expr) override {
+    emit(ExprOpCode::BinaryOpExpression);
+    emit(expr->type);
+    emit(expr->lhs);
+    emit(expr->rhs);
+  }
+
+  void visit(TernaryOpExpression *expr) override {
+    emit(ExprOpCode::TernaryOpExpression);
+    emit(expr->type);
+    emit(expr->op1);
+    emit(expr->op2);
+    emit(expr->op3);
+  }
+
+  void visit(InternalFuncCallExpression *expr) override {
+    emit(ExprOpCode::InternalFuncCallExpression);
+    emit(expr->with_runtime_context);
+    emit(expr->func_name);
+    emit(expr->args);
+  }
+
+  void visit(ExternalTensorExpression *expr) override {
+    emit(ExprOpCode::ExternalTensorExpression);
+    emit(expr->dt);
+    emit(expr->dim);
+    emit(expr->arg_id);
+    emit(expr->element_dim);
+    emit(expr->element_shape);
+  }
+
+  void visit(GlobalVariableExpression *expr) override {
+    emit(ExprOpCode::GlobalVariableExpression);
+    emit(expr->ident);
+    emit(expr->dt);
+    emit(expr->snode);
+    emit(expr->has_ambient);
+    emit(expr->ambient_value);
+    emit(expr->is_primal);
+    emit(expr->adjoint);
+  }
+
+  void visit(GlobalPtrExpression *expr) override {
+    emit(ExprOpCode::GlobalPtrExpression);
+    emit(expr->var);
+    emit(expr->indices.exprs);
+  }
+
+  void visit(TensorElementExpression *expr) override {
+    emit(ExprOpCode::TensorElementExpression);
+    emit(expr->var);
+    emit(expr->indices.exprs);
+    emit(expr->shape);
+    emit(expr->stride);
+  }
+
+  void visit(RangeAssumptionExpression *expr) override {
+    emit(ExprOpCode::RangeAssumptionExpression);
+    emit(expr->input);
+    emit(expr->base);
+    emit(expr->low);
+    emit(expr->high);
+  }
+
+  void visit(LoopUniqueExpression *expr) override {
+    emit(ExprOpCode::LoopUniqueExpression);
+    emit(expr->input);
+    emit(expr->covers);
+  }
+
+  void visit(IdExpression *expr) override {
+    emit(ExprOpCode::IdExpression);
+    emit(expr->id);
+  }
+
+  void visit(AtomicOpExpression *expr) override {
+    emit(ExprOpCode::AtomicOpExpression);
+    emit(expr->op_type);
+    emit(expr->dest);
+    emit(expr->val);
+  }
+
+  void visit(SNodeOpExpression *expr) override {
+    emit(ExprOpCode::SNodeOpExpression);
+    emit(expr->op_type);
+    emit(expr->snode);
+    std::size_t count = expr->indices.size();
+    if (expr->value.expr)
+      ++count;
+    emit(count);
+    for (const auto &i : expr->indices.exprs) {
+      emit(i);
+    }
+    if (expr->value.expr) {
+      emit(expr->value);
+    }
+  }
+
+  void visit(ConstExpression *expr) override {
+    emit(ExprOpCode::ConstExpression);
+    emit(expr->val);
+  }
+
+  void visit(ExternalTensorShapeAlongAxisExpression *expr) override {
+    emit(ExprOpCode::ExternalTensorShapeAlongAxisExpression);
+    emit(expr->ptr);
+    emit(expr->axis);
+  }
+
+  void visit(FuncCallExpression *expr) override {
+    emit(ExprOpCode::FuncCallExpression);
+    emit(expr->func);
+    emit(expr->args.exprs);
+  }
+
+  void visit(MeshPatchIndexExpression *expr) override {
+    emit(ExprOpCode::MeshPatchIndexExpression);
+  }
+
+  void visit(MeshRelationAccessExpression *expr) override {
+    emit(ExprOpCode::MeshRelationAccessExpression);
+    if (expr->neighbor_idx) {
+      emit(MeshRelationAccessType::Access);
+      emit(expr->neighbor_idx);
+    } else {
+      emit(MeshRelationAccessType::Size);
+    }
+    emit(expr->mesh);
+    emit(expr->to_type);
+    emit(expr->mesh_idx);
+  }
+
+  void visit(MeshIndexConversionExpression *expr) override {
+    emit(ExprOpCode::MeshIndexConversionExpression);
+    emit(expr->mesh);
+    emit(expr->idx_type);
+    emit(expr->idx);
+    emit(expr->conv_type);
+  }
+
   void visit(Block *block) override {
     emit(StmtOpCode::EnterBlock);
     emit(static_cast<std::size_t>(block->statements.size()));
@@ -90,6 +273,8 @@ class ASTSerializer : public IRVisitor {
   void visit(FrontendAssertStmt *stmt) override {
     emit(StmtOpCode::FrontendAssertStmt);
     emit(stmt->cond);
+    emit(stmt->text);
+    emit(stmt->args);
   }
 
   void visit(FrontendSNodeOpStmt *stmt) override {
@@ -205,10 +390,7 @@ class ASTSerializer : public IRVisitor {
   }
 
   static void run(Program *prog, IRNode *ast, std::ostream *os) {
-    // Temporary: using ExpressionOfflineCacheKeyGenerator, which will be
-    // refactored
-    ExpressionOfflineCacheKeyGenerator generator(prog);
-    ASTSerializer serializer(prog, &generator, os);
+    ASTSerializer serializer(prog, os);
     ast->accept(&serializer);
     serializer.emit_dependencies();
   }
@@ -219,11 +401,11 @@ class ASTSerializer : public IRVisitor {
     std::ostringstream temp_oss;
     auto *curr_os = this->get_ostream();
     this->set_ostream(&temp_oss);
-    expr_printer_->set_ostream(&temp_oss);
     std::size_t last_size{0};
     do {
       last_size = real_funcs_.size();
-      for (auto &[func, visited] : real_funcs_) {
+      for (auto &[func, v] : real_funcs_) {
+        auto &[id, visited] = v;
         if (!visited) {
           visited = true;
           func->ir->accept(this);  // Maybe add new func
@@ -231,9 +413,9 @@ class ASTSerializer : public IRVisitor {
       }
     } while (real_funcs_.size() > last_size);
     this->set_ostream(curr_os);
-    expr_printer_->set_ostream(curr_os);
     emit(static_cast<std::size_t>(real_funcs_.size()));
-    emit(&temp_oss);
+    auto real_funcs_ast_string = temp_oss.str();
+    emit_bytes(real_funcs_ast_string.data(), real_funcs_ast_string.size());
 
     // Serialize snode_trees(Temporary: using offline-cache-key of SNode)
     // Note: The result of serializing snode_tree_roots_ is not parsable now
@@ -257,9 +439,19 @@ class ASTSerializer : public IRVisitor {
 
   void emit_bytes(const char *bytes, std::size_t len) {
     TI_ASSERT(os_);
+    if (!bytes)
+      return;
     os_->write(bytes, len);
   }
 
+  template <typename T>
+  void emit(const std::vector<T> &v) {
+    emit(static_cast<std::size_t>(v.size()));
+    for (const auto &e : v) {
+      emit(e);
+    }
+  }
+
   template <typename K, typename V>
   void emit(const std::unordered_map<K, V> &map) {
     emit(static_cast<std::size_t>(map.size()));
@@ -284,11 +476,6 @@ class ASTSerializer : public IRVisitor {
     }
   }
 
-  void emit(std::ostream *os) {
-    TI_ASSERT(os_ && os);
-    *os_ << os->rdbuf();
-  }
-
   void emit(const std::string &str) {
     std::size_t size = str.size();
     std::size_t offset = string_pool_.size();
@@ -297,29 +484,36 @@ class ASTSerializer : public IRVisitor {
     emit(offset);
   }
 
-  void emit(SNodeOpType type) {
-    emit_pod(type);
-  }
-
-  void emit(SNode *snode) {
-    TI_ASSERT(snode);
-    TI_ASSERT(prog_);
-    emit(static_cast<std::size_t>(snode->get_snode_tree_id()));
-    emit(static_cast<std::size_t>(snode->id));
-    auto *root = prog_->get_snode_root(snode->get_snode_tree_id());
-    snode_tree_roots_.insert(root);
-  }
-
-  void emit(mesh::MeshElementType type) {
-    emit_pod(type);
+  void emit(Function *func) {
+    TI_ASSERT(func);
+    auto iter = real_funcs_.find(func);
+    if (iter != real_funcs_.end()) {
+      emit(iter->second.first);
+    } else {
+      auto [iter, ok] = real_funcs_.insert({func, {real_funcs_.size(), false}});
+      TI_ASSERT(ok);
+      emit(iter->second.first);
+    }
   }
 
-  void emit(mesh::MeshRelationType type) {
-    emit_pod(type);
+  void emit(const TypedConstant &val) {
+    emit(val.dt);
+    if (!val.dt->is_primitive(PrimitiveTypeID::unknown)) {
+      emit(val.stringify());
+    }
   }
 
-  void emit(mesh::ConvType type) {
-    emit_pod(type);
+  void emit(SNode *snode) {
+    TI_ASSERT(prog_);
+    if (snode) {
+      emit(static_cast<std::size_t>(snode->get_snode_tree_id()));
+      emit(static_cast<std::size_t>(snode->id));
+      auto *root = prog_->get_snode_root(snode->get_snode_tree_id());
+      snode_tree_roots_.insert(root);
+    } else {
+      emit(std::numeric_limits<std::size_t>::max());
+      emit(std::numeric_limits<std::size_t>::max());
+    }
   }
 
   void emit(const mesh::MeshLocalRelation &r) {
@@ -330,6 +524,7 @@ class ASTSerializer : public IRVisitor {
   }
 
   void emit(mesh::Mesh *mesh) {
+    TI_ASSERT(mesh);
     emit(mesh->num_patches);
     emit(mesh->num_elements);
     emit(mesh->patch_max_element_num);
@@ -343,43 +538,25 @@ class ASTSerializer : public IRVisitor {
     emit(ident.id);
   }
 
-  void emit(const std::vector<Identifier> &identifiers) {
-    emit(static_cast<std::size_t>(identifiers.size()));
-    for (const auto &id : identifiers) {
-      emit(id);
-    }
-  }
-
-  void emit(PrimitiveTypeID type_id) {
-    emit_pod(type_id);
-  }
-
   void emit(const DataType &type) {
     if (auto *p = type->cast<PrimitiveType>()) {
       emit(p->type);
     } else {
-      TI_NOT_IMPLEMENTED;
+      auto type_str = type->to_string();
+      emit(type_str);
     }
   }
 
-  void emit(StmtOpCode code) {
-    emit_pod(code);
-  }
-
   void emit(IRNode *ir) {
     TI_ASSERT(ir);
     ir->accept(this);
   }
 
   void emit(const Expr &expr) {
-    TI_ASSERT(expr_printer_);
-    expr.expr->accept(expr_printer_);
-  }
-
-  void emit(const std::vector<Expr> &exprs) {
-    emit(static_cast<std::size_t>(exprs.size()));
-    for (const auto &e : exprs) {
-      emit(e);
+    if (expr) {
+      expr.expr->accept(this);
+    } else {
+      emit(ExprOpCode::NIL);
     }
   }
 
@@ -399,14 +576,6 @@ class ASTSerializer : public IRVisitor {
     emit_pod(v);
   }
 
-  void emit(ForLoopType type) {
-    emit_pod(type);
-  }
-
-  void emit(SNodeAccessFlag flag) {
-    emit_pod(flag);
-  }
-
   void emit(const MemoryAccessOptions &mem_access_options) {
     auto all_options = mem_access_options.get_all();
     emit(static_cast<std::size_t>(all_options.size()));
@@ -419,15 +588,33 @@ class ASTSerializer : public IRVisitor {
     }
   }
 
-  void emit(ExternalFuncType type) {
-    emit_pod(type);
+#define DEFINE_EMIT_ENUM(EnumType) \
+  void emit(EnumType type) {       \
+    emit_pod(type);                \
   }
 
+  DEFINE_EMIT_ENUM(ExprOpCode);
+  DEFINE_EMIT_ENUM(StmtOpCode);
+  DEFINE_EMIT_ENUM(PrimitiveTypeID);
+  DEFINE_EMIT_ENUM(UnaryOpType);
+  DEFINE_EMIT_ENUM(BinaryOpType);
+  DEFINE_EMIT_ENUM(TernaryOpType);
+  DEFINE_EMIT_ENUM(AtomicOpType);
+  DEFINE_EMIT_ENUM(SNodeOpType);
+  DEFINE_EMIT_ENUM(ForLoopType);
+  DEFINE_EMIT_ENUM(SNodeAccessFlag);
+  DEFINE_EMIT_ENUM(MeshRelationAccessType);
+  DEFINE_EMIT_ENUM(ExternalFuncType);
+  DEFINE_EMIT_ENUM(mesh::MeshElementType);
+  DEFINE_EMIT_ENUM(mesh::MeshRelationType);
+  DEFINE_EMIT_ENUM(mesh::ConvType);
+
+#undef DEFINE_EMIT_ENUM
+
   Program *prog_{nullptr};
   std::ostream *os_{nullptr};
-  ExpressionPrinter *expr_printer_{nullptr};
   std::unordered_set<SNode *> snode_tree_roots_;
-  std::unordered_map<Function *, bool> real_funcs_;
+  std::unordered_map<Function *, std::pair<std::size_t, bool>> real_funcs_;
   std::vector<char> string_pool_;
 };
 
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 5596189b42dd4..baf5ab19c3738 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -2389,8 +2389,8 @@ FunctionType CodeGenLLVM::gen() {
   bool needs_cache = false;
   const auto &config = prog->config;
   std::string kernel_key;
-  if (config.offline_cache && this->supports_offline_cache() &&
-      !kernel->is_evaluator) {
+  if (config.offline_cache && !config.async_mode &&
+      this->supports_offline_cache() && !kernel->is_evaluator) {
     kernel_key = get_hashed_offline_cache_key(&kernel->program->config, kernel);
 
     LlvmOfflineCacheFileReader reader(config.offline_cache_file_path);
diff --git a/taichi/ir/expression_printer.h b/taichi/ir/expression_printer.h
index 7d1b463896f61..92c882bcd19ff 100644
--- a/taichi/ir/expression_printer.h
+++ b/taichi/ir/expression_printer.h
@@ -256,75 +256,5 @@ class ExpressionHumanFriendlyPrinter : public ExpressionPrinter {
   }
 };
 
-// Temporary reuse ExpressionHumanFriendlyPrinter
-class ExpressionOfflineCacheKeyGenerator
-    : public ExpressionHumanFriendlyPrinter {
- public:
-  explicit ExpressionOfflineCacheKeyGenerator(Program *prog,
-                                              std::ostream *os = nullptr)
-      : ExpressionHumanFriendlyPrinter(os), prog_(prog) {
-  }
-
-  void visit(GlobalVariableExpression *expr) override {
-    emit("#", expr->ident.name());
-    if (expr->snode) {
-      emit("(snode=", this->get_hashed_key_of_snode(expr->snode), ')');
-    } else {
-      emit("(dt=", expr->dt->to_string(), ')');
-    }
-  }
-
-  void visit(GlobalPtrExpression *expr) override {
-    if (expr->snode) {
-      emit(this->get_hashed_key_of_snode(expr->snode));
-    } else {
-      expr->var->accept(this);
-    }
-    emit('[');
-    emit_vector(expr->indices.exprs);
-    emit(']');
-  }
-
-  void visit(SNodeOpExpression *expr) override {
-    emit(snode_op_type_name(expr->op_type));
-    emit('(', this->get_hashed_key_of_snode(expr->snode), ", [");
-    emit_vector(expr->indices.exprs);
-    emit(']');
-    if (expr->value.expr) {
-      emit(' ');
-      expr->value->accept(this);
-    }
-    emit(')');
-  }
-
- private:
-  const std::string &cache_snode_tree_key(int snode_tree_id,
-                                          std::string &&key) {
-    if (snode_tree_id >= snode_tree_key_cache_.size()) {
-      snode_tree_key_cache_.resize(snode_tree_id + 1);
-    }
-    return snode_tree_key_cache_[snode_tree_id] = std::move(key);
-  }
-
-  std::string get_hashed_key_of_snode(SNode *snode) {
-    TI_ASSERT(snode && prog_);
-    auto snode_tree_id = snode->get_snode_tree_id();
-    std::string res;
-    if (snode_tree_id < snode_tree_key_cache_.size() &&
-        !snode_tree_key_cache_[snode_tree_id].empty()) {
-      res = snode_tree_key_cache_[snode_tree_id];
-    } else {
-      auto *snode_tree_root = prog_->get_snode_root(snode_tree_id);
-      auto snode_tree_key =
-          get_hashed_offline_cache_key_of_snode(snode_tree_root);
-      res = cache_snode_tree_key(snode_tree_id, std::move(snode_tree_key));
-    }
-    return res.append(std::to_string(snode->id));
-  }
-
-  Program *prog_{nullptr};
-  std::vector<std::string> snode_tree_key_cache_;
-};
-
 }  // namespace lang
 }  // namespace taichi

From e21a4e3ba70f0ec647e2f41b416fe91dd0e13a35 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Wed, 11 May 2022 17:39:13 +0800
Subject: [PATCH 052/176] [refactor] Add ArrayMetadata to store the array
 runtime size (#4950)

* [refactor] Add ArrayMetadata to store the array runtime size

* rm macros

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* revert to debug

* decompose

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/backends/cuda/codegen_cuda.cpp | 20 +++++++++++---------
 taichi/codegen/codegen_llvm.cpp       |  5 +++--
 taichi/program/context.h              | 24 ++++++++++++++++++------
 taichi/program/kernel.cpp             | 11 ++++++++---
 taichi/runtime/opengl/opengl_api.cpp  | 24 ++++++++++++++----------
 5 files changed, 54 insertions(+), 30 deletions(-)

diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index c1c3dcf1807b6..856b0f182f4a5 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -65,13 +65,15 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
       bool transferred = false;
       for (int i = 0; i < (int)args.size(); i++) {
         if (args[i].is_array) {
-          if (args[i].size == 0)
+          const auto arr_sz = context.array_runtime_sizes[i];
+          if (arr_sz == 0) {
             continue;
+          }
           arg_buffers[i] = context.get_arg<void *>(i);
           if (!context.is_device_allocation[i]) {
             // Note: both numpy and PyTorch support arrays/tensors with zeros
             // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
-            // args[i].size = 0.
+            // `arr_sz` zero.
             unsigned int attr_val = 0;
             uint32_t ret_code =
                 CUDADriver::get_instance().mem_get_attribute.call(
@@ -86,19 +88,18 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
               //   host.
               // See CUDA driver API `cuPointerGetAttribute` for more details.
               transferred = true;
-              CUDADriver::get_instance().malloc(&device_buffers[i],
-                                                args[i].size);
+              CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz);
               CUDADriver::get_instance().memcpy_host_to_device(
-                  (void *)device_buffers[i], arg_buffers[i], args[i].size);
+                  (void *)device_buffers[i], arg_buffers[i], arr_sz);
             } else {
               device_buffers[i] = arg_buffers[i];
             }
             // device_buffers[i] saves a raw ptr on CUDA device.
             ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
-                                               args[i].size,
+                                               arr_sz,
                                                /*is_device_allocation=*/false);
 
-          } else if (args[i].size > 0) {
+          } else if (arr_sz > 0) {
             // arg_buffers[i] is a DeviceAllocation*
             // TODO: Unwraps DeviceAllocation* can be done at CodeGenLLVM since
             // it's shared by cpu and cuda.
@@ -114,7 +115,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
 
             // device_buffers[i] saves the unwrapped raw ptr from arg_buffers[i]
             ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
-                                               args[i].size,
+                                               arr_sz,
                                                /*is_device_allocation=*/false);
           }
         }
@@ -135,7 +136,8 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
         for (int i = 0; i < (int)args.size(); i++) {
           if (device_buffers[i] != arg_buffers[i]) {
             CUDADriver::get_instance().memcpy_device_to_host(
-                arg_buffers[i], (void *)device_buffers[i], args[i].size);
+                arg_buffers[i], (void *)device_buffers[i],
+                context.array_runtime_sizes[i]);
             CUDADriver::get_instance().mem_free((void *)device_buffers[i]);
           }
         }
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index baf5ab19c3738..2e46059688961 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -2297,13 +2297,14 @@ FunctionType CodeGenLLVM::compile_module_to_executable() {
     // |DeviceAllocation|, CPU backend actually want to use the raw ptr here.
     for (int i = 0; i < (int)args.size(); i++) {
       if (args[i].is_array && context.is_device_allocation[i] &&
-          args[i].size > 0) {
+          context.array_runtime_sizes[i] > 0) {
         DeviceAllocation *ptr =
             static_cast<DeviceAllocation *>(context.get_arg<void *>(i));
         uint64 host_ptr = (uint64)kernel->program->get_llvm_program_impl()
                               ->get_ndarray_alloc_info_ptr(*ptr);
         context.set_arg(i, host_ptr);
-        context.set_device_allocation(i, false);
+        context.set_array_is_device_allocation(i,
+                                               /*is_device_allocation=*/false);
       }
     }
     for (auto task : offloaded_tasks_local) {
diff --git a/taichi/program/context.h b/taichi/program/context.h
index ad85e09a38437..2712152b0a556 100644
--- a/taichi/program/context.h
+++ b/taichi/program/context.h
@@ -22,7 +22,15 @@ struct RuntimeContext {
   uint64 args[taichi_max_num_args_total];
   int32 extra_args[taichi_max_num_args_extra][taichi_max_num_indices];
   int32 cpu_thread_id;
-  // |is_device_allocation| is true iff args[i] is a DeviceAllocation*.
+
+  // Note that I've tried to group `array_runtime_size` and
+  // `is_device_allocation` into a small struct. However, it caused some test
+  // cases to stuck.
+
+  // `array_runtime_size` records the runtime size of the
+  // corresponding array arguments.
+  uint64 array_runtime_sizes[taichi_max_num_args_total]{0};
+  // `is_device_allocation` is true iff args[i] is a DeviceAllocation*.
   bool is_device_allocation[taichi_max_num_args_total]{false};
   // We move the pointer of result buffer from LLVMRuntime to RuntimeContext
   // because each real function need a place to store its result, but
@@ -45,11 +53,15 @@ struct RuntimeContext {
   template <typename T>
   void set_arg(int i, T v) {
     args[i] = taichi_union_cast_with_different_sizes<uint64>(v);
-    set_device_allocation(i, false);
+    set_array_is_device_allocation(i, /*is_device_allocation=*/false);
+  }
+
+  void set_array_runtime_size(int i, uint64 size) {
+    this->array_runtime_sizes[i] = size;
   }
 
-  void set_device_allocation(int i, bool is_device_allocation_) {
-    is_device_allocation[i] = is_device_allocation_;
+  void set_array_is_device_allocation(int i, bool is_device_allocation) {
+    this->is_device_allocation[i] = is_device_allocation;
   }
 
   template <typename T>
@@ -61,7 +73,7 @@ struct RuntimeContext {
                         DeviceAllocation &alloc,
                         const std::vector<int> &shape) {
     args[arg_id] = taichi_union_cast_with_different_sizes<uint64>(&alloc);
-    set_device_allocation(arg_id, true);
+    set_array_is_device_allocation(arg_id, /*is_device_allocation=*/true);
     TI_ASSERT(shape.size() <= taichi_max_num_indices);
     for (int i = 0; i < shape.size(); i++) {
       extra_args[arg_id][i] = shape[i];
@@ -73,7 +85,7 @@ struct RuntimeContext {
                         const std::vector<int> &shape,
                         const std::vector<int> &element_shape) {
     args[arg_id] = taichi_union_cast_with_different_sizes<uint64>(&alloc);
-    set_device_allocation(arg_id, true);
+    set_array_is_device_allocation(arg_id, /*is_device_allocation=*/true);
     TI_ASSERT(shape.size() + element_shape.size() <= taichi_max_num_indices);
     for (int i = 0; i < shape.size(); i++) {
       extra_args[arg_id][i] = shape[i];
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 863070aeec6af..3965bd9b27566 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -238,9 +238,12 @@ void Kernel::LaunchContextBuilder::set_arg_external_array(
        ActionArg("address", fmt::format("0x{:x}", ptr)),
        ActionArg("array_size_in_bytes", (int64)size)});
 
+  // FIXME(https://github.com/taichi-dev/taichi/issues/4949): Make the Metal
+  // backend support Ndarray, then remove this line below.
   kernel_->args[arg_id].size = size;
   ctx_->set_arg(arg_id, ptr);
-  ctx_->set_device_allocation(arg_id, is_device_allocation);
+  ctx_->set_array_runtime_size(arg_id, size);
+  ctx_->set_array_is_device_allocation(arg_id, is_device_allocation);
 }
 
 void Kernel::LaunchContextBuilder::set_arg_external_array_with_shape(
@@ -248,7 +251,8 @@ void Kernel::LaunchContextBuilder::set_arg_external_array_with_shape(
     uintptr_t ptr,
     uint64 size,
     const std::vector<int64> &shape) {
-  this->set_arg_external_array(arg_id, ptr, size, false);
+  this->set_arg_external_array(arg_id, ptr, size,
+                               /*is_device_allocation=*/false);
   TI_ASSERT_INFO(shape.size() <= taichi_max_num_indices,
                  "External array cannot have > {max_num_indices} indices");
   for (uint64 i = 0; i < shape.size(); ++i) {
@@ -260,7 +264,8 @@ void Kernel::LaunchContextBuilder::set_arg_ndarray(int arg_id,
                                                    const Ndarray &arr) {
   intptr_t ptr = arr.get_device_allocation_ptr_as_int();
   uint64 arr_size = arr.get_element_size() * arr.get_nelement();
-  this->set_arg_external_array(arg_id, ptr, arr_size, true);
+  this->set_arg_external_array(arg_id, ptr, arr_size,
+                               /*is_device_allocation=*/true);
   TI_ASSERT_INFO(arr.shape.size() <= taichi_max_num_indices,
                  "External array cannot have > {max_num_indices} indices");
   for (uint64 i = 0; i < arr.shape.size(); ++i) {
diff --git a/taichi/runtime/opengl/opengl_api.cpp b/taichi/runtime/opengl/opengl_api.cpp
index cbb05d0340b56..17ba414f1c5ce 100644
--- a/taichi/runtime/opengl/opengl_api.cpp
+++ b/taichi/runtime/opengl/opengl_api.cpp
@@ -232,6 +232,7 @@ void CompiledTaichiKernel::init_args(Kernel *kernel) {
   for (int i = 0; i < arg_count; i++) {
     const auto dtype_name = kernel->args[i].dt.to_string();
     if (kernel->args[i].is_array) {
+      constexpr uint64 kUnkownRuntimeSize = 0;
       arr_args[i] = CompiledArrayArg(
           {/*dtype_enum=*/to_gl_dtype_enum(kernel->args[i].dt), dtype_name,
            /*field_dim=*/kernel->args[i].total_dim -
@@ -240,7 +241,7 @@ void CompiledTaichiKernel::init_args(Kernel *kernel) {
            /*element_shape=*/kernel->args[i].element_shape,
            /*shape_offset_in_bytes_in_args_buf=*/taichi_opengl_extra_args_base +
                i * taichi_max_num_indices * sizeof(int),
-           /*total_size=*/kernel->args[i].size});
+           kUnkownRuntimeSize});
     } else {
       scalar_args[i] = ScalarArg(
           {dtype_name, /*offset_in_bytes_in_args_buf=*/i * sizeof(uint64_t)});
@@ -400,23 +401,25 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
   for (auto &item : program_.arr_args) {
     int i = item.first;
     TI_ASSERT(args[i].is_array);
-    if (args[i].size == 0 || ctx.is_device_allocation[i])
+    const auto arr_sz = ctx.array_runtime_sizes[i];
+    if (arr_sz == 0 || ctx.is_device_allocation[i]) {
       continue;
+    }
     has_ext_arr = true;
-    if (args[i].size != item.second.total_size ||
+    if (arr_sz != item.second.total_size ||
         ext_arr_bufs_[i] == kDeviceNullAllocation) {
       if (ext_arr_bufs_[i] != kDeviceNullAllocation) {
         device_->dealloc_memory(ext_arr_bufs_[i]);
       }
-      ext_arr_bufs_[i] = device_->allocate_memory(
-          {args[i].size, /*host_write=*/true, /*host_read=*/true,
-           /*export_sharing=*/false});
-      item.second.total_size = args[i].size;
+      ext_arr_bufs_[i] = device_->allocate_memory({arr_sz, /*host_write=*/true,
+                                                   /*host_read=*/true,
+                                                   /*export_sharing=*/false});
+      item.second.total_size = arr_sz;
     }
     void *host_ptr = (void *)ctx.args[i];
     void *baseptr = device_->map(ext_arr_bufs_[i]);
     if (program_.check_ext_arr_read(i)) {
-      std::memcpy((char *)baseptr, host_ptr, args[i].size);
+      std::memcpy((char *)baseptr, host_ptr, arr_sz);
     }
     device_->unmap(ext_arr_bufs_[i]);
   }
@@ -503,9 +506,10 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
   if (has_ext_arr) {
     for (auto &item : program_.arr_args) {
       int i = item.first;
-      if (args[i].size != 0 && !ctx.is_device_allocation[i]) {
+      const auto arr_sz = ctx.array_runtime_sizes[i];
+      if (arr_sz > 0 && !ctx.is_device_allocation[i]) {
         uint8_t *baseptr = (uint8_t *)device_->map(ext_arr_bufs_[i]);
-        memcpy((void *)ctx.args[i], baseptr, args[i].size);
+        memcpy((void *)ctx.args[i], baseptr, arr_sz);
         device_->unmap(ext_arr_bufs_[i]);
       }
     }

From 80f20f25279112bcdf012ece57de2033d8b046be Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Wed, 11 May 2022 19:19:20 +0800
Subject: [PATCH 053/176] [refactor] Some renamings (#4959)

---
 taichi/backends/cuda/codegen_cuda.cpp |  2 +-
 taichi/codegen/codegen_llvm.cpp       |  2 +-
 taichi/program/context.h              |  8 ++++----
 taichi/runtime/opengl/opengl_api.cpp  | 10 +++++-----
 taichi/runtime/opengl/opengl_api.h    |  2 +-
 taichi/runtime/vulkan/runtime.cpp     | 10 +++++-----
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index 856b0f182f4a5..aa20cb43c8e77 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -70,7 +70,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
             continue;
           }
           arg_buffers[i] = context.get_arg<void *>(i);
-          if (!context.is_device_allocation[i]) {
+          if (!context.is_device_allocations[i]) {
             // Note: both numpy and PyTorch support arrays/tensors with zeros
             // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
             // `arr_sz` zero.
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 2e46059688961..37469c154c941 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -2296,7 +2296,7 @@ FunctionType CodeGenLLVM::compile_module_to_executable() {
     // For taichi ndarrays, context.args saves pointer to its
     // |DeviceAllocation|, CPU backend actually want to use the raw ptr here.
     for (int i = 0; i < (int)args.size(); i++) {
-      if (args[i].is_array && context.is_device_allocation[i] &&
+      if (args[i].is_array && context.is_device_allocations[i] &&
           context.array_runtime_sizes[i] > 0) {
         DeviceAllocation *ptr =
             static_cast<DeviceAllocation *>(context.get_arg<void *>(i));
diff --git a/taichi/program/context.h b/taichi/program/context.h
index 2712152b0a556..1e630d68d1a90 100644
--- a/taichi/program/context.h
+++ b/taichi/program/context.h
@@ -24,14 +24,14 @@ struct RuntimeContext {
   int32 cpu_thread_id;
 
   // Note that I've tried to group `array_runtime_size` and
-  // `is_device_allocation` into a small struct. However, it caused some test
+  // `is_device_allocations` into a small struct. However, it caused some test
   // cases to stuck.
 
   // `array_runtime_size` records the runtime size of the
   // corresponding array arguments.
   uint64 array_runtime_sizes[taichi_max_num_args_total]{0};
-  // `is_device_allocation` is true iff args[i] is a DeviceAllocation*.
-  bool is_device_allocation[taichi_max_num_args_total]{false};
+  // `is_device_allocations` is true iff i-th arg is a `DeviceAllocation*`.
+  bool is_device_allocations[taichi_max_num_args_total]{false};
   // We move the pointer of result buffer from LLVMRuntime to RuntimeContext
   // because each real function need a place to store its result, but
   // LLVMRuntime is shared among functions. So we moved the pointer to
@@ -61,7 +61,7 @@ struct RuntimeContext {
   }
 
   void set_array_is_device_allocation(int i, bool is_device_allocation) {
-    this->is_device_allocation[i] = is_device_allocation;
+    this->is_device_allocations[i] = is_device_allocation;
   }
 
   template <typename T>
diff --git a/taichi/runtime/opengl/opengl_api.cpp b/taichi/runtime/opengl/opengl_api.cpp
index 17ba414f1c5ce..a7f8985f0390b 100644
--- a/taichi/runtime/opengl/opengl_api.cpp
+++ b/taichi/runtime/opengl/opengl_api.cpp
@@ -402,11 +402,11 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
     int i = item.first;
     TI_ASSERT(args[i].is_array);
     const auto arr_sz = ctx.array_runtime_sizes[i];
-    if (arr_sz == 0 || ctx.is_device_allocation[i]) {
+    if (arr_sz == 0 || ctx.is_device_allocations[i]) {
       continue;
     }
     has_ext_arr = true;
-    if (arr_sz != item.second.total_size ||
+    if (arr_sz != item.second.runtime_size ||
         ext_arr_bufs_[i] == kDeviceNullAllocation) {
       if (ext_arr_bufs_[i] != kDeviceNullAllocation) {
         device_->dealloc_memory(ext_arr_bufs_[i]);
@@ -414,7 +414,7 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
       ext_arr_bufs_[i] = device_->allocate_memory({arr_sz, /*host_write=*/true,
                                                    /*host_read=*/true,
                                                    /*export_sharing=*/false});
-      item.second.total_size = arr_sz;
+      item.second.runtime_size = arr_sz;
     }
     void *host_ptr = (void *)ctx.args[i];
     void *baseptr = device_->map(ext_arr_bufs_[i]);
@@ -471,7 +471,7 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
     //       On most devices this number is 8. But I need to look up how
     //       to query this information so currently this is thrown from OpenGl.
     for (const auto [arg_id, bind_id] : program_.used.arr_arg_to_bind_idx) {
-      if (ctx.is_device_allocation[arg_id]) {
+      if (ctx.is_device_allocations[arg_id]) {
         DeviceAllocation *ptr =
             static_cast<DeviceAllocation *>((void *)ctx.args[arg_id]);
 
@@ -507,7 +507,7 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
     for (auto &item : program_.arr_args) {
       int i = item.first;
       const auto arr_sz = ctx.array_runtime_sizes[i];
-      if (arr_sz > 0 && !ctx.is_device_allocation[i]) {
+      if (arr_sz > 0 && !ctx.is_device_allocations[i]) {
         uint8_t *baseptr = (uint8_t *)device_->map(ext_arr_bufs_[i]);
         memcpy((void *)ctx.args[i], baseptr, arr_sz);
         device_->unmap(ext_arr_bufs_[i]);
diff --git a/taichi/runtime/opengl/opengl_api.h b/taichi/runtime/opengl/opengl_api.h
index 7fb5fe203e1be..1262444178eea 100644
--- a/taichi/runtime/opengl/opengl_api.h
+++ b/taichi/runtime/opengl/opengl_api.h
@@ -66,7 +66,7 @@ struct CompiledArrayArg {
   bool is_scalar{false};
   std::vector<int> element_shape;
   size_t shape_offset_in_bytes_in_args_buf{0};
-  size_t total_size{0};  // Runtime information
+  size_t runtime_size{0};  // Runtime information
 
   TI_IO_DEF(field_dim,
             is_scalar,
diff --git a/taichi/runtime/vulkan/runtime.cpp b/taichi/runtime/vulkan/runtime.cpp
index bc1fca35c7c76..3a4d199824a13 100644
--- a/taichi/runtime/vulkan/runtime.cpp
+++ b/taichi/runtime/vulkan/runtime.cpp
@@ -79,7 +79,7 @@ class HostDeviceContextBlitter {
       char *device_ptr = device_base + arg.offset_in_mem;
       do {
         if (arg.is_array) {
-          if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
+          if (!host_ctx_->is_device_allocations[i] && ext_arr_size.at(i)) {
             // Only need to blit ext arrs (host array)
             DeviceAllocation buffer = ext_arrays.at(i);
             char *const device_arr_ptr =
@@ -150,7 +150,7 @@ class HostDeviceContextBlitter {
       for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
         const auto &arg = ctx_attribs_->args()[i];
         if (arg.is_array) {
-          if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
+          if (!host_ctx_->is_device_allocations[i] && ext_arr_size.at(i)) {
             require_sync = true;
           }
         }
@@ -166,7 +166,7 @@ class HostDeviceContextBlitter {
     for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
       const auto &arg = ctx_attribs_->args()[i];
       if (arg.is_array) {
-        if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
+        if (!host_ctx_->is_device_allocations[i] && ext_arr_size.at(i)) {
           // Only need to blit ext arrs (host array)
           DeviceAllocation buffer = ext_arrays.at(i);
           char *const device_arr_ptr =
@@ -455,7 +455,7 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
     const auto &args = ti_kernel->ti_kernel_attribs().ctx_attribs.args();
     for (auto &arg : args) {
       if (arg.is_array) {
-        if (host_ctx->is_device_allocation[i]) {
+        if (host_ctx->is_device_allocations[i]) {
           // NDArray
           if (host_ctx->args[i]) {
             any_arrays[i] = *(DeviceAllocation *)(host_ctx->args[i]);
@@ -546,7 +546,7 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   // Dealloc external arrays
   for (auto pair : any_arrays) {
     if (pair.second != kDeviceNullAllocation) {
-      if (!host_ctx->is_device_allocation[pair.first]) {
+      if (!host_ctx->is_device_allocations[pair.first]) {
         device_->dealloc_memory(pair.second);
       }
     }

From acedc0e72e7af3f9be7d7358333528ab40cd35e4 Mon Sep 17 00:00:00 2001
From: Lin Jiang <90667349+lin-hitonami@users.noreply.github.com>
Date: Thu, 12 May 2022 13:38:28 +0800
Subject: [PATCH 054/176] [lang] Add reference type support on real functions
 (#4889)

* wip

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fixes

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* add test

* fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix test_api

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/ast/ast_transformer.py |  6 +++++
 python/taichi/lang/kernel_arguments.py    | 12 ++++++---
 python/taichi/lang/kernel_impl.py         |  6 ++++-
 python/taichi/types/primitive_types.py    | 11 +++++++++
 taichi/analysis/data_source_analysis.cpp  |  2 ++
 taichi/codegen/codegen_llvm.cpp           | 11 +++++++--
 taichi/codegen/codegen_llvm.h             |  2 ++
 taichi/inc/expressions.inc.h              |  1 +
 taichi/inc/statements.inc.h               |  1 +
 taichi/ir/expression_printer.h            |  6 +++++
 taichi/ir/frontend_ir.cpp                 | 27 +++++++++++++++-----
 taichi/ir/frontend_ir.h                   | 21 +++++++++++++++-
 taichi/ir/statements.h                    | 20 +++++++++++++++
 taichi/python/export_lang.cpp             |  4 ++-
 taichi/transforms/ir_printer.cpp          |  4 +++
 taichi/transforms/lower_ast.cpp           |  4 ++-
 taichi/transforms/type_check.cpp          |  5 ++++
 tests/python/test_api.py                  |  2 +-
 tests/python/test_function.py             | 30 +++++++++++++++++++++++
 19 files changed, 158 insertions(+), 17 deletions(-)

diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 169d09d691272..9ffc80db38c0e 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -486,6 +486,12 @@ def transform_as_kernel():
                         arg.arg,
                         kernel_arguments.decl_matrix_arg(
                             ctx.func.arguments[i].annotation))
+                elif isinstance(ctx.func.arguments[i].annotation,
+                                primitive_types.RefType):
+                    ctx.create_variable(
+                        arg.arg,
+                        kernel_arguments.decl_scalar_arg(
+                            ctx.func.arguments[i].annotation))
                 else:
                     ctx.global_vars[
                         arg.arg] = kernel_arguments.decl_scalar_arg(
diff --git a/python/taichi/lang/kernel_arguments.py b/python/taichi/lang/kernel_arguments.py
index d512f2ceb5a66..05adeb18316d7 100644
--- a/python/taichi/lang/kernel_arguments.py
+++ b/python/taichi/lang/kernel_arguments.py
@@ -8,7 +8,7 @@
 from taichi.lang.expr import Expr
 from taichi.lang.matrix import Matrix, MatrixType
 from taichi.lang.util import cook_dtype
-from taichi.types.primitive_types import u64
+from taichi.types.primitive_types import RefType, u64
 
 
 class KernelArgument:
@@ -47,9 +47,13 @@ def subscript(self, i, j):
 
 
 def decl_scalar_arg(dtype):
+    is_ref = False
+    if isinstance(dtype, RefType):
+        is_ref = True
+        dtype = dtype.tp
     dtype = cook_dtype(dtype)
     arg_id = impl.get_runtime().prog.decl_arg(dtype, False)
-    return Expr(_ti_core.make_arg_load_expr(arg_id, dtype))
+    return Expr(_ti_core.make_arg_load_expr(arg_id, dtype, is_ref))
 
 
 def decl_matrix_arg(matrixtype):
@@ -63,8 +67,8 @@ def decl_sparse_matrix(dtype):
     ptr_type = cook_dtype(u64)
     # Treat the sparse matrix argument as a scalar since we only need to pass in the base pointer
     arg_id = impl.get_runtime().prog.decl_arg(ptr_type, False)
-    return SparseMatrixProxy(_ti_core.make_arg_load_expr(arg_id, ptr_type),
-                             value_type)
+    return SparseMatrixProxy(
+        _ti_core.make_arg_load_expr(arg_id, ptr_type, False), value_type)
 
 
 def decl_ndarray_arg(dtype, dim, element_shape, layout):
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 0b820c11eac04..418b51bb8bba3 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -244,6 +244,9 @@ def func_call_rvalue(self, key, args):
             if not isinstance(anno, template):
                 if id(anno) in primitive_types.type_ids:
                     non_template_args.append(ops.cast(args[i], anno))
+                elif isinstance(anno, primitive_types.RefType):
+                    non_template_args.append(
+                        _ti_core.make_reference(args[i].ptr))
                 else:
                     non_template_args.append(args[i])
         non_template_args = impl.make_expr_group(non_template_args)
@@ -302,7 +305,8 @@ def extract_arguments(self):
             else:
                 if not id(annotation
                           ) in primitive_types.type_ids and not isinstance(
-                              annotation, template):
+                              annotation, template) and not isinstance(
+                                  annotation, primitive_types.RefType):
                     raise TaichiSyntaxError(
                         f'Invalid type annotation (argument {i}) of Taichi function: {annotation}'
                     )
diff --git a/python/taichi/types/primitive_types.py b/python/taichi/types/primitive_types.py
index 726da1831a32a..3149ffc450a8b 100644
--- a/python/taichi/types/primitive_types.py
+++ b/python/taichi/types/primitive_types.py
@@ -141,6 +141,16 @@
 
 # ----------------------------------------
 
+
+class RefType:
+    def __init__(self, tp):
+        self.tp = tp
+
+
+def ref(tp):
+    return RefType(tp)
+
+
 real_types = [f16, f32, f64, float]
 real_type_ids = [id(t) for t in real_types]
 
@@ -173,4 +183,5 @@
     'u32',
     'uint64',
     'u64',
+    'ref',
 ]
diff --git a/taichi/analysis/data_source_analysis.cpp b/taichi/analysis/data_source_analysis.cpp
index ec23e8be085ef..4a018afa6bf47 100644
--- a/taichi/analysis/data_source_analysis.cpp
+++ b/taichi/analysis/data_source_analysis.cpp
@@ -35,6 +35,8 @@ std::vector<Stmt *> get_load_pointers(Stmt *load_stmt) {
     return std::vector<Stmt *>(1, stack_pop->stack);
   } else if (auto external_func = load_stmt->cast<ExternalFuncCallStmt>()) {
     return external_func->arg_stmts;
+  } else if (auto ref = load_stmt->cast<ReferenceStmt>()) {
+    return {ref->var};
   } else {
     return std::vector<Stmt *>();
   }
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 37469c154c941..22f701abb7cb5 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1086,6 +1086,9 @@ llvm::Value *CodeGenLLVM::bitcast_from_u64(llvm::Value *val, DataType type) {
 
 llvm::Value *CodeGenLLVM::bitcast_to_u64(llvm::Value *val, DataType type) {
   auto intermediate_bits = 0;
+  if (type.is_pointer()) {
+    return builder->CreatePtrToInt(val, tlctx->get_data_type<int64>());
+  }
   if (auto cit = type->cast<CustomIntType>()) {
     intermediate_bits = data_type_bits(cit->get_compute_type());
   } else {
@@ -1109,8 +1112,8 @@ void CodeGenLLVM::visit(ArgLoadStmt *stmt) {
 
   llvm::Type *dest_ty = nullptr;
   if (stmt->is_ptr) {
-    dest_ty =
-        llvm::PointerType::get(tlctx->get_data_type(PrimitiveType::i32), 0);
+    dest_ty = llvm::PointerType::get(
+        tlctx->get_data_type(stmt->ret_type.ptr_removed()), 0);
     llvm_val[stmt] = builder->CreateIntToPtr(raw_arg, dest_ty);
   } else {
     llvm_val[stmt] = bitcast_from_u64(raw_arg, stmt->ret_type);
@@ -2460,6 +2463,10 @@ llvm::Value *CodeGenLLVM::create_mesh_xlogue(std::unique_ptr<Block> &block) {
   return xlogue;
 }
 
+void CodeGenLLVM::visit(ReferenceStmt *stmt) {
+  llvm_val[stmt] = llvm_val[stmt->var];
+}
+
 void CodeGenLLVM::visit(FuncCallStmt *stmt) {
   if (!func_map.count(stmt->func)) {
     auto guard = get_function_creation_guard(
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index bbea19ba60dd6..51707e7ae6dfc 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -369,6 +369,8 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void visit(MeshPatchIndexStmt *stmt) override;
 
+  void visit(ReferenceStmt *stmt) override;
+
   llvm::Value *create_xlogue(std::unique_ptr<Block> &block);
 
   llvm::Value *create_mesh_xlogue(std::unique_ptr<Block> &block);
diff --git a/taichi/inc/expressions.inc.h b/taichi/inc/expressions.inc.h
index faba902169142..4ec43c58357f9 100644
--- a/taichi/inc/expressions.inc.h
+++ b/taichi/inc/expressions.inc.h
@@ -19,3 +19,4 @@ PER_EXPRESSION(FuncCallExpression)
 PER_EXPRESSION(MeshPatchIndexExpression)
 PER_EXPRESSION(MeshRelationAccessExpression)
 PER_EXPRESSION(MeshIndexConversionExpression)
+PER_EXPRESSION(ReferenceExpression)
diff --git a/taichi/inc/statements.inc.h b/taichi/inc/statements.inc.h
index b26a942860a8a..c40c89290afd8 100644
--- a/taichi/inc/statements.inc.h
+++ b/taichi/inc/statements.inc.h
@@ -18,6 +18,7 @@ PER_STATEMENT(FuncCallStmt)
 PER_STATEMENT(ReturnStmt)
 
 PER_STATEMENT(ArgLoadStmt)
+PER_STATEMENT(ReferenceStmt)
 PER_STATEMENT(ExternalPtrStmt)
 PER_STATEMENT(PtrOffsetStmt)
 PER_STATEMENT(ConstStmt)
diff --git a/taichi/ir/expression_printer.h b/taichi/ir/expression_printer.h
index 92c882bcd19ff..f6bb7e607f2ef 100644
--- a/taichi/ir/expression_printer.h
+++ b/taichi/ir/expression_printer.h
@@ -216,6 +216,12 @@ class ExpressionHumanFriendlyPrinter : public ExpressionPrinter {
     emit(")");
   }
 
+  void visit(ReferenceExpression *expr) override {
+    emit("ref(");
+    expr->var->accept(this);
+    emit(")");
+  }
+
   static std::string expr_to_string(Expr &expr) {
     std::ostringstream oss;
     ExpressionHumanFriendlyPrinter printer(&oss);
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 627ff3592cf26..95acab8df2f0c 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -121,7 +121,7 @@ void ArgLoadExpression::type_check(CompileConfig *) {
 }
 
 void ArgLoadExpression::flatten(FlattenContext *ctx) {
-  auto arg_load = std::make_unique<ArgLoadStmt>(arg_id, dt);
+  auto arg_load = std::make_unique<ArgLoadStmt>(arg_id, dt, is_ptr);
   ctx->push_back(std::move(arg_load));
   stmt = ctx->back_stmt();
 }
@@ -485,17 +485,19 @@ void AtomicOpExpression::flatten(FlattenContext *ctx) {
     op_type = AtomicOpType::add;
   }
   // expand rhs
-  auto expr = val;
-  flatten_rvalue(expr, ctx);
+  flatten_rvalue(val, ctx);
+  auto src_val = val->stmt;
   if (dest.is<IdExpression>()) {  // local variable
     // emit local store stmt
     auto alloca = ctx->current_block->lookup_var(dest.cast<IdExpression>()->id);
-    ctx->push_back<AtomicOpStmt>(op_type, alloca, expr->stmt);
+    ctx->push_back<AtomicOpStmt>(op_type, alloca, src_val);
   } else {
     TI_ASSERT(dest.is<GlobalPtrExpression>() ||
-              dest.is<TensorElementExpression>());
+              dest.is<TensorElementExpression>() ||
+              (dest.is<ArgLoadExpression>() &&
+               dest.cast<ArgLoadExpression>()->is_ptr));
     flatten_lvalue(dest, ctx);
-    ctx->push_back<AtomicOpStmt>(op_type, dest->stmt, expr->stmt);
+    ctx->push_back<AtomicOpStmt>(op_type, dest->stmt, src_val);
   }
   stmt = ctx->back_stmt();
   stmt->tb = tb;
@@ -625,6 +627,16 @@ void MeshIndexConversionExpression::flatten(FlattenContext *ctx) {
   stmt = ctx->back_stmt();
 }
 
+void ReferenceExpression::type_check(CompileConfig *) {
+  ret_type = var->ret_type;
+}
+
+void ReferenceExpression::flatten(FlattenContext *ctx) {
+  flatten_lvalue(var, ctx);
+  ctx->push_back<ReferenceStmt>(var->stmt);
+  stmt = ctx->back_stmt();
+}
+
 Block *ASTBuilder::current_block() {
   if (stack_.empty())
     return nullptr;
@@ -945,6 +957,9 @@ void flatten_rvalue(Expr ptr, Expression::FlattenContext *ctx) {
     else {
       TI_NOT_IMPLEMENTED
     }
+  } else if (ptr.is<ArgLoadExpression>() &&
+             ptr.cast<ArgLoadExpression>()->is_ptr) {
+    flatten_global_load(ptr, ctx);
   }
 }
 
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 2e6ef5bf1c9b8..d34f7b8274c7f 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -276,14 +276,20 @@ class ArgLoadExpression : public Expression {
  public:
   int arg_id;
   DataType dt;
+  bool is_ptr;
 
-  ArgLoadExpression(int arg_id, DataType dt) : arg_id(arg_id), dt(dt) {
+  ArgLoadExpression(int arg_id, DataType dt, bool is_ptr = false)
+      : arg_id(arg_id), dt(dt), is_ptr(is_ptr) {
   }
 
   void type_check(CompileConfig *config) override;
 
   void flatten(FlattenContext *ctx) override;
 
+  bool is_lvalue() const override {
+    return is_ptr;
+  }
+
   TI_DEFINE_ACCEPT_FOR_EXPRESSION
 };
 
@@ -727,6 +733,19 @@ class MeshIndexConversionExpression : public Expression {
   TI_DEFINE_ACCEPT_FOR_EXPRESSION
 };
 
+class ReferenceExpression : public Expression {
+ public:
+  Expr var;
+  void type_check(CompileConfig *config) override;
+
+  ReferenceExpression(const Expr &expr) : var(expr) {
+  }
+
+  void flatten(FlattenContext *ctx) override;
+
+  TI_DEFINE_ACCEPT_FOR_EXPRESSION
+};
+
 class ASTBuilder {
  private:
   enum LoopState { None, Outermost, Inner };
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index 83e9a57f49fd0..d9099ccdcf785 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -898,6 +898,26 @@ class FuncCallStmt : public Stmt {
   TI_DEFINE_ACCEPT_AND_CLONE
 };
 
+/**
+ * A reference to a variable.
+ */
+class ReferenceStmt : public Stmt {
+ public:
+  Stmt *var;
+  bool global_side_effect{false};
+
+  ReferenceStmt(Stmt *var) : var(var) {
+    TI_STMT_REG_FIELDS;
+  }
+
+  bool has_global_side_effect() const override {
+    return global_side_effect;
+  }
+
+  TI_STMT_DEF_FIELDS(ret_type, var);
+  TI_DEFINE_ACCEPT_AND_CLONE
+};
+
 /**
  * Exit the kernel or function with a return value.
  */
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 29374e70fad48..6d4095fd48b89 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -726,7 +726,9 @@ void export_lang(py::module &m) {
         Stmt::make<FrontendAssignStmt, const Expr &, const Expr &>);
 
   m.def("make_arg_load_expr",
-        Expr::make<ArgLoadExpression, int, const DataType &>);
+        Expr::make<ArgLoadExpression, int, const DataType &, bool>);
+
+  m.def("make_reference", Expr::make<ReferenceExpression, const Expr &>);
 
   m.def("make_external_tensor_expr",
         Expr::make<ExternalTensorExpression, const DataType &, int, int, int,
diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index e04a4b9a1cfbc..23c54d23a4c2c 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -774,6 +774,10 @@ class IRPrinter : public IRVisitor {
     print(")");
   }
 
+  void visit(ReferenceStmt *stmt) override {
+    print("{}{} = ref({})", stmt->type_hint(), stmt->name(), stmt->var->name());
+  }
+
  private:
   std::string expr_to_string(Expr &expr) {
     return expr_to_string(expr.expr.get());
diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index 5141724e19deb..7920370e3444e 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -408,7 +408,9 @@ class LowerAST : public IRVisitor {
         TI_NOT_IMPLEMENTED
       }
     } else {  // global variable
-      TI_ASSERT(dest.is<GlobalPtrExpression>());
+      TI_ASSERT(dest.is<GlobalPtrExpression>() ||
+                (dest.is<ArgLoadExpression>() &&
+                 dest.cast<ArgLoadExpression>()->is_ptr));
       flatten_lvalue(dest, &fctx);
       fctx.push_back<GlobalStoreStmt>(dest->stmt, expr->stmt);
     }
diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index 5eb1ba0e95804..fc3092b01b105 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -534,6 +534,11 @@ class TypeCheck : public IRVisitor {
   void visit(BitStructStoreStmt *stmt) override {
     // do nothing
   }
+
+  void visit(ReferenceStmt *stmt) override {
+    stmt->ret_type = stmt->var->ret_type;
+    stmt->ret_type.set_is_pointer(true);
+  }
 };
 
 namespace irpass {
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 33a19546be3de..af554afcbd266 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -80,7 +80,7 @@ def _get_expected_matrix_apis():
     'lang', 'length', 'linalg', 'log', 'loop_config', 'math', 'max',
     'mesh_local', 'mesh_patch_idx', 'metal', 'min', 'ndarray', 'ndrange',
     'no_activate', 'one', 'opengl', 'polar_decompose', 'pow', 'profiler',
-    'randn', 'random', 'raw_div', 'raw_mod', 'rescale_index', 'reset',
+    'randn', 'random', 'raw_div', 'raw_mod', 'ref', 'rescale_index', 'reset',
     'rgb_to_hex', 'root', 'round', 'rsqrt', 'select', 'set_logging_level',
     'simt', 'sin', 'solve', 'sparse_matrix_builder', 'sqrt', 'static',
     'static_assert', 'static_print', 'stop_grad', 'svd', 'swizzle_generator',
diff --git a/tests/python/test_function.py b/tests/python/test_function.py
index a7d38116cf4e8..70e70692f0315 100644
--- a/tests/python/test_function.py
+++ b/tests/python/test_function.py
@@ -304,3 +304,33 @@ def bar(a: ti.i32) -> ti.i32:
 
     assert bar(10) == 11 * 5
     assert bar(200) == 99 * 50
+
+
+@test_utils.test(arch=[ti.cpu, ti.gpu], debug=True)
+def test_ref():
+    @ti.experimental.real_func
+    def foo(a: ti.ref(ti.f32)):
+        a = 7
+
+    @ti.kernel
+    def bar():
+        a = 5.
+        foo(a)
+        assert a == 7
+
+    bar()
+
+
+@test_utils.test(arch=[ti.cpu, ti.gpu], debug=True)
+def test_ref_atomic():
+    @ti.experimental.real_func
+    def foo(a: ti.ref(ti.f32)):
+        a += a
+
+    @ti.kernel
+    def bar():
+        a = 5.
+        foo(a)
+        assert a == 10.
+
+    bar()

From 3c89f4dee9223d141162d0aad694c63ab2c9629a Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Thu, 12 May 2022 13:38:40 +0800
Subject: [PATCH 055/176] [llvm] Move cache directory to dump() (#4963)

* [llvm] Move cache directory to dump()

* fix

* fix
---
 taichi/llvm/llvm_offline_cache.cpp | 9 +++++++--
 taichi/llvm/llvm_offline_cache.h   | 7 +------
 taichi/llvm/llvm_program.cpp       | 4 ++--
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index c7e625b35a673..957c636eb1526 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -1,5 +1,7 @@
 #include "llvm_offline_cache.h"
 
+#include <sstream>
+
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_os_ostream.h"
@@ -41,9 +43,12 @@ bool LlvmOfflineCacheFileReader::get_kernel_cache(
   return true;
 }
 
-void LlvmOfflineCacheFileWriter::dump() {
+void LlvmOfflineCacheFileWriter::dump(const std::string &path) {
+  taichi::create_directories(path);
   for (auto &[k, v] : data_.kernels) {
-    std::string filename_prefix = path_ + "/" + k;
+    std::stringstream filename_ss;
+    filename_ss << path << "/" << k;
+    std::string filename_prefix = filename_ss.str();
     {
       std::string filename = filename_prefix + ".ll";
       std::ofstream os(filename, std::ios::out | std::ios::binary);
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index 0f98fd60227ed..c0759345aa865 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -45,10 +45,6 @@ class LlvmOfflineCacheFileReader {
 
 class LlvmOfflineCacheFileWriter {
  public:
-  LlvmOfflineCacheFileWriter(const std::string &path) : path_(path) {
-    taichi::create_directories(path);
-  }
-
   void set_data(LlvmOfflineCache &&data) {
     this->mangled_ = false;
     this->data_ = std::move(data);
@@ -59,7 +55,7 @@ class LlvmOfflineCacheFileWriter {
     data_.kernels[key] = std::move(kernel_cache);
   }
 
-  void dump();
+  void dump(const std::string &path);
 
  private:
   void mangle_offloaded_task_name(
@@ -68,7 +64,6 @@ class LlvmOfflineCacheFileWriter {
       std::vector<LlvmOfflineCache::OffloadedTaskCacheData>
           &offloaded_task_list);
 
-  std::string path_;
   LlvmOfflineCache data_;
   bool mangled_{false};
 };
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 2935735f852f6..8d1c0fa029cd6 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -662,9 +662,9 @@ void LlvmProgramImpl::cache_kernel(
 
 void LlvmProgramImpl::dump_cache_data_to_disk() {
   if (config->offline_cache && !cache_data_.kernels.empty()) {
-    LlvmOfflineCacheFileWriter writer(config->offline_cache_file_path);
+    LlvmOfflineCacheFileWriter writer{};
     writer.set_data(std::move(cache_data_));
-    writer.dump();
+    writer.dump(config->offline_cache_file_path);
   }
 }
 

From a7b9d2be8fb7b34e389b4450b205187edbaba617 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Thu, 12 May 2022 13:58:41 +0800
Subject: [PATCH 056/176] [RFC] AOT for all SNodes (#4806)

* [rfc] AOT for all SNodes

* add rfc tag

* fix

* fix

* Update docs/rfcs/20220413-aot-for-all-snode.md

Co-authored-by: Ailing  <ailzhang@users.noreply.github.com>

* fix

* soa

* more contents on autodiff, add_field()

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* toc

* Update docs/rfcs/20220413-aot-for-all-snode.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docs/rfcs/20220413-aot-for-all-snode.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docs/rfcs/20220413-aot-for-all-snode.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* improvements

* improvements

Co-authored-by: Ailing  <ailzhang@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>
---
 docs/rfcs/20220413-aot-for-all-snode.md | 408 ++++++++++++++++++++++++
 misc/prtags.json                        |   3 +-
 2 files changed, 410 insertions(+), 1 deletion(-)
 create mode 100644 docs/rfcs/20220413-aot-for-all-snode.md

diff --git a/docs/rfcs/20220413-aot-for-all-snode.md b/docs/rfcs/20220413-aot-for-all-snode.md
new file mode 100644
index 0000000000000..7df4ae3834b8b
--- /dev/null
+++ b/docs/rfcs/20220413-aot-for-all-snode.md
@@ -0,0 +1,408 @@
+# RFC: AOT for All SNodes
+
+* Author(s): [Ye Kuang](https://github.com/k-ye)
+* Date: 2022-04-13
+* Relevant Issue: https://github.com/taichi-dev/taichi/issues/4777
+---
+
+- [RFC: AOT for All SNodes](#rfc-aot-for-all-snodes)
+  - [* Relevant Issue: https://github.com/taichi-dev/taichi/issues/4777](#-relevant-issue-httpsgithubcomtaichi-devtaichiissues4777)
+- [TL;DR](#tldr)
+- [Background](#background)
+- [Goals](#goals)
+  - [Non-Goals](#non-goals)
+- [Detailed Design](#detailed-design)
+  - [A first attempt](#a-first-attempt)
+  - [A working design](#a-working-design)
+  - [Defining `shape`](#defining-shape)
+  - [AoS vs SoA](#aos-vs-soa)
+  - [Gradient and AutoDiff](#gradient-and-autodiff)
+  - [Python AOT API](#python-aot-api)
+  - [C++ AOT API](#c-aot-api)
+  - [Backward Compatibility](#backward-compatibility)
+- [Alternatives](#alternatives)
+- [FAQ](#faq)
+
+# TL;DR
+
+This RFC describes a design and API changes to make AOT support all kinds of SNodes and/or Taichi fields.
+
+# Background
+
+Currently, Taichi fields are defined and used in the following manner:
+
+```py
+a = ti.field(ti.i32)
+b = ti.field(ti.f32)
+ti.root.pointer(ti.ij, 16).dense(ti.ij, 16).place(a, b)
+
+@ti.kernel
+def run():
+  for I in ti.grouped(a):
+    b[I] = a[I] * 4.2
+```
+
+While this is convenient for Python users, it imposes challenges for the deployment side.
+
+1. Taichi fields are currently implemented as global variables.
+
+    This would result in the Taichi kernels being "not pure" and relying on implicit information. When saving such kernels into the AOT module, it is also necessary to save all the depdendant global states. Ideally, users should be able to create Taichi fields, and pass them into Taichi kernels as parameters.
+
+2. SNodes types are missing from the AOT module.
+
+    To moving towards the direction of passing fields into kernels, field and SNode types need to be saved into the AOT module as well.
+
+3. Fields data are not managed by the users.
+
+    Because fields are global, the Taichi runtime have to create and manage them. By localizing the fields, decoupling them from Taichi kernels, users can manage the memory resources for these fields.
+
+# Goals
+
+* Provide a SNode API that allows SNodes and Taichi fields to be localized, so that Taichi kernels can be made *pure*.
+* Supports describing the complete SNode tree type explicitly.
+* Make SNode types serializable into AOT, so that AOT can use all kinds of SNodes.
+* The new SNode API should offer compatibility with the existing usage.
+* (Uncertain, but highly desired) Decouple the element type from the SNode type, i.e. the situation where matrix fields has to been implemented in the "scattered" way to support SoA layout.
+
+## Non-Goals
+
+* Expand the support for sparse SNodes beyond LLVM's codegen, especially SPIR-V.
+
+# Detailed Design
+
+## A first attempt
+
+This API would clearly allow us to pass in fields into kernels:
+
+```py
+a = ti.field(ti.i32)
+b = ti.field(ti.f32)
+ti.root.pointer(ti.ij, 16).dense(ti.ij, 16).place(a, b)
+
+@ti.kernel
+def run(a: ?, b: ?):
+  for I in ti.grouped(a):
+    b[I] = a[I] * 4.2
+
+run(a, b)
+```
+
+However, it doesn't really work for AOT, because `a` and `b` are **attributes of a tree type**. That is, you *cannot* dump `a` and `b`'s types separately.
+
+To understand this problem, we can define something equivalent in C++:
+
+```cpp
+struct AB {
+  int32_t a;
+  float b;
+};
+
+using TreeType = PointerDense<AB>;
+```
+
+We cannot declare the `run` kernel as `void run(? a, ? b)`. Instead, we have to pass in a `TreeType` instance as a whole into the kernel, i.e., `void run(TreeType &tree)`.
+
+Internally, as you are using Taichi's SNode system to construct hierarchies, you are also constructing a `SNodeTree` type at the same time. This is done by Taichi's [`FieldsBuilder`](https://github.com/taichi-dev/taichi/blob/master/python/taichi/_snode/fields_builder.py).
+
+## A working design
+
+We will make the SNode tree and its type explicit by providing `SNodeTreeBuilder`. Each field needs to be registered into the builder via `add_field()`. `add_field()` does *not* actually do any memory allocation. Instead, it just returns a *field handle*, which can be used to retrieve a field from the tree inside the kernel.
+
+```py
+builder = ti.SNodeTreeBuilder()
+
+builder.add_field(dtype=ti.f32, name='x')
+builder.add_field(dtype=ti.i32, name='y')
+builder.tree()
+  .pointer(ti.ij, 4)
+  .dense(ti.ij, 5)
+  .place('x', 'y')
+
+# `tree_t` stands for "tree type".
+tree_t = builder.build()
+```
+
+Similarly, `SNodeTreeBuilder.build()` doesn't allocate memory for the tree. It only builds *the type of* a SNode tree. You can later instantiate a tree with `tree_t.instantiate()`. There are a few reasons behind this type-tree decoupling design:
+
+1. We have explicit access to the SNode tree type. This is a must for AOT, but can also be used as type annotations for enhanced language formality.
+2. We can instantiate as many trees as we want from this type, and pass them to the same kernel without re-compilation.
+
+Inside a Taichi kernel, the entire tree can be used in the following way:
+
+```py
+@ti.kernel
+def run(tr: tree_t):
+  for I in ti.grouped(tr.x):
+    tr.x[I] = tr.y[I] + 2.0
+
+tree = tree_t.instantiate()
+run(tree)
+```
+
+The only change from the existing API is that, you will need to prepend the fields with `tree.`. The subscript still happens on a field, not the tree (i.e., `tr.x[I]` instead of `tr[I].x`).
+
+There will be two ways to retrieve a field from a tree:
+
+* By name: `add_field()` takes in a `name` parameter. After building a SNode tree, Taichi will generate an attribute for each registered field on that tree. This allows you to directly write `tr.x` to access the field named `'x'`. `name` serves as the unique identifer of the field in the tree. Note that when placing, it is the names being passed in.
+* By field handle: You can also use the field handle returned by `add_field()` to access a field. Here's an example:
+   ```py
+   builder = ti.SNodeTreeBuilder()
+   x_handle = builder.add_field(dtype=ti.f32, name='x')
+   # boilerplate to generate tree type and instantiate a tree ...
+
+   @ti.kernel
+   def foo(tr: tree_t):
+     x = ti.static(tr.get_field(x_handle))  # 1
+     for i in x:
+       x[i] = i * 2.0
+   ```
+
+   Note that this design requires that part of the kernel (1) being evaluated inside Python. It also pulls in the global variable `x_handle`, which kind of violates our initial goal. We could require that `x_handle` is passed into the kernel as an argument. But maybe it's fine just to view that as a trivial Python constant?
+
+## Defining `shape`
+
+Like how `ti.field()` works, `add_field` can take in a `shape` parameter. When doing so, the builder will automatically create a new `dense` field under the root of the tree. Note that you should *not* do another place if `shape` is defined.
+
+Here is an example:
+
+```py
+builder = ti.SNodeTreeBuilder()
+
+builder.add_field(dtype=ti.f32, name='x', shape=(4, 8))
+# This would result an error
+# builder.tree().dense(ti.ij, (4, 8)).place('x')
+tree_t = builder.build()
+```
+
+It is equivalent to this:
+
+```py
+builder = ti.SNodeTreeBuilder()
+
+builder.add_field(dtype=ti.f32, name='x')
+builder.tree().dense(ti.ij, (4, 8)).place('x')
+tree_t = builder.build()
+```
+
+## AoS vs SoA
+
+Two composite types require the switch between AoS vs SoA, `ti.Matrix` and `ti.Struct`.
+
+AoS is quite straightforward. One can just use the composite type as the `dtype` of the field. For example:
+
+```py
+builder = ti.SNodeTreeBuilder()
+
+builder.add_field(dtype=ti.vec3, name='x')  # ti.vec3 is a vector of 3 ti.f32's
+builder.dense(ti.i, 8).place('x')
+tree_t = builder.build()
+```
+
+For SoA, things get a bit trickier. The **current approach** is to treat each compopnent of the composite type as a standalone scalar Taichi field. In the example below, we have to manually place the underlying 3 components of `x` separately.
+
+```py
+# Current way (as of v1.0.1) of doing SoA in Taichi
+x = ti.Vector.field(3, ti.f32)
+for f in x._get_field_members():  # `x` consists three scalar f32 fields
+  ti.root.dense(ti.ij).place(f)
+```
+
+This introduces confusion at several spots:
+
+1. Type is not purely decided by `dtype`, but also by how the field is placed.
+2. It introduces the notion of "nested field", which Taichi doesn't currently have a good abstraction for. Because of this, it is quite complicated to apply certain kind of optimizations for a composite-typed field. For example, vectorized load/save consumes the same bandwidth as scalar ops on certain platforms. Without a good abstraction, the checking for whether a matrix field is AoS or SoA has to be spread across different passes in CHI IR.
+
+If we further think about the problem, SoA `x` is not really a field. Instead, it is a *grouped view* of three individual scalar fields. This view provides matrix operations, which won't make sense for each individual scalar field.
+
+In addition, because type is currently coupled with Taichi field definition, a Taichi field has to be implemented as individual fields in order to support the SoA scenario. Once we switch to the type builder pattern, we can control how the type is constructed first, and choose the field implementation later.
+
+If we want to make it explicit that this is a *field view*, we can do the following:
+
+```py
+builder = ti.SNodeTreeBuilder()
+builder.add_field(dtype=ti.f32, name='v0')
+builder.add_field(dtype=ti.f32, name='v1')
+builder.add_field(dtype=ti.f32, name='v2')
+for v in ['v0', 'v1', 'v2']:
+  builder.tree().dense(ti.ij, 4).place(v)
+
+# Checks that
+# 1. `components` and `dtype` are compatible.
+# 2. If `dtype` is a vector/matrix, then all the fields in `components` are homogeneous in their SNode hierarchy.
+#    See https://github.com/taichi-dev/taichi/issues/3810
+builder.add_field_view(dtype=ti.vec3, name='vel', components=['v0', 'v1', 'v2'])
+```
+
+Matrix field view supports common matrix operations, and is equivalent to expanding each component into a local matrix variable.
+
+```py
+# 1
+vel_soa[i, j].inverse()
+# equivalent to
+ti.vec3([v0[i, j], v1[i, j], v2[i, j]]).inverse()
+
+# 2
+vel_soa[i, j][1] += 2.0
+# equivalent to
+v1[i, j] += 2.0
+
+# 3
+vel_soa[i, j] = vel_soa[i, j] @ some_vec3
+# equivalent to
+vel_tmp = ti.vec3([v0[i, j], v1[i, j], v2[i, j]])
+vel_tmp = vel_tmp @ some_vec3
+v0[i, j] = vel_tmp[0]
+v1[i, j] = vel_tmp[1]
+v2[i, j] = vel_tmp[2]
+```
+
+To make field view even more powerful, we can supported *nested field views*. For example:
+
+```py
+vertex_t = ti.types.struct({'pos': ti.vec3, 'normal': ti.vec3})
+sphere_t = ti.types.struct({'center': vertex_t, 'radius': ti.f32})
+
+builder = ti.SNodeTreeBuilder()
+builder.add_field(dtype=ti.vec3, name='pos')
+builder.add_field(dtype=ti.vec3, name='normal')
+builder.add_field(dtype=ti.f32, name='radius')
+builder.add_field_view(dtype=sphere_t, name='spheres',
+                       components=[['pos', 'normal'], 'radius'])
+###                                 ^^^^^^^^^^^^^^^^^ Note this is nested
+```
+
+## Gradient and AutoDiff
+
+In order to support autodiff, `add_field()` still needs to take in a parameter named `needs_grad: bool`:
+
+```py
+b = ti.SNodeTreeBuilder()
+b.add_field(dtype=ti.f32, name='x', needs_grad=True)
+# AOS
+b.tree()....place('x', b.grad_of('x'))
+# or SOA
+b.tree()....place('x')
+b.tree()....place(b.grad_of('x'))
+```
+
+If `needs_grad=True`, the primal and adjoint fields will be defined inside the same tree. You will need to use `b.grad_of(primal_name)` to access the handle of the adjoint field. The alternative would be to use `f'{primal_name}.grad'`, which feels too ad-hoc.
+
+Alternatively, if you don't want to place the gradient fields on your own, you could use `builder.lazy_grad()` by the end, which automatically places all the gradient fields.
+
+## Python AOT API
+
+Here's the Python AOT API to save the SNodeTree type.
+
+```py
+builder = ti.SNodeTreeBuilder()
+# ...
+tree_t = builder.build()
+
+@ti.kernel
+def foo(tr: tree_t):
+  # ...
+
+m = ti.aot.Module(arch)
+m.add_snode_tree_type(tree_t, name="vel_tree")
+m.add_kernel(foo)
+m.save('/path/to/module')
+```
+
+## C++ AOT API
+
+```cpp
+auto mod = taichi::aot::Module("/path/to/module");
+auto *tree_t = mod->get_snode_tree("vel_tree");
+taichi::Device::AllocParams alloc_params;
+alloc_params.size = tree_t->get_size();
+auto *tree_mem = device->allocate_memory(alloc_params);
+// By doing this, the kernel can verify that the passed in memory matches its
+// signature.
+auto *tree = taichi::instantiate_tree(tree_t, tree_mem);
+
+auto foo_kernel = mod->get_kernel("foo");
+foo_kernel->launch(/*args=*/{tree});
+```
+
+## Backward Compatibility
+
+We need to make sure `ti.SNodeTreeBuilder` can still support the existing usage. Right now `ti.root` is already implemented as a "field accumulator": All fields being accumulated in root get materialized into a new SNode tree upon kernel invocation.
+
+Let's start with a simple example:
+
+```py
+x = ti.field(ti.f32)
+ti.root.pointer(ti.i, 4).dense(ti.i, 8).place(x)
+
+@ti.kernel
+def foo():
+  for i in x:
+    x[i] = i * 2.0
+```
+
+The equivalent code using the new `SNodeTreeBuilder` API is shown below:
+
+```py
+b = ti.SNodeTreeBuilder()
+b.add_field(ti.f32, name='x')
+b.tree().pointer(ti.i, 4).dense(ti.i, 8).place('x')
+tree_t = b.build()
+
+tr = tree_t.instantiate()
+
+@ti.kernel
+def foo():
+  for i in tr.x:
+    tr.x[i] = i * 2.0
+```
+
+In order to provide backward compatibility, we need some helper utils to make the following happen:
+
+* Maps `x@old` to `tr.x@new`. In addition, the runtime will need to know which SNode tree `x@old` belongs to.
+* `x@old` returned by `ti.field()` will be a placeholder for field, until the current SNode tree of `ti.root` is built and instantiated.
+
+All these being considered, here's a possible solution.
+
+`ti.root` is just a global `SNodeTreeBuilder`.
+
+`ti.field()` returns a `FieldThunk` ([what is a *thunk*?](https://en.wikipedia.org/wiki/Thunk)).
+
+```py
+class FieldThunk:
+  def __init__(self, fid):
+    self.field_id = fid
+    self.tree = None
+
+  def bind(self, tree):
+    self.tree = tree
+
+def field(dtype, name='', shape=None, offset=None, needs_grad=False):
+  name = name or random_name()
+  handle = ti.root.add_field(dtype, name)
+  ft = FieldThunk(handle)
+  ti.root._field_thunks.append(ft)
+  return ft
+```
+
+Upon materializing the SNodeTree:
+
+```py
+tree_t = ti.root.build()
+tree = tree_t.instantiate()
+ti._runtime.global_snode_trees.append(tree)
+for ft in ti.root._field_thunks:
+  ft.bind(tree)
+
+# Make `ti.root` a new SNodeTreeBuilder to allow for dynamic fields
+ti.root = SNodeTreeBuilder()
+```
+
+When JIT compiling a Taichi kernel, it transforms `x@old` into `x.tree.get_field(x.field_id)`, where `x` is the `FieldThunk`.
+
+# Alternatives
+
+Not sure what's a better design to cover all the listed goals here.
+
+# FAQ
+
+TBD
diff --git a/misc/prtags.json b/misc/prtags.json
index 902804f73c0b6..b108c159eacf6 100644
--- a/misc/prtags.json
+++ b/misc/prtags.json
@@ -46,5 +46,6 @@
   "type"            : "Type system",
   "simt"            : "SIMT programming",
   "release"         : "Release",
-  "build"           : "Build system"
+  "build"           : "Build system",
+  "rfc"             : "RFC"
 }

From 1b6797cf5788fde8bc1e8b1f8b186380d4eae5b3 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 12 May 2022 15:26:16 +0800
Subject: [PATCH 057/176] [ci] Add new buildbot with latest driver for
 Linux/Vulkan test (#4953)

* [ci] Add new buildbot with latest driver for Linux test

* Removed unused Jenkinsfile and travis

* Ref to issue

* Change matrix

* Change matrix format

* Change indented maybe

* String maybe

* First remove runs-on

* Minor

* Use nested array

* Use nested array

* Use nested array

* Debug path

* Revert "Debug path"

This reverts commit 000db2ad746f1d670e7fa7c9bdd1fad0209b8147.

* Debug path

* Revert

* Remove trailing space
---
 .github/workflows/testing.yml |  8 ++-
 .travis/install.sh            | 15 ------
 Jenkinsfile                   | 93 -----------------------------------
 setup.py                      |  2 +-
 4 files changed, 8 insertions(+), 110 deletions(-)
 delete mode 100755 .travis/install.sh
 delete mode 100644 Jenkinsfile

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index a34de5b8581fb..72169e5abb467 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -242,8 +242,14 @@ jobs:
   build_and_test_gpu_linux:
     name: Build and Test (GPU)
     needs: check_files
-    runs-on: [self-hosted, cuda, vulkan, cn]
     timeout-minutes: 60
+    strategy:
+      matrix:
+        tags:
+          - [self-hosted, cuda, vulkan, cn, driver470]
+          - [self-hosted, cuda, vulkan, cn, driver510]
+
+    runs-on: ${{ matrix.tags }}
     steps:
       - uses: actions/checkout@v2
         with:
diff --git a/.travis/install.sh b/.travis/install.sh
deleted file mode 100755
index 5e1ea0571f20a..0000000000000
--- a/.travis/install.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-# use brew & pyenv to build specific python on osx
-if [[ $TRAVIS_OS_NAME == 'osx' ]]; then
-    export PATH="$HOME/.pyenv/bin:$PATH"
-    eval "${MATRIX_EVAL}"
-    echo "python version: $PYTHON_VERSION"
-    brew update > /dev/null
-    brew upgrade pyenv
-    # use pyenv to build python
-    eval "$(pyenv init -)"
-    pyenv install $PYTHON_VERSION
-    pyenv global $PYTHON_VERSION
-    pyenv rehash
-fi
diff --git a/Jenkinsfile b/Jenkinsfile
deleted file mode 100644
index 50f273add48f0..0000000000000
--- a/Jenkinsfile
+++ /dev/null
@@ -1,93 +0,0 @@
-pipeline {
-    agent any
-    environment {
-        PYPI_PWD = credentials("${PYPI_PWD}")
-        COMMIT_SHA = "${COMMIT_SHA}"
-        PATH = "/opt/taichi-llvm-10.0.0/bin:/usr/local/cuda/bin/:$PATH"
-        CC = "clang-10"
-        CXX = "clang++-10"
-        // Local machine uses version 11.2. However, we need to define
-        // TI_CUDAVERSION, which eventually translates to the version number
-        // of the slimmed CUDA libdevice bytecode. Currently this slimmed
-        // version only covers 10. See:
-        // https://github.com/taichi-dev/taichi/tree/master/external/cuda_libdevice
-        // so we pass hack version to avoid build errors.
-        HACK_CUDA_VERSION = "10.0"
-    }
-    stages{
-        stage('Build and Test') {
-            parallel {
-                stage('python3.6') {
-                    agent {
-                        node {
-                            label "python36"
-                            customWorkspace "taichi_py36"
-                        }
-                    }
-                    environment {
-                        UBUNTU = "10.0-devel-ubuntu18.04"
-                        PYTHON = "python3.6"
-                    }
-                    steps{
-                        build_taichi()
-                    }
-                }
-                stage('python3.7') {
-                    agent {
-                        node {
-                            label "python37"
-                            customWorkspace "taichi_py37"
-                        }
-                    }
-                    environment {
-                        UBUNTU = "10.0-devel-ubuntu18.04"
-                        PYTHON = "python3.7"
-                    }
-                    steps{
-                        build_taichi()
-                    }
-                }
-                stage('python3.8') {
-                    agent {
-                        node {
-                            label "python38"
-                            customWorkspace "taichi_py38"
-                        }
-                    }
-                    environment {
-                        UBUNTU = "10.0-devel-ubuntu18.04"
-                        PYTHON = "python3.8"
-                    }
-                    steps{
-                        build_taichi()
-                    }
-                }
-                stage('python3.9') {
-                    agent {
-                        node {
-                            label "python39"
-                            customWorkspace "taichi_py39"
-                        }
-                    }
-                    environment {
-                        UBUNTU = "11.0-devel-ubuntu20.04"
-                        PYTHON = "python3.9"
-                    }
-                    steps{
-                        build_taichi()
-                    }
-                }
-            }
-        }
-    }
-}
-
-void build_taichi() {
-    sh "echo building"
-    sh "echo $PATH"
-    git 'https://github.com/taichi-dev/taichi.git'
-    sh label: '', script: '''
-    cd ci
-    docker build . --build-arg UBUNTU=${UBUNTU} --build-arg PYTHON=${PYTHON} --build-arg TEST_OPTION="${TEST_OPTION}" --build-arg PYPI_PWD=${PYPI_PWD} --build-arg COMMIT_SHA=${COMMIT_SHA}
-    '''
-}
diff --git a/setup.py b/setup.py
index 902a1e30e7dfb..d829a59315604 100644
--- a/setup.py
+++ b/setup.py
@@ -66,7 +66,7 @@ def remove_tmp(taichi_dir):
 class EggInfo(egg_info):
     def finalize_options(self, *args, **kwargs):
         if '' not in self.distribution.package_dir:
-            # XXX: skbuild loses the root package dir
+            #4975: skbuild loses the root package dir
             self.distribution.package_dir[''] = package_dir
         return super().finalize_options(*args, **kwargs)
 

From 7c4de0353ec622c148930c2d71858558ce9ed0cf Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Thu, 12 May 2022 16:40:21 +0800
Subject: [PATCH 058/176] [vulkan] Set kApiVersion to VK_API_VERSION_1_3
 (#4970)

* Change vulkan version to fix AMD crash problem.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/backends/vulkan/vulkan_utils.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/taichi/backends/vulkan/vulkan_utils.h b/taichi/backends/vulkan/vulkan_utils.h
index 675bacd80587e..d23efd38a143b 100644
--- a/taichi/backends/vulkan/vulkan_utils.h
+++ b/taichi/backends/vulkan/vulkan_utils.h
@@ -21,8 +21,12 @@ namespace vulkan {
 
 class VulkanEnvSettings {
  public:
+  // This version number is used to create a vkInstance, it should be
+  // the highest API version that is designed to use.
+  // Reference:
+  // https://www.khronos.org/registry/vulkan/specs/1.3-extensions/man/html/VkApplicationInfo.html
   static constexpr uint32_t kApiVersion() {
-    return VK_API_VERSION_1_2;
+    return VK_API_VERSION_1_3;
   }
 };
 

From 22d189519ea1cbef965461618643a38784963bdd Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Thu, 12 May 2022 17:27:59 +0800
Subject: [PATCH 059/176] [bug] [simt] Fix the problem that some intrinsics are
 never called (#4957)

* [bug] [simt] Fix the problem that some intrinsics are never called

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix format

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/simt/subgroup.py | 106 ++++++++-------------
 python/taichi/lang/simt/warp.py     | 139 +++++++++++++++-------------
 2 files changed, 115 insertions(+), 130 deletions(-)

diff --git a/python/taichi/lang/simt/subgroup.py b/python/taichi/lang/simt/subgroup.py
index caaf87c11a7b2..5a4fcae076d71 100644
--- a/python/taichi/lang/simt/subgroup.py
+++ b/python/taichi/lang/simt/subgroup.py
@@ -1,24 +1,17 @@
-from taichi._lib import core as _ti_core
-from taichi.lang import expr
-from taichi.types import i32
+from taichi.lang import impl
 
 
 def barrier():
-    return expr.Expr(
-        _ti_core.insert_internal_func_call("subgroupBarrier",
-                                           expr.make_expr_group(), False))
+    return impl.call_internal("subgroupBarrier", with_runtime_context=False)
 
 
 def memory_barrier():
-    return expr.Expr(
-        _ti_core.insert_internal_func_call("subgroupMemoryBarrier",
-                                           expr.make_expr_group(), False))
+    return impl.call_internal("subgroupMemoryBarrier",
+                              with_runtime_context=False)
 
 
 def elect():
-    return expr.Expr(
-        _ti_core.insert_internal_func_call("subgroupElect",
-                                           expr.make_expr_group(), False))
+    return impl.call_internal("subgroupElect", with_runtime_context=False)
 
 
 def all_true(cond):
@@ -41,107 +34,90 @@ def broadcast_first(value):
     pass
 
 
-def broadcast(value, index: i32):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call("subgroupBroadcast",
-                                           expr.make_expr_group(value, index),
-                                           False))
+def broadcast(value, index):
+    return impl.call_internal("subgroupBroadcast",
+                              value,
+                              index,
+                              with_runtime_context=False)
 
 
 def group_size():
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupSize", expr.make_expr_group(), False),
-                     dtype=i32)
+    return impl.call_internal("subgroupSize", with_runtime_context=False)
 
 
 def invocation_id():
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInvocationId", expr.make_expr_group(), False),
-                     dtype=i32)
+    return impl.call_internal("subgroupInvocationId",
+                              with_runtime_context=False)
 
 
 def reduce_add(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupAdd", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupAdd", value, with_runtime_context=False)
 
 
 def reduce_mul(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupMul", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupMul", value, with_runtime_context=False)
 
 
 def reduce_min(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupMin", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupMin", value, with_runtime_context=False)
 
 
 def reduce_max(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupMax", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupMax", value, with_runtime_context=False)
 
 
 def reduce_and(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupAnd", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupAnd", value, with_runtime_context=False)
 
 
 def reduce_or(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupOr", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupOr", value, with_runtime_context=False)
 
 
 def reduce_xor(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupXor", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupXor", value, with_runtime_context=False)
 
 
 def inclusive_add(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInclusiveAdd", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupInclusiveAdd",
+                              value,
+                              with_runtime_context=False)
 
 
 def inclusive_mul(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInclusiveMul", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupInclusiveMul",
+                              value,
+                              with_runtime_context=False)
 
 
 def inclusive_min(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInclusiveMin", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupInclusiveMin",
+                              value,
+                              with_runtime_context=False)
 
 
 def inclusive_max(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInclusiveMax", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupInclusiveMax",
+                              value,
+                              with_runtime_context=False)
 
 
 def inclusive_and(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInclusiveAnd", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupInclusiveAnd",
+                              value,
+                              with_runtime_context=False)
 
 
 def inclusive_or(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInclusiveOr", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupInclusiveOr",
+                              value,
+                              with_runtime_context=False)
 
 
 def inclusive_xor(value):
-    return expr.Expr(_ti_core.insert_internal_func_call(
-        "subgroupInclusiveXor", expr.make_expr_group(value), False),
-                     dtype=value.ptr.get_ret_type())
+    return impl.call_internal("subgroupInclusiveXor",
+                              value,
+                              with_runtime_context=False)
 
 
 def exclusive_add(value):
diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
index 6075ce5d4eb10..ae3dae11fad90 100644
--- a/python/taichi/lang/simt/warp.py
+++ b/python/taichi/lang/simt/warp.py
@@ -1,91 +1,100 @@
-from taichi._lib import core as _ti_core
-from taichi.lang import expr
+from taichi.lang import impl
 
 
 def all_nonzero(mask, predicate):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_all_sync_i32", expr.make_expr_group(mask, predicate), False))
+    return impl.call_internal("cuda_all_sync_i32",
+                              mask,
+                              predicate,
+                              with_runtime_context=False)
 
 
 def any_nonzero(mask, predicate):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_any_sync_i32", expr.make_expr_group(mask, predicate), False))
+    return impl.call_internal("cuda_any_sync_i32",
+                              mask,
+                              predicate,
+                              with_runtime_context=False)
 
 
 def unique(mask, predicate):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_uni_sync_i32", expr.make_expr_group(mask, predicate), False))
+    return impl.call_internal("cuda_uni_sync_i32",
+                              mask,
+                              predicate,
+                              with_runtime_context=False)
 
 
 def ballot(predicate):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call("cuda_ballot_i32",
-                                           expr.make_expr_group(predicate),
-                                           False))
+    return impl.call_internal("cuda_ballot_i32",
+                              predicate,
+                              with_runtime_context=False)
 
 
 def shfl_sync_i32(mask, val, offset):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            # lane offset is 31 for warp size 32
-            "cuda_shfl_sync_i32",
-            expr.make_expr_group(mask, val, offset, 31),
-            False))
+    # lane offset is 31 for warp size 32
+    return impl.call_internal("cuda_shfl_sync_i32",
+                              mask,
+                              val,
+                              offset,
+                              31,
+                              with_runtime_context=False)
 
 
 def shfl_sync_f32(mask, val, offset):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            # lane offset is 31 for warp size 32
-            "cuda_shfl_sync_f32",
-            expr.make_expr_group(mask, val, offset, 31),
-            False))
-
-
-def shfl_down_i32(mask, val, offset):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_shfl_down_sync_i32",
-            # lane offset is 31 for warp size 32
-            expr.make_expr_group(mask, val, offset, 31),
-            False))
+    # lane offset is 31 for warp size 32
+    return impl.call_internal("cuda_shfl_sync_f32",
+                              mask,
+                              val,
+                              offset,
+                              31,
+                              with_runtime_context=False)
 
 
 def shfl_up_i32(mask, val, offset):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_shfl_up_sync_i32",
-            # lane offset is 0 for warp size 32
-            expr.make_expr_group(mask, val, offset, 0),
-            False))
+    # lane offset is 0 for warp size 32
+    return impl.call_internal("cuda_shfl_up_sync_i32",
+                              mask,
+                              val,
+                              offset,
+                              0,
+                              with_runtime_context=False)
 
 
 def shfl_up_f32(mask, val, offset):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_shfl_up_sync_f32",
-            # lane offset is 0 for warp size 32
-            expr.make_expr_group(mask, val, offset, 0),
-            False))
+    # lane offset is 0 for warp size 32
+    return impl.call_internal("cuda_shfl_up_sync_f32",
+                              mask,
+                              val,
+                              offset,
+                              0,
+                              with_runtime_context=False)
+
+
+def shfl_down_i32(mask, val, offset):
+    # lane offset is 31 for warp size 32
+    return impl.call_internal("cuda_shfl_down_sync_i32",
+                              mask,
+                              val,
+                              offset,
+                              31,
+                              with_runtime_context=False)
 
 
 def shfl_down_f32(mask, val, offset):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_shfl_down_sync_f32",
-            # lane offset is 31 for warp size 32
-            expr.make_expr_group(mask, val, offset, 31),
-            False))
+    # lane offset is 31 for warp size 32
+    return impl.call_internal("cuda_shfl_down_sync_f32",
+                              mask,
+                              val,
+                              offset,
+                              31,
+                              with_runtime_context=False)
 
 
 def shfl_xor_i32(mask, val, offset):
-    return expr.Expr(
-        _ti_core.insert_internal_func_call(
-            "cuda_shfl_xor_sync_i32",
-            expr.make_expr_group(mask, val, offset, 31), False))
+    return impl.call_internal("cuda_shfl_xor_sync_i32",
+                              mask,
+                              val,
+                              offset,
+                              31,
+                              with_runtime_context=False)
 
 
 def match_any():
@@ -99,15 +108,11 @@ def match_all():
 
 
 def active_mask():
-    return expr.Expr(
-        _ti_core.insert_internal_func_call("cuda_active_mask",
-                                           expr.make_expr_group(), False))
+    return impl.call_internal("cuda_active_mask", with_runtime_context=False)
 
 
 def sync(mask):
-    expr.Expr(
-        _ti_core.insert_internal_func_call("warp_barrier",
-                                           expr.make_expr_group(mask), False))
+    return impl.call_internal("warp_barrier", mask, with_runtime_context=False)
 
 
 __all__ = [
@@ -115,9 +120,13 @@ def sync(mask):
     'any_nonzero',
     'unique',
     'ballot',
-    'shfl_i32',
+    'shfl_sync_i32',
+    'shfl_sync_f32',
     'shfl_up_i32',
+    'shfl_up_f32',
     'shfl_down_i32',
+    'shfl_down_f32',
+    'shfl_xor_i32',
     'match_any',
     'match_all',
     'active_mask',

From f4a14f354d000ca2d41775b95d67c30b76f1a8f6 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Fri, 13 May 2022 11:41:02 +0800
Subject: [PATCH 060/176] [llvm] Create ModuleToFunctionConverter (#4962)

* [llvm] Create ModuleToFunctionConverter

* fix wild pointer

* get_compute_device
---
 taichi/codegen/codegen_llvm.cpp | 112 +++++++++++++++++++-------------
 taichi/codegen/codegen_llvm.h   |  34 ++++++++--
 taichi/llvm/llvm_program.cpp    |   5 +-
 taichi/llvm/llvm_program.h      |  11 ++--
 4 files changed, 104 insertions(+), 58 deletions(-)

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 22f701abb7cb5..9f5ab45cc0cc0 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1,11 +1,14 @@
 #include "taichi/codegen/codegen_llvm.h"
 
+#include <algorithm>
+
 #ifdef TI_WITH_LLVM
 #include "taichi/analysis/offline_cache_util.h"
 #include "taichi/llvm/llvm_offline_cache.h"
 #include "taichi/ir/statements.h"
 #include "taichi/struct/struct_llvm.h"
 #include "taichi/util/file_sequence_writer.h"
+#include "taichi/llvm/llvm_program.h"
 
 #include "llvm/IR/Module.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -18,7 +21,6 @@ TLANG_NAMESPACE_BEGIN
 // OffloadedTask
 
 OffloadedTask::OffloadedTask(CodeGenLLVM *codegen) : codegen(codegen) {
-  func = nullptr;
 }
 
 void OffloadedTask::begin(const std::string &name) {
@@ -29,19 +31,6 @@ void OffloadedTask::end() {
   codegen->offloaded_tasks.push_back(*this);
 }
 
-void OffloadedTask::operator()(RuntimeContext *context) {
-  TI_ASSERT(func);
-  func(context);
-}
-
-void OffloadedTask::compile() {
-  TI_ASSERT(!func);
-  auto kernel_symbol = codegen->tlctx->lookup_function_pointer(name);
-  TI_ASSERT_INFO(kernel_symbol, "Function not found");
-
-  func = (task_fp_type)kernel_symbol;
-}
-
 // TODO(k-ye): Hide FunctionCreationGuard inside cpp file
 FunctionCreationGuard::FunctionCreationGuard(
     CodeGenLLVM *mb,
@@ -2283,37 +2272,12 @@ void CodeGenLLVM::eliminate_unused_functions() {
 }
 
 FunctionType CodeGenLLVM::compile_module_to_executable() {
-  TI_AUTO_PROF
-
-  tlctx->add_module(std::move(module));
+  TI_AUTO_PROF;
 
-  for (auto &task : offloaded_tasks) {
-    task.compile();
-  }
-  auto offloaded_tasks_local = offloaded_tasks;
-  auto kernel_name_ = kernel_name;
-  return [offloaded_tasks_local, kernel_name_,
-          kernel = this->kernel](RuntimeContext &context) {
-    TI_TRACE("Launching kernel {}", kernel_name_);
-    auto args = kernel->args;
-    // For taichi ndarrays, context.args saves pointer to its
-    // |DeviceAllocation|, CPU backend actually want to use the raw ptr here.
-    for (int i = 0; i < (int)args.size(); i++) {
-      if (args[i].is_array && context.is_device_allocations[i] &&
-          context.array_runtime_sizes[i] > 0) {
-        DeviceAllocation *ptr =
-            static_cast<DeviceAllocation *>(context.get_arg<void *>(i));
-        uint64 host_ptr = (uint64)kernel->program->get_llvm_program_impl()
-                              ->get_ndarray_alloc_info_ptr(*ptr);
-        context.set_arg(i, host_ptr);
-        context.set_array_is_device_allocation(i,
-                                               /*is_device_allocation=*/false);
-      }
-    }
-    for (auto task : offloaded_tasks_local) {
-      task(&context);
-    }
-  };
+  ModuleToFunctionConverter converter{tlctx,
+                                      kernel->program->get_llvm_program_impl()};
+  return converter.convert(kernel, std::move(module),
+                           std::move(offloaded_tasks));
 }
 
 FunctionCreationGuard CodeGenLLVM::get_function_creation_guard(
@@ -2508,6 +2472,66 @@ void CodeGenLLVM::cache_module(const std::string &kernel_key) {
   prog->get_llvm_program_impl()->cache_kernel(kernel_key, this->module.get(),
                                               std::move(offloaded_task_list));
 }
+
+ModuleToFunctionConverter::ModuleToFunctionConverter(TaichiLLVMContext *tlctx,
+                                                     LlvmProgramImpl *program)
+    : tlctx_(tlctx), program_(program) {
+}
+
+FunctionType ModuleToFunctionConverter::convert(
+    const std::string &kernel_name,
+    const std::vector<ArgInfo> &args,
+    std::unique_ptr<llvm::Module> mod,
+    std::vector<OffloadedTask> &&tasks) const {
+  tlctx_->add_module(std::move(mod));
+
+  using TaskFunc = int32 (*)(void *);
+  std::vector<TaskFunc> task_funcs;
+  task_funcs.reserve(tasks.size());
+  for (auto &task : tasks) {
+    auto *func_ptr = tlctx_->lookup_function_pointer(task.name);
+    TI_ASSERT_INFO(func_ptr, "Offloaded task function {} not found", task.name);
+    task_funcs.push_back((TaskFunc)(func_ptr));
+  }
+  // Do NOT capture `this`...
+  return [program = this->program_, args, kernel_name,
+          task_funcs](RuntimeContext &context) {
+    TI_TRACE("Launching kernel {}", kernel_name);
+    // For taichi ndarrays, context.args saves pointer to its
+    // |DeviceAllocation|, CPU backend actually want to use the raw ptr here.
+    for (int i = 0; i < (int)args.size(); i++) {
+      if (args[i].is_array && context.is_device_allocations[i] &&
+          context.array_runtime_sizes[i] > 0) {
+        DeviceAllocation *ptr =
+            static_cast<DeviceAllocation *>(context.get_arg<void *>(i));
+        uint64 host_ptr = (uint64)program->get_ndarray_alloc_info_ptr(*ptr);
+        context.set_arg(i, host_ptr);
+        context.set_array_is_device_allocation(i,
+                                               /*is_device_allocation=*/false);
+      }
+    }
+    for (auto task : task_funcs) {
+      task(&context);
+    }
+  };
+}
+
+FunctionType ModuleToFunctionConverter::convert(
+    const Kernel *kernel,
+    std::unique_ptr<llvm::Module> mod,
+    std::vector<OffloadedTask> &&tasks) const {
+  const auto &kargs = kernel->args;
+  std::vector<ArgInfo> args;
+  args.resize(kargs.size());
+  std::transform(kargs.begin(), kargs.end(), args.begin(),
+                 [](const auto &arg) -> ArgInfo {
+                   ArgInfo res;
+                   res.is_array = arg.is_array;
+                   return res;
+                 });
+  return convert(kernel->name, args, std::move(mod), std::move(tasks));
+}
+
 TLANG_NAMESPACE_END
 
 #endif  // #ifdef TI_WITH_LLVM
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index 51707e7ae6dfc..ca5df8c0be817 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -18,8 +18,6 @@ class OffloadedTask {
  public:
   std::string name;
   CodeGenLLVM *codegen;
-  using task_fp_type = int32 (*)(void *);
-  task_fp_type func;
 
   int block_dim{0};
   int grid_dim{0};
@@ -29,10 +27,6 @@ class OffloadedTask {
   void begin(const std::string &name);
 
   void end();
-
-  void compile();
-
-  void operator()(RuntimeContext *context);
 };
 
 class FunctionCreationGuard {
@@ -400,6 +394,34 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
   void cache_module(const std::string &kernel_key);
 };
 
+class LlvmProgramImpl;
+
+// This is for CPU, we need one for CUDA (AMDGPU) as well.
+class ModuleToFunctionConverter {
+ public:
+  struct ArgInfo {
+    bool is_array{false};
+  };
+
+  explicit ModuleToFunctionConverter(TaichiLLVMContext *tlctx,
+                                     LlvmProgramImpl *program);
+
+  virtual ~ModuleToFunctionConverter() = default;
+
+  virtual FunctionType convert(const std::string &kernel_name,
+                               const std::vector<ArgInfo> &args,
+                               std::unique_ptr<llvm::Module> mod,
+                               std::vector<OffloadedTask> &&tasks) const;
+
+  FunctionType convert(const Kernel *kernel,
+                       std::unique_ptr<llvm::Module> mod,
+                       std::vector<OffloadedTask> &&tasks) const;
+
+ protected:
+  TaichiLLVMContext *tlctx_{nullptr};
+  LlvmProgramImpl *program_{nullptr};
+};
+
 TLANG_NAMESPACE_END
 
 #endif  // #ifdef TI_WITH_LLVM
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 8d1c0fa029cd6..9f6b5359ac300 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -1,4 +1,5 @@
-#include "llvm_program.h"
+#include "taichi/llvm/llvm_program.h"
+
 #include "llvm/IR/Module.h"
 
 #include "taichi/backends/cuda/cuda_driver.h"
@@ -430,7 +431,7 @@ void LlvmProgramImpl::materialize_runtime(MemoryPool *memory_pool,
                                       llvm_runtime_,
                                       (void *)assert_failed_host);
   }
-  if (arch_is_cpu(config->arch)) {
+  if (arch_is_cpu(config->arch) && (profiler != nullptr)) {
     // Profiler functions can only be called on CPU kernels
     runtime_jit->call<void *, void *>("LLVMRuntime_set_profiler", llvm_runtime_,
                                       profiler);
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index f05647ae8eacd..761240f6fa56b 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <cstddef>
+#include <memory>
 
 #include "taichi/llvm/llvm_device.h"
 #include "taichi/llvm/llvm_offline_cache.h"
@@ -20,8 +21,6 @@
 #include "taichi/program/context.h"
 #undef TI_RUNTIME_HOST
 
-#include <memory>
-
 namespace llvm {
 class Module;
 }  // namespace llvm
@@ -119,6 +118,10 @@ class LlvmProgramImpl : public ProgramImpl {
                     std::vector<LlvmOfflineCache::OffloadedTaskCacheData>
                         &&offloaded_task_list);
 
+  Device *get_compute_device() override {
+    return device_.get();
+  }
+
  private:
   std::unique_ptr<llvm::Module> clone_struct_compiler_initial_context(
       bool has_multiple_snode_trees,
@@ -159,10 +162,6 @@ class LlvmProgramImpl : public ProgramImpl {
     TI_NOT_IMPLEMENTED;
   }
 
-  Device *get_compute_device() override {
-    return device_.get();
-  }
-
   DevicePtr get_snode_tree_device_ptr(int tree_id) override;
 
   void dump_cache_data_to_disk() override;

From ab96782c66a0ca1123aafcb62babac120b1dc196 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 13 May 2022 13:42:42 +0800
Subject: [PATCH 061/176] [build] Fixed Ilegal Instruction Error when importing
 PaddlePaddle module (#4969)

* Trigger CI failure

* [build] Fixed Ilegal Instruction Error when importing PaddlePaddle module

* CI run: second time

* CI run: third time

* Log hardware info for CI build-bot
---
 .github/workflows/scripts/unix_test.sh | 8 +++++++-
 .github/workflows/scripts/win_test.ps1 | 2 +-
 README.md                              | 1 -
 ci/scripts/ubuntu_build_test_cpu.sh    | 2 +-
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
index e7294f8387696..b50aee72775f6 100755
--- a/.github/workflows/scripts/unix_test.sh
+++ b/.github/workflows/scripts/unix_test.sh
@@ -24,7 +24,7 @@ if [ -z "$GPU_TEST" ]; then
     python3 -m pip install "torch; python_version < '3.10'"
     # Paddle's develop package doesn't support CI's MACOS machine at present
     if [[ $OSTYPE == "linux-"* ]]; then
-        python3 -m pip install "paddlepaddle==0.0.0; python_version < '3.10'" -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
+        python3 -m pip install "paddlepaddle==2.3.0; python_version < '3.10'"
     fi
 else
     ## Only GPU machine uses system python.
@@ -32,7 +32,13 @@ else
     # pip will skip packages if already installed
     python3 -m pip install -r requirements_test.txt
     # Import Paddle's develop GPU package will occur error `Illegal Instruction`.
+
+    # Log hardware info for the current CI-bot
+    # There's random CI failure caused by "import paddle"
+    # Top suspect is an issue with MKL support for specific CPU
+    lscpu | grep "Model name"
 fi
+
 ti diagnose
 ti changelog
 echo "wanted archs: $TI_WANTED_ARCHS"
diff --git a/.github/workflows/scripts/win_test.ps1 b/.github/workflows/scripts/win_test.ps1
index 9cd94f1d66f30..33db09e393e84 100644
--- a/.github/workflows/scripts/win_test.ps1
+++ b/.github/workflows/scripts/win_test.ps1
@@ -11,7 +11,7 @@ if ("$env:TI_WANTED_ARCHS".Contains("cuda")) {
     pip install "torch==1.10.1+cu113; python_version < '3.10'" -f https://download.pytorch.org/whl/cu113/torch_stable.html
 } else {
     pip install "torch; python_version < '3.10'"
-    pip install "paddlepaddle==0.0.0; python_version < '3.10'" -f https://www.paddlepaddle.org.cn/whl/windows/cpu-mkl-avx/develop.html
+    pip install "paddlepaddle==2.3.0; python_version < '3.10'"
 }
 # Fail fast, give priority to the error-prone tests
 python tests/run_tests.py -vr2 -t1 -k "paddle" -a cpu
diff --git a/README.md b/README.md
index e2b0bcb172a70..17006bcf0f375 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,6 @@
   <img width="500px" src="https://github.com/taichi-dev/taichi/raw/master/misc/logo.png"/>
 </div>
 
-
 ---
 
 [![Latest Release](https://img.shields.io/github/v/release/taichi-dev/taichi?color=blue&label=Latest%20Release)](https://github.com/taichi-dev/taichi/releases/latest)
diff --git a/ci/scripts/ubuntu_build_test_cpu.sh b/ci/scripts/ubuntu_build_test_cpu.sh
index abfafbd2e2bb3..bd0a552d93bf8 100755
--- a/ci/scripts/ubuntu_build_test_cpu.sh
+++ b/ci/scripts/ubuntu_build_test_cpu.sh
@@ -23,7 +23,7 @@ cd taichi
 git checkout $SHA
 python3 -m pip install -r requirements_dev.txt -i http://repo.taichigraphics.com/repository/pypi/simple --trusted-host repo.taichigraphics.com
 # Paddle's paddle.fluid.core.Tensor._ptr() is only available on develop branch
-python3 -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/linux/cpu-mkl/develop.html
+python3 -m pip install paddlepaddle==2.3.0
 TAICHI_CMAKE_ARGS="-DTI_WITH_VULKAN:BOOL=OFF -DTI_WITH_CUDA:BOOL=OFF -DTI_WITH_OPENGL:BOOL=OFF" python3 setup.py install
 
 # Add Docker specific ENV

From 9959ee3d935ed941ca04998c6f22f24b10f9a8f8 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Thu, 12 May 2022 22:54:22 -0700
Subject: [PATCH 062/176] [test] Add an ndarray test in C++. (#4972)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 tests/cpp/ir/ir_builder_test.cpp   | 72 ++++++++++++++++++++++++++++++
 tests/cpp/program/test_program.cpp |  4 +-
 tests/cpp/program/test_program.h   |  2 +-
 3 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/tests/cpp/ir/ir_builder_test.cpp b/tests/cpp/ir/ir_builder_test.cpp
index cef117651871a..cbb082c780bf8 100644
--- a/tests/cpp/ir/ir_builder_test.cpp
+++ b/tests/cpp/ir/ir_builder_test.cpp
@@ -3,6 +3,9 @@
 #include "taichi/ir/ir_builder.h"
 #include "taichi/ir/statements.h"
 #include "tests/cpp/program/test_program.h"
+#ifdef TI_WITH_VULKAN
+#include "taichi/backends/vulkan/vulkan_loader.h"
+#endif
 
 namespace taichi {
 namespace lang {
@@ -117,5 +120,74 @@ TEST(IRBuilder, ExternalPtr) {
   EXPECT_EQ(array[1], 1);
   EXPECT_EQ(array[2], 42);
 }
+
+TEST(IRBuilder, Ndarray) {
+  TestProgram test_prog;
+#ifdef TI_WITH_VULKAN
+  Arch arch = taichi::lang::vulkan::is_vulkan_api_available() ? Arch::vulkan
+                                                              : Arch::x64;
+#else
+  Arch arch = Arch::x64;
+#endif
+  test_prog.setup(arch);
+  IRBuilder builder1;
+  int size = 10;
+
+  auto array = Ndarray(test_prog.prog(), PrimitiveType::i32, {size});
+  array.write_int({0}, 2);
+  array.write_int({2}, 40);
+  {
+    auto *arg = builder1.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
+                                         /*is_ptr=*/true);
+    auto *zero = builder1.get_int32(0);
+    auto *one = builder1.get_int32(1);
+    auto *two = builder1.get_int32(2);
+    auto *a1ptr = builder1.create_external_ptr(arg, {one});
+    builder1.create_global_store(a1ptr, one);  // a[1] = 1
+    auto *a0 =
+        builder1.create_global_load(builder1.create_external_ptr(arg, {zero}));
+    auto *a2ptr = builder1.create_external_ptr(arg, {two});
+    auto *a2 = builder1.create_global_load(a2ptr);
+    auto *a0plusa2 = builder1.create_add(a0, a2);
+    builder1.create_global_store(a2ptr, a0plusa2);  // a[2] = a[0] + a[2]
+  }
+  auto block1 = builder1.extract_ir();
+  auto ker1 =
+      std::make_unique<Kernel>(*test_prog.prog(), std::move(block1), "ker1");
+  ker1->insert_arg(get_data_type<int>(), /*is_array=*/true);
+  auto launch_ctx1 = ker1->make_launch_context();
+  launch_ctx1.set_arg_external_array(
+      /*arg_id=*/0, array.get_device_allocation_ptr_as_int(), size,
+      /*is_device_allocation=*/true);
+  (*ker1)(launch_ctx1);
+  EXPECT_EQ(array.read_int({0}), 2);
+  EXPECT_EQ(array.read_int({1}), 1);
+  EXPECT_EQ(array.read_int({2}), 42);
+
+  IRBuilder builder2;
+  {
+    auto *arg0 = builder2.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
+                                          /*is_ptr=*/true);
+    auto *arg1 = builder2.create_arg_load(/*arg_id=*/1, PrimitiveType::i32,
+                                          /*is_ptr=*/false);
+    auto *one = builder2.get_int32(1);
+    auto *a1ptr = builder2.create_external_ptr(arg0, {one});
+    builder2.create_global_store(a1ptr, arg1);  // a[1] = arg1
+  }
+  auto block2 = builder2.extract_ir();
+  auto ker2 =
+      std::make_unique<Kernel>(*test_prog.prog(), std::move(block2), "ker2");
+  ker2->insert_arg(get_data_type<int>(), /*is_array=*/true);
+  ker2->insert_arg(get_data_type<int>(), /*is_array=*/false);
+  auto launch_ctx2 = ker2->make_launch_context();
+  launch_ctx2.set_arg_external_array(
+      /*arg_id=*/0, array.get_device_allocation_ptr_as_int(), size,
+      /*is_device_allocation=*/true);
+  launch_ctx2.set_arg_int(/*arg_id=*/1, 3);
+  (*ker2)(launch_ctx2);
+  EXPECT_EQ(array.read_int({0}), 2);
+  EXPECT_EQ(array.read_int({1}), 3);
+  EXPECT_EQ(array.read_int({2}), 42);
+}
 }  // namespace lang
 }  // namespace taichi
diff --git a/tests/cpp/program/test_program.cpp b/tests/cpp/program/test_program.cpp
index 3d1a0858165bd..c08eb254ee02a 100644
--- a/tests/cpp/program/test_program.cpp
+++ b/tests/cpp/program/test_program.cpp
@@ -3,8 +3,8 @@
 namespace taichi {
 namespace lang {
 
-void TestProgram::setup() {
-  prog_ = std::make_unique<Program>(Arch::x64);
+void TestProgram::setup(Arch arch) {
+  prog_ = std::make_unique<Program>(arch);
   prog_->materialize_runtime();
   prog_->add_snode_tree(std::make_unique<SNode>(/*depth=*/0, SNodeType::root),
                         /*compile_only=*/false);
diff --git a/tests/cpp/program/test_program.h b/tests/cpp/program/test_program.h
index 2c1a1079d6fc3..d9c2237f6b988 100644
--- a/tests/cpp/program/test_program.h
+++ b/tests/cpp/program/test_program.h
@@ -9,7 +9,7 @@ namespace lang {
 
 class TestProgram {
  public:
-  void setup();
+  void setup(Arch arch = Arch::x64);
 
   Program *prog() {
     return prog_.get();

From 93daf9821e04c89d318c1595be3ee3fc7d38dce6 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Fri, 13 May 2022 18:29:04 +0800
Subject: [PATCH 063/176] [llvm] Make codegen produce static llvm::Module
 (#4975)

* [llvm] Make codegen produces static llvm::Module

* Update taichi/codegen/codegen_llvm.h

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* TI_WITH_LLVM

* fix

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/backends/cpu/codegen_cpu.cpp   | 15 ++++++++--
 taichi/backends/cpu/codegen_cpu.h     |  9 ++++++
 taichi/backends/cuda/codegen_cuda.cpp | 33 ++++++++++++---------
 taichi/backends/wasm/codegen_wasm.cpp |  2 --
 taichi/codegen/codegen_llvm.cpp       | 42 ++++++++++++++++-----------
 taichi/codegen/codegen_llvm.h         | 28 +++++++++++++-----
 taichi/program/context.h              |  2 +-
 7 files changed, 89 insertions(+), 42 deletions(-)

diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp
index e0d6e6dc90c7c..61af40c075c0e 100644
--- a/taichi/backends/cpu/codegen_cpu.cpp
+++ b/taichi/backends/cpu/codegen_cpu.cpp
@@ -1,6 +1,5 @@
 #include "taichi/backends/cpu/codegen_cpu.h"
 
-#include "taichi/codegen/codegen_llvm.h"
 #include "taichi/llvm/llvm_program.h"
 #include "taichi/common/core.h"
 #include "taichi/util/io.h"
@@ -12,6 +11,8 @@
 
 TLANG_NAMESPACE_BEGIN
 
+namespace {
+
 class CodeGenLLVMCPU : public CodeGenLLVM {
  public:
   using IRVisitor::visit;
@@ -199,8 +200,18 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
   }
 };
 
+}  // namespace
+
+#ifdef TI_WITH_LLVM
+// static
+std::unique_ptr<CodeGenLLVM> CodeGenCPU::make_codegen_llvm(Kernel *kernel,
+                                                           IRNode *ir) {
+  return std::make_unique<CodeGenLLVMCPU>(kernel, ir);
+}
+#endif  // TI_WITH_LLVM
+
 FunctionType CodeGenCPU::codegen() {
-  TI_AUTO_PROF
+  TI_AUTO_PROF;
   return CodeGenLLVMCPU(kernel, ir).gen();
 }
 
diff --git a/taichi/backends/cpu/codegen_cpu.h b/taichi/backends/cpu/codegen_cpu.h
index c3d723c75eff2..d13704152ec14 100644
--- a/taichi/backends/cpu/codegen_cpu.h
+++ b/taichi/backends/cpu/codegen_cpu.h
@@ -2,7 +2,10 @@
 
 #pragma once
 
+#include <memory>
+
 #include "taichi/codegen/codegen.h"
+#include "taichi/codegen/codegen_llvm.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -11,6 +14,12 @@ class CodeGenCPU : public KernelCodeGen {
   CodeGenCPU(Kernel *kernel, IRNode *ir = nullptr) : KernelCodeGen(kernel, ir) {
   }
 
+  // TODO: Stop defining this macro guards in the headers
+#ifdef TI_WITH_LLVM
+  static std::unique_ptr<CodeGenLLVM> make_codegen_llvm(Kernel *kernel,
+                                                        IRNode *ir);
+#endif  // TI_WITH_LLVM
+
   FunctionType codegen() override;
 };
 
diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index aa20cb43c8e77..ad72bf95f1413 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -1,4 +1,4 @@
-#include "codegen_cuda.h"
+#include "taichi/backends/cuda/codegen_cuda.h"
 
 #include <vector>
 #include <set>
@@ -35,23 +35,30 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
     return true;
   }
 
-  FunctionType compile_module_to_executable() override {
+  FunctionType gen() override {
+    auto compiled_res = run_compilation();
+    return compile_module_to_executable(this->kernel, std::move(compiled_res));
+  }
+
+  static FunctionType compile_module_to_executable(
+      Kernel *kernel,
+      CompiledData &&compiled_data) {
 #ifdef TI_WITH_CUDA
-    auto offloaded_local = offloaded_tasks;
-    for (auto &task : offloaded_local) {
-      llvm::Function *func = module->getFunction(task.name);
+    auto *tlctx =
+        kernel->program->get_llvm_program_impl()->get_llvm_context(Arch::cuda);
+    for (auto &task : compiled_data.offloaded_tasks) {
+      llvm::Function *func = compiled_data.llvm_module->getFunction(task.name);
       TI_ASSERT(func);
       tlctx->mark_function_as_cuda_kernel(func, task.block_dim);
     }
 
-    auto jit = kernel->program->get_llvm_program_impl()
-                   ->get_llvm_context(Arch::cuda)
-                   ->jit.get();
-    auto cuda_module =
-        jit->add_module(std::move(module), kernel->program->config.gpu_max_reg);
+    auto jit = tlctx->jit.get();
+    auto cuda_module = jit->add_module(std::move(compiled_data.llvm_module),
+                                       kernel->program->config.gpu_max_reg);
 
-    return [offloaded_local, cuda_module,
-            kernel = this->kernel](RuntimeContext &context) {
+    return [cuda_module, kernel,
+            offloaded_tasks =
+                compiled_data.offloaded_tasks](RuntimeContext &context) {
       CUDAContext::get_instance().make_current();
       auto args = kernel->args;
       std::vector<void *> arg_buffers(args.size(), nullptr);
@@ -124,7 +131,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
         CUDADriver::get_instance().stream_synchronize(nullptr);
       }
 
-      for (auto task : offloaded_local) {
+      for (auto task : offloaded_tasks) {
         TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
                  task.block_dim);
         cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0,
diff --git a/taichi/backends/wasm/codegen_wasm.cpp b/taichi/backends/wasm/codegen_wasm.cpp
index 7170eab5e1779..4d3144e8827c3 100644
--- a/taichi/backends/wasm/codegen_wasm.cpp
+++ b/taichi/backends/wasm/codegen_wasm.cpp
@@ -221,8 +221,6 @@ class CodeGenLLVMWASM : public CodeGenLLVM {
     auto offloaded_task_name = init_taichi_kernel_function();
     ir->accept(this);
     finalize_taichi_kernel_function();
-
-    // compile_module_to_executable
     // only keep the current func
     TaichiLLVMContext::eliminate_unused_functions(
         module.get(), [offloaded_task_name](const std::string &func_name) {
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 9f5ab45cc0cc0..0656eae7c3558 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -3,17 +3,19 @@
 #include <algorithm>
 
 #ifdef TI_WITH_LLVM
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Linker/Linker.h"
+
 #include "taichi/analysis/offline_cache_util.h"
-#include "taichi/llvm/llvm_offline_cache.h"
 #include "taichi/ir/statements.h"
+#include "taichi/llvm/llvm_offline_cache.h"
+#include "taichi/llvm/llvm_program.h"
 #include "taichi/struct/struct_llvm.h"
 #include "taichi/util/file_sequence_writer.h"
 #include "taichi/llvm/llvm_program.h"
 
-#include "llvm/IR/Module.h"
-#include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/Linker/Linker.h"
-
 TLANG_NAMESPACE_BEGIN
 
 // TODO: sort function definitions to match declaration order in header
@@ -2271,15 +2273,6 @@ void CodeGenLLVM::eliminate_unused_functions() {
       });
 }
 
-FunctionType CodeGenLLVM::compile_module_to_executable() {
-  TI_AUTO_PROF;
-
-  ModuleToFunctionConverter converter{tlctx,
-                                      kernel->program->get_llvm_program_impl()};
-  return converter.convert(kernel, std::move(module),
-                           std::move(offloaded_tasks));
-}
-
 FunctionCreationGuard CodeGenLLVM::get_function_creation_guard(
     std::vector<llvm::Type *> argument_types) {
   return FunctionCreationGuard(this, argument_types);
@@ -2353,7 +2346,7 @@ void CodeGenLLVM::emit_to_module() {
   ir->accept(this);
 }
 
-FunctionType CodeGenLLVM::gen() {
+CodeGenLLVM::CompiledData CodeGenLLVM::run_compilation() {
   bool needs_cache = false;
   const auto &config = prog->config;
   std::string kernel_key;
@@ -2376,7 +2369,10 @@ FunctionType CodeGenLLVM::gen() {
         t.grid_dim = task.grid_dim;
       }
       kernel->set_from_offline_cache();
-      return compile_module_to_executable();
+      CompiledData res;
+      res.offloaded_tasks = std::move(this->offloaded_tasks);
+      res.llvm_module = std::move(this->module);
+      return res;
     } else {
       needs_cache = true;
     }
@@ -2390,7 +2386,19 @@ FunctionType CodeGenLLVM::gen() {
   if (needs_cache) {
     cache_module(kernel_key);
   }
-  return compile_module_to_executable();
+  CompiledData res;
+  res.offloaded_tasks = std::move(this->offloaded_tasks);
+  res.llvm_module = std::move(this->module);
+  return res;
+}
+
+FunctionType CodeGenLLVM::gen() {
+  auto compiled_res = run_compilation();
+
+  ModuleToFunctionConverter converter{tlctx,
+                                      kernel->program->get_llvm_program_impl()};
+  return converter.convert(kernel, std::move(compiled_res.llvm_module),
+                           std::move(compiled_res.offloaded_tasks));
 }
 
 llvm::Value *CodeGenLLVM::create_xlogue(std::unique_ptr<Block> &block) {
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index ca5df8c0be817..e6b20104640db 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -1,16 +1,17 @@
 // The LLVM backend for CPUs/NVPTX/AMDGPU
 #pragma once
 
-#ifdef TI_WITH_LLVM
-
 #include <set>
 #include <unordered_map>
 
+#ifdef TI_WITH_LLVM
+
 #include "taichi/ir/ir.h"
-#include "taichi/program/program.h"
 #include "taichi/llvm/llvm_codegen_utils.h"
+#include "taichi/program/program.h"
 
-TLANG_NAMESPACE_BEGIN
+namespace taichi {
+namespace lang {
 
 class CodeGenLLVM;
 
@@ -119,8 +120,20 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void eliminate_unused_functions();
 
-  virtual FunctionType compile_module_to_executable();
-
+  struct CompiledData {
+    std::vector<OffloadedTask> offloaded_tasks;
+    std::unique_ptr<llvm::Module> llvm_module{nullptr};
+  };
+  /**
+   * @brief Runs the codegen and produces the compiled result.
+   *
+   * After this call, `module` and `offloaded_tasks` will be moved.
+   *
+   * @return CompiledData
+   */
+  CompiledData run_compilation();
+
+  // TODO: This function relies largely on `run_compilation()`. Name it better.
   virtual FunctionType gen();
 
   virtual bool supports_offline_cache() const {
@@ -422,6 +435,7 @@ class ModuleToFunctionConverter {
   LlvmProgramImpl *program_{nullptr};
 };
 
-TLANG_NAMESPACE_END
+}  // namespace lang
+}  // namespace taichi
 
 #endif  // #ifdef TI_WITH_LLVM
diff --git a/taichi/program/context.h b/taichi/program/context.h
index 1e630d68d1a90..fe8e2de20c8ff 100644
--- a/taichi/program/context.h
+++ b/taichi/program/context.h
@@ -14,7 +14,7 @@ struct DeviceAllocation;
 // pointer to the LLVMRuntime struct, kernel arguments, and the thread id (if on
 // CPU).
 struct RuntimeContext {
-  LLVMRuntime *runtime;
+  LLVMRuntime *runtime{nullptr};
   // args can contain:
   // - primitive_types
   // - raw ptrs: for external array, or torch-based ndarray

From 055b0c75ecb543264a6dfc1b607a9429cef56ab1 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Fri, 13 May 2022 18:58:02 +0800
Subject: [PATCH 064/176] [ci] [build] Containerize Windows CPU build and test
 (#4933)

* [ci] [build] Containerize Windows CPU build and test

* Disable ninja

* Avoid pybind11_add_module()

* Force reinstall

* Find pybind11

* Include pybind11 dir

* Update include dir

* Remove trailing whitespace

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Use correct pybind11

* Add path

* Enable no extras for pybind11_add_module

* Add no_extra

* Clone in the container

* Use github job container

* Add runs-on

* Revert back to docker based jobs

* Install instead of develop

* [ci] [build] Containerize Windows CPU build and test

* Disable ninja

* Avoid pybind11_add_module()

* Force reinstall

* Find pybind11

* Include pybind11 dir

* Update include dir

* Remove trailing whitespace

* Use correct pybind11

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add path

* Enable no extras for pybind11_add_module

* Add no_extra

* Clone in the container

* Use github job container

* Add runs-on

* Revert back to docker based jobs

* Install instead of develop

* Use tar in jobs

* Update cmake

* Skip clone

* Manual fixing white space

* Remove comments

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../workflows/scripts/win_build_test_cpu.ps1  | 72 +++++++++++++++++++
 .github/workflows/testing.yml                 | 42 +++++++++++
 cmake/PythonNumpyPybind11.cmake               |  1 -
 cmake/TaichiCore.cmake                        |  3 +-
 4 files changed, 116 insertions(+), 2 deletions(-)
 create mode 100644 .github/workflows/scripts/win_build_test_cpu.ps1

diff --git a/.github/workflows/scripts/win_build_test_cpu.ps1 b/.github/workflows/scripts/win_build_test_cpu.ps1
new file mode 100644
index 0000000000000..07b257c47c66f
--- /dev/null
+++ b/.github/workflows/scripts/win_build_test_cpu.ps1
@@ -0,0 +1,72 @@
+# Build script for windows CPU
+# TODO unify this with the other Win scripts
+
+param (
+    [switch]$clone = $false,
+    [switch]$install = $false,
+    [string]$libsDir = "C:\"
+)
+
+$ErrorActionPreference = "Stop"
+
+$RepoURL = 'https://github.com/taichi-dev/taichi'
+
+function WriteInfo($text) {
+    Write-Host -ForegroundColor Green "[BUILD] $text"
+}
+
+$libsDir = (Resolve-Path $libsDir).Path
+if (-not (Test-Path $libsDir)) {
+    New-Item -ItemType Directory -Path $libsDir
+}
+Set-Location $libsDir
+
+if (-not (Test-Path "taichi_llvm")) {
+    WriteInfo("Download and extract LLVM")
+    curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip -LO
+    7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm
+}
+if (-not (Test-Path "taichi_clang")) {
+    WriteInfo("Download and extract Clang")
+    curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO
+    7z x clang-10.0.0-win.zip -otaichi_clang
+}
+
+WriteInfo("Setting the env vars")
+$env:LLVM_DIR = "C://taichi_llvm"
+
+#TODO enable build test
+$env:TAICHI_CMAKE_ARGS = "-DTI_WITH_OPENGL:BOOL=OFF -DTI_WITH_CC:BOOL=OFF -DTI_WITH_VULKAN:BOOL=OFF -DTI_WITH_CUDA:BOOL=OFF -DTI_BUILD_TESTS:BOOL=OFF"
+
+#TODO: For now we need to hard code the compiler path from build tools 2019
+$env:TAICHI_CMAKE_ARGS +=' -DCMAKE_CXX_COMPILER=C:/Program\ Files\ (x86)/Microsoft\ Visual\ Studio/2019/BuildTools/vc/Tools/Llvm/x64/bin/clang++.exe -DCMAKE_C_COMPILER=C:/Program\ Files\ (x86)/Microsoft\ Visual\ Studio/2019/BuildTools/vc/Tools/Llvm/x64/bin/clang.exe'
+$env:TAICHI_CMAKE_ARGS += " -DCLANG_EXECUTABLE=C:\\taichi_clang\\bin\\clang++.exe"
+$env:TAICHI_CMAKE_ARGS += " -DLLVM_AS_EXECUTABLE=C:\\taichi_llvm\\bin\\llvm-as.exe -DTI_WITH_VULKAN:BOOL=OFF"
+
+WriteInfo("Checking clang compiler")
+clang --version
+
+WriteInfo("Enter the repository")
+Set-Location .\taichi
+
+WriteInfo("Setting up Python environment")
+conda activate py37
+python -m pip install -r requirements_dev.txt
+python -m pip install -r requirements_test.txt
+
+# These have to be re-installed to avoid strange certificate issue
+# on CPU docker environment
+python -m pip install --upgrade --force-reinstall numpy
+python -m pip install --upgrade --force-reinstall cmake
+python -m pip install --upgrade --force-reinstall wheel
+if (-not $?) { exit 1 }
+
+WriteInfo("Building Taichi")
+python setup.py install
+if (-not $?) { exit 1 }
+WriteInfo("Build finished")
+
+$env:TI_ENABLE_PADDLE = "0"
+WriteInfo("Testing Taichi")
+python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cpu
+WriteInfo("Test finished")
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 72169e5abb467..9a0034b6b49dd 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -239,6 +239,48 @@ jobs:
           TI_WANTED_ARCHS: ${{ matrix.wanted_archs }}
           TI_CI: 1
 
+  build_and_test_cpu_windows:
+    name: Build and Test Windows (CPU)
+    needs: check_files
+    timeout-minutes: 90
+    runs-on: windows-2019
+    permissions:
+      packages: read
+      contents: read
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          submodules: 'recursive'
+
+      - name: Get docker images
+        shell: bash
+        run: |
+          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+            exit 0
+          fi
+          echo $CR_PAT | docker login ghcr.io -u ${{ github.actor }} --password-stdin
+          docker pull ghcr.io/taichi-dev/taichidev-cpu-windows:v0.0.1
+        env:
+          CR_PAT: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Build and Test
+        shell: bash
+        run: |
+          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
+            exit 0
+          fi
+          docker create --name taichi_build_test \
+            ghcr.io/taichi-dev/taichidev-cpu-windows:v0.0.1 \
+            C:/taichi/.github/workflows/scripts/win_build_test_cpu.ps1
+          tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx | docker cp - taichi_build_test:C:/
+          docker start -a taichi_build_test
+
+      - name: clean docker container
+        shell: bash
+        if: always()
+        run: |
+          docker rm taichi_build_test -f
+
   build_and_test_gpu_linux:
     name: Build and Test (GPU)
     needs: check_files
diff --git a/cmake/PythonNumpyPybind11.cmake b/cmake/PythonNumpyPybind11.cmake
index 65a231e04f64b..39086dd8b5db2 100644
--- a/cmake/PythonNumpyPybind11.cmake
+++ b/cmake/PythonNumpyPybind11.cmake
@@ -10,7 +10,6 @@ message("    include: ${PYTHON_INCLUDE_DIR}")
 message("    library: ${PYTHON_LIBRARY}")
 message("    numpy include: ${NUMPY_INCLUDE_DIR}")
 
-
 include_directories(${NUMPY_INCLUDE_DIR})
 
 find_package(pybind11 CONFIG REQUIRED)
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 1a41bc2020552..59178429d2e09 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -467,7 +467,8 @@ if(NOT TI_EMSCRIPTENED)
     # Cannot compile Python source code with Android, but TI_EXPORT_CORE should be set and
     # Android should only use the isolated library ignoring those source code.
     if (NOT ANDROID)
-        pybind11_add_module(${CORE_WITH_PYBIND_LIBRARY_NAME} ${TAICHI_PYBIND_SOURCE})
+	# NO_EXTRAS is required here to avoid llvm symbol error during build
+	pybind11_add_module(${CORE_WITH_PYBIND_LIBRARY_NAME} NO_EXTRAS ${TAICHI_PYBIND_SOURCE})
     else()
         add_library(${CORE_WITH_PYBIND_LIBRARY_NAME} SHARED)
     endif ()

From 3fbff4bb8adc4f00922e05f87242b7d737c7051b Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Fri, 13 May 2022 20:17:36 +0800
Subject: [PATCH 065/176] [llvm] Make cache writer support BC format (#4978)

---
 taichi/llvm/llvm_offline_cache.cpp | 47 ++++++++++++++++++++----------
 taichi/llvm/llvm_offline_cache.h   |  8 ++++-
 2 files changed, 38 insertions(+), 17 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index 957c636eb1526..fbcf59d90018a 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -3,9 +3,11 @@
 #include <sstream>
 
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/IR/Module.h"
+
 #include "taichi/ir/transforms.h"
 
 namespace taichi {
@@ -43,28 +45,41 @@ bool LlvmOfflineCacheFileReader::get_kernel_cache(
   return true;
 }
 
-void LlvmOfflineCacheFileWriter::dump(const std::string &path) {
+void LlvmOfflineCacheFileWriter::dump(const std::string &path, Format format) {
   taichi::create_directories(path);
   for (auto &[k, v] : data_.kernels) {
     std::stringstream filename_ss;
     filename_ss << path << "/" << k;
     std::string filename_prefix = filename_ss.str();
+
+    auto write_llvm_module =
+        [&filename_prefix](
+            const std::string &suffix,
+            std::function<void(llvm::raw_os_ostream & os)> writer) {
+          const std::string filename = filename_prefix + suffix;
+          std::ofstream os(filename, std::ios::out | std::ios::binary);
+          TI_ERROR_IF(!os.is_open(), "File {} open failed", filename);
+          llvm::raw_os_ostream llvm_os{os};
+          writer(llvm_os);
+        };
     {
-      std::string filename = filename_prefix + ".ll";
-      std::ofstream os(filename, std::ios::out | std::ios::binary);
-      TI_ERROR_IF(!os.is_open(), "File {} open failed", filename);
-      llvm::SMDiagnostic err;
-      llvm::LLVMContext ctx;
-      llvm::raw_os_ostream llvm_os(os);
-      if (v.module) {
-        mangle_offloaded_task_name(k, v.module, v.offloaded_task_list);
-        v.module->print(llvm_os, nullptr);
-      } else if (v.owned_module) {
-        mangle_offloaded_task_name(k, v.owned_module.get(),
-                                   v.offloaded_task_list);
-        v.owned_module->print(llvm_os, nullptr);
-      } else
-        TI_ASSERT(false);
+      auto *mod = v.module;
+      if (!mod) {
+        mod = v.owned_module.get();
+      }
+      TI_ASSERT(mod != nullptr);
+
+      mangle_offloaded_task_name(k, mod, v.offloaded_task_list);
+      if (format & Format::LL) {
+        write_llvm_module(".ll", [mod](llvm::raw_os_ostream &os) {
+          mod->print(os, /*AAW=*/nullptr);
+        });
+      }
+      if (format & Format::BC) {
+        write_llvm_module(".bc", [mod](llvm::raw_os_ostream &os) {
+          llvm::WriteBitcodeToFile(*mod, os);
+        });
+      }
     }
     {
       std::string filename = filename_prefix + "_otnl.txt";
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index c0759345aa865..7dce67ab90867 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -15,6 +15,7 @@ struct LlvmOfflineCache {
     int block_dim{0};
     int grid_dim{0};
   };
+
   struct KernelCacheData {
     std::string kernel_key;
     std::unique_ptr<llvm::Module> owned_module{nullptr};
@@ -45,6 +46,11 @@ class LlvmOfflineCacheFileReader {
 
 class LlvmOfflineCacheFileWriter {
  public:
+  enum Format {
+    LL = 0x01,
+    BC = 0x10,
+  };
+
   void set_data(LlvmOfflineCache &&data) {
     this->mangled_ = false;
     this->data_ = std::move(data);
@@ -55,7 +61,7 @@ class LlvmOfflineCacheFileWriter {
     data_.kernels[key] = std::move(kernel_cache);
   }
 
-  void dump(const std::string &path);
+  void dump(const std::string &path, Format format = Format::LL);
 
  private:
   void mangle_offloaded_task_name(

From c189fc6ad21c44167a298bde4086153a01443223 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Fri, 13 May 2022 21:35:47 +0800
Subject: [PATCH 066/176] [Build] Improve Windows build script (#4955)

* Improve Windows build script

* Switch to clean up intermediates
---
 build.ps1 | 125 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 124 insertions(+), 1 deletion(-)

diff --git a/build.ps1 b/build.ps1
index 9857d6d0928db..f45ec10c70f94 100644
--- a/build.ps1
+++ b/build.ps1
@@ -1,3 +1,126 @@
+param(
+    # Debug, Release, RelWithDebInfo, MinSizeRel
+    [string] $BuildType = "Release",
+    [string] $LlvmDir = "",
+    [string] $ClangDir = "",
+    # Install python package in user-space.
+    [switch] $UserSpace = $false,
+    # Clean up compilation intermediates instead of building Taichi. Note that
+    # downloaded artifacts (like LLVM and Clang) will not be removed.
+    [switch] $Clean = $false
+)
+
+$ErrorActionPreference = "Stop"
+
+if ($Clean) {
+    & python setup.py clean
+    exit
+}
+
+$TempDir = "${pwd}/tmp"
+$DownloadDir = "${TempDir}/download"
+
+function EnsureDir($Dir) {
+    if (-not (Test-Path $Dir)) {
+        New-Item $Dir -ItemType Directory
+    }
+}
+function DownloadFile($Uri, $DstFileName) {
+    EnsureDir $TempDir
+    EnsureDir $DownloadDir
+    # Download only if the file is in absence.
+    $DstPath = "$DownloadDir/$DstFileName"
+    if (-not (Test-Path $DstPath)) {
+        Invoke-WebRequest -MaximumRetryCount 10 -RetryIntervalSec 5 $Uri -OutFile $DstPath
+    }
+}
+function DownloadArchiveAndExpand($Uri, $ArchiveName) {
+    DownloadFile $Uri "$ArchiveName.zip";
+    # Expand archive only if we haven't done it before.
+    $ExpandDir = "$TempDir/$ArchiveName";
+    if (-not (Test-Path $ExpandDir)) {
+        Expand-Archive "$DownloadDir/$ArchiveName.zip" -DestinationPath $ExpandDir
+    }
+}
+
+
+
+# Select build type, by default it's `Release`.
+switch ($BuildType) {
+    "Debug" { $env:DEBUG = 1; }
+    "Release" {}
+    "RelWithDebInfo" { $env:RELWITHDEBINFO = 1; }
+    "MinSizeRel" { $env:MINSIZEREL = 1; }
+    Default {
+        Write-Error "Unknown build type '$BuildType'"
+    }
+}
+
+# Prepare LLVM.
+if ($env:LLVM_DIR) {
+    # Compatible with previous building process, where `LLVM_DIR` and
+    # `LLVM_AS_EXECUTABLE` are set externally.
+    $LlvmDir = $env:LLVM_DIR;
+}
+if (-not $LlvmDir) {
+    DownloadArchiveAndExpand -Uri "https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip" -ArchiveName "taichi-llvm"
+    $LlvmDir = "$TempDir/taichi-llvm"
+}
+if (-not $LlvmDir -or -not (Test-Path $LlvmDir)) {
+    throw "LLVM cannot be found in local environment and the script failed to download a prebuilt archive. " +
+        "Please follow the instructions at 'https://docs.taichi-lang.org/lang/articles/dev_install' to manually configure LLVM for Taichi."
+} else {
+    $LlvmDir = (Resolve-Path $LlvmDir).Path;
+    $env:LLVM_DIR = $LlvmDir
+    Write-Host "Using LLVM at '$LlvmDir'."
+}
+
+#Prepare Clang.
+if (-not $ClangDir) {
+    DownloadArchiveAndExpand -Uri "https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip" -ArchiveName "taichi-clang"
+    $ClangDir = "$TempDir/taichi-clang"
+}
+if (-not $ClangDir -or -not (Test-Path $ClangDir)) {
+    throw "Clang cannot be found in local environment and the script failed to download a prebuilt archive. " +
+        "Please follow the instructions at 'https://docs.taichi-lang.org/lang/articles/dev_install' to manually configure Clang for Taichi."
+} else {
+    $ClangDir = (Resolve-Path $ClangDir).Path;
+    Write-Host "Using Clang at '$ClangDir'."
+}
+
+$CMakeArgs = @{
+    "CLANG_EXECUTABLE" = "$ClangDir/bin/clang++.exe";
+    "LLVM_AS_EXECUTABLE" = "$LlvmDir/bin/llvm-as.exe";
+}
+
+# Build Vulkan backend if Vulkan SDK is installed.
+if ($env:VK_SDK_PATH) {
+    Write-Host "Found existing Vulkan SDK isntalltion at '$env:VK_SDK_PATH', Vulkan backend will be built."
+    $env:VULKAN_SDK = $env:VK_SDK_PATH;
+    $CMakeArgs["TI_WITH_VULKAN:BOOL"] = "ON";
+}
+
+# Chain up the cmake arguments.
+Write-Host "Will build Taichi ($BuildType) with the following CMake args:"
+$env:TAICHI_CMAKE_ARGS = ""
+foreach ($Pair in $CMakeArgs.GetEnumerator()) {
+    $Key = $Pair | Select-Object -ExpandProperty Key
+    $Value = ($Pair | Select-Object -ExpandProperty Value) -replace "\\", "/"
+    Write-Host "  $Key = $Value"
+    $env:TAICHI_CMAKE_ARGS += " -D$Key=`"$Value`""
+}
+
+# Install in userspace?
+$BuildExpr = "python setup.py develop";
+if ($UserSpace) {
+    Write-Host "Taichi Python package will be installed in user-space."
+    $BuildExpr += " --user"
+}
+
+Write-Host
+
+# Do the job.
 $stopwatch = [system.diagnostics.stopwatch]::startNew()
-python setup.py develop
+Write-Host $BuildExpr
+Invoke-Expression $BuildExpr
 $stopwatch.Elapsed

From 9e89c5836c1f8b1951ef355a60ea150c03e69d98 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Sat, 14 May 2022 18:55:54 +0800
Subject: [PATCH 067/176] [refactor] Improve serializer and cleanup utils
 (#4980)

* [refactor] Improve serializer and cleanup utils

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/common/cleanup.cpp               | 19 +++++++++++++++++++
 taichi/common/cleanup.h                 | 24 ++++++++++++++++++++++++
 taichi/common/serialization.h           |  4 ++--
 tests/cpp/common/serialization_test.cpp | 23 +++++++++++++++++++++++
 4 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 taichi/common/cleanup.cpp
 create mode 100644 taichi/common/cleanup.h

diff --git a/taichi/common/cleanup.cpp b/taichi/common/cleanup.cpp
new file mode 100644
index 0000000000000..c4d73684da2d0
--- /dev/null
+++ b/taichi/common/cleanup.cpp
@@ -0,0 +1,19 @@
+#include "taichi/common/cleanup.h"
+
+namespace taichi {
+
+RaiiCleanup::RaiiCleanup(Func fn) : fn_(std::move(fn)) {
+}
+
+RaiiCleanup::~RaiiCleanup() {
+  if (fn_) {
+    fn_();
+    fn_ = nullptr;
+  }
+}
+
+RaiiCleanup make_cleanup(RaiiCleanup::Func fn) {
+  return RaiiCleanup{std::move(fn)};
+}
+
+}  // namespace taichi
diff --git a/taichi/common/cleanup.h b/taichi/common/cleanup.h
new file mode 100644
index 0000000000000..f65eaccee3eef
--- /dev/null
+++ b/taichi/common/cleanup.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <functional>
+
+namespace taichi {
+
+class RaiiCleanup {
+ public:
+  using Func = std::function<void()>;
+
+  explicit RaiiCleanup(Func fn);
+  ~RaiiCleanup();
+  RaiiCleanup(const RaiiCleanup &) = delete;
+  RaiiCleanup &operator=(const RaiiCleanup &) = delete;
+  RaiiCleanup(RaiiCleanup &&) = default;
+  RaiiCleanup &operator=(RaiiCleanup &&) = default;
+
+ private:
+  Func fn_;
+};
+
+RaiiCleanup make_cleanup(RaiiCleanup::Func fn);
+
+}  // namespace taichi
diff --git a/taichi/common/serialization.h b/taichi/common/serialization.h
index 324a9e4916031..3896958fb7f3f 100644
--- a/taichi/common/serialization.h
+++ b/taichi/common/serialization.h
@@ -588,7 +588,7 @@ class BinarySerializer : public Serializer {
   void handle_associative_container(const M &val) {
     if constexpr (writing) {
       this->process(val.size());
-      for (auto iter : val) {
+      for (auto &iter : val) {
         auto first = iter.first;
         this->process(first);
         this->process(iter.second);
@@ -601,7 +601,7 @@ class BinarySerializer : public Serializer {
       for (std::size_t i = 0; i < n; i++) {
         typename M::value_type record;
         this->process(record);
-        wval.insert(record);
+        wval.insert(std::move(record));
       }
     }
   }
diff --git a/tests/cpp/common/serialization_test.cpp b/tests/cpp/common/serialization_test.cpp
index 2f8cb15afb2ba..5e049ee42cbab 100644
--- a/tests/cpp/common/serialization_test.cpp
+++ b/tests/cpp/common/serialization_test.cpp
@@ -122,6 +122,29 @@ TEST(Serialization, Basic) {
   ts.print();
 }
 
+struct MoveOnlyObj {
+  int foo{0};
+  std::string bar;
+  std::unique_ptr<int> ptr{nullptr};
+
+  TI_IO_DEF(foo, bar);
+};
+
+TEST(Serialization, MoveOnly) {
+  std::unordered_map<std::string, MoveOnlyObj> m;
+  m["1"] = MoveOnlyObj{42, "abc", nullptr};
+  m["2"] = MoveOnlyObj{100, "def", nullptr};
+
+  BinIoPair bp;
+  const auto actual = bp.run(m);
+  EXPECT_EQ(actual.size(), m.size());
+  const auto &exp_item1 = m.at("1");
+  const auto &act_item1 = actual.at("1");
+  EXPECT_EQ(act_item1.foo, exp_item1.foo);
+  EXPECT_EQ(act_item1.bar, exp_item1.bar);
+  EXPECT_EQ(act_item1.ptr, nullptr);
+}
+
 }  // namespace
 }  // namespace lang
 }  // namespace taichi

From 6bbaf52d01fbe0ad487d83f4ed6c18db29e6681e Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Sat, 14 May 2022 22:11:19 +0800
Subject: [PATCH 068/176] [llvm] Support both BC and LL cache format (#4979)

* [llvm] Support both BC and LL cache format

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* rm

* fix fs

* fix

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 cmake/TaichiTests.cmake                    |   1 +
 taichi/jit/jit_session.cpp                 |   1 +
 taichi/llvm/llvm_context.cpp               |  39 +++++--
 taichi/llvm/llvm_context.h                 |  30 ++++-
 taichi/llvm/llvm_offline_cache.cpp         |  34 ++++--
 taichi/llvm/llvm_offline_cache.h           |  23 ++--
 tests/cpp/llvm/llvm_offline_cache_test.cpp | 130 +++++++++++++++++++++
 7 files changed, 230 insertions(+), 28 deletions(-)
 create mode 100644 tests/cpp/llvm/llvm_offline_cache_test.cpp

diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index 5f6adb66d1c07..b3085731ba612 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -17,6 +17,7 @@ file(GLOB_RECURSE TAICHI_TESTS_SOURCE
         "tests/cpp/codegen/*.cpp"
         "tests/cpp/common/*.cpp"
         "tests/cpp/ir/*.cpp"
+        "tests/cpp/llvm/*.cpp",
         "tests/cpp/program/*.cpp"
         "tests/cpp/struct/*.cpp"
         "tests/cpp/transforms/*.cpp")
diff --git a/taichi/jit/jit_session.cpp b/taichi/jit/jit_session.cpp
index dd9547589949b..83e021a288008 100644
--- a/taichi/jit/jit_session.cpp
+++ b/taichi/jit/jit_session.cpp
@@ -10,6 +10,7 @@ TLANG_NAMESPACE_BEGIN
 std::unique_ptr<JITSession> create_llvm_jit_session_cpu(
     LlvmProgramImpl *llvm_prog,
     Arch arch);
+
 std::unique_ptr<JITSession> create_llvm_jit_session_cuda(
     LlvmProgramImpl *llvm_prog,
     Arch arch);
diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
index f71bf3715de7e..278013095231d 100644
--- a/taichi/llvm/llvm_context.cpp
+++ b/taichi/llvm/llvm_context.cpp
@@ -185,30 +185,47 @@ TaichiLLVMContext::clone_module_to_this_thread_context(llvm::Module *module) {
   return clone_module_to_context(module, this_context);
 }
 
-std::unique_ptr<llvm::Module> module_from_bitcode_file(std::string bitcode_path,
-                                                       llvm::LLVMContext *ctx) {
-  TI_AUTO_PROF
-  std::ifstream ifs(bitcode_path, std::ios::binary);
-  TI_ERROR_IF(!ifs, "Bitcode file ({}) not found.", bitcode_path);
+std::unique_ptr<llvm::Module> LlvmModuleBitcodeLoader::load(
+    llvm::LLVMContext *ctx) const {
+  TI_AUTO_PROF;
+  std::ifstream ifs(bitcode_path_, std::ios::binary);
+  TI_ERROR_IF(!ifs, "Bitcode file ({}) not found.", bitcode_path_);
   std::string bitcode(std::istreambuf_iterator<char>(ifs),
                       (std::istreambuf_iterator<char>()));
   auto runtime =
-      parseBitcodeFile(llvm::MemoryBufferRef(bitcode, "runtime_bitcode"), *ctx);
+      parseBitcodeFile(llvm::MemoryBufferRef(bitcode, buffer_id_), *ctx);
   if (!runtime) {
     auto error = runtime.takeError();
     TI_WARN("Bitcode loading error message:");
     llvm::errs() << error << "\n";
-    TI_ERROR("Bitcode {} load failure.", bitcode_path);
+    TI_ERROR("Failed to load bitcode={}", bitcode_path_);
+    return nullptr;
   }
 
-  for (auto &f : *(runtime.get()))
-    TaichiLLVMContext::mark_inline(&f);
+  if (inline_funcs_) {
+    for (auto &f : *(runtime.get())) {
+      TaichiLLVMContext::mark_inline(&f);
+    }
+  }
 
-  bool module_broken = llvm::verifyModule(*runtime.get(), &llvm::errs());
-  TI_ERROR_IF(module_broken, "Module broken");
+  const bool module_broken = llvm::verifyModule(*runtime.get(), &llvm::errs());
+  if (module_broken) {
+    TI_ERROR("Broken bitcode={}", bitcode_path_);
+    return nullptr;
+  }
   return std::move(runtime.get());
 }
 
+std::unique_ptr<llvm::Module> module_from_bitcode_file(
+    const std::string &bitcode_path,
+    llvm::LLVMContext *ctx) {
+  LlvmModuleBitcodeLoader loader;
+  return loader.set_bitcode_path(bitcode_path)
+      .set_buffer_id("runtime_bitcode")
+      .set_inline_funcs(true)
+      .load(ctx);
+}
+
 // The goal of this function is to rip off huge libdevice functions that are not
 // going to be used later, at an early stage. Although the LLVM optimizer will
 // ultimately remove unused functions during a global DCE pass, we don't even
diff --git a/taichi/llvm/llvm_context.h b/taichi/llvm/llvm_context.h
index dd2510d873570..20dc820af46a3 100644
--- a/taichi/llvm/llvm_context.h
+++ b/taichi/llvm/llvm_context.h
@@ -163,8 +163,34 @@ class TaichiLLVMContext {
   std::unordered_map<int, std::vector<std::string>> snode_tree_funcs_;
 };
 
-std::unique_ptr<llvm::Module> module_from_bitcode_file(std::string bitcode_path,
-                                                       llvm::LLVMContext *ctx);
+class LlvmModuleBitcodeLoader {
+ public:
+  LlvmModuleBitcodeLoader &set_bitcode_path(const std::string &bitcode_path) {
+    bitcode_path_ = bitcode_path;
+    return *this;
+  }
+
+  LlvmModuleBitcodeLoader &set_buffer_id(const std::string &buffer_id) {
+    buffer_id_ = buffer_id;
+    return *this;
+  }
+
+  LlvmModuleBitcodeLoader &set_inline_funcs(bool inline_funcs) {
+    inline_funcs_ = inline_funcs;
+    return *this;
+  }
+
+  std::unique_ptr<llvm::Module> load(llvm::LLVMContext *ctx) const;
+
+ private:
+  std::string bitcode_path_;
+  std::string buffer_id_;
+  bool inline_funcs_{false};
+};
+
+std::unique_ptr<llvm::Module> module_from_bitcode_file(
+    const std::string &bitcode_path,
+    llvm::LLVMContext *ctx);
 
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index fbcf59d90018a..0500459570bfd 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -3,32 +3,49 @@
 #include <sstream>
 
 #include "llvm/AsmParser/Parser.h"
+#include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/IR/Module.h"
 
 #include "taichi/ir/transforms.h"
+#include "taichi/llvm/llvm_context.h"
 
 namespace taichi {
 namespace lang {
+namespace {
+using Format = LlvmOfflineCache::Format;
+}  // namespace
 
 bool LlvmOfflineCacheFileReader::get_kernel_cache(
     LlvmOfflineCache::KernelCacheData &res,
     const std::string &key,
     llvm::LLVMContext &llvm_ctx) {
   res.kernel_key = key;
-  std::string filename_prefix = path_ + "/" + key;
-  {
-    std::string filename = filename_prefix + ".ll";
+  const std::string filename_prefix = path_ + "/" + key;
+  if (format_ & Format::BC) {
+    LlvmModuleBitcodeLoader loader;
+    res.owned_module = loader.set_bitcode_path(filename_prefix + ".bc")
+                           .set_buffer_id(key)
+                           .set_inline_funcs(false)
+                           .load(&llvm_ctx);
+  } else if (format_ & Format::LL) {
+    const std::string filename = filename_prefix + ".ll";
     llvm::SMDiagnostic err;
     res.owned_module = llvm::parseAssemblyFile(filename, err, llvm_ctx);
-    res.module = res.owned_module.get();
-    if (!res.module)
-      return false;
+  } else {
+    TI_ERROR("Unknown LLVM format={}", format_);
+    return false;
   }
+
+  res.module = res.owned_module.get();
+  if (!res.module) {
+    return false;
+  }
+
   {
-    std::string filename = filename_prefix + "_otnl.txt";
+    const std::string filename = filename_prefix + "_otnl.txt";
     std::ifstream in(filename, std::ios::in | std::ios::binary);
     if (!in.is_open())
       return false;
@@ -45,7 +62,8 @@ bool LlvmOfflineCacheFileReader::get_kernel_cache(
   return true;
 }
 
-void LlvmOfflineCacheFileWriter::dump(const std::string &path, Format format) {
+void LlvmOfflineCacheFileWriter::dump(const std::string &path,
+                                      LlvmOfflineCache::Format format) {
   taichi::create_directories(path);
   for (auto &[k, v] : data_.kernels) {
     std::stringstream filename_ss;
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index 7dce67ab90867..42f89c362179e 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -10,6 +10,11 @@ namespace taichi {
 namespace lang {
 
 struct LlvmOfflineCache {
+  enum Format {
+    LL = 0x01,
+    BC = 0x10,
+  };
+
   struct OffloadedTaskCacheData {
     std::string name;
     int block_dim{0};
@@ -33,7 +38,10 @@ struct LlvmOfflineCache {
 
 class LlvmOfflineCacheFileReader {
  public:
-  LlvmOfflineCacheFileReader(const std::string &path) : path_(path) {
+  LlvmOfflineCacheFileReader(
+      const std::string &path,
+      LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL)
+      : path_(path), format_(format) {
   }
 
   bool get_kernel_cache(LlvmOfflineCache::KernelCacheData &res,
@@ -42,15 +50,11 @@ class LlvmOfflineCacheFileReader {
 
  private:
   std::string path_;
+  LlvmOfflineCache::Format format_;
 };
 
 class LlvmOfflineCacheFileWriter {
  public:
-  enum Format {
-    LL = 0x01,
-    BC = 0x10,
-  };
-
   void set_data(LlvmOfflineCache &&data) {
     this->mangled_ = false;
     this->data_ = std::move(data);
@@ -61,7 +65,12 @@ class LlvmOfflineCacheFileWriter {
     data_.kernels[key] = std::move(kernel_cache);
   }
 
-  void dump(const std::string &path, Format format = Format::LL);
+  void dump(const std::string &path,
+            LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL);
+
+  void set_no_mangle() {
+    mangled_ = true;
+  }
 
  private:
   void mangle_offloaded_task_name(
diff --git a/tests/cpp/llvm/llvm_offline_cache_test.cpp b/tests/cpp/llvm/llvm_offline_cache_test.cpp
new file mode 100644
index 0000000000000..b2d044807ef87
--- /dev/null
+++ b/tests/cpp/llvm/llvm_offline_cache_test.cpp
@@ -0,0 +1,130 @@
+#include "gtest/gtest.h"
+
+#include "taichi/common/platform_macros.h"
+
+#ifdef TI_WITH_LLVM
+
+#if defined(TI_PLATFORM_LINUX) || defined(TI_PLATFORM_WINDOWS)
+#if __has_include(<filesystem>)
+#include <filesystem>
+namespace fs = std::filesystem;
+#elif __has_include(<experimental/filesystem>)
+#include <experimental/filesystem>
+namespace fs = std::experimental::filesystem;
+#else
+error "Missing the <filesystem> header."
+#endif  //  __has_include(<filesystem>)
+
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
+
+#include "taichi/backends/arch.h"
+#include "taichi/llvm/llvm_context.h"
+#include "taichi/llvm/llvm_offline_cache.h"
+#include "taichi/llvm/llvm_program.h"
+#include "taichi/program/compile_config.h"
+#include "taichi/program/program.h"
+
+namespace taichi {
+namespace lang {
+namespace {
+
+constexpr char kKernelName[] = "foo";
+constexpr char kTaskName[] = "my_add";
+constexpr int kBlockDim = 1;
+constexpr int kGridDim = 1;
+
+using Format = LlvmOfflineCache::Format;
+
+class LlvmOfflineCacheTest : public testing::TestWithParam<Format> {
+ protected:
+  void SetUp() override {
+    const auto arch = host_arch();
+    config_.packed = false;
+    config_.print_kernel_llvm_ir = false;
+    prog_ = std::make_unique<Program>(arch);
+    tlctx_ = prog_->get_llvm_program_impl()->get_llvm_context(arch);
+  }
+
+  static std::unique_ptr<llvm::Module> make_module(
+      llvm::LLVMContext &llvm_ctx) {
+    auto mod = std::make_unique<llvm::Module>("my_mod", llvm_ctx);
+    auto builder = std::make_unique<llvm::IRBuilder<>>(llvm_ctx);
+    auto *const int32_ty = llvm::Type::getInt32Ty(llvm_ctx);
+    auto *const func_ty =
+        llvm::FunctionType::get(int32_ty, {int32_ty, int32_ty},
+                                /*isVarArg=*/false);
+    auto *const func = llvm::Function::Create(
+        func_ty, llvm::Function::ExternalLinkage, kTaskName, mod.get());
+    std::vector<llvm::Value *> args;
+    for (auto &a : func->args()) {
+      args.push_back(&a);
+    }
+    auto *entry_block = llvm::BasicBlock::Create(llvm_ctx, "entry", func);
+    builder->SetInsertPoint(entry_block);
+    auto *ret_val = builder->CreateAdd(args[0], args[1], "add");
+    builder->CreateRet(ret_val);
+
+    llvm::verifyFunction(*func);
+    return mod;
+  }
+
+  CompileConfig config_;
+  // Program is *absolutely unnecessary* in this test. However, it is by far the
+  // easiest approach in Taichi to use LLVM infra (e.g. JIT session).
+  std::unique_ptr<Program> prog_{nullptr};
+  TaichiLLVMContext *tlctx_{nullptr};
+};
+
+TEST_P(LlvmOfflineCacheTest, ReadWrite) {
+  const auto llvm_fmt = GetParam();
+  fs::path tmp_dir{fs::temp_directory_path() /= std::tmpnam(nullptr)};
+  const auto tmp_dir_str{tmp_dir.u8string()};
+  const bool dir_ok = fs::create_directories(tmp_dir);
+  ASSERT_TRUE(dir_ok);
+  {
+    auto llvm_ctx = std::make_unique<llvm::LLVMContext>();
+
+    LlvmOfflineCacheFileWriter writer;
+    LlvmOfflineCache::KernelCacheData kcache;
+    kcache.kernel_key = kKernelName;
+    kcache.owned_module = make_module(*llvm_ctx);
+    kcache.module = kcache.owned_module.get();
+    kcache.offloaded_task_list.push_back(
+        LlvmOfflineCache::OffloadedTaskCacheData{kTaskName, kBlockDim,
+                                                 kGridDim});
+    writer.add_kernel_cache(kKernelName, std::move(kcache));
+    writer.set_no_mangle();
+    writer.dump(tmp_dir_str, llvm_fmt);
+  }
+
+  {
+    auto *llvm_ctx = tlctx_->get_this_thread_context();
+    LlvmOfflineCacheFileReader reader{tmp_dir_str, llvm_fmt};
+    LlvmOfflineCache::KernelCacheData kcache;
+    const bool ok = reader.get_kernel_cache(kcache, kKernelName, *llvm_ctx);
+    ASSERT_TRUE(ok);
+
+    ASSERT_NE(kcache.owned_module, nullptr);
+    kcache.module->dump();
+    tlctx_->add_module(std::move(kcache.owned_module));
+    using FuncType = int (*)(int, int);
+    FuncType my_add = (FuncType)tlctx_->lookup_function_pointer(kTaskName);
+    const auto res = my_add(40, 2);
+    EXPECT_EQ(res, 42);
+  }
+  fs::remove_all(tmp_dir);
+}
+
+INSTANTIATE_TEST_SUITE_P(Format,
+                         LlvmOfflineCacheTest,
+                         testing::Values(Format::LL, Format::BC));
+
+}  // namespace
+}  // namespace lang
+}  // namespace taichi
+
+#endif  // #if defined(TI_PLATFORM_LINUX) || defined(TI_PLATFORM_WINDOWS)
+#endif  // #ifdef TI_WITH_LLVM

From 368f3b35d80dab573cbfd193fbf7d9697a9e0c70 Mon Sep 17 00:00:00 2001
From: PGZXB <420254146@qq.com>
Date: Mon, 16 May 2022 11:17:09 +0800
Subject: [PATCH 069/176] [misc] Add ASTSerializer::visit(ReferenceExpression
 *) (#4984)

---
 taichi/analysis/gen_offline_cache_key.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/taichi/analysis/gen_offline_cache_key.cpp b/taichi/analysis/gen_offline_cache_key.cpp
index 678076650e51a..bab660c68409b 100644
--- a/taichi/analysis/gen_offline_cache_key.cpp
+++ b/taichi/analysis/gen_offline_cache_key.cpp
@@ -236,6 +236,11 @@ class ASTSerializer : public IRVisitor, public ExpressionVisitor {
     emit(expr->conv_type);
   }
 
+  void visit(ReferenceExpression *expr) override {
+    emit(ExprOpCode::ReferenceExpression);
+    emit(expr->var);
+  }
+
   void visit(Block *block) override {
     emit(StmtOpCode::EnterBlock);
     emit(static_cast<std::size_t>(block->statements.size()));

From e047ec56bcd1f2233f3b9a955eebafff74e315e3 Mon Sep 17 00:00:00 2001
From: PGZXB <420254146@qq.com>
Date: Mon, 16 May 2022 11:25:17 +0800
Subject: [PATCH 070/176] [bug] Fix infinite recursion of
 get_offline_cache_key_of_snode_impl() (#4983)

* Fix infinite recursion of get_offline_cache_key_of_snode_impl

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix some comments

* Fix

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/analysis/offline_cache_util.cpp | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
index 140a3915df67a..bcffd829f3940 100644
--- a/taichi/analysis/offline_cache_util.cpp
+++ b/taichi/analysis/offline_cache_util.cpp
@@ -71,9 +71,16 @@ static std::vector<std::uint8_t> get_offline_cache_key_of_compile_config(
 
 static void get_offline_cache_key_of_snode_impl(
     SNode *snode,
-    BinaryOutputSerializer &serializer) {
+    BinaryOutputSerializer &serializer,
+    std::unordered_set<int> &visited) {
+  if (auto iter = visited.find(snode->id); iter != visited.end()) {
+    serializer(snode->id);  // Use snode->id as placeholder to identify a snode
+    return;
+  }
+
+  visited.insert(snode->id);
   for (auto &c : snode->ch) {
-    get_offline_cache_key_of_snode_impl(c.get(), serializer);
+    get_offline_cache_key_of_snode_impl(c.get(), serializer, visited);
   }
   for (int i = 0; i < taichi_max_num_indices; ++i) {
     auto &extractor = snode->extractors[i];
@@ -106,21 +113,21 @@ static void get_offline_cache_key_of_snode_impl(
   }
   if (snode->grad_info && !snode->grad_info->is_primal()) {
     if (auto *grad_snode = snode->grad_info->grad_snode()) {
-      get_offline_cache_key_of_snode_impl(grad_snode, serializer);
+      get_offline_cache_key_of_snode_impl(grad_snode, serializer, visited);
     }
   }
   if (snode->exp_snode) {
-    get_offline_cache_key_of_snode_impl(snode->exp_snode, serializer);
+    get_offline_cache_key_of_snode_impl(snode->exp_snode, serializer, visited);
   }
   serializer(snode->bit_offset);
   serializer(snode->placing_shared_exp);
   serializer(snode->owns_shared_exponent);
   for (auto s : snode->exponent_users) {
-    get_offline_cache_key_of_snode_impl(s, serializer);
+    get_offline_cache_key_of_snode_impl(s, serializer, visited);
   }
   if (snode->currently_placing_exp_snode) {
     get_offline_cache_key_of_snode_impl(snode->currently_placing_exp_snode,
-                                        serializer);
+                                        serializer, visited);
   }
   if (snode->currently_placing_exp_snode_dtype) {
     serializer(snode->currently_placing_exp_snode_dtype->to_string());
@@ -138,7 +145,10 @@ std::string get_hashed_offline_cache_key_of_snode(SNode *snode) {
 
   BinaryOutputSerializer serializer;
   serializer.initialize();
-  get_offline_cache_key_of_snode_impl(snode, serializer);
+  {
+    std::unordered_set<int> visited;
+    get_offline_cache_key_of_snode_impl(snode, serializer, visited);
+  }
   serializer.finalize();
 
   picosha2::hash256_one_by_one hasher;

From 17f905632f59c5ba0b66762550d1b0c827fedd39 Mon Sep 17 00:00:00 2001
From: YuZhang <YuCrazing@users.noreply.github.com>
Date: Mon, 16 May 2022 18:27:35 +0800
Subject: [PATCH 071/176] [cuda] Add block and grid level intrinsic for cuda
 backend (#4977)

* Add block/grid level intrinsics

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Add test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix syntax

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/simt/__init__.py |  4 +--
 python/taichi/lang/simt/block.py    |  5 +++
 python/taichi/lang/simt/grid.py     |  5 +++
 tests/python/test_simt.py           | 54 ++++++++++++++++++++++++++++-
 4 files changed, 65 insertions(+), 3 deletions(-)
 create mode 100644 python/taichi/lang/simt/block.py
 create mode 100644 python/taichi/lang/simt/grid.py

diff --git a/python/taichi/lang/simt/__init__.py b/python/taichi/lang/simt/__init__.py
index bde183ffbf9a4..b7a0be5dea542 100644
--- a/python/taichi/lang/simt/__init__.py
+++ b/python/taichi/lang/simt/__init__.py
@@ -1,3 +1,3 @@
-from taichi.lang.simt import subgroup, warp
+from taichi.lang.simt import block, grid, subgroup, warp
 
-__all__ = ['warp', 'subgroup']
+__all__ = ['warp', 'subgroup', 'block', 'grid']
diff --git a/python/taichi/lang/simt/block.py b/python/taichi/lang/simt/block.py
new file mode 100644
index 0000000000000..427664473dd5d
--- /dev/null
+++ b/python/taichi/lang/simt/block.py
@@ -0,0 +1,5 @@
+from taichi.lang import impl
+
+
+def sync():
+    return impl.call_internal("block_barrier", with_runtime_context=False)
diff --git a/python/taichi/lang/simt/grid.py b/python/taichi/lang/simt/grid.py
new file mode 100644
index 0000000000000..62bf5e3d8cab8
--- /dev/null
+++ b/python/taichi/lang/simt/grid.py
@@ -0,0 +1,5 @@
+from taichi.lang import impl
+
+
+def memfence():
+    return impl.call_internal("grid_memfence", with_runtime_context=False)
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index a29221d3735d6..1b5cc39996a6d 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -296,7 +296,7 @@ def foo():
 
 
 @test_utils.test(arch=ti.cuda)
-def test_sync():
+def test_warp_sync():
     a = ti.field(dtype=ti.u32, shape=32)
 
     @ti.kernel
@@ -314,6 +314,58 @@ def foo():
         assert a[i] == i % 16 + 16
 
 
+@test_utils.test(arch=ti.cuda)
+def test_block_sync():
+    N = 1024
+    a = ti.field(dtype=ti.u32, shape=N)
+
+    @ti.kernel
+    def foo():
+        ti.loop_config(block_dim=N)
+        for i in range(N):
+            # Make the 0-th thread runs slower intentionally
+            for j in range(N - i):
+                a[i] = j
+            ti.simt.block.sync()
+            if i > 0:
+                a[i] = a[0]
+
+    foo()
+
+    for i in range(N):
+        assert a[i] == N - 1
+
+
+# TODO: replace this with a stronger test case
+@test_utils.test(arch=ti.cuda)
+def test_grid_memfence():
+
+    N = 1000
+    BLOCK_SIZE = 1
+    a = ti.field(dtype=ti.u32, shape=N)
+
+    @ti.kernel
+    def foo():
+
+        block_counter = 0
+        ti.loop_config(block_dim=BLOCK_SIZE)
+        for i in range(N):
+
+            a[i] = 1
+            ti.simt.grid.memfence()
+
+            # Execute a prefix sum after all blocks finish
+            actual_order_of_block = ti.atomic_add(block_counter, 1)
+            if actual_order_of_block == N - 1:
+                for j in range(1, N):
+                    a[j] += a[j - 1]
+
+    foo()
+
+    for i in range(N):
+        assert a[i] == i + 1
+
+
 # Higher level primitives test
 def _test_subgroup_reduce(op, group_op, np_op, size, initial_value, dtype):
     field = ti.field(dtype, (size))

From ea40be94a0308eb4ef88087039b71402405d2474 Mon Sep 17 00:00:00 2001
From: Chuandong Yan <90600320+chuandongyan@users.noreply.github.com>
Date: Mon, 16 May 2022 20:36:47 +0800
Subject: [PATCH 072/176] [Workflow] Update release_test.sh (#4960)

Co-authored-by: Chengchen(Rex) Wang <14366016+rexwangcc@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 ci/scripts/release_test.sh | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/ci/scripts/release_test.sh b/ci/scripts/release_test.sh
index 8122383e3dfae..f008fe95504de 100644
--- a/ci/scripts/release_test.sh
+++ b/ci/scripts/release_test.sh
@@ -80,6 +80,11 @@ function taichi::utils::pause {
     read -p "Press enter to continue"
 }
 
+function taichi::utils::pkill {
+    sleep 5
+    pkill -f "$1"
+}
+
 function taichi::test::ggui {
     local WORKDIR=${1}
     local PATTERN="*_ggui.py"
@@ -96,9 +101,10 @@ function taichi::test::ggui {
 
     # run tests
     for match in $(find ./ -name "${PATTERN}"); do
-        python "${match}"
+        python "${match}" &
+        taichi::utils::pkill "${match}"
         taichi::utils::line
-        taichi::utils::pause
+        # taichi::utils::pause
     done
 
     # go back to workdir
@@ -121,9 +127,10 @@ function taichi::test::difftaichi {
 
     # run tests
     for match in $(find ./ -name "${PATTERN}"); do
-        python "${match}"
+        python "${match}" &
+        taichi::utils::pkill "${match}"
         taichi::utils::line
-        taichi::utils::pause
+        # taichi::utils::pause
     done
 
     # go back to workdir
@@ -147,12 +154,14 @@ function taichi::test::taichi_elements {
     # install dependencies
     python "download_ply.py"
 
+
     # run tests
     cd "${REPO}/demo"
     for match in $(find ./ -name "${PATTERN}"); do
-        python "${match}"
+        python "${match}" &
+        taichi::utils::pkill "${match}"
         taichi::utils::line
-        taichi::utils::pause
+        # taichi::utils::pause
     done
 
     # run special tests
@@ -186,6 +195,7 @@ function taichi::test::stannum {
 
     # run tests
     pytest -v -s ./
+    taichi::utils::line
 
     # go back to workdir
     cd "${WORKDIR}"
@@ -210,8 +220,9 @@ function taichi::test::sandyfluid {
     pip install -r requirements.txt
 
     # run tests
-    python src/main.py
-
+    python src/main.py &
+    taichi::utils::pkill "src/main.py"
+    taichi::utils::line
     # go back to workdir
     cd "${WORKDIR}"
 }
@@ -230,7 +241,9 @@ function taichi::test::voxel_editor {
     cd "${REPO}"
 
     # run tests
-    python voxel_editor.py
+    python voxel_editor.py &
+    taichi::utils::pkill "voxel_editor.py"
+    taichi::utils::line
 
     # go back to workdir
     cd "${WORKDIR}"

From 09de043b1e5fc522d642e7fb14a9dfed3283c8f3 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Mon, 16 May 2022 23:44:49 +0800
Subject: [PATCH 073/176] Provision of prebuilt LLVM 10 for VS2022 (#4987)

---
 CMakeLists.txt                                | 15 +++++-
 build.ps1                                     | 54 ++++++++++++++++---
 .../lang/articles/contribution/dev_install.md |  8 +--
 3 files changed, 67 insertions(+), 10 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 494535cb4187c..0be966f98e101 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,7 @@
 # The Taichi Programming Language
 #*********************************************************************
 
-cmake_minimum_required(VERSION 3.13)
+cmake_minimum_required(VERSION 3.15)
 
 project(taichi)
 
@@ -70,6 +70,19 @@ if (USE_MOLD)
     set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=mold")
 endif()
 
+if (WIN32)
+  # For `Debug` configs MSVC links to a debuggable runtime by default which has
+  # symbol conflicts with the prebuilt LLVM in `Release`. We shoule be providing
+  # prebuilt LLVMs for both `Debug` and `Release` but LLVM 10 cannot be built by
+  # MSVC in `Debug` config because MSVC would try to fill uninitialize memory
+  # with `0xCC` but it too breaks `LLVMTableGen` which is depended on by almost
+  # every component in LLVM.
+  #
+  # FIXME: (penguinliong) This is fixed in later releases of LLVM so maybe
+  # someday we can distribute `Debug` libraries, if it's ever needed.
+  SET(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
+endif()
+
 # No support of Python for Android build
 if (NOT ANDROID)
     include(cmake/PythonNumpyPybind11.cmake)
diff --git a/build.ps1 b/build.ps1
index f45ec10c70f94..fdd90f4544b0c 100644
--- a/build.ps1
+++ b/build.ps1
@@ -3,6 +3,7 @@ param(
     [string] $BuildType = "Release",
     [string] $LlvmDir = "",
     [string] $ClangDir = "",
+    [string] $VisualStudioVersion = "",
     # Install python package in user-space.
     [switch] $UserSpace = $false,
     # Clean up compilation intermediates instead of building Taichi. Note that
@@ -45,12 +46,53 @@ function DownloadArchiveAndExpand($Uri, $ArchiveName) {
 
 
+# Identify Visual Studio version.
+if (-not $VisualStudioVersion) {
+    $VisualStudioVersion = (Get-CimInstance MSFT_VSInstance).Version.Split('.')[0]
+    Write-Host "Identified Visual Studio version from installation."
+}
+switch ($VisualStudioVersion) {
+    "2019" { $VisualStudioVersion = "16" }
+    "2022" { $VisualStudioVersion = "17" }
+}
+switch ($VisualStudioVersion) {
+    "16" {
+        Write-Host "Using MSVC from Visual Studio 2019."
+        $PrebuiltLlvmUri = "https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip"
+        $LlvmArchiveName = "taichi-llvm-10.0.0-msvc2019"
+    }
+    "17" {
+        Write-Host "Using MSVC from Visual Studio 2022."
+        $PrebuiltLlvmUri = "https://github.com/taichi-dev/taichi_assets/releases/download/llvm10_msvc2022/taichi-llvm-10.0.0-msvc2022.zip"
+        $LlvmArchiveName = "taichi-llvm-10.0.0-msvc2022"
+    }
+    default {
+        Write-Error "Unsupported Visual Studio Version"
+    }
+}
+
 # Select build type, by default it's `Release`.
 switch ($BuildType) {
-    "Debug" { $env:DEBUG = 1; }
-    "Release" {}
-    "RelWithDebInfo" { $env:RELWITHDEBINFO = 1; }
-    "MinSizeRel" { $env:MINSIZEREL = 1; }
+    "Debug" {
+        $env:DEBUG = 1;
+        $env:RELWITHDEBINFO = 0;
+        $env:MINSIZEREL = 0;
+    }
+    "Release" {
+        $env:DEBUG = 0;
+        $env:RELWITHDEBINFO = 0;
+        $env:MINSIZEREL = 0;
+    }
+    "RelWithDebInfo" {
+        $env:DEBUG = 0;
+        $env:RELWITHDEBINFO = 1;
+        $env:MINSIZEREL = 0;
+    }
+    "MinSizeRel" {
+        $env:DEBUG = 0;
+        $env:RELWITHDEBINFO = 0;
+        $env:MINSIZEREL = 1;
+    }
     Default {
         Write-Error "Unknown build type '$BuildType'"
     }
@@ -63,8 +105,8 @@ if ($env:LLVM_DIR) {
     $LlvmDir = $env:LLVM_DIR;
 }
 if (-not $LlvmDir) {
-    DownloadArchiveAndExpand -Uri "https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip" -ArchiveName "taichi-llvm"
-    $LlvmDir = "$TempDir/taichi-llvm"
+    DownloadArchiveAndExpand -Uri $PrebuiltLlvmUri -ArchiveName $LlvmArchiveName
+    $LlvmDir = "$TempDir/$LlvmArchiveName"
 }
 if (-not $LlvmDir -or -not (Test-Path $LlvmDir)) {
     throw "LLVM cannot be found in local environment and the script failed to download a prebuilt archive. " +
diff --git a/docs/lang/articles/contribution/dev_install.md b/docs/lang/articles/contribution/dev_install.md
index 106a6d7fc395a..ba85f89e2e04e 100644
--- a/docs/lang/articles/contribution/dev_install.md
+++ b/docs/lang/articles/contribution/dev_install.md
@@ -168,7 +168,7 @@ We provide pre-built, customized LLVM binaries. For now, Taichi supports LLVM 10
     {label: 'LLVM 10.0.0 for Linux', value: 'llvm_linux'},
     {label: 'LLVM 10.0.0 for macOS (without M1 chip)', value: 'llvm_macos_sans_m1'},
     {label: 'LLVM 10.0.0 for macOS (with M1 chip)', value: 'llvm_macos_m1'},
-    {label: 'LLVM 10.0.0 for Windows MSVC 2019', value: 'llvm_windows'},
+    {label: 'LLVM 10.0.0 for Windows', value: 'llvm_windows'},
   ]}>
 
 <TabItem value="llvm_linux">
@@ -182,6 +182,7 @@ We provide pre-built, customized LLVM binaries. For now, Taichi supports LLVM 10
 </TabItem>
 <TabItem value="llvm_windows">
     <a href="https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip">LLVM 10.0.0 for Windows MSVC 2019</a>
+    <a href="https://github.com/taichi-dev/taichi_assets/releases/download/llvm10_msvc2022/taichi-llvm-10.0.0-msvc2022.zip">LLVM 10.0.0 for Windows MSVC 2022</a>
 </TabItem>
 </Tabs>
 
@@ -269,12 +270,13 @@ llvm-config --version  # You should get 10.0.0
 
 # LLVM 10.0.0 + MSVC 2019
 
-cmake .. -G "Visual Studio 16 2019" -A x64 -DLLVM_ENABLE_RTTI:BOOL=ON -DBUILD_SHARED_LIBS:BOOL=OFF   -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DLLVM_ENABLE_ASSERTIONS=ON -Thost=x64   -DLLVM_BUILD_TESTS:BOOL=OFF -DCMAKE_INSTALL_PREFIX=installed
+cmake .. -G "Visual Studio 16 2019" -A x64 -DLLVM_ENABLE_RTTI:BOOL=ON -DBUILD_SHARED_LIBS:BOOL=OFF -DCMAKE_BUILD_TYPE=Release -DLLVM_TARGETS_TO_BUILD="X86;NVPTX" -DLLVM_ENABLE_ASSERTIONS=ON -Thost=x64 -DLLVM_BUILD_TESTS:BOOL=OFF -DCMAKE_INSTALL_PREFIX=installed -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreadedDLL -DCMAKE_CXX_STANDARD=17
+cmake --build . --target=INSTALL --config=Release
 ```
 
 1. Use Visual Studio 2017+ to build **LLVM.sln**.
 2. Ensure that you use the **Release** configuration. After building the `INSTALL` project (under folde **CMakePredefinedTargets** in the Solution Explorer window).
-3. If you use MSVC 2019, ensure that you use **C++17** for the `INSTALL` project.
+3. If you use MSVC 2019+, ensure that you use **C++17** for the `INSTALL` project.
 4. When the build completes, add an environment variable `LLVM_DIR` with value `<PATH_TO_BUILD>/build/installed/lib/cmake/llvm`.
 
 </TabItem>

From 70d45babc998bc8fcbab620aa4659940af527a98 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Tue, 17 May 2022 09:49:45 +0800
Subject: [PATCH 074/176] [llvm] Use serializer for LLVM cache (#4982)

* [llvm] Use serializer for LLVM cache

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* ctor

* fix

* fix to pointer

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix order

* wip

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix

* fix

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/codegen_llvm.cpp            |  56 ++++++----
 taichi/codegen/codegen_llvm.h              |   3 +
 taichi/llvm/llvm_offline_cache.cpp         | 116 +++++++++++++--------
 taichi/llvm/llvm_offline_cache.h           |  29 ++++--
 tests/cpp/llvm/llvm_offline_cache_test.cpp |  21 +++-
 tests/python/test_offline_cache.py         |  69 ++++++------
 6 files changed, 190 insertions(+), 104 deletions(-)

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 0656eae7c3558..d1b880f8821a0 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -2354,28 +2354,13 @@ CodeGenLLVM::CompiledData CodeGenLLVM::run_compilation() {
       this->supports_offline_cache() && !kernel->is_evaluator) {
     kernel_key = get_hashed_offline_cache_key(&kernel->program->config, kernel);
 
-    LlvmOfflineCacheFileReader reader(config.offline_cache_file_path);
-    LlvmOfflineCache::KernelCacheData cache_data;
-    auto *tlctx =
-        this->prog->get_llvm_program_impl()->get_llvm_context(config.arch);
-    auto &llvm_ctx = *tlctx->get_this_thread_context();
-
-    if (reader.get_kernel_cache(cache_data, kernel_key, llvm_ctx)) {
-      this->module = std::move(cache_data.owned_module);
-      for (auto &task : cache_data.offloaded_task_list) {
-        auto &t = this->offloaded_tasks.emplace_back(this);
-        t.name = std::move(task.name);
-        t.block_dim = task.block_dim;
-        t.grid_dim = task.grid_dim;
-      }
-      kernel->set_from_offline_cache();
-      CompiledData res;
-      res.offloaded_tasks = std::move(this->offloaded_tasks);
-      res.llvm_module = std::move(this->module);
+    CompiledData res;
+    const bool ok = maybe_read_compilation_from_cache(kernel_key, &res);
+    if (ok) {
       return res;
-    } else {
-      needs_cache = true;
     }
+
+    needs_cache = true;
   }
 
   if (!kernel->lowered()) {
@@ -2392,6 +2377,37 @@ CodeGenLLVM::CompiledData CodeGenLLVM::run_compilation() {
   return res;
 }
 
+bool CodeGenLLVM::maybe_read_compilation_from_cache(
+    const std::string &kernel_key,
+    CompiledData *data) {
+  const auto &config = prog->config;
+  auto reader =
+      LlvmOfflineCacheFileReader::make(config.offline_cache_file_path);
+  if (!reader) {
+    return false;
+  }
+
+  LlvmOfflineCache::KernelCacheData cache_data;
+  auto *tlctx =
+      this->prog->get_llvm_program_impl()->get_llvm_context(config.arch);
+  auto &llvm_ctx = *tlctx->get_this_thread_context();
+
+  if (!reader->get_kernel_cache(cache_data, kernel_key, llvm_ctx)) {
+    return false;
+  }
+  this->module = std::move(cache_data.owned_module);
+  for (auto &task : cache_data.offloaded_task_list) {
+    auto &t = this->offloaded_tasks.emplace_back(this);
+    t.name = std::move(task.name);
+    t.block_dim = task.block_dim;
+    t.grid_dim = task.grid_dim;
+  }
+  kernel->set_from_offline_cache();
+  data->offloaded_tasks = std::move(this->offloaded_tasks);
+  data->llvm_module = std::move(this->module);
+  return true;
+}
+
 FunctionType CodeGenLLVM::gen() {
   auto compiled_res = run_compilation();
 
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index e6b20104640db..2613f86d0b803 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -404,6 +404,9 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
   ~CodeGenLLVM() override = default;
 
  private:
+  bool maybe_read_compilation_from_cache(const std::string &kernel_key,
+                                         CompiledData *data);
+
   void cache_module(const std::string &kernel_key);
 };
 
diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index 0500459570bfd..8b34198ab2c3b 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -8,6 +8,7 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_os_ostream.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 
 #include "taichi/ir/transforms.h"
 #include "taichi/llvm/llvm_context.h"
@@ -15,53 +16,85 @@
 namespace taichi {
 namespace lang {
 namespace {
+
 using Format = LlvmOfflineCache::Format;
+constexpr char kMetadataFilename[] = "metadata";
+
 }  // namespace
 
+// static
+std::unique_ptr<LlvmOfflineCacheFileReader> LlvmOfflineCacheFileReader::make(
+    const std::string &path,
+    LlvmOfflineCache::Format format) {
+  std::stringstream tcb_ss;
+  tcb_ss << path << "/" << kMetadataFilename << ".tcb";
+  const auto tcb_path = tcb_ss.str();
+  {
+    // No the best way to check for filepath existence, but whatever... See
+    // https://stackoverflow.com/questions/12774207/fastest-way-to-check-if-a-file-exists-using-standard-c-c11-14-17-c
+    std::ifstream fs(tcb_path, std::ios::in | std::ios::binary);
+    if (!fs.good()) {
+      TI_DEBUG("LLVM cache {} does not exist", path);
+      return nullptr;
+    }
+  }
+  LlvmOfflineCache data;
+  read_from_binary_file(data, tcb_path);
+  return std::unique_ptr<LlvmOfflineCacheFileReader>(
+      new LlvmOfflineCacheFileReader(path, std::move(data), format));
+}
+
+LlvmOfflineCacheFileReader::LlvmOfflineCacheFileReader(
+    const std::string &path,
+    LlvmOfflineCache &&data,
+    LlvmOfflineCache::Format format)
+    : path_(path), data_(std::move(data)), format_(format) {
+}
+
 bool LlvmOfflineCacheFileReader::get_kernel_cache(
     LlvmOfflineCache::KernelCacheData &res,
     const std::string &key,
     llvm::LLVMContext &llvm_ctx) {
-  res.kernel_key = key;
-  const std::string filename_prefix = path_ + "/" + key;
-  if (format_ & Format::BC) {
-    LlvmModuleBitcodeLoader loader;
-    res.owned_module = loader.set_bitcode_path(filename_prefix + ".bc")
-                           .set_buffer_id(key)
-                           .set_inline_funcs(false)
-                           .load(&llvm_ctx);
-  } else if (format_ & Format::LL) {
-    const std::string filename = filename_prefix + ".ll";
-    llvm::SMDiagnostic err;
-    res.owned_module = llvm::parseAssemblyFile(filename, err, llvm_ctx);
-  } else {
-    TI_ERROR("Unknown LLVM format={}", format_);
+  auto itr = data_.kernels.find(key);
+  if (itr == data_.kernels.end()) {
+    TI_DEBUG("Cannot find kernel={}", key);
     return false;
   }
 
-  res.module = res.owned_module.get();
-  if (!res.module) {
-    return false;
+  auto &kernel_data = itr->second;
+  if (kernel_data.owned_module == nullptr) {
+    const std::string filename_prefix = path_ + "/" + key;
+    kernel_data.owned_module = load_module(filename_prefix, key, llvm_ctx);
+    TI_ASSERT(kernel_data.owned_module != nullptr);
+    kernel_data.module = kernel_data.owned_module.get();
   }
 
-  {
-    const std::string filename = filename_prefix + "_otnl.txt";
-    std::ifstream in(filename, std::ios::in | std::ios::binary);
-    if (!in.is_open())
-      return false;
-    while (true) {
-      std::string line;
-      std::getline(in, line, '\n');
-      if (line.empty())
-        break;
-      std::istringstream iss(line);
-      auto &task = res.offloaded_task_list.emplace_back();
-      iss >> task.name >> task.block_dim >> task.grid_dim;
-    }
-  }
+  res.kernel_key = key;
+  res.owned_module = llvm::CloneModule(*kernel_data.module);
+  res.module = res.owned_module.get();
+  res.offloaded_task_list = kernel_data.offloaded_task_list;
   return true;
 }
 
+std::unique_ptr<llvm::Module> LlvmOfflineCacheFileReader::load_module(
+    const std::string &path_prefix,
+    const std::string &key,
+    llvm::LLVMContext &llvm_ctx) const {
+  if (format_ & Format::BC) {
+    LlvmModuleBitcodeLoader loader;
+    return loader.set_bitcode_path(path_prefix + ".bc")
+        .set_buffer_id(key)
+        .set_inline_funcs(false)
+        .load(&llvm_ctx);
+  } else if (format_ & Format::LL) {
+    const std::string filename = path_prefix + ".ll";
+    llvm::SMDiagnostic err;
+    return llvm::parseAssemblyFile(filename, err, llvm_ctx);
+  }
+  TI_ERROR("Unknown LLVM format={}", format_);
+  return nullptr;
+}
+
 void LlvmOfflineCacheFileWriter::dump(const std::string &path,
                                       LlvmOfflineCache::Format format) {
   taichi::create_directories(path);
@@ -99,15 +132,16 @@ void LlvmOfflineCacheFileWriter::dump(const std::string &path,
         });
       }
     }
-    {
-      std::string filename = filename_prefix + "_otnl.txt";
-      std::ofstream os(filename, std::ios::out | std::ios::binary);
-      TI_ERROR_IF(!os.is_open(), "File {} open failed", filename);
-      for (const auto &task : v.offloaded_task_list) {
-        os << task.name << ' ' << task.block_dim << ' ' << task.grid_dim
-           << '\n';
-      }
-    }
+  }
+  {
+    std::stringstream prefix_ss;
+    prefix_ss << path << "/" << kMetadataFilename;
+    const std::string file_prefix = prefix_ss.str();
+    write_to_binary_file(data_, file_prefix + ".tcb");
+    // For debugging
+    TextSerializer ts;
+    ts.serialize_to_json("cache", data_);
+    ts.write_to_file(file_prefix + ".json");
   }
 }
 
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index 42f89c362179e..fe5666e98a5e7 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "taichi/common/core.h"
+#include "taichi/common/serialization.h"
 #include "taichi/program/kernel.h"
 #include "taichi/util/io.h"
 
@@ -19,37 +20,51 @@ struct LlvmOfflineCache {
     std::string name;
     int block_dim{0};
     int grid_dim{0};
+
+    TI_IO_DEF(name, block_dim, grid_dim);
   };
 
   struct KernelCacheData {
     std::string kernel_key;
+    std::vector<OffloadedTaskCacheData> offloaded_task_list;
+
     std::unique_ptr<llvm::Module> owned_module{nullptr};
     llvm::Module *module{nullptr};
-    std::vector<OffloadedTaskCacheData> offloaded_task_list;
 
     KernelCacheData() = default;
     KernelCacheData(KernelCacheData &&) = default;
     KernelCacheData &operator=(KernelCacheData &&) = default;
     ~KernelCacheData() = default;
+
+    TI_IO_DEF(kernel_key, offloaded_task_list);
   };
 
   std::unordered_map<std::string, KernelCacheData> kernels;
+
+  TI_IO_DEF(kernels);
 };
 
 class LlvmOfflineCacheFileReader {
  public:
-  LlvmOfflineCacheFileReader(
-      const std::string &path,
-      LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL)
-      : path_(path), format_(format) {
-  }
-
   bool get_kernel_cache(LlvmOfflineCache::KernelCacheData &res,
                         const std::string &key,
                         llvm::LLVMContext &llvm_ctx);
 
+  static std::unique_ptr<LlvmOfflineCacheFileReader> make(
+      const std::string &path,
+      LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL);
+
  private:
+  LlvmOfflineCacheFileReader(const std::string &path,
+                             LlvmOfflineCache &&data,
+                             LlvmOfflineCache::Format format);
+
+  std::unique_ptr<llvm::Module> load_module(const std::string &path_prefix,
+                                            const std::string &key,
+                                            llvm::LLVMContext &llvm_ctx) const;
+
   std::string path_;
+  LlvmOfflineCache data_;
   LlvmOfflineCache::Format format_;
 };
 
diff --git a/tests/cpp/llvm/llvm_offline_cache_test.cpp b/tests/cpp/llvm/llvm_offline_cache_test.cpp
index b2d044807ef87..38fc95dcab209 100644
--- a/tests/cpp/llvm/llvm_offline_cache_test.cpp
+++ b/tests/cpp/llvm/llvm_offline_cache_test.cpp
@@ -1,6 +1,7 @@
 #include "gtest/gtest.h"
 
 #include "taichi/common/platform_macros.h"
+#include "taichi/common/cleanup.h"
 
 #ifdef TI_WITH_LLVM
 
@@ -81,6 +82,7 @@ class LlvmOfflineCacheTest : public testing::TestWithParam<Format> {
 TEST_P(LlvmOfflineCacheTest, ReadWrite) {
   const auto llvm_fmt = GetParam();
   fs::path tmp_dir{fs::temp_directory_path() /= std::tmpnam(nullptr)};
+  auto cleanup = make_cleanup([tmp_dir]() { fs::remove_all(tmp_dir); });
   const auto tmp_dir_str{tmp_dir.u8string()};
   const bool dir_ok = fs::create_directories(tmp_dir);
   ASSERT_TRUE(dir_ok);
@@ -100,12 +102,16 @@ TEST_P(LlvmOfflineCacheTest, ReadWrite) {
     writer.dump(tmp_dir_str, llvm_fmt);
   }
 
+  auto *llvm_ctx = tlctx_->get_this_thread_context();
+  auto reader = LlvmOfflineCacheFileReader::make(tmp_dir_str, llvm_fmt);
   {
-    auto *llvm_ctx = tlctx_->get_this_thread_context();
-    LlvmOfflineCacheFileReader reader{tmp_dir_str, llvm_fmt};
     LlvmOfflineCache::KernelCacheData kcache;
-    const bool ok = reader.get_kernel_cache(kcache, kKernelName, *llvm_ctx);
+    const bool ok = reader->get_kernel_cache(kcache, kKernelName, *llvm_ctx);
     ASSERT_TRUE(ok);
+    EXPECT_EQ(kcache.kernel_key, kKernelName);
+    EXPECT_EQ(kcache.offloaded_task_list.size(), 1);
+    const auto &task0 = kcache.offloaded_task_list.front();
+    EXPECT_EQ(task0.name, kTaskName);
 
     ASSERT_NE(kcache.owned_module, nullptr);
     kcache.module->dump();
@@ -114,8 +120,13 @@ TEST_P(LlvmOfflineCacheTest, ReadWrite) {
     FuncType my_add = (FuncType)tlctx_->lookup_function_pointer(kTaskName);
     const auto res = my_add(40, 2);
     EXPECT_EQ(res, 42);
-  }
-  fs::remove_all(tmp_dir);
+  };
+  {
+    // Do it twice. No file IO this time.
+    LlvmOfflineCache::KernelCacheData kcache;
+    const bool ok = reader->get_kernel_cache(kcache, kKernelName, *llvm_ctx);
+    ASSERT_TRUE(ok);
+  };
 }
 
 INSTANTIATE_TEST_SUITE_P(Format,
diff --git a/tests/python/test_offline_cache.py b/tests/python/test_offline_cache.py
index b89081b841009..99c6203ff80df 100644
--- a/tests/python/test_offline_cache.py
+++ b/tests/python/test_offline_cache.py
@@ -14,7 +14,14 @@
     v for v in supported_archs_offline_cache
     if v in test_utils.expected_archs()
 ]
-cache_files_num_per_kernel = 2
+
+
+def get_expected_num_cache_files(num_kernels: int) -> int:
+    if num_kernels == 0:
+        return 0
+    NUM_CACHE_FILES_PER_KERNEL = 1
+    # metadata.{json, tcb}
+    return 2 + NUM_CACHE_FILES_PER_KERNEL * num_kernels
 
 
 def tmp_offline_cache_file_path():
@@ -121,20 +128,20 @@ def _test_offline_cache_for_a_kernel(curr_arch, kernel, args, result):
             **current_thread_ext_options())
     res1 = kernel(*args)
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
 
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 1 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(1)
     res2 = kernel(*args)
     assert res1 == test_utils.approx(result) and res1 == test_utils.approx(
         res2)
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 1 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(1)
 
 
 @_test_offline_cache_dec
@@ -146,20 +153,21 @@ def _test_closing_offline_cache_for_a_kernel(curr_arch, kernel, args, result):
             offline_cache_file_path=tmp_offline_cache_file_path())
     res1 = kernel(*args)
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
 
     ti.init(arch=curr_arch,
             enable_fallback=False,
             offline_cache_file_path=tmp_offline_cache_file_path())
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
     res2 = kernel(*args)
+
     assert res1 == test_utils.approx(result) and res1 == test_utils.approx(
         res2)
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -209,18 +217,18 @@ def compute_y():
             **current_thread_ext_options())
     helper()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
 
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 8 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(8)
     helper()
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 8 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(8)
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -244,7 +252,7 @@ def np_kernel(a, b):
     np_mat3 = mat3.to_numpy()
 
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
@@ -254,7 +262,7 @@ def np_kernel(a, b):
             enable_fallback=False,
             **current_thread_ext_options())
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 1 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(1)
 
     assert (kernel(mat1, mat1).to_numpy() == np_kernel(np_mat1, np_mat1)).all()
     assert (kernel(mat1, mat2).to_numpy() == np_kernel(np_mat1, np_mat2)).all()
@@ -263,7 +271,7 @@ def np_kernel(a, b):
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 1 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(1)
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -286,7 +294,7 @@ def helper():
         assert y[None] == test_utils.approx(7.28)
 
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
@@ -296,12 +304,12 @@ def helper():
             enable_fallback=False,
             **current_thread_ext_options())
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 4 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(4)
     helper()
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 4 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(4)
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -322,7 +330,7 @@ def helper():
         assert a[4][9] == 9
 
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
@@ -332,12 +340,12 @@ def helper():
             enable_fallback=False,
             **current_thread_ext_options())
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 2 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(2)
     helper()
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 2 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(2)
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -354,19 +362,19 @@ def helper():
             **current_thread_ext_options())
     helper()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
 
     ti.init(arch=curr_arch,
             enable_fallback=False,
             **current_thread_ext_options())
-    assert len(listdir(
-        tmp_offline_cache_file_path())) - count_of_cache_file == len(
-            simple_kernels_to_test) * cache_files_num_per_kernel
+    assert len(listdir(tmp_offline_cache_file_path())
+               ) - count_of_cache_file == get_expected_num_cache_files(
+                   len(simple_kernels_to_test))
     helper()
     ti.reset()
-    assert len(listdir(
-        tmp_offline_cache_file_path())) - count_of_cache_file == len(
-            simple_kernels_to_test) * cache_files_num_per_kernel
+    assert len(listdir(tmp_offline_cache_file_path())
+               ) - count_of_cache_file == get_expected_num_cache_files(
+                   len(simple_kernels_to_test))
 
 
 @pytest.mark.parametrize('curr_arch', supported_archs_offline_cache)
@@ -383,7 +391,7 @@ def helper():
             c += i
 
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 0 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(0)
     ti.init(arch=curr_arch,
             enable_fallback=False,
             default_fp=ti.f32,
@@ -395,13 +403,12 @@ def helper():
             default_fp=ti.f64,
             **current_thread_ext_options())
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 1 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(1)
     helper()
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 2 * cache_files_num_per_kernel
-
+               ) - count_of_cache_file == get_expected_num_cache_files(2)
     ti.init(arch=curr_arch,
             enable_fallback=False,
             default_fp=ti.f32,
@@ -411,4 +418,4 @@ def helper():
 
     ti.reset()
     assert len(listdir(tmp_offline_cache_file_path())
-               ) - count_of_cache_file == 2 * cache_files_num_per_kernel
+               ) - count_of_cache_file == get_expected_num_cache_files(2)

From 6764b88af0e2af03603b637f568224c09ca7536f Mon Sep 17 00:00:00 2001
From: Justin <62801799+Justinterest@users.noreply.github.com>
Date: Tue, 17 May 2022 14:22:12 +0800
Subject: [PATCH 075/176] [Doc] Fix docs deploy netlify test configuration
 (#4991)

* fix docs deploy netlify test configuration

* check netlify change to run docs preview
---
 netlify.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/netlify.toml b/netlify.toml
index 02de37e049ec1..f5cd94992176f 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -1,6 +1,6 @@
 [build]
-  command = "git clone https://github.com/taichi-dev/docs.taichi.graphics.git; rm -rf docs.taichi.graphics/website/docs/lang; cp -rf docs/lang docs.taichi.graphics/website/docs/lang; git clone https://github.com/taichi-dev/docstring-gen docsgen; export DOCSTRING_GEN_PATH=\"$(pwd)/docsgen\"; export TAICHI_PATH=\"$(pwd)/python/taichi\"; export TAICHI_WEBSITE=\"$(pwd)/docs.taichi.graphics\"; pip install sphinx-autoapi==1.8.4 gitpython pydata-sphinx-theme==0.7.2; cd $DOCSTRING_GEN_PATH/experimental; export current_version=master; make clean; make version; make apideploy; cd $TAICHI_WEBSITE/website; npm install --global yarn@1.22; yarn install; yarn build; yarn run apiversion;"
+  command = "git clone https://github.com/taichi-dev/docs.taichi.graphics.git; git clone https://github.com/taichi-dev/docstring-gen docsgen; export DOCSTRING_GEN_PATH=\"$(pwd)/docsgen\"; export TAICHI_PATH=\"$(pwd)/python/taichi\"; export TAICHI_MAIN=\"$(pwd)\"; export TAICHI_WEBSITE=\"$(pwd)/docs.taichi.graphics\"; pip install sphinx-autoapi==1.8.4 gitpython pydata-sphinx-theme==0.7.2; cd $DOCSTRING_GEN_PATH/experimental; export current_version=master; make clean; make version; make apideploy; cd $TAICHI_WEBSITE/website; npm install --global yarn@1.22; yarn install; ./sync_docs.sh dev; yarn build;"
 
   publish = "docs.taichi.graphics/website/build"
   # Cancel the build if there're no changes detected in docs/ folder.
-  ignore = "git remote add upstream https://github.com/taichi-dev/taichi.git; git fetch upstream master; git diff --quiet $COMMIT_REF upstream/master -- docs/ python/"
+  ignore = "git remote add upstream https://github.com/taichi-dev/taichi.git; git fetch upstream master; git diff --quiet $COMMIT_REF upstream/master -- docs/ python/ netlify.toml"

From ac1f06f36f292d800a0f0b6b2f3b38ea66867fd4 Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Tue, 17 May 2022 14:43:05 +0800
Subject: [PATCH 076/176] [Doc] Updated URL (#4990)

* Updated URL

* Updated URL

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .github/ISSUE_TEMPLATE/bug_report.md      | 2 +-
 .github/ISSUE_TEMPLATE/config.yml         | 2 +-
 .github/pull_request_template.md          | 4 ++--
 CMakeLists.txt                            | 2 +-
 CONTRIBUTING.md                           | 4 ++--
 README.md                                 | 6 +++---
 benchmarks/microbenchmarks/_items.py      | 2 +-
 build.ps1                                 | 4 ++--
 python/taichi/_lib/utils.py               | 3 +--
 python/taichi/_logging.py                 | 4 ++--
 python/taichi/lang/impl.py                | 6 +++---
 python/taichi/lang/kernel_impl.py         | 4 ++--
 python/taichi/lang/matrix.py              | 4 ++--
 python/taichi/lang/misc.py                | 4 ++--
 python/taichi/lang/snode.py               | 2 +-
 python/taichi/profiler/kernel_metrics.py  | 2 +-
 python/taichi/profiler/kernel_profiler.py | 4 ++--
 python/taichi/types/annotations.py        | 2 +-
 taichi/analysis/verify.cpp                | 2 +-
 taichi/backends/cuda/cupti_toolkit.cpp    | 4 ++--
 taichi/gui/x11.cpp                        | 2 +-
 taichi/ir/snode.h                         | 2 +-
 taichi/system/traceback.cpp               | 2 +-
 taichi/transforms/reverse_segments.cpp    | 2 +-
 tests/python/test_fields_builder.py       | 2 +-
 tests/python/test_simple_matrix_slice.py  | 2 +-
 26 files changed, 39 insertions(+), 40 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
index ba245f01619f8..8800d1b66d44d 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.md
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -7,7 +7,7 @@ assignees: ''
 
 ---
 
-<!-- We've collected some common issue solutions in https://docs.taichi-lang.org/lang/articles/install. Make sure you've check them out first. Hopefully they could address your problem. -->
+<!-- We've collected some common issue solutions in https://docs.taichi-lang.org/docs/install. Make sure you've check them out first. Hopefully they could address your problem. -->
 
 **Describe the bug**
 A clear and concise description of what the bug is, ideally within 20 words.
diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml
index eaddd6150ca49..e5c9d52124c3d 100644
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -2,7 +2,7 @@
 blank_issues_enabled: true
 contact_links:
   - name: Contributor Guideline
-    url: https://docs.taichi-lang.org/lang/articles/contributor_guide
+    url: https://docs.taichi-lang.org/docs/contributor_guide
     about: Please check this out if you'd like to contribute by opening a PR :)
   - name: Taichi Forum
     url: https://forum.taichi.graphics
diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index b7f0ebaabc84d..4dd4981dfb48a 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -4,11 +4,11 @@ Related issue = #
 Thank you for your contribution!
 
 If it is your first time contributing to Taichi, please read our Contributor Guidelines:
-  https://docs.taichi-lang.org/lang/articles/contributor_guide
+  https://docs.taichi-lang.org/docs/contributor_guide
 
 - Please always prepend your PR title with tags such as [CUDA], [Lang], [Doc], [Example]. For a complete list of valid PR tags, please check out https://github.com/taichi-dev/taichi/blob/master/misc/prtags.json.
 - Use upper-case tags (e.g., [Metal]) for PRs that change public APIs. Otherwise, please use lower-case tags (e.g., [metal]).
-- More details: https://docs.taichi-lang.org/lang/articles/contributor_guide#pr-title-format-and-tags
+- More details: https://docs.taichi-lang.org/docs/contributor_guide#pr-title-format-and-tags
 
 - Please fill in the issue number that this PR relates to.
 - If your PR fixes the issue **completely**, use the `close` or `fixes` prefix so that GitHub automatically closes the issue when the PR is merged. For example,
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0be966f98e101..9cee806f27ea3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -7,7 +7,7 @@ cmake_minimum_required(VERSION 3.15)
 project(taichi)
 
 if (NOT DEFINED TI_VERSION_MAJOR)
-    message(WARNING "It seems that you are running cmake manually, which may cause issues. Please use setup.py to build taichi from source, see https://docs.taichi-lang.org/lang/articles/dev_install for more details.")
+    message(WARNING "It seems that you are running cmake manually, which may cause issues. Please use setup.py to build taichi from source, see https://docs.taichi-lang.org/docs/dev_install for more details.")
     set(TI_VERSION_MAJOR 0)
     set(TI_VERSION_MINOR 0)
     set(TI_VERSION_PATCH 0)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index b0d3ad75cb08a..0ed8205c7f1b9 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,12 @@
 # Contributing Guide
 
-Thank you for your interest in contributing to Taichi! Please check out the [Contribution Guidelines](https://docs.taichi-lang.org/lang/articles/contributor_guide) for how to make a contribution.
+Thank you for your interest in contributing to Taichi! Please check out the [Contribution Guidelines](https://docs.taichi-lang.org/docs/contributor_guide) for how to make a contribution.
 
 All contributors are expected to follow the [code of conduct](https://github.com/taichi-dev/taichi/blob/master/CODE_OF_CONDUCT.md).
 
 ## Developer installation
 
-Taichi is developed mainly in C++17 and Python3. Please check out the [Developer Installation](https://docs.taichi-lang.org/lang/articles/dev_install) to build Taichi from source. Note that Taichi is LLVM-10.0.0 dependent and that we recommend installing [our pre-built LLVM libraries](https://docs.taichi-lang.org/lang/articles/dev_install#install-llvm) for your platform.
+Taichi is developed mainly in C++17 and Python3. Please check out the [Developer Installation](https://docs.taichi-lang.org/docs/dev_install) to build Taichi from source. Note that Taichi is LLVM-10.0.0 dependent and that we recommend installing [our pre-built LLVM libraries](https://docs.taichi-lang.org/docs/dev_install#install-llvm) for your platform.
 
 ## Contribution opportunities
 
diff --git a/README.md b/README.md
index 17006bcf0f375..2102cb3d840dd 100644
--- a/README.md
+++ b/README.md
@@ -35,10 +35,10 @@ The language has broad applications spanning real-time physical simulation, numb
 ## Why Taichi?
 
 - Built around Python: Taichi shares almost the same syntax with Python, allowing you to write algorithms with minimal language barrier. It is also well integrated into the Python ecosystem, including NumPy and PyTorch.
-- Flexibility: Taichi provides a set of generic data containers known as *SNode* (/ˈsnoʊd/), an effective mechanism for composing hierarchical, multi-dimensional fields. This can cover many use patterns in numerical simulation (e.g. [spatially sparse computing](https://docs.taichi-lang.org/lang/articles/sparse)).
+- Flexibility: Taichi provides a set of generic data containers known as *SNode* (/ˈsnoʊd/), an effective mechanism for composing hierarchical, multi-dimensional fields. This can cover many use patterns in numerical simulation (e.g. [spatially sparse computing](https://docs.taichi-lang.org/docs/sparse)).
 - Performance: With the `@ti.kernel` decorator, Taichi's JIT compiler automatically compiles your Python functions into efficient GPU or CPU machine code for parallel execution.
 - Portability: Write your code once and run it everywhere. Currently, Taichi supports most mainstream GPU APIs, such as CUDA and Vulkan.
-- ... and many more features! A cross-platform, Vulkan-based 3D visualizer, [differentiable programming](https://docs.taichi-lang.org/lang/articles/differentiable_programming),  [quantized computation](https://github.com/taichi-dev/quantaichi) (experimental), etc.
+- ... and many more features! A cross-platform, Vulkan-based 3D visualizer, [differentiable programming](https://docs.taichi-lang.org/docs/differentiable_programming),  [quantized computation](https://github.com/taichi-dev/quantaichi) (experimental), etc.
 
 ## Getting Started
 
@@ -123,7 +123,7 @@ See [Get started](https://docs.taichi-lang.org) for more information.
 
 ### Build from source
 
-If you wish to try our our experimental features or build Taichi for your own environments, see [Developer installation](https://docs.taichi-lang.org/lang/articles/dev_install).
+If you wish to try our our experimental features or build Taichi for your own environments, see [Developer installation](https://docs.taichi-lang.org/docs/dev_install).
 
 ## Documentation
 
diff --git a/benchmarks/microbenchmarks/_items.py b/benchmarks/microbenchmarks/_items.py
index c8b8651f322ea..ed24fe8cd264d 100644
--- a/benchmarks/microbenchmarks/_items.py
+++ b/benchmarks/microbenchmarks/_items.py
@@ -67,7 +67,7 @@ def __init__(self):
 class MathOps(BenchmarkItem):
     name = 'math_op'
 
-    #reference: https://docs.taichi-lang.org/lang/articles/operator
+    #reference: https://docs.taichi-lang.org/docs/operator
     def __init__(self):
         self._items = {
             # Trigonometric
diff --git a/build.ps1 b/build.ps1
index fdd90f4544b0c..9b12f770a79d8 100644
--- a/build.ps1
+++ b/build.ps1
@@ -110,7 +110,7 @@ if (-not $LlvmDir) {
 }
 if (-not $LlvmDir -or -not (Test-Path $LlvmDir)) {
     throw "LLVM cannot be found in local environment and the script failed to download a prebuilt archive. " +
-        "Please follow the instructions at 'https://docs.taichi-lang.org/lang/articles/dev_install' to manually configure LLVM for Taichi."
+        "Please follow the instructions at 'https://docs.taichi-lang.org/docs/dev_install' to manually configure LLVM for Taichi."
 } else {
     $LlvmDir = (Resolve-Path $LlvmDir).Path;
     $env:LLVM_DIR = $LlvmDir
@@ -124,7 +124,7 @@ if (-not $ClangDir) {
 }
 if (-not $ClangDir -or -not (Test-Path $ClangDir)) {
     throw "Clang cannot be found in local environment and the script failed to download a prebuilt archive. " +
-        "Please follow the instructions at 'https://docs.taichi-lang.org/lang/articles/dev_install' to manually configure Clang for Taichi."
+        "Please follow the instructions at 'https://docs.taichi-lang.org/docs/dev_install' to manually configure Clang for Taichi."
 } else {
     $ClangDir = (Resolve-Path $ClangDir).Path;
     Write-Host "Using Clang at '$ClangDir'."
diff --git a/python/taichi/_lib/utils.py b/python/taichi/_lib/utils.py
index 542dde54e6c9c..364fd86345278 100644
--- a/python/taichi/_lib/utils.py
+++ b/python/taichi/_lib/utils.py
@@ -46,8 +46,7 @@ def import_ti_core():
         if isinstance(e, ImportError):
             print(Fore.YELLOW + "Share object taichi_core import failed, "
                   "check this page for possible solutions:\n"
-                  "https://docs.taichi-lang.org/lang/articles/install" +
-                  Fore.RESET)
+                  "https://docs.taichi-lang.org/docs/install" + Fore.RESET)
             if get_os_name() == 'win':
                 # pylint: disable=E1101
                 e.msg += '\nConsider installing Microsoft Visual C++ Redistributable: https://aka.ms/vs/16/release/vc_redist.x64.exe'
diff --git a/python/taichi/_logging.py b/python/taichi/_logging.py
index b58c1b5b76f66..596d32aae4504 100644
--- a/python/taichi/_logging.py
+++ b/python/taichi/_logging.py
@@ -36,7 +36,7 @@ def set_logging_level(level):
     also be effective. For example if `level` is set to 'warn', then the levels below
     it, which are 'error' and 'critical' in this case, will also be effective.
 
-    See also https://docs.taichi-lang.org/lang/articles/utilities#logging.
+    See also https://docs.taichi-lang.org/docs/utilities#logging.
 
     Args:
         level (str): Logging level.
@@ -53,7 +53,7 @@ def is_logging_effective(level):
     All levels below current level will be effective.
     The default level is 'info'.
 
-    See also https://docs.taichi-lang.org/lang/articles/utilities#logging.
+    See also https://docs.taichi-lang.org/docs/utilities#logging.
 
     Args:
         level (str): The string represents logging level. \
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index d9e02ee2e6131..90cfc7446d3af 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -482,7 +482,7 @@ def __repr__(self):
 root = _Root()
 """Root of the declared Taichi :func:`~taichi.lang.impl.field`s.
 
-See also https://docs.taichi-lang.org/lang/articles/layout
+See also https://docs.taichi-lang.org/docs/layout
 
 Example::
 
@@ -524,7 +524,7 @@ def field(dtype, shape=None, name="", offset=None, needs_grad=False):
     actually defined. The data in a Taichi field can be directly accessed by
     a Taichi :func:`~taichi.lang.kernel_impl.kernel`.
 
-    See also https://docs.taichi-lang.org/lang/articles/field
+    See also https://docs.taichi-lang.org/docs/field
 
     Args:
         dtype (DataType): data type of the field.
@@ -779,7 +779,7 @@ def static(x, *xs):
     `static()` is what enables the so-called metaprogramming in Taichi. It is
     in many ways similar to ``constexpr`` in C++.
 
-    See also https://docs.taichi-lang.org/lang/articles/meta.
+    See also https://docs.taichi-lang.org/docs/meta.
 
     Args:
         x (Any): an expression to be evaluated
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 418b51bb8bba3..1f689717f42a2 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -892,7 +892,7 @@ def kernel(fn):
 
     Kernel's gradient kernel would be generated automatically by the AutoDiff system.
 
-    See also https://docs.taichi-lang.org/lang/articles/syntax#kernel.
+    See also https://docs.taichi-lang.org/docs/syntax#kernel.
 
     Args:
         fn (Callable): the Python function to be decorated
@@ -941,7 +941,7 @@ def data_oriented(cls):
     To allow for modularized code, Taichi provides this decorator so that
     Taichi kernels can be defined inside a class.
 
-    See also https://docs.taichi-lang.org/lang/articles/odop
+    See also https://docs.taichi-lang.org/docs/odop
 
     Example::
 
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 2cd17d3064e7d..a79d28ce118e7 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -134,7 +134,7 @@ def _linearize_entry_id(self, *args):
                     'If you want to *iterate through matrix elements*, use a static range:\n'
                     '  for i in ti.static(range(3)):\n'
                     '    print(i, "-th component is", vec[i])\n'
-                    'See https://docs.taichi-lang.org/lang/articles/meta#when-to-use-tistatic-with-for-loops for more details.'
+                    'See https://docs.taichi-lang.org/docs/meta#when-to-use-tistatic-with-for-loops for more details.'
                     'Or turn on ti.init(..., dynamic_index=True) to support indexing with variables!'
                 )
         assert 0 <= args[0] < self.n, \
@@ -455,7 +455,7 @@ def __init__(self, arr, dt=None, suppress_warning=False, is_ref=False):
                 ' So the compilation time could be extremely long if the matrix size is too big.'
                 ' You may use a field to store a large matrix like this, e.g.:\n'
                 f'    x = ti.field(ti.f32, ({self.n}, {self.m})).\n'
-                ' See https://docs.taichi-lang.org/lang/articles/field#matrix-size'
+                ' See https://docs.taichi-lang.org/docs/field#matrix-size'
                 ' for more details.',
                 UserWarning,
                 stacklevel=2)
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 0bfebb2da5973..0426dc62b2c2d 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -341,7 +341,7 @@ def init(arch=None,
             * ``cpu_max_num_threads`` (int): Sets the number of threads used by the CPU thread pool.
             * ``debug`` (bool): Enables the debug mode, under which Taichi does a few more things like boundary checks.
             * ``print_ir`` (bool): Prints the CHI IR of the Taichi kernels.
-            * ``packed`` (bool): Enables the packed memory layout. See https://docs.taichi-lang.org/lang/articles/layout.
+            * ``packed`` (bool): Enables the packed memory layout. See https://docs.taichi-lang.org/docs/layout.
     """
     # Check version for users every 7 days if not disabled by users.
     _version_check.start_version_check_thread()
@@ -476,7 +476,7 @@ def no_activate(*args):
 def block_local(*args):
     """Hints Taichi to cache the fields and to enable the BLS optimization.
 
-    Please visit https://docs.taichi-lang.org/lang/articles/performance
+    Please visit https://docs.taichi-lang.org/docs/performance
     for how BLS is used.
 
     Args:
diff --git a/python/taichi/lang/snode.py b/python/taichi/lang/snode.py
index 3b0c7460f3958..81c046af9f3a4 100644
--- a/python/taichi/lang/snode.py
+++ b/python/taichi/lang/snode.py
@@ -11,7 +11,7 @@ class SNode:
     For more information on Taichi's SNode system, please check out
     these references:
 
-    * https://docs.taichi-lang.org/lang/articles/sparse
+    * https://docs.taichi-lang.org/docs/sparse
     * https://yuanming.taichi.graphics/publication/2019-taichi/taichi-lang.pdf
 
     Arg:
diff --git a/python/taichi/profiler/kernel_metrics.py b/python/taichi/profiler/kernel_metrics.py
index 2e5bbd7036959..dd47ad98fcd57 100644
--- a/python/taichi/profiler/kernel_metrics.py
+++ b/python/taichi/profiler/kernel_metrics.py
@@ -44,7 +44,7 @@ class CuptiMetric:
         >>> ti.profiler.print_kernel_profiler_info('trace')
 
     Note:
-        For details about using CUPTI in Taichi, please visit https://docs.taichi-lang.org/lang/articles/profiler#advanced-mode.
+        For details about using CUPTI in Taichi, please visit https://docs.taichi-lang.org/docs/profiler#advanced-mode.
     """
     def __init__(self,
                  name='',
diff --git a/python/taichi/profiler/kernel_profiler.py b/python/taichi/profiler/kernel_profiler.py
index 5dea618b1a286..6bdf5de97f069 100644
--- a/python/taichi/profiler/kernel_profiler.py
+++ b/python/taichi/profiler/kernel_profiler.py
@@ -46,7 +46,7 @@ class KernelProfiler:
     This mode is only available for the CUDA backend with CUPTI toolkit, i.e. you need ``ti.init(kernel_profiler=True, arch=ti.cuda)``.
 
     Note:
-        For details about using CUPTI in Taichi, please visit https://docs.taichi-lang.org/lang/articles/profiler#advanced-mode.
+        For details about using CUPTI in Taichi, please visit https://docs.taichi-lang.org/docs/profiler#advanced-mode.
     """
     def __init__(self):
         self._profiling_mode = False
@@ -388,7 +388,7 @@ def print_kernel_profiler_info(mode='count'):
         Currently the result of `KernelProfiler` could be incorrect on OpenGL
         backend due to its lack of support for `ti.sync()`.
 
-        For advanced mode of `KernelProfiler`, please visit https://docs.taichi-lang.org/lang/articles/profiler#advanced-mode.
+        For advanced mode of `KernelProfiler`, please visit https://docs.taichi-lang.org/docs/profiler#advanced-mode.
     """
     get_default_kernel_profiler().print_info(mode)
 
diff --git a/python/taichi/types/annotations.py b/python/taichi/types/annotations.py
index 4476fe628d5f0..491da6b529ee7 100644
--- a/python/taichi/types/annotations.py
+++ b/python/taichi/types/annotations.py
@@ -2,7 +2,7 @@ class Template:
     """Type annotation for template kernel parameter.
     Useful for passing parameters to kernels by reference.
 
-    See also https://docs.taichi-lang.org/lang/articles/meta.
+    See also https://docs.taichi-lang.org/docs/meta.
 
     Args:
         tensor (Any): unused
diff --git a/taichi/analysis/verify.cpp b/taichi/analysis/verify.cpp
index cc9df7fde4860..31fbbea3ac052 100644
--- a/taichi/analysis/verify.cpp
+++ b/taichi/analysis/verify.cpp
@@ -49,7 +49,7 @@ class IRVerifier : public BasicStmtVisitor {
           found,
           "IR broken: stmt {} {} cannot have operand {} {}."
           " If you are using autodiff, please check"
-          " https://docs.taichi-lang.org/lang/articles/"
+          " https://docs.taichi-lang.org/docs/"
           "differences_between_taichi_and_python_programs"
           " If it doesn't help, please report this bug by opening an issue at"
           " https://github.com/taichi-dev/taichi to help us improve."
diff --git a/taichi/backends/cuda/cupti_toolkit.cpp b/taichi/backends/cuda/cupti_toolkit.cpp
index 02698e9857e70..dda7a27b0f707 100644
--- a/taichi/backends/cuda/cupti_toolkit.cpp
+++ b/taichi/backends/cuda/cupti_toolkit.cpp
@@ -39,7 +39,7 @@ bool check_cupti_availability() {
         "7.0 , fallback to default kernel profiler");
     TI_WARN(
         "See also: "
-        "https://docs.taichi-lang.org/lang/articles/profiler");
+        "https://docs.taichi-lang.org/docs/profiler");
     return false;
   }
   return true;
@@ -106,7 +106,7 @@ bool check_cupti_privileges() {
         "=================================================================");
     TI_WARN(
         "See also: "
-        "https://docs.taichi-lang.org/lang/articles/profiler");
+        "https://docs.taichi-lang.org/docs/profiler");
     return false;
   }
   // For other errors , CuptiToolkit::init_cupti() will send error message.
diff --git a/taichi/gui/x11.cpp b/taichi/gui/x11.cpp
index da82e5b08fcf5..97e268fe16d33 100644
--- a/taichi/gui/x11.cpp
+++ b/taichi/gui/x11.cpp
@@ -151,7 +151,7 @@ void GUI::create_window() {
                  "Taichi fails to create a window."
                  " This is probably due to the lack of an X11 GUI environment."
                  " Consider using the `ti.GUI(show_gui=False)` option, see"
-                 " https://docs.taichi-lang.org/lang/articles/gui_system");
+                 " https://docs.taichi-lang.org/docs/gui_system");
   visual = DefaultVisual(display, 0);
   window =
       XCreateSimpleWindow((Display *)display, RootWindow((Display *)display, 0),
diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
index b6698574c8c32..a3e7f55987f0f 100644
--- a/taichi/ir/snode.h
+++ b/taichi/ir/snode.h
@@ -119,7 +119,7 @@ class SNode {
   std::string name;
   // Product of the |shape| of all the activated axes identified by
   // |extractors|.
-  // See https://docs.taichi-lang.org/lang/articles/internal for terms
+  // See https://docs.taichi-lang.org/docs/internal for terms
   // like cell and container.
   int64 num_cells_per_container{1};
   int total_num_bits{0};
diff --git a/taichi/system/traceback.cpp b/taichi/system/traceback.cpp
index 5ba9ae8a44975..e2e0e4d189f86 100644
--- a/taichi/system/traceback.cpp
+++ b/taichi/system/traceback.cpp
@@ -370,7 +370,7 @@ void print_traceback() {
   fmt::print(
       fg(fmt::color::orange),
       "\nInternal error occurred. Check out this page for possible solutions:\n"
-      "https://docs.taichi-lang.org/lang/articles/install\n");
+      "https://docs.taichi-lang.org/docs/install\n");
 }
 
 TI_NAMESPACE_END
diff --git a/taichi/transforms/reverse_segments.cpp b/taichi/transforms/reverse_segments.cpp
index c6148746a7c47..4065b0278af96 100644
--- a/taichi/transforms/reverse_segments.cpp
+++ b/taichi/transforms/reverse_segments.cpp
@@ -70,7 +70,7 @@ void reverse_segments(IRNode *root) {
         "Mixed usage of for-loops and statements without looping. \n"
         "Please split them into two kernels "
         "and check the documentation for more details:\n"
-        "https://docs.taichi-lang.org/lang/articles/"
+        "https://docs.taichi-lang.org/docs/"
         "differentiable_programming");
   }
   for (auto &sblock : statement_blocks) {
diff --git a/tests/python/test_fields_builder.py b/tests/python/test_fields_builder.py
index 0e95bdb93d8e1..590fc2986e27a 100644
--- a/tests/python/test_fields_builder.py
+++ b/tests/python/test_fields_builder.py
@@ -139,7 +139,7 @@ def assign_field_multiple_struct_for():
 
 
 # We currently only consider data types that all platforms support.
-# See https://docs.taichi-lang.org/lang/articles/type#primitive-types for more details.
+# See https://docs.taichi-lang.org/docs/type#primitive-types for more details.
 @pytest.mark.parametrize('test_1d_size', [1, 10, 100])
 @pytest.mark.parametrize('field_type', [ti.f32, ti.i32])
 @test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
diff --git a/tests/python/test_simple_matrix_slice.py b/tests/python/test_simple_matrix_slice.py
index 68f1461b6b1de..4bd213e148cf3 100644
--- a/tests/python/test_simple_matrix_slice.py
+++ b/tests/python/test_simple_matrix_slice.py
@@ -62,7 +62,7 @@ def test_one_col_slice() -> ti.types.matrix(1, 3, dtype=ti.i32):
             r'If you want to \*iterate through matrix elements\*, use a static range:\n'
             r'  for i in ti.static\(range\(3\)\):\n'
             r'    print\(i, "-th component is", vec\[i\]\)\n'
-            r'See https://docs.taichi-lang.org/lang/articles/meta#when-to-use-tistatic-with-for-loops for more details.'
+            r'See https://docs.taichi-lang.org/docs/meta#when-to-use-tistatic-with-for-loops for more details.'
             r'Or turn on ti.init\(..., dynamic_index=True\) to support indexing with variables!'
     ):
         test_one_col_slice()

From ed22d64f18eece247f8206f7fbba7236043c0e4b Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Tue, 17 May 2022 14:52:18 +0800
Subject: [PATCH 077/176] [Doc] Update trouble shooting URL in bug report
 template (#4988)


From 84b6c8917f9f1ea20b90b17527a4f005e83f5f50 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Tue, 17 May 2022 18:05:01 +0800
Subject: [PATCH 078/176] [Lang] [type] Refactor quantized_types module and
 make quant APIs public (#4985)

* [Type] Refactor quantized_types module and make quant APIs public

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix pylint

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 misc/benchmark_bit_struct_stores.py           |   2 +-
 misc/visualize_quant_types.py                 |   6 +-
 python/taichi/__init__.py                     |   2 -
 python/taichi/lang/matrix.py                  |   1 -
 python/taichi/lang/misc.py                    |   3 -
 python/taichi/lang/struct.py                  |   2 +-
 python/taichi/types/__init__.py               |   2 +-
 python/taichi/types/quantized_types.py        | 218 +++++++++---------
 tests/python/test_bit_array.py                |   4 +-
 tests/python/test_bit_array_vectorization.py  |   6 +-
 tests/python/test_bit_struct.py               |  23 +-
 tests/python/test_cast.py                     |   4 +-
 tests/python/test_custom_float.py             |  14 +-
 tests/python/test_custom_float_exponents.py   |  45 ++--
 tests/python/test_custom_float_shared_exp.py  |  64 ++---
 .../test_custom_float_time_integration.py     |  15 +-
 tests/python/test_custom_int.py               |   2 +-
 tests/python/test_custom_type_atomics.py      |  18 +-
 tests/python/test_matrix_different_type.py    |   8 +-
 tests/python/test_snode_layout_inspection.py  |   5 +-
 tests/python/test_struct_for.py               |   2 +-
 21 files changed, 217 insertions(+), 229 deletions(-)

diff --git a/misc/benchmark_bit_struct_stores.py b/misc/benchmark_bit_struct_stores.py
index 748b41a06bb9b..dcf1b9a8a85b9 100644
--- a/misc/benchmark_bit_struct_stores.py
+++ b/misc/benchmark_bit_struct_stores.py
@@ -7,7 +7,7 @@
 n = 1024 * 1024 * 256
 
 if quant:
-    ci16 = ti.types.quantized_types.quant.int(16, True)
+    ci16 = ti.types.quant.int(16, True)
 
     x = ti.field(dtype=ci16)
     y = ti.field(dtype=ci16)
diff --git a/misc/visualize_quant_types.py b/misc/visualize_quant_types.py
index 6f51d5ab03ada..62fb507a456a4 100644
--- a/misc/visualize_quant_types.py
+++ b/misc/visualize_quant_types.py
@@ -7,9 +7,9 @@
 
 ti.init()
 
-f19 = ti.types.quantized_types.quant.float(exp=6, frac=13, signed=True)
-f16 = ti.types.quantized_types.quant.float(exp=5, frac=11, signed=True)
-fixed16 = ti.types.quantized_types.quant.fixed(frac=16, range=2)
+f19 = ti.types.quant.float(exp=6, frac=13, signed=True)
+f16 = ti.types.quant.float(exp=5, frac=11, signed=True)
+fixed16 = ti.types.quant.fixed(frac=16, range=2)
 
 vf19 = ti.Vector.field(2, dtype=f19)
 bs_vf19 = ti.root.bit_struct(num_bits=32)
diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index 2d799e7362953..e89a221b99575 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -37,8 +37,6 @@
     'imresize': 'tools.imresize',
     'imshow': 'tools.imshow',
     'imwrite': 'tools.imwrite',
-    'quant': 'types.quantized_types.quant',
-    'type_factory': 'types.quantized_types.type_factory',
     'ext_arr': 'types.ndarray',
     'any_arr': 'types.ndarray'
 }
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index a79d28ce118e7..c5160c22d9dd1 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1435,7 +1435,6 @@ def __init__(self, _vars, n, m):
 
     def get_scalar_field(self, *indices):
         """Creates a ScalarField using a specific field member.
-        Only used for quant.
 
         Args:
             indices (Tuple[Int]): Specified indices of the field member.
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 0426dc62b2c2d..1af43927d945b 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -170,9 +170,6 @@
 timeline_clear = lambda: impl.get_runtime().prog.timeline_clear()  # pylint: disable=unnecessary-lambda
 timeline_save = lambda fn: impl.get_runtime().prog.timeline_save(fn)  # pylint: disable=unnecessary-lambda
 
-# Legacy API
-type_factory_ = _ti_core.get_type_factory_instance()
-
 extension = _ti_core.Extension
 """An instance of Taichi extension.
 
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index b2965f3f4c7bc..cb750b8eee578 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -472,7 +472,7 @@ def _initialize_host_accessors(self):
             v._initialize_host_accessors()
 
     def get_member_field(self, key):
-        """Creates a ScalarField using a specific field member. Only used for quant.
+        """Creates a ScalarField using a specific field member.
 
         Args:
             key (str): Specified key of the field member.
diff --git a/python/taichi/types/__init__.py b/python/taichi/types/__init__.py
index 20b00ca94e0fc..7360e9786f1ec 100644
--- a/python/taichi/types/__init__.py
+++ b/python/taichi/types/__init__.py
@@ -7,9 +7,9 @@
 - ndarray: for arbitrary arrays.
 - quantized: for quantized types, see "https://yuanming.taichi.graphics/publication/2021-quantaichi/quantaichi.pdf"
 """
+from taichi.types import quantized_types as quant
 from taichi.types.annotations import *
 from taichi.types.compound_types import *
 from taichi.types.ndarray_type import *
 from taichi.types.primitive_types import *
-from taichi.types.quantized_types import *
 from taichi.types.utils import *
diff --git a/python/taichi/types/quantized_types.py b/python/taichi/types/quantized_types.py
index 19a6dec44a2cb..7bb9297b284a0 100644
--- a/python/taichi/types/quantized_types.py
+++ b/python/taichi/types/quantized_types.py
@@ -1,129 +1,117 @@
+"""
+This module defines generators of quantized types.
+For more details, read https://yuanming.taichi.graphics/publication/2021-quantaichi/quantaichi.pdf.
+"""
 from taichi._lib.utils import ti_core as _ti_core
 from taichi.lang import impl
 from taichi.types.primitive_types import i32
 
+_type_factory = _ti_core.get_type_factory_instance()
 
-class TypeFactory:
-    """A Python-side TypeFactory wrapper."""
-    def __init__(self):
-        self.core = _ti_core.get_type_factory_instance()
-
-    def custom_int(self, bits, signed=True, compute_type=None):
-        """Generates a custom int type.
-
-        Args:
-            bits (int): Number of bits.
-            signed (bool): Signed or unsigned.
-            compute_type (DataType): Type for computation.
-
-        Returns:
-            DataType: The specified type.
-        """
-        if compute_type is None:
-            compute_type = impl.get_runtime().default_ip
-        if isinstance(compute_type, _ti_core.DataType):
-            compute_type = compute_type.get_ptr()
-        return self.core.get_custom_int_type(bits, signed, compute_type)
-
-    def custom_float(self,
-                     significand_type,
-                     exponent_type=None,
-                     compute_type=None,
-                     scale=1.0):
-        """Generates a custom float type.
-
-        Args:
-            significand_type (DataType): Type of significand.
-            exponent_type (DataType): Type of exponent.
-            compute_type (DataType): Type for computation.
-            scale (float): Scaling factor.
-
-        Returns:
-            DataType: The specified type.
-        """
-        if compute_type is None:
-            compute_type = impl.get_runtime().default_fp
-        if isinstance(compute_type, _ti_core.DataType):
-            compute_type = compute_type.get_ptr()
-        return self.core.get_custom_float_type(significand_type,
+
+def _custom_int(bits, signed=True, compute_type=None):
+    """Generates a custom int type.
+
+    Args:
+        bits (int): Number of bits.
+        signed (bool): Signed or unsigned.
+        compute_type (DataType): Type for computation.
+
+    Returns:
+        DataType: The specified type.
+    """
+    if compute_type is None:
+        compute_type = impl.get_runtime().default_ip
+    if isinstance(compute_type, _ti_core.DataType):
+        compute_type = compute_type.get_ptr()
+    return _type_factory.get_custom_int_type(bits, signed, compute_type)
+
+
+def _custom_float(significand_type,
+                  exponent_type=None,
+                  compute_type=None,
+                  scale=1.0):
+    """Generates a custom float type.
+
+    Args:
+        significand_type (DataType): Type of significand.
+        exponent_type (DataType): Type of exponent.
+        compute_type (DataType): Type for computation.
+        scale (float): Scaling factor.
+
+    Returns:
+        DataType: The specified type.
+    """
+    if compute_type is None:
+        compute_type = impl.get_runtime().default_fp
+    if isinstance(compute_type, _ti_core.DataType):
+        compute_type = compute_type.get_ptr()
+    return _type_factory.get_custom_float_type(significand_type,
                                                exponent_type,
                                                compute_type,
                                                scale=scale)
 
 
-# Unstable API
-type_factory = TypeFactory()
+def int(bits, signed=False, compute=None):  # pylint: disable=W0622
+    """Generates a quantized type for integers.
 
+    Args:
+        bits (int): Number of bits.
+        signed (bool): Signed or unsigned.
+        compute (DataType): Type for computation.
 
-class Quant:
-    """Generator of quantized types.
+    Returns:
+        DataType: The specified type.
+    """
+    if compute is None:
+        compute = impl.get_runtime().default_ip
+    return _custom_int(bits, signed, compute)
+
+
+def fixed(frac, signed=True, num_range=1.0, compute=None):
+    """Generates a quantized type for fixed-point real numbers.
 
-    For more details, read https://yuanming.taichi.graphics/publication/2021-quantaichi/quantaichi.pdf.
+    Args:
+        frac (int): Number of bits.
+        signed (bool): Signed or unsigned.
+        num_range (float): Range of the number.
+        compute (DataType): Type for computation.
+
+    Returns:
+        DataType: The specified type.
+    """
+    # TODO: handle cases with frac > 32
+    frac_type = int(bits=frac, signed=signed, compute=i32)
+    if signed:
+        scale = num_range / 2**(frac - 1)
+    else:
+        scale = num_range / 2**frac
+    if compute is None:
+        compute = impl.get_runtime().default_fp
+    return _custom_float(frac_type, None, compute, scale)
+
+
+def float(exp, frac, signed=True, compute=None):  # pylint: disable=W0622
+    """Generates a quantized type for floating-point real numbers.
+
+    Args:
+        exp (int): Number of exponent bits.
+        frac (int): Number of fraction bits.
+        signed (bool): Signed or unsigned.
+        compute (DataType): Type for computation.
+
+    Returns:
+        DataType: The specified type.
     """
-    @staticmethod
-    def int(bits, signed=False, compute=None):
-        """Generates a quantized type for integers.
-
-        Args:
-            bits (int): Number of bits.
-            signed (bool): Signed or unsigned.
-            compute (DataType): Type for computation.
-
-        Returns:
-            DataType: The specified type.
-        """
-        if compute is None:
-            compute = impl.get_runtime().default_ip
-        return type_factory.custom_int(bits, signed, compute)
-
-    @staticmethod
-    def fixed(frac, signed=True, num_range=1.0, compute=None):
-        """Generates a quantized type for fixed-point real numbers.
-
-        Args:
-            frac (int): Number of bits.
-            signed (bool): Signed or unsigned.
-            num_range (float): Range of the number.
-            compute (DataType): Type for computation.
-
-        Returns:
-            DataType: The specified type.
-        """
-        # TODO: handle cases with frac > 32
-        frac_type = Quant.int(bits=frac, signed=signed, compute=i32)
-        if signed:
-            scale = num_range / 2**(frac - 1)
-        else:
-            scale = num_range / 2**frac
-        if compute is None:
-            compute = impl.get_runtime().default_fp
-        return type_factory.custom_float(frac_type, None, compute, scale)
-
-    @staticmethod
-    def float(exp, frac, signed=True, compute=None):
-        """Generates a quantized type for floating-point real numbers.
-
-        Args:
-            exp (int): Number of exponent bits.
-            frac (int): Number of fraction bits.
-            signed (bool): Signed or unsigned.
-            compute (DataType): Type for computation.
-
-        Returns:
-            DataType: The specified type.
-        """
-        # Exponent is always unsigned
-        exp_type = Quant.int(bits=exp, signed=False, compute=i32)
-        # TODO: handle cases with frac > 32
-        frac_type = Quant.int(bits=frac, signed=signed, compute=i32)
-        if compute is None:
-            compute = impl.get_runtime().default_fp
-        return type_factory.custom_float(significand_type=frac_type,
-                                         exponent_type=exp_type,
-                                         compute_type=compute)
-
-
-# Unstable API
-quant = Quant
-
-__all__ = []
+    # Exponent is always unsigned
+    exp_type = int(bits=exp, signed=False, compute=i32)
+    # TODO: handle cases with frac > 32
+    frac_type = int(bits=frac, signed=signed, compute=i32)
+    if compute is None:
+        compute = impl.get_runtime().default_fp
+    return _custom_float(significand_type=frac_type,
+                         exponent_type=exp_type,
+                         compute_type=compute)
+
+
+__all__ = ['int', 'fixed', 'float']
diff --git a/tests/python/test_bit_array.py b/tests/python/test_bit_array.py
index d5426ab3c9f8c..3c3f76fc30d87 100644
--- a/tests/python/test_bit_array.py
+++ b/tests/python/test_bit_array.py
@@ -6,7 +6,7 @@
 
 @test_utils.test(require=ti.extension.quant, debug=True)
 def test_1D_bit_array():
-    cu1 = ti.types.quantized_types.quant.int(1, False)
+    cu1 = ti.types.quant.int(1, False)
 
     x = ti.field(dtype=cu1)
 
@@ -30,7 +30,7 @@ def verify_val():
 
 @test_utils.test(require=ti.extension.quant, debug=True)
 def test_2D_bit_array():
-    ci1 = ti.types.quantized_types.quant.int(1, False)
+    ci1 = ti.types.quant.int(1, False)
 
     x = ti.field(dtype=ci1)
 
diff --git a/tests/python/test_bit_array_vectorization.py b/tests/python/test_bit_array_vectorization.py
index 02afb38ec2f3c..44adc9e1943df 100644
--- a/tests/python/test_bit_array_vectorization.py
+++ b/tests/python/test_bit_array_vectorization.py
@@ -8,7 +8,7 @@
                  debug=True,
                  cfg_optimization=False)
 def test_vectorized_struct_for():
-    cu1 = ti.types.quantized_types.quant.int(1, False)
+    cu1 = ti.types.quant.int(1, False)
 
     x = ti.field(dtype=cu1)
     y = ti.field(dtype=cu1)
@@ -49,7 +49,7 @@ def verify():
 
 @test_utils.test(require=ti.extension.quant)
 def test_offset_load():
-    ci1 = ti.types.quantized_types.quant.int(1, False)
+    ci1 = ti.types.quant.int(1, False)
 
     x = ti.field(dtype=ci1)
     y = ti.field(dtype=ci1)
@@ -109,7 +109,7 @@ def verify(dx: ti.template(), dy: ti.template()):
 
 @test_utils.test(require=ti.extension.quant, debug=True)
 def test_evolve():
-    ci1 = ti.types.quantized_types.quant.int(1, False)
+    ci1 = ti.types.quant.int(1, False)
 
     x = ti.field(dtype=ci1)
     y = ti.field(dtype=ci1)
diff --git a/tests/python/test_bit_struct.py b/tests/python/test_bit_struct.py
index ef1e6ce49c263..a7333802d86e5 100644
--- a/tests/python/test_bit_struct.py
+++ b/tests/python/test_bit_struct.py
@@ -7,8 +7,8 @@
 
 @test_utils.test(require=ti.extension.quant_basic, debug=True)
 def test_simple_array():
-    ci13 = ti.types.quantized_types.quant.int(13, True)
-    cu19 = ti.types.quantized_types.quant.int(19, False)
+    ci13 = ti.types.quant.int(13, True)
+    cu19 = ti.types.quant.int(19, False)
 
     x = ti.field(dtype=ci13)
     y = ti.field(dtype=cu19)
@@ -42,9 +42,9 @@ def verify_val():
                  exclude=[ti.metal],
                  debug=True)
 def test_custom_int_load_and_store():
-    ci13 = ti.types.quantized_types.quant.int(13, True)
-    cu14 = ti.types.quantized_types.quant.int(14, False)
-    ci5 = ti.types.quantized_types.quant.int(5, True)
+    ci13 = ti.types.quant.int(13, True)
+    cu14 = ti.types.quant.int(14, False)
+    ci5 = ti.types.quant.int(5, True)
 
     x = ti.field(dtype=ci13)
     y = ti.field(dtype=cu14)
@@ -83,7 +83,7 @@ def verify_val(idx: ti.i32):
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_int_full_struct():
-    cit = ti.types.quantized_types.quant.int(32, True)
+    cit = ti.types.quant.int(32, True)
     x = ti.field(dtype=cit)
     ti.root.dense(ti.i, 1).bit_struct(num_bits=32).place(x)
 
@@ -99,12 +99,9 @@ def test_single_bit_struct(physical_type, compute_type, custom_bits,
                                test_case):
         ti.init(arch=ti.cpu, debug=True)
 
-        cit1 = ti.types.quantized_types.quant.int(custom_bits[0], True,
-                                                  compute_type)
-        cit2 = ti.types.quantized_types.quant.int(custom_bits[1], False,
-                                                  compute_type)
-        cit3 = ti.types.quantized_types.quant.int(custom_bits[2], True,
-                                                  compute_type)
+        cit1 = ti.types.quant.int(custom_bits[0], True, compute_type)
+        cit2 = ti.types.quant.int(custom_bits[1], False, compute_type)
+        cit3 = ti.types.quant.int(custom_bits[2], True, compute_type)
 
         a = ti.field(dtype=cit1)
         b = ti.field(dtype=cit2)
@@ -151,7 +148,7 @@ def test_bit_struct_struct_for():
     block_size = 16
     N = 64
     cell = ti.root.pointer(ti.i, N // block_size)
-    fixed32 = ti.types.quantized_types.quant.fixed(frac=32, num_range=1024)
+    fixed32 = ti.types.quant.fixed(frac=32, num_range=1024)
 
     x = ti.field(dtype=fixed32)
     cell.dense(ti.i, block_size).bit_struct(32).place(x)
diff --git a/tests/python/test_cast.py b/tests/python/test_cast.py
index 0bcca774a2b3a..06d48ce73e438 100644
--- a/tests/python/test_cast.py
+++ b/tests/python/test_cast.py
@@ -145,8 +145,8 @@ def test_custom_int_extension():
     x = ti.field(dtype=ti.i32, shape=2)
     y = ti.field(dtype=ti.u32, shape=2)
 
-    ci5 = ti.types.quantized_types.quant.int(5, True, ti.i16)
-    cu7 = ti.types.quantized_types.quant.int(7, False, ti.u16)
+    ci5 = ti.types.quant.int(5, True, ti.i16)
+    cu7 = ti.types.quant.int(7, False, ti.u16)
 
     a = ti.field(dtype=ci5)
     b = ti.field(dtype=cu7)
diff --git a/tests/python/test_custom_float.py b/tests/python/test_custom_float.py
index aae23362038ab..5043e4d552df5 100644
--- a/tests/python/test_custom_float.py
+++ b/tests/python/test_custom_float.py
@@ -8,7 +8,7 @@
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_float():
-    cft = ti.types.quantized_types.quant.fixed(frac=32, num_range=2)
+    cft = ti.types.quant.fixed(frac=32, num_range=2)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -29,7 +29,7 @@ def foo():
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_matrix_rotation():
-    cft = ti.types.quantized_types.quant.fixed(frac=16, num_range=1.2)
+    cft = ti.types.quant.fixed(frac=16, num_range=1.2)
 
     x = ti.Matrix.field(2, 2, dtype=cft)
 
@@ -57,9 +57,8 @@ def rotate_18_degrees():
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_float_implicit_cast():
-    ci13 = ti.types.quantized_types.quant.int(bits=13)
-    cft = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=ci13, scale=0.1)
+    ci13 = ti.types.quant.int(bits=13)
+    cft = ti.types.quant._custom_float(significand_type=ci13, scale=0.1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -74,9 +73,8 @@ def foo():
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_cache_read_only():
-    ci15 = ti.types.quantized_types.quant.int(bits=15)
-    cft = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=ci15, scale=0.1)
+    ci15 = ti.types.quant.int(bits=15)
+    cft = ti.types.quant._custom_float(significand_type=ci15, scale=0.1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
diff --git a/tests/python/test_custom_float_exponents.py b/tests/python/test_custom_float_exponents.py
index 6ede9f03a6498..1411d566ae232 100644
--- a/tests/python/test_custom_float_exponents.py
+++ b/tests/python/test_custom_float_exponents.py
@@ -8,10 +8,11 @@
 
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_unsigned():
-    cu13 = ti.types.quantized_types.quant.int(13, False)
-    exp = ti.types.quantized_types.quant.int(6, False)
-    cft = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cu13, exponent_type=exp, scale=1)
+    cu13 = ti.types.quant.int(13, False)
+    exp = ti.types.quant.int(6, False)
+    cft = ti.types.quant._custom_float(significand_type=cu13,
+                                       exponent_type=exp,
+                                       scale=1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -30,10 +31,11 @@ def test_custom_float_unsigned():
 
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_signed():
-    cu13 = ti.types.quantized_types.quant.int(13, True)
-    exp = ti.types.quantized_types.quant.int(6, False)
-    cft = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cu13, exponent_type=exp, scale=1)
+    cu13 = ti.types.quant.int(13, True)
+    exp = ti.types.quant.int(6, False)
+    cft = ti.types.quant._custom_float(significand_type=cu13,
+                                       exponent_type=exp,
+                                       scale=1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -61,10 +63,11 @@ def test_custom_float_signed():
 @pytest.mark.parametrize('digits_bits', [23, 24])
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_precision(digits_bits):
-    cu24 = ti.types.quantized_types.quant.int(digits_bits, True)
-    exp = ti.types.quantized_types.quant.int(8, False)
-    cft = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cu24, exponent_type=exp, scale=1)
+    cu24 = ti.types.quant.int(digits_bits, True)
+    exp = ti.types.quant.int(8, False)
+    cft = ti.types.quant._custom_float(significand_type=cu24,
+                                       exponent_type=exp,
+                                       scale=1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -85,10 +88,11 @@ def test_custom_float_precision(digits_bits):
 @pytest.mark.parametrize('signed', [True, False])
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_truncation(signed):
-    cit = ti.types.quantized_types.quant.int(2, signed)
-    exp = ti.types.quantized_types.quant.int(5, False)
-    cft = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit, exponent_type=exp, scale=1)
+    cit = ti.types.quant.int(2, signed)
+    exp = ti.types.quant.int(5, False)
+    cft = ti.types.quant._custom_float(significand_type=cit,
+                                       exponent_type=exp,
+                                       scale=1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -116,10 +120,11 @@ def test_custom_float_truncation(signed):
 
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_atomic_demotion():
-    cit = ti.types.quantized_types.quant.int(2, True)
-    exp = ti.types.quantized_types.quant.int(5, False)
-    cft = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit, exponent_type=exp, scale=1)
+    cit = ti.types.quant.int(2, True)
+    exp = ti.types.quant.int(5, False)
+    cft = ti.types.quant._custom_float(significand_type=cit,
+                                       exponent_type=exp,
+                                       scale=1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
diff --git a/tests/python/test_custom_float_shared_exp.py b/tests/python/test_custom_float_shared_exp.py
index 02e9da00b8dec..6f56ab96b290f 100644
--- a/tests/python/test_custom_float_shared_exp.py
+++ b/tests/python/test_custom_float_shared_exp.py
@@ -8,13 +8,15 @@
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_shared_exponents(exponent_bits):
-    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quantized_types.quant.int(10, False)
-    cit2 = ti.types.quantized_types.quant.int(14, False)
-    cft1 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit1, exponent_type=exp, scale=1)
-    cft2 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit2, exponent_type=exp, scale=1)
+    exp = ti.types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quant.int(10, False)
+    cit2 = ti.types.quant.int(14, False)
+    cft1 = ti.types.quant._custom_float(significand_type=cit1,
+                                        exponent_type=exp,
+                                        scale=1)
+    cft2 = ti.types.quant._custom_float(significand_type=cit2,
+                                        exponent_type=exp,
+                                        scale=1)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
@@ -76,13 +78,15 @@ def foo(x: ti.f32, y: ti.f32):
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_shared_exponent_add(exponent_bits):
-    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quantized_types.quant.int(10, False)
-    cit2 = ti.types.quantized_types.quant.int(14, False)
-    cft1 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit1, exponent_type=exp, scale=1)
-    cft2 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit2, exponent_type=exp, scale=1)
+    exp = ti.types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quant.int(10, False)
+    cit2 = ti.types.quant.int(14, False)
+    cft1 = ti.types.quant._custom_float(significand_type=cit1,
+                                        exponent_type=exp,
+                                        scale=1)
+    cft2 = ti.types.quant._custom_float(significand_type=cit2,
+                                        exponent_type=exp,
+                                        scale=1)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
@@ -114,13 +118,15 @@ def foo(x: ti.f32, y: ti.f32):
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_shared_exponent_borrow(exponent_bits):
-    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quantized_types.quant.int(10, False)
-    cit2 = ti.types.quantized_types.quant.int(14, False)
-    cft1 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit1, exponent_type=exp, scale=1)
-    cft2 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit2, exponent_type=exp, scale=1)
+    exp = ti.types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quant.int(10, False)
+    cit2 = ti.types.quant.int(14, False)
+    cft1 = ti.types.quant._custom_float(significand_type=cit1,
+                                        exponent_type=exp,
+                                        scale=1)
+    cft2 = ti.types.quant._custom_float(significand_type=cit2,
+                                        exponent_type=exp,
+                                        scale=1)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
@@ -145,13 +151,15 @@ def inc():
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_negative(exponent_bits):
-    exp = ti.types.quantized_types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quantized_types.quant.int(10, False)
-    cit2 = ti.types.quantized_types.quant.int(14, True)
-    cft1 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit1, exponent_type=exp, scale=1)
-    cft2 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit2, exponent_type=exp, scale=1)
+    exp = ti.types.quant.int(exponent_bits, False)
+    cit1 = ti.types.quant.int(10, False)
+    cit2 = ti.types.quant.int(14, True)
+    cft1 = ti.types.quant._custom_float(significand_type=cit1,
+                                        exponent_type=exp,
+                                        scale=1)
+    cft2 = ti.types.quant._custom_float(significand_type=cit2,
+                                        exponent_type=exp,
+                                        scale=1)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
diff --git a/tests/python/test_custom_float_time_integration.py b/tests/python/test_custom_float_time_integration.py
index 00906efec1a8e..2cad8267dcb5b 100644
--- a/tests/python/test_custom_float_time_integration.py
+++ b/tests/python/test_custom_float_time_integration.py
@@ -14,10 +14,11 @@
 def test_custom_float_time_integration(use_cft, use_exponent, use_shared_exp):
     if use_cft:
         if use_exponent:
-            exp = ti.types.quantized_types.quant.int(6, False)
-            cit = ti.types.quantized_types.quant.int(13, True)
-            cft = ti.types.quantized_types.type_factory.custom_float(
-                significand_type=cit, exponent_type=exp, scale=1)
+            exp = ti.types.quant.int(6, False)
+            cit = ti.types.quant.int(13, True)
+            cft = ti.types.quant._custom_float(significand_type=cit,
+                                               exponent_type=exp,
+                                               scale=1)
             x = ti.Vector.field(2, dtype=cft)
             if use_shared_exp:
                 ti.root.bit_struct(num_bits=32).place(x, shared_exponent=True)
@@ -25,9 +26,9 @@ def test_custom_float_time_integration(use_cft, use_exponent, use_shared_exp):
                 ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(0))
                 ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(1))
         else:
-            cit = ti.types.quantized_types.quant.int(16, True)
-            cft = ti.types.quantized_types.type_factory.custom_float(
-                significand_type=cit, scale=1 / 2**14)
+            cit = ti.types.quant.int(16, True)
+            cft = ti.types.quant._custom_float(significand_type=cit,
+                                               scale=1 / 2**14)
             x = ti.Vector.field(2, dtype=cft)
             ti.root.bit_struct(num_bits=32).place(x)
     else:
diff --git a/tests/python/test_custom_int.py b/tests/python/test_custom_int.py
index b75d366b8d064..1fd0077e9280f 100644
--- a/tests/python/test_custom_int.py
+++ b/tests/python/test_custom_int.py
@@ -4,7 +4,7 @@
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_int_implicit_cast():
-    ci13 = ti.types.quantized_types.quant.int(13, True)
+    ci13 = ti.types.quant.int(13, True)
     x = ti.field(dtype=ci13)
 
     ti.root.bit_struct(num_bits=32).place(x)
diff --git a/tests/python/test_custom_type_atomics.py b/tests/python/test_custom_type_atomics.py
index 1e810963b6c2a..d1c030e25de66 100644
--- a/tests/python/test_custom_type_atomics.py
+++ b/tests/python/test_custom_type_atomics.py
@@ -9,9 +9,9 @@
                  exclude=[ti.metal],
                  debug=True)
 def test_custom_int_atomics():
-    ci13 = ti.types.quantized_types.quant.int(13, True)
-    ci5 = ti.types.quantized_types.quant.int(5, True)
-    cu2 = ti.types.quantized_types.quant.int(2, False)
+    ci13 = ti.types.quant.int(13, True)
+    ci5 = ti.types.quant.int(5, True)
+    cu2 = ti.types.quant.int(2, False)
 
     x = ti.field(dtype=ci13)
     y = ti.field(dtype=ci5)
@@ -44,7 +44,7 @@ def foo():
 @test_utils.test(require=[ti.extension.quant_basic, ti.extension.data64],
                  debug=True)
 def test_custom_int_atomics_b64():
-    ci13 = ti.types.quantized_types.quant.int(13, True)
+    ci13 = ti.types.quant.int(13, True)
 
     x = ti.field(dtype=ci13)
 
@@ -68,12 +68,10 @@ def foo():
 
 @test_utils.test(require=ti.extension.quant_basic, debug=True)
 def test_custom_float_atomics():
-    ci13 = ti.types.quantized_types.quant.int(13, True)
-    ci19 = ti.types.quantized_types.quant.int(19, False)
-    cft13 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=ci13, scale=0.1)
-    cft19 = ti.types.quantized_types.type_factory.custom_float(
-        significand_type=ci19, scale=0.1)
+    ci13 = ti.types.quant.int(13, True)
+    ci19 = ti.types.quant.int(19, False)
+    cft13 = ti.types.quant._custom_float(significand_type=ci13, scale=0.1)
+    cft19 = ti.types.quant._custom_float(significand_type=ci19, scale=0.1)
 
     x = ti.field(dtype=cft13)
     y = ti.field(dtype=cft19)
diff --git a/tests/python/test_matrix_different_type.py b/tests/python/test_matrix_different_type.py
index 5fd00d78eeb2f..c1e899b8ea77d 100644
--- a/tests/python/test_matrix_different_type.py
+++ b/tests/python/test_matrix_different_type.py
@@ -70,10 +70,10 @@ def verify():
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_type():
-    cit1 = ti.types.quantized_types.quant.int(bits=10, signed=True)
-    cft1 = ti.types.quantized_types.type_factory.custom_float(cit1, scale=0.1)
-    cit2 = ti.types.quantized_types.quant.int(bits=22, signed=False)
-    cft2 = ti.types.quantized_types.type_factory.custom_float(cit2, scale=0.1)
+    cit1 = ti.types.quant.int(bits=10, signed=True)
+    cft1 = ti.types.quant._custom_float(cit1, scale=0.1)
+    cit2 = ti.types.quant.int(bits=22, signed=False)
+    cft2 = ti.types.quant._custom_float(cit2, scale=0.1)
     type_list = [[cit1, cft2], [cft1, cit2]]
     a = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
     b = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
diff --git a/tests/python/test_snode_layout_inspection.py b/tests/python/test_snode_layout_inspection.py
index ced2699a38623..5282665712c2d 100644
--- a/tests/python/test_snode_layout_inspection.py
+++ b/tests/python/test_snode_layout_inspection.py
@@ -41,10 +41,9 @@ def test_primitives():
 
 @test_utils.test(arch=ti.cpu)
 def test_bit_struct():
-    cit = ti.types.quantized_types.quant.int(16, False)
+    cit = ti.types.quant.int(16, False)
     x = ti.field(dtype=cit)
-    y = ti.field(dtype=ti.types.quantized_types.type_factory.custom_float(
-        significand_type=cit))
+    y = ti.field(dtype=ti.types.quant._custom_float(significand_type=cit))
     z = ti.field(dtype=ti.f32)
 
     n1 = ti.root.dense(ti.i, 32)
diff --git a/tests/python/test_struct_for.py b/tests/python/test_struct_for.py
index 95ce85ab2efcd..4e4b557c0711b 100644
--- a/tests/python/test_struct_for.py
+++ b/tests/python/test_struct_for.py
@@ -267,7 +267,7 @@ def count() -> int:
 def test_struct_for_quant():
     n = 8
 
-    ci13 = ti.types.quantized_types.quant.int(13, True)
+    ci13 = ti.types.quant.int(13, True)
     x = ti.field(dtype=ci13)
 
     ti.root.dense(ti.i, n).bit_struct(num_bits=32).place(x)

From b236c6a82c5e693ff5060d8dd360d96c1db2851c Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Tue, 17 May 2022 22:45:50 +0800
Subject: [PATCH 079/176] Update README.md

Tests if netlify is still working
---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 2102cb3d840dd..271e6df806007 100644
--- a/README.md
+++ b/README.md
@@ -14,9 +14,9 @@
 import taichi as ti
 ```
 
-## What is Taichi?
+## What is Taichi Lang?
 
-Taichi is an open-source, imperative, parallel programming language for high-performance numerical computation. It is embedded in Python and uses just-in-time (JIT) compiler frameworks, for example LLVM, to offload the compute-intensive Python code to the native GPU or CPU instructions.
+Taichi Lang is an open-source, imperative, parallel programming language for high-performance numerical computation. It is embedded in Python and uses just-in-time (JIT) compiler frameworks, for example LLVM, to offload the compute-intensive Python code to the native GPU or CPU instructions.
 
 <a href="https://github.com/taichi-dev/taichi/blob/master/python/taichi/examples/simulation/fractal.py#L1-L31"> <img src="https://github.com/taichi-dev/public_files/raw/master/taichi/fractal_code.png" height="270px"></a>  <img src="https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/fractal_small.gif" height="270px">
 
@@ -34,10 +34,10 @@ The language has broad applications spanning real-time physical simulation, numb
 
 ## Why Taichi?
 
-- Built around Python: Taichi shares almost the same syntax with Python, allowing you to write algorithms with minimal language barrier. It is also well integrated into the Python ecosystem, including NumPy and PyTorch.
-- Flexibility: Taichi provides a set of generic data containers known as *SNode* (/ˈsnoʊd/), an effective mechanism for composing hierarchical, multi-dimensional fields. This can cover many use patterns in numerical simulation (e.g. [spatially sparse computing](https://docs.taichi-lang.org/docs/sparse)).
-- Performance: With the `@ti.kernel` decorator, Taichi's JIT compiler automatically compiles your Python functions into efficient GPU or CPU machine code for parallel execution.
-- Portability: Write your code once and run it everywhere. Currently, Taichi supports most mainstream GPU APIs, such as CUDA and Vulkan.
+- Built around Python: Taichi Lang shares almost the same syntax with Python, allowing you to write algorithms with minimal language barrier. It is also well integrated into the Python ecosystem, including NumPy and PyTorch.
+- Flexibility: Taichi Lang provides a set of generic data containers known as *SNode* (/ˈsnoʊd/), an effective mechanism for composing hierarchical, multi-dimensional fields. This can cover many use patterns in numerical simulation (e.g. [spatially sparse computing](https://docs.taichi-lang.org/docs/sparse)).
+- Performance: With the `@ti.kernel` decorator, Taichi Lang's JIT compiler automatically compiles your Python functions into efficient GPU or CPU machine code for parallel execution.
+- Portability: Write your code once and run it everywhere. Currently, Taichi Lang supports most mainstream GPU APIs, such as CUDA and Vulkan.
 - ... and many more features! A cross-platform, Vulkan-based 3D visualizer, [differentiable programming](https://docs.taichi-lang.org/docs/differentiable_programming),  [quantized computation](https://github.com/taichi-dev/quantaichi) (experimental), etc.
 
 ## Getting Started
@@ -115,7 +115,7 @@ for i in range(1000000):
     gui.show()
 ```
 
-*If Taichi is properly installed, you should get the animation below 🎉:*
+*If Taichi Lang is properly installed, you should get the animation below 🎉:*
 
 <a href="https://github.com/taichi-dev/taichi/blob/master/python/taichi/examples/simulation/fractal.py#L1-L31"> </a><img src="https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/fractal_small.gif" height="270px">
 
@@ -123,7 +123,7 @@ See [Get started](https://docs.taichi-lang.org) for more information.
 
 ### Build from source
 
-If you wish to try our our experimental features or build Taichi for your own environments, see [Developer installation](https://docs.taichi-lang.org/docs/dev_install).
+If you wish to try our our experimental features or build Taichi Lang for your own environments, see [Developer installation](https://docs.taichi-lang.org/docs/dev_install).
 
 ## Documentation
 
@@ -133,7 +133,7 @@ If you wish to try our our experimental features or build Taichi for your own en
 
 ## Contributing
 
-Kudos to all of our amazing contributors! Taichi thrives through open-source. In that spirit, we welcome all kinds of contributions from the community. If you would like to participate, check out the [Contribution Guidelines](CONTRIBUTING.md) first.
+Kudos to all of our amazing contributors! Taichi Lang thrives through open-source. In that spirit, we welcome all kinds of contributions from the community. If you would like to participate, check out the [Contribution Guidelines](CONTRIBUTING.md) first.
 
 <a href="https://github.com/taichi-dev/taichi/graphs/contributors"><img src="https://raw.githubusercontent.com/taichi-dev/public_files/master/taichi/contributors_taichi-dev_taichi_12.png" width="800px"></a>
 
@@ -141,7 +141,7 @@ Kudos to all of our amazing contributors! Taichi thrives through open-source. In
 
 ## License
 
-Taichi is distributed under the terms of Apache License (Version 2.0).
+Taichi Lang is distributed under the terms of Apache License (Version 2.0).
 
 See [Apache License](https://github.com/taichi-dev/taichi/blob/master/LICENSE) for details.
 
@@ -165,8 +165,8 @@ You can also join our community from Slack or WeChat. Drop us a message at <a hr
 
 ### Demos
 
-- [Taichi examples](https://github.com/taichi-dev/taichi/tree/master/python/taichi/examples)
-- [Advanced Taichi examples](https://github.com/taichi-dev/advanced_examples)
+- [Taichi Lang examples](https://github.com/taichi-dev/taichi/tree/master/python/taichi/examples)
+- [Advanced Taichi Lang examples](https://github.com/taichi-dev/advanced_examples)
 - [DiffTaichi](https://github.com/taichi-dev/difftaichi)
 - [Taichi elements](https://github.com/taichi-dev/taichi_elements)
 - [Taichi Houdini](https://github.com/taichi-dev/taichi_houdini)
@@ -183,7 +183,7 @@ You can also join our community from Slack or WeChat. Drop us a message at <a hr
 
 ### Citations
 
-If you use Taichi in your research, please cite the corresponding papers:
+If you use Taichi Lang in your research, please cite the corresponding papers:
 
 - [**(SIGGRAPH Asia 2019) Taichi: High-Performance Computation on Sparse Data Structures**](https://yuanming.taichi.graphics/publication/2019-taichi/taichi-lang.pdf) [[Video]](https://youtu.be/wKw8LMF3Djo) [[BibTex]](https://raw.githubusercontent.com/taichi-dev/taichi/master/misc/taichi_bibtex.txt) [[Code]](https://github.com/taichi-dev/taichi)
 - [**(ICLR 2020) DiffTaichi: Differentiable Programming for Physical Simulation**](https://arxiv.org/abs/1910.00935) [[Video]](https://www.youtube.com/watch?v=Z1xvAZve9aE) [[BibTex]](https://raw.githubusercontent.com/taichi-dev/taichi/master/misc/difftaichi_bibtex.txt) [[Code]](https://github.com/yuanming-hu/difftaichi)

From 101590d6c451f796e65dddee638b66571dba487b Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Wed, 18 May 2022 10:28:35 +0800
Subject: [PATCH 080/176] [test] Fix a few mis-configured ndarray tests (#5000)

---
 tests/python/test_ndarray.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index acf000aeb5449..8579c04b4acf0 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -84,8 +84,11 @@ def test_matrix_ndarray(n, m, dtype, shape):
 
 
 @pytest.mark.parametrize('dtype', [ti.f32, ti.f64])
+@test_utils.test(arch=supported_archs_taichi_ndarray)
 def test_default_fp_ndarray(dtype):
-    ti.init(arch=supported_archs_taichi_ndarray, default_fp=dtype)
+    arch = ti.lang.impl.current_cfg().arch
+    ti.reset()
+    ti.init(arch=arch, default_fp=dtype)
 
     x = ti.Vector.ndarray(2, float, ())
 
@@ -93,8 +96,11 @@ def test_default_fp_ndarray(dtype):
 
 
 @pytest.mark.parametrize('dtype', [ti.i32, ti.i64])
+@test_utils.test(arch=supported_archs_taichi_ndarray)
 def test_default_ip_ndarray(dtype):
-    ti.init(arch=supported_archs_taichi_ndarray, default_ip=dtype)
+    arch = ti.lang.impl.current_cfg().arch
+    ti.reset()
+    ti.init(arch=arch, default_ip=dtype)
 
     x = ti.Vector.ndarray(2, int, ())
 
@@ -254,8 +260,8 @@ def _test_ndarray_deepcopy():
     assert y[4][1, 0] == 9
 
 
+@test_utils.test(arch=[ti.cuda], ndarray_use_cached_allocator=True)
 def test_ndarray_cuda_caching_allocator():
-    ti.init(arch=ti.cuda, ndarray_use_cached_allocator=True)
     n = 8
     a = ti.ndarray(ti.i32, shape=(n))
     a.fill(2)

From 19c89b9d06202db2b340997316e6ebfa5adb9c1a Mon Sep 17 00:00:00 2001
From: Justin <62801799+Justinterest@users.noreply.github.com>
Date: Wed, 18 May 2022 11:46:44 +0800
Subject: [PATCH 081/176] [Doc] Fix netlify cache & sync doc without pr content
 (#5003)

---
 netlify.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/netlify.toml b/netlify.toml
index f5cd94992176f..0e411ae8242f3 100644
--- a/netlify.toml
+++ b/netlify.toml
@@ -1,5 +1,5 @@
 [build]
-  command = "git clone https://github.com/taichi-dev/docs.taichi.graphics.git; git clone https://github.com/taichi-dev/docstring-gen docsgen; export DOCSTRING_GEN_PATH=\"$(pwd)/docsgen\"; export TAICHI_PATH=\"$(pwd)/python/taichi\"; export TAICHI_MAIN=\"$(pwd)\"; export TAICHI_WEBSITE=\"$(pwd)/docs.taichi.graphics\"; pip install sphinx-autoapi==1.8.4 gitpython pydata-sphinx-theme==0.7.2; cd $DOCSTRING_GEN_PATH/experimental; export current_version=master; make clean; make version; make apideploy; cd $TAICHI_WEBSITE/website; npm install --global yarn@1.22; yarn install; ./sync_docs.sh dev; yarn build;"
+  command = "git clone https://github.com/taichi-dev/docs.taichi.graphics.git; git clone https://github.com/taichi-dev/docstring-gen docsgen; export DOCSTRING_GEN_PATH=\"$(pwd)/docsgen\"; export TAICHI_PATH=\"$(pwd)/python/taichi\"; export TAICHI_MAIN=\"$(pwd)\"; export TAICHI_WEBSITE=\"$(pwd)/docs.taichi.graphics\"; pip install sphinx-autoapi==1.8.4 gitpython pydata-sphinx-theme==0.7.2; cd $DOCSTRING_GEN_PATH/experimental; export current_version=master; make clean; make version; make apideploy; cd $TAICHI_WEBSITE/website; git fetch origin master; git reset --hard origin/master; npm install --global yarn@1.22; yarn install; ./sync_version.sh; yarn build;"
 
   publish = "docs.taichi.graphics/website/build"
   # Cancel the build if there're no changes detected in docs/ folder.

From 2c7c1cc63fb639b1738ff4a8295b0e4029131739 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Wed, 18 May 2022 10:03:51 +0800
Subject: [PATCH 082/176] [refactor] Program owns allocated ndarrays.

The end goal of this refactor is let Ndarray be a simple wrapper around
(DeviceAllocation, dtype, shape) without having to worry about memory
allocation/deallocation. But its current implementation heavily couples
with Program*, so an intermediate state would be:
- If created from Program, Ndarray handles deviceallocation in
ctor/dtor.
- We'll add another ctor simply constructing Ndarray from
(DeviceAllocation, dtype, shape) and update the codebase to it.

ghstack-source-id: bdfd24154428a3fd92ca05688333509d0402e53a
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/4996
---
 python/taichi/lang/_ndarray.py                 |  4 ++--
 taichi/backends/opengl/opengl_program.cpp      |  4 ----
 taichi/backends/opengl/opengl_program.h        |  2 --
 taichi/backends/vulkan/vulkan_program.cpp      |  9 +++------
 taichi/backends/vulkan/vulkan_program.h        |  3 ---
 taichi/llvm/llvm_program.cpp                   |  4 ----
 taichi/llvm/llvm_program.h                     |  2 --
 taichi/program/ndarray.cpp                     |  6 +++---
 taichi/program/ndarray.h                       | 13 +++++--------
 taichi/program/program.cpp                     |  6 ++++++
 taichi/program/program.h                       |  6 ++----
 taichi/program/program_impl.h                  |  4 ----
 taichi/python/export_lang.cpp                  |  8 +++++++-
 taichi/runtime/opengl/opengl_api.cpp           |  2 +-
 taichi/runtime/opengl/opengl_kernel_launcher.h |  2 +-
 tests/python/test_torch_io.py                  |  2 +-
 16 files changed, 31 insertions(+), 46 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index 9c04d03a68e87..6ce862f91edbf 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -16,8 +16,8 @@ class Ndarray:
     def __init__(self, dtype, arr_shape):
         self.host_accessor = None
         self.dtype = cook_dtype(dtype)
-        self.arr = _ti_core.Ndarray(impl.get_runtime().prog, cook_dtype(dtype),
-                                    arr_shape)
+        self.arr = impl.get_runtime().prog.create_ndarray(
+            cook_dtype(dtype), arr_shape)
 
     @property
     def element_shape(self):
diff --git a/taichi/backends/opengl/opengl_program.cpp b/taichi/backends/opengl/opengl_program.cpp
index 8ef7262ae9ee5..0b6defe956507 100644
--- a/taichi/backends/opengl/opengl_program.cpp
+++ b/taichi/backends/opengl/opengl_program.cpp
@@ -40,10 +40,6 @@ DeviceAllocation OpenglProgramImpl::allocate_memory_ndarray(
        /*export_sharing=*/false});
 }
 
-std::shared_ptr<Device> OpenglProgramImpl::get_device_shared() {
-  return opengl_runtime_->device;
-}
-
 void OpenglProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   // TODO: support materializing multiple snode trees
   opengl::OpenglStructCompiler scomp;
diff --git a/taichi/backends/opengl/opengl_program.h b/taichi/backends/opengl/opengl_program.h
index 9f55d6678fae6..5e0434823bf22 100644
--- a/taichi/backends/opengl/opengl_program.h
+++ b/taichi/backends/opengl/opengl_program.h
@@ -43,8 +43,6 @@ class OpenglProgramImpl : public ProgramImpl {
   DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
                                            uint64 *result_buffer) override;
 
-  std::shared_ptr<Device> get_device_shared() override;
-
   std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override;
 
   void destroy_snode_tree(SNodeTree *snode_tree) override {
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/backends/vulkan/vulkan_program.cpp
index 6ec32b704056e..b2d0c6a8d80a3 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/backends/vulkan/vulkan_program.cpp
@@ -178,15 +178,12 @@ std::unique_ptr<AotModuleBuilder> VulkanProgramImpl::make_aot_module_builder() {
 DeviceAllocation VulkanProgramImpl::allocate_memory_ndarray(
     std::size_t alloc_size,
     uint64 *result_buffer) {
-  auto &ndarray =
-      ref_ndarry_.emplace_back(get_compute_device()->allocate_memory_unique(
-          {alloc_size, /*host_write=*/false, /*host_read=*/false,
-           /*export_sharing=*/false}));
-  return *ndarray;
+  return get_compute_device()->allocate_memory(
+      {alloc_size, /*host_write=*/false, /*host_read=*/false,
+       /*export_sharing=*/false});
 }
 
 VulkanProgramImpl::~VulkanProgramImpl() {
-  ref_ndarry_.clear();
   vulkan_runtime_.reset();
   embedded_device_.reset();
 }
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index f18cd07d23715..8285fdf7e8947 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -89,9 +89,6 @@ class VulkanProgramImpl : public ProgramImpl {
   std::unique_ptr<vulkan::VkRuntime> vulkan_runtime_{nullptr};
   std::unique_ptr<vulkan::SNodeTreeManager> snode_tree_mgr_{nullptr};
   std::vector<spirv::CompiledSNodeStructs> aot_compiled_snode_structs_;
-
-  // This is a hack until NDArray is properlly owned by programs
-  std::vector<std::unique_ptr<DeviceAllocationGuard>> ref_ndarry_;
 };
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 9f6b5359ac300..721074ecb9050 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -615,10 +615,6 @@ DeviceAllocation LlvmProgramImpl::allocate_memory_ndarray(
        result_buffer});
 }
 
-std::shared_ptr<Device> LlvmProgramImpl::get_device_shared() {
-  return device_;
-}
-
 uint64_t *LlvmProgramImpl::get_ndarray_alloc_info_ptr(
     const DeviceAllocation &alloc) {
   if (config->arch == Arch::cuda) {
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index 761240f6fa56b..7ceb33e9fe559 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -107,8 +107,6 @@ class LlvmProgramImpl : public ProgramImpl {
 
   uint64_t *get_ndarray_alloc_info_ptr(const DeviceAllocation &alloc);
 
-  std::shared_ptr<Device> get_device_shared() override;
-
   void fill_ndarray(const DeviceAllocation &alloc,
                     std::size_t size,
                     uint32_t data);
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index a54923d353ade..f87c17bd27e23 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -22,7 +22,7 @@ Ndarray::Ndarray(Program *prog,
                                 1,
                                 std::multiplies<>())),
       element_size_(data_type_size(dtype)),
-      device_(prog->get_device_shared()),
+      prog_(prog),
       prog_impl_(prog->get_llvm_program_impl()),
       rw_accessors_bank_(&prog->get_ndarray_rw_accessors_bank()) {
   ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
@@ -39,8 +39,8 @@ Ndarray::Ndarray(Program *prog,
 }
 
 Ndarray::~Ndarray() {
-  if (device_) {
-    device_->dealloc_memory(ndarray_alloc_);
+  if (prog_) {
+    ndarray_alloc_.device->dealloc_memory(ndarray_alloc_);
   }
 }
 
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index dafec26cbd1b9..ae7b6bee2c358 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -42,20 +42,17 @@ class Ndarray {
   ~Ndarray();
 
  private:
+  void buffer_fill(uint32_t val);
+
   DeviceAllocation ndarray_alloc_{kDeviceNullAllocation};
   // Invariant:
   //   data_ptr_ is not nullptr iff arch is a llvm backend
   uint64_t *data_ptr_{nullptr};
   std::size_t nelement_{1};
   std::size_t element_size_{1};
-  // Ndarrays manage their own |DeviceAllocation| so this must be shared with
-  // |OpenGlRuntime|. Without the ownership, when the program exits |device_|
-  // might be destructed earlier than Ndarray object, leaving a segfault when
-  // you try to deallocate in Ndarray destructor.
-  // Note that we might consider changing this logic later if we implement
-  // dynamic tensor rematerialization.
-  std::shared_ptr<Device> device_{nullptr};
-  void buffer_fill(uint32_t val);
+
+  Program *prog_{nullptr};
+  // TODO: maybe remove these?
   LlvmProgramImpl *prog_impl_{nullptr};
   NdarrayRwAccessorsBank *rw_accessors_bank_{nullptr};
 };
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index f873f8e8debf2..04256d09b9f72 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -553,6 +553,12 @@ std::size_t Program::get_snode_num_dynamically_allocated(SNode *snode) {
                                                             result_buffer);
 }
 
+Ndarray *Program::create_ndarray(const DataType type,
+                                 const std::vector<int> &shape) {
+  ndarrays_.emplace_back(std::make_unique<Ndarray>(this, type, shape));
+  return ndarrays_.back().get();
+}
+
 Program::~Program() {
   if (!finalized_)
     finalize();
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 8ceffa659c047..cbc0d9aef7f96 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -311,16 +311,13 @@ class TI_DLL_EXPORT Program {
     return program_impl_->get_graphics_device();
   }
 
-  std::shared_ptr<Device> get_device_shared() {
-    return program_impl_->get_device_shared();
-  }
-
   // TODO: do we still need result_buffer?
   DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
                                            uint64 *result_buffer) {
     return program_impl_->allocate_memory_ndarray(alloc_size, result_buffer);
   }
 
+  Ndarray *create_ndarray(const DataType type, const std::vector<int> &shape);
   ASTBuilder *current_ast_builder() {
     return current_callable ? &current_callable->context->builder() : nullptr;
   }
@@ -351,6 +348,7 @@ class TI_DLL_EXPORT Program {
   bool finalized_{false};
 
   std::unique_ptr<MemoryPool> memory_pool_{nullptr};
+  std::vector<std::unique_ptr<Ndarray>> ndarrays_;
 };
 
 }  // namespace lang
diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
index 86bd361d8157b..c168bf1246d2a 100644
--- a/taichi/program/program_impl.h
+++ b/taichi/program/program_impl.h
@@ -80,10 +80,6 @@ class ProgramImpl {
     return nullptr;
   }
 
-  virtual std::shared_ptr<Device> get_device_shared() {
-    return nullptr;
-  }
-
   virtual DevicePtr get_snode_tree_device_ptr(int tree_id) {
     return kDeviceNullPtr;
   }
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 6d4095fd48b89..9cb0d59ebea59 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -416,6 +416,13 @@ void export_lang(py::module &m) {
            [](Program *program, const std::string &name) {
              return Expr::make<IdExpression>(program->get_next_global_id(name));
            })
+      .def(
+          "create_ndarray",
+          [&](Program *program, const DataType &dt,
+              const std::vector<int> &shape) -> Ndarray * {
+            return program->create_ndarray(dt, shape);
+          },
+          py::return_value_policy::reference)
       .def("global_var_expr_from_snode", [](Program *program, SNode *snode) {
         return Expr::make<GlobalVariableExpression>(
             snode, program->get_next_global_id());
@@ -495,7 +502,6 @@ void export_lang(py::module &m) {
       });
 
   py::class_<Ndarray>(m, "Ndarray")
-      .def(py::init<Program *, const DataType &, const std::vector<int> &>())
       .def("data_ptr", &Ndarray::get_data_ptr_as_int)
       .def("device_allocation_ptr", &Ndarray::get_device_allocation_ptr_as_int)
       .def("element_size", &Ndarray::get_element_size)
diff --git a/taichi/runtime/opengl/opengl_api.cpp b/taichi/runtime/opengl/opengl_api.cpp
index a7f8985f0390b..d4c0c93667592 100644
--- a/taichi/runtime/opengl/opengl_api.cpp
+++ b/taichi/runtime/opengl/opengl_api.cpp
@@ -543,7 +543,7 @@ DeviceCompiledTaichiKernel::DeviceCompiledTaichiKernel(
 OpenGlRuntime::OpenGlRuntime() {
   initialize_opengl();
 
-  device = std::make_shared<GLDevice>();
+  device = std::make_unique<GLDevice>();
 
   impl = std::make_unique<OpenGlRuntimeImpl>();
 
diff --git a/taichi/runtime/opengl/opengl_kernel_launcher.h b/taichi/runtime/opengl/opengl_kernel_launcher.h
index 183a9f63ce073..a9b88e8a0d642 100644
--- a/taichi/runtime/opengl/opengl_kernel_launcher.h
+++ b/taichi/runtime/opengl/opengl_kernel_launcher.h
@@ -17,7 +17,7 @@ class GLBuffer;
 class DeviceCompiledTaichiKernel;
 
 struct OpenGlRuntime {
-  std::shared_ptr<Device> device{nullptr};
+  std::unique_ptr<Device> device{nullptr};
   std::unique_ptr<OpenGlRuntimeImpl> impl{nullptr};
   std::vector<std::unique_ptr<DeviceAllocationGuard>> saved_arg_bufs;
   OpenGlRuntime();
diff --git a/tests/python/test_torch_io.py b/tests/python/test_torch_io.py
index ba457c97c25f3..1f05bfdcb1f3e 100644
--- a/tests/python/test_torch_io.py
+++ b/tests/python/test_torch_io.py
@@ -273,7 +273,7 @@ def test_torch(arr: ti.types.ndarray()):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.opengl, ti.vulkan])
 def test_torch_view():
     @ti.kernel
     def copy(x: ti.types.ndarray(), y: ti.types.ndarray()):

From 237dcd15fca18501663fa61356f45046b9c0efc6 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Wed, 18 May 2022 10:03:52 +0800
Subject: [PATCH 083/176] [test] Add test for Ndarray from DeviceAllocation

ghstack-source-id: 7d7c5486bce0a491170b52ba3ae809b4853d5447
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/4997
---
 tests/cpp/aot/aot_save_load_test.cpp | 155 +++++++++++++++++++++++++++
 1 file changed, 155 insertions(+)

diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index 798af91e54926..0784140ca0a6a 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -3,6 +3,7 @@
 #include "taichi/ir/statements.h"
 #include "taichi/inc/constants.h"
 #include "taichi/program/program.h"
+#include "tests/cpp/program/test_program.h"
 #ifdef TI_WITH_VULKAN
 #include "taichi/backends/vulkan/aot_module_loader_impl.h"
 #include "taichi/backends/device.h"
@@ -104,7 +105,74 @@ using namespace lang;
   aot_builder->dump(".", "");
 }
 
+[[maybe_unused]] static void save_ndarray_kernels(Arch arch) {
+  TestProgram test_prog;
+  test_prog.setup(arch);
+  auto aot_builder = test_prog.prog()->make_aot_module_builder(arch);
+  IRBuilder builder1, builder2;
+
+  {
+    auto *arg = builder1.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
+                                         /*is_ptr=*/true);
+    auto *zero = builder1.get_int32(0);
+    auto *one = builder1.get_int32(1);
+    auto *two = builder1.get_int32(2);
+    auto *a1ptr = builder1.create_external_ptr(arg, {one});
+    builder1.create_global_store(a1ptr, one);  // a[1] = 1
+    auto *a0 =
+        builder1.create_global_load(builder1.create_external_ptr(arg, {zero}));
+    auto *a2ptr = builder1.create_external_ptr(arg, {two});
+    auto *a2 = builder1.create_global_load(a2ptr);
+    auto *a0plusa2 = builder1.create_add(a0, a2);
+    builder1.create_global_store(a2ptr, a0plusa2);  // a[2] = a[0] + a[2]
+  }
+  auto block = builder1.extract_ir();
+  auto ker1 =
+      std::make_unique<Kernel>(*test_prog.prog(), std::move(block), "ker1");
+  ker1->insert_arg(get_data_type<int>(), /*is_array=*/true);
+  {
+    auto *arg0 = builder2.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
+                                          /*is_ptr=*/true);
+    auto *arg1 = builder2.create_arg_load(/*arg_id=*/1, get_data_type<int>(),
+                                          /*is_ptr=*/false);
+    auto *one = builder2.get_int32(1);
+    auto *a1ptr = builder2.create_external_ptr(arg0, {one});
+    builder2.create_global_store(a1ptr, arg1);  // a[1] = arg1
+  }
+  auto block2 = builder2.extract_ir();
+  auto ker2 =
+      std::make_unique<Kernel>(*test_prog.prog(), std::move(block2), "ker2");
+  ker2->insert_arg(get_data_type<int>(), /*is_array=*/true);
+  ker2->insert_arg(get_data_type<int>(), /*is_array=*/false);
+
+  aot_builder->add("ker1", ker1.get());
+  aot_builder->add("ker2", ker2.get());
+  aot_builder->dump(".", "");
+}
+
 #ifdef TI_WITH_VULKAN
+[[maybe_unused]] static void write_devalloc(
+    taichi::lang::vulkan::VkRuntime *vulkan_runtime,
+    taichi::lang::DeviceAllocation &alloc,
+    const void *data,
+    size_t size) {
+  char *const device_arr_ptr =
+      reinterpret_cast<char *>(vulkan_runtime->get_ti_device()->map(alloc));
+  std::memcpy(device_arr_ptr, data, size);
+  vulkan_runtime->get_ti_device()->unmap(alloc);
+}
+
+[[maybe_unused]] static void load_devalloc(
+    taichi::lang::vulkan::VkRuntime *vulkan_runtime,
+    taichi::lang::DeviceAllocation &alloc,
+    void *data,
+    size_t size) {
+  char *const device_arr_ptr =
+      reinterpret_cast<char *>(vulkan_runtime->get_ti_device()->map(alloc));
+  std::memcpy(data, device_arr_ptr, size);
+  vulkan_runtime->get_ti_device()->unmap(alloc);
+}
+
 TEST(AotSaveLoad, Vulkan) {
   // Otherwise will segfault on macOS VM,
   // where Vulkan is installed but no devices are present
@@ -177,4 +245,91 @@ TEST(AotSaveLoad, Vulkan) {
   auto x_field = vk_module->get_field("place");
   EXPECT_NE(x_field, nullptr);
 }
+
+TEST(AotSaveLoad, VulkanNdarray) {
+  // Otherwise will segfault on macOS VM,
+  // where Vulkan is installed but no devices are present
+  if (!vulkan::is_vulkan_api_available()) {
+    return;
+  }
+
+  save_ndarray_kernels(Arch::vulkan);
+
+  // API based on proposal https://github.com/taichi-dev/taichi/issues/3642
+  // Initialize Vulkan program
+  taichi::uint64 *result_buffer{nullptr};
+  taichi::lang::RuntimeContext host_ctx;
+  auto memory_pool =
+      std::make_unique<taichi::lang::MemoryPool>(Arch::vulkan, nullptr);
+  result_buffer = (taichi::uint64 *)memory_pool->allocate(
+      sizeof(taichi::uint64) * taichi_result_buffer_entries, 8);
+  host_ctx.result_buffer = result_buffer;
+
+  // Create Taichi Device for computation
+  lang::vulkan::VulkanDeviceCreator::Params evd_params;
+  evd_params.api_version =
+      taichi::lang::vulkan::VulkanEnvSettings::kApiVersion();
+  auto embedded_device =
+      std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
+
+  // Create Vulkan runtime
+  vulkan::VkRuntime::Params params;
+  params.host_result_buffer = result_buffer;
+  params.device = embedded_device->device();
+  auto vulkan_runtime =
+      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+
+  // Run AOT module loader
+  vulkan::AotModuleParams mod_params;
+  mod_params.module_path = ".";
+  mod_params.runtime = vulkan_runtime.get();
+
+  std::unique_ptr<aot::Module> vk_module =
+      aot::Module::load(Arch::vulkan, mod_params);
+  EXPECT_TRUE(vk_module);
+
+  // Retrieve kernels/fields/etc from AOT module
+  auto root_size = vk_module->get_root_size();
+  EXPECT_EQ(root_size, 0);
+  vulkan_runtime->add_root_buffer(root_size);
+
+  auto ker1 = vk_module->get_kernel("ker1");
+  EXPECT_TRUE(ker1);
+
+  const int size = 10;
+  taichi::lang::Device::AllocParams alloc_params;
+  alloc_params.host_write = true;
+  alloc_params.size = size * sizeof(int);
+  alloc_params.usage = taichi::lang::AllocUsage::Storage;
+  DeviceAllocation devalloc_arr_ =
+      embedded_device->device()->allocate_memory(alloc_params);
+  host_ctx.set_arg_devalloc(0, devalloc_arr_, {10});
+
+  int src[size] = {0};
+  src[0] = 2;
+  src[2] = 40;
+  write_devalloc(vulkan_runtime.get(), devalloc_arr_, src, sizeof(src));
+  ker1->launch(&host_ctx);
+  vulkan_runtime->synchronize();
+
+  int dst[size] = {33};
+  load_devalloc(vulkan_runtime.get(), devalloc_arr_, dst, sizeof(dst));
+  EXPECT_EQ(dst[0], 2);
+  EXPECT_EQ(dst[1], 1);
+  EXPECT_EQ(dst[2], 42);
+
+  auto ker2 = vk_module->get_kernel("ker2");
+  EXPECT_TRUE(ker2);
+
+  host_ctx.set_arg(1, 3);
+  ker2->launch(&host_ctx);
+  vulkan_runtime->synchronize();
+  load_devalloc(vulkan_runtime.get(), devalloc_arr_, dst, sizeof(dst));
+  EXPECT_EQ(dst[0], 2);
+  EXPECT_EQ(dst[1], 3);
+  EXPECT_EQ(dst[2], 42);
+
+  // Deallocate
+  embedded_device->device()->dealloc_memory(devalloc_arr_);
+}
 #endif

From 62332fceefe1c8f0a4b8e4b2fc1c8a4ab51df91d Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Wed, 18 May 2022 10:03:52 +0800
Subject: [PATCH 084/176] [refactor] Construct ndarray from existing
 DeviceAllocation.

The end goal is make this the only ctor for Ndarray class.

ghstack-source-id: a7294096285b3879a84ffbb41196a136b2b605a8
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/4998
---
 taichi/program/ndarray.cpp           | 21 +++++++++++++++++++++
 taichi/program/ndarray.h             | 16 +++++++++++++++-
 tests/cpp/aot/aot_save_load_test.cpp |  3 ++-
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index f87c17bd27e23..aff9e412f05b0 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -38,6 +38,20 @@ Ndarray::Ndarray(Program *prog,
 #endif
 }
 
+Ndarray::Ndarray(DeviceAllocation &devalloc,
+                 const DataType type,
+                 const std::vector<int> &shape)
+    : ndarray_alloc_(devalloc),
+      dtype(type),
+      shape(shape),
+      num_active_indices(shape.size()),
+      nelement_(std::accumulate(std::begin(shape),
+                                std::end(shape),
+                                1,
+                                std::multiplies<>())),
+      element_size_(data_type_size(dtype)) {
+}
+
 Ndarray::~Ndarray() {
   if (prog_) {
     ndarray_alloc_.device->dealloc_memory(ndarray_alloc_);
@@ -104,5 +118,12 @@ void Ndarray::buffer_fill(uint32_t val) {
   TI_ERROR("Llvm disabled");
 #endif
 }
+
+void set_runtime_ctx_ndarray(RuntimeContext &ctx,
+                             int arg_id,
+                             Ndarray &ndarray) {
+  ctx.set_arg_devalloc(arg_id, ndarray.ndarray_alloc_, ndarray.shape);
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index ae7b6bee2c358..863f33511898a 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -16,10 +16,21 @@ class NdarrayRwAccessorsBank;
 
 class Ndarray {
  public:
+  /* Constructs a Ndarray managed by Program.
+   * Memory allocation and deallocation is handled by Program.
+   * TODO: Ideally Ndarray shouldn't worry about memory alloc/dealloc at all.
+   */
   explicit Ndarray(Program *prog,
                    const DataType type,
                    const std::vector<int> &shape);
 
+  /* Constructs a Ndarray from an existing DeviceAllocation
+   * It doesn't handle the allocation and deallocation.
+   */
+  explicit Ndarray(DeviceAllocation &devalloc,
+                   const DataType type,
+                   const std::vector<int> &shape);
+  DeviceAllocation ndarray_alloc_{kDeviceNullAllocation};
   DataType dtype;
   // Invariant: Since ndarray indices are flattened for vector/matrix, this is
   // always true:
@@ -44,7 +55,6 @@ class Ndarray {
  private:
   void buffer_fill(uint32_t val);
 
-  DeviceAllocation ndarray_alloc_{kDeviceNullAllocation};
   // Invariant:
   //   data_ptr_ is not nullptr iff arch is a llvm backend
   uint64_t *data_ptr_{nullptr};
@@ -57,5 +67,9 @@ class Ndarray {
   NdarrayRwAccessorsBank *rw_accessors_bank_{nullptr};
 };
 
+// TODO: move this as a method inside RuntimeContext once Ndarray is decoupled
+// with Program
+void set_runtime_ctx_ndarray(RuntimeContext &ctx, int arg_id, Ndarray &ndarray);
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index 0784140ca0a6a..2ab9f8a3102d3 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -303,7 +303,8 @@ TEST(AotSaveLoad, VulkanNdarray) {
   alloc_params.usage = taichi::lang::AllocUsage::Storage;
   DeviceAllocation devalloc_arr_ =
       embedded_device->device()->allocate_memory(alloc_params);
-  host_ctx.set_arg_devalloc(0, devalloc_arr_, {10});
+  Ndarray arr = Ndarray(devalloc_arr_, PrimitiveType::i32, {size});
+  taichi::lang::set_runtime_ctx_ndarray(host_ctx, 0, arr);
 
   int src[size] = {0};
   src[0] = 2;

From 058f43620b7f0a7f7f279b02f16a25c47e02550d Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Wed, 18 May 2022 10:46:30 +0800
Subject: [PATCH 085/176] [refactor] Free ndarray's memory when python GC
 triggers

Previously Program manages lifetime of all allocated ndarrays. So when
you call `del ndarray` in python, its memory was not freed. This PR
changes the behavior that `ndarray` memory gets deallocated when python
GC triggers, or its containing `Program` gets destructed, whichever
happens first. There're some quirks around how we handle the async
python GC and manual `ti.reset()`. Thanks to k-ye, we now added a
`generation` number to track the containing program instance of ndarrays
so that memory deallocation happens correctly.

ghstack-source-id: 4fdef9c8285e2188c4afaffe6febdd45e5164b15
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/4999
---
 python/taichi/lang/_ndarray.py | 13 +++++++++++++
 python/taichi/lang/impl.py     |  4 ++++
 taichi/program/program.cpp     | 14 ++++++++++++--
 taichi/program/program.h       |  3 ++-
 taichi/python/export_lang.cpp  |  1 +
 tests/python/test_ndarray.py   | 22 ++++++++++++++++++++++
 6 files changed, 54 insertions(+), 3 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index 6ce862f91edbf..8dc2a275ca8b2 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -18,6 +18,19 @@ def __init__(self, dtype, arr_shape):
         self.dtype = cook_dtype(dtype)
         self.arr = impl.get_runtime().prog.create_ndarray(
             cook_dtype(dtype), arr_shape)
+        self._gen = impl.get_runtime().generation
+
+    def __del__(self):
+        # - impl.get_runtime().prog == None:
+        #   ti.reset() is called but ti.init() isn't re-initialized yet.
+        #   At this point all ndarrays allocated in the previous program
+        #   are freed along with program destruction.
+        # - impl.get_generation() != self.gen
+        #   This ndarray was created from previous prog which was destructed.
+        #   So its memory was freed already.
+        if impl.get_runtime().prog is not None and impl.get_runtime(
+        ).generation == self._gen:
+            impl.get_runtime().prog.delete_ndarray(self.arr)
 
     @property
     def element_shape(self):
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 90cfc7446d3af..4e768d4457389 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -1,4 +1,5 @@
 import numbers
+from itertools import count
 from types import FunctionType, MethodType
 from typing import Iterable
 
@@ -223,6 +224,8 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class PyTaichi:
+    _gen = count(0)
+
     def __init__(self, kernels=None):
         self.materialized = False
         self.prog = None
@@ -239,6 +242,7 @@ def __init__(self, kernels=None):
         self.grad_replaced = False
         self.kernels = kernels or []
         self._signal_handler_registry = None
+        self.generation = next(self._gen)
 
     def get_num_compiled_functions(self):
         return len(self.compiled_functions) + len(self.compiled_grad_functions)
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 04256d09b9f72..a72a71719359d 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -508,6 +508,8 @@ void Program::finalize() {
     }
   }
 
+  ndarrays_.clear();
+
   synchronize();
   memory_pool_->terminate();
 
@@ -555,8 +557,16 @@ std::size_t Program::get_snode_num_dynamically_allocated(SNode *snode) {
 
 Ndarray *Program::create_ndarray(const DataType type,
                                  const std::vector<int> &shape) {
-  ndarrays_.emplace_back(std::make_unique<Ndarray>(this, type, shape));
-  return ndarrays_.back().get();
+  // TODO: allocate DeviceAllocation first and then create Ndarray
+  auto arr = std::make_unique<Ndarray>(this, type, shape);
+  auto arr_ptr = arr.get();
+  ndarrays_.insert({arr_ptr, std::move(arr)});
+  return arr_ptr;
+}
+
+void Program::delete_ndarray(Ndarray *ndarray) {
+  TI_ASSERT(ndarrays_.count(ndarray));
+  ndarrays_.erase(ndarray);
 }
 
 Program::~Program() {
diff --git a/taichi/program/program.h b/taichi/program/program.h
index cbc0d9aef7f96..2d8ee0bcdba52 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -318,6 +318,7 @@ class TI_DLL_EXPORT Program {
   }
 
   Ndarray *create_ndarray(const DataType type, const std::vector<int> &shape);
+  void delete_ndarray(Ndarray *ndarray);
   ASTBuilder *current_ast_builder() {
     return current_callable ? &current_callable->context->builder() : nullptr;
   }
@@ -348,7 +349,7 @@ class TI_DLL_EXPORT Program {
   bool finalized_{false};
 
   std::unique_ptr<MemoryPool> memory_pool_{nullptr};
-  std::vector<std::unique_ptr<Ndarray>> ndarrays_;
+  std::unordered_map<void *, std::unique_ptr<Ndarray>> ndarrays_;
 };
 
 }  // namespace lang
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 9cb0d59ebea59..1d3ad87c8b27c 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -423,6 +423,7 @@ void export_lang(py::module &m) {
             return program->create_ndarray(dt, shape);
           },
           py::return_value_policy::reference)
+      .def("delete_ndarray", &Program::delete_ndarray)
       .def("global_var_expr_from_snode", [](Program *program, SNode *snode) {
         return Expr::make<GlobalVariableExpression>(
             snode, program->get_next_global_id());
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 8579c04b4acf0..5034320a15935 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -341,6 +341,15 @@ def test_ndarray_numpy_io():
     _test_ndarray_numpy_io()
 
 
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_reset():
+    n = 8
+    c = ti.Matrix.ndarray(4, 4, ti.f32, shape=(n))
+    del c
+    d = ti.Matrix.ndarray(4, 4, ti.f32, shape=(n))
+    ti.reset()
+
+
 def _test_ndarray_matrix_numpy_io(layout):
     n = 5
     m = 2
@@ -613,3 +622,16 @@ def init(d: ti.i32, arr: ti.types.ndarray()):
     y = ti.ndarray(dtype=ti.f32, shape=(n2, n2))
     init(3, y)
     assert (y.to_numpy() == (np.ones(shape=(n2, n2)) * 3)).all()
+
+
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_generation():
+    curr_arch = ti.lang.impl.current_cfg().arch
+    n1 = 4
+    x = ti.ndarray(dtype=ti.f32, shape=(n1, n1))
+    prev_gen = x._gen
+    ti.reset()  # gen++
+    ti.init(curr_arch)  # calls ti.reset(), gen++
+    y = ti.ndarray(dtype=ti.f32, shape=(n1, ))
+    assert y._gen > prev_gen
+    del x

From 9606e037afd1a883449e9f068d7f1c70badab58a Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Wed, 18 May 2022 13:59:45 +0800
Subject: [PATCH 086/176] [refactor] Move ndarray fast fill methods to Program

This PR gets rid of `LlvmProgramImpl*` member inside `Ndarray` class,
which is a step closer towards decoupling `Ndarray` and memory
management.

ghstack-source-id: 181d28eba3ded5c95d8c70c95a293505bbfebf01
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5002
---
 python/taichi/lang/_ndarray.py |  6 +++---
 taichi/program/ndarray.cpp     | 23 -----------------------
 taichi/program/ndarray.h       |  6 ------
 taichi/program/program.cpp     | 11 +++++++++++
 taichi/program/program.h       |  4 ++++
 taichi/python/export_lang.cpp  | 17 ++++++++++++++---
 6 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index 8dc2a275ca8b2..3b3d96a0eef16 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -83,11 +83,11 @@ def fill(self, val):
         ).arch != _ti_core.Arch.x64:
             self._fill_by_kernel(val)
         elif self.dtype == primitive_types.f32:
-            self.arr.fill_float(val)
+            impl.get_runtime().prog.fill_float(self.arr, val)
         elif self.dtype == primitive_types.i32:
-            self.arr.fill_int(val)
+            impl.get_runtime().prog.fill_int(self.arr, val)
         elif self.dtype == primitive_types.u32:
-            self.arr.fill_uint(val)
+            impl.get_runtime().prog.fill_uint(self.arr, val)
         else:
             self._fill_by_kernel(val)
 
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index aff9e412f05b0..e6424d2fcc076 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -23,7 +23,6 @@ Ndarray::Ndarray(Program *prog,
                                 std::multiplies<>())),
       element_size_(data_type_size(dtype)),
       prog_(prog),
-      prog_impl_(prog->get_llvm_program_impl()),
       rw_accessors_bank_(&prog->get_ndarray_rw_accessors_bank()) {
   ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
                                                  prog->result_buffer);
@@ -77,18 +76,6 @@ std::size_t Ndarray::get_nelement() const {
   return nelement_;
 }
 
-void Ndarray::fill_float(float val) {
-  buffer_fill(reinterpret_cast<uint32_t &>(val));
-}
-
-void Ndarray::fill_int(int32_t val) {
-  buffer_fill(reinterpret_cast<uint32_t &>(val));
-}
-
-void Ndarray::fill_uint(uint32_t val) {
-  buffer_fill(reinterpret_cast<uint32_t &>(val));
-}
-
 int64 Ndarray::read_int(const std::vector<int> &i) {
   return rw_accessors_bank_->get(this).read_int(i);
 }
@@ -109,16 +96,6 @@ void Ndarray::write_float(const std::vector<int> &i, float64 val) {
   rw_accessors_bank_->get(this).write_float(i, val);
 }
 
-void Ndarray::buffer_fill(uint32_t val) {
-  // This is a temporary solution to bypass device api
-  // should be moved to commandList when available in CUDA
-#ifdef TI_WITH_LLVM
-  prog_impl_->fill_ndarray(ndarray_alloc_, nelement_, val);
-#else
-  TI_ERROR("Llvm disabled");
-#endif
-}
-
 void set_runtime_ctx_ndarray(RuntimeContext &ctx,
                              int arg_id,
                              Ndarray &ndarray) {
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index 863f33511898a..50381ca0f21cc 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -42,9 +42,6 @@ class Ndarray {
   intptr_t get_device_allocation_ptr_as_int() const;
   std::size_t get_element_size() const;
   std::size_t get_nelement() const;
-  void fill_float(float val);
-  void fill_int(int32_t val);
-  void fill_uint(uint32_t val);
   int64 read_int(const std::vector<int> &i);
   uint64 read_uint(const std::vector<int> &i);
   float64 read_float(const std::vector<int> &i);
@@ -53,8 +50,6 @@ class Ndarray {
   ~Ndarray();
 
  private:
-  void buffer_fill(uint32_t val);
-
   // Invariant:
   //   data_ptr_ is not nullptr iff arch is a llvm backend
   uint64_t *data_ptr_{nullptr};
@@ -63,7 +58,6 @@ class Ndarray {
 
   Program *prog_{nullptr};
   // TODO: maybe remove these?
-  LlvmProgramImpl *prog_impl_{nullptr};
   NdarrayRwAccessorsBank *rw_accessors_bank_{nullptr};
 };
 
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index a72a71719359d..e2a9515111a6a 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -569,6 +569,17 @@ void Program::delete_ndarray(Ndarray *ndarray) {
   ndarrays_.erase(ndarray);
 }
 
+void Program::fill_ndarray_fast(Ndarray *ndarray, uint32_t val) {
+// This is a temporary solution to bypass device api.
+// Should be moved to CommandList once available in CUDA.
+#ifdef TI_WITH_LLVM
+  get_llvm_program_impl()->fill_ndarray(ndarray->ndarray_alloc_,
+                                        ndarray->get_nelement(), val);
+#else
+  TI_ERROR("Not supported");
+#endif
+}
+
 Program::~Program() {
   if (!finalized_)
     finalize();
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 2d8ee0bcdba52..a85fd41484d49 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -318,7 +318,11 @@ class TI_DLL_EXPORT Program {
   }
 
   Ndarray *create_ndarray(const DataType type, const std::vector<int> &shape);
+
   void delete_ndarray(Ndarray *ndarray);
+
+  void fill_ndarray_fast(Ndarray *ndarray, uint32_t val);
+
   ASTBuilder *current_ast_builder() {
     return current_callable ? &current_callable->context->builder() : nullptr;
   }
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 1d3ad87c8b27c..a7c9796e74edf 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -424,6 +424,20 @@ void export_lang(py::module &m) {
           },
           py::return_value_policy::reference)
       .def("delete_ndarray", &Program::delete_ndarray)
+      .def("fill_float",
+           [](Program *program, Ndarray *ndarray, float val) {
+             program->fill_ndarray_fast(ndarray,
+                                        reinterpret_cast<uint32_t &>(val));
+           })
+      .def("fill_int",
+           [](Program *program, Ndarray *ndarray, int32_t val) {
+             program->fill_ndarray_fast(ndarray,
+                                        reinterpret_cast<int32_t &>(val));
+           })
+      .def("fill_uint",
+           [](Program *program, Ndarray *ndarray, uint32_t val) {
+             program->fill_ndarray_fast(ndarray, val);
+           })
       .def("global_var_expr_from_snode", [](Program *program, SNode *snode) {
         return Expr::make<GlobalVariableExpression>(
             snode, program->get_next_global_id());
@@ -507,9 +521,6 @@ void export_lang(py::module &m) {
       .def("device_allocation_ptr", &Ndarray::get_device_allocation_ptr_as_int)
       .def("element_size", &Ndarray::get_element_size)
       .def("nelement", &Ndarray::get_nelement)
-      .def("fill_float", &Ndarray::fill_float)
-      .def("fill_int", &Ndarray::fill_int)
-      .def("fill_uint", &Ndarray::fill_uint)
       .def("read_int", &Ndarray::read_int)
       .def("read_uint", &Ndarray::read_uint)
       .def("read_float", &Ndarray::read_float)

From b42a09fc3375ee07f2dabf73f3fe8d8fcd1ad05f Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Wed, 18 May 2022 13:59:46 +0800
Subject: [PATCH 087/176] [refactor] Get rid of data_ptr_ in Ndarray

ghstack-source-id: d795592d21f4a3da4a6c8ccffdce1dbc40ad99aa
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5004
---
 python/taichi/lang/_ndarray.py |  9 ---------
 taichi/program/ndarray.cpp     | 13 -------------
 taichi/program/ndarray.h       |  3 ---
 taichi/python/export_lang.cpp  |  1 -
 4 files changed, 26 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index 3b3d96a0eef16..f9546df9c242f 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -41,15 +41,6 @@ def element_shape(self):
         """
         raise NotImplementedError()
 
-    @property
-    def _data_handle(self):
-        """Gets the pointer to underlying data.
-
-        Returns:
-            int: The pointer to underlying data.
-        """
-        return self.arr.data_ptr()
-
     @python_scope
     def __setitem__(self, key, value):
         """Sets ndarray element in Python scope.
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index e6424d2fcc076..46209a56abfc8 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -26,15 +26,6 @@ Ndarray::Ndarray(Program *prog,
       rw_accessors_bank_(&prog->get_ndarray_rw_accessors_bank()) {
   ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
                                                  prog->result_buffer);
-#ifdef TI_WITH_LLVM
-  if (arch_is_cpu(prog->config.arch) || prog->config.arch == Arch::cuda) {
-    // For the LLVM backends, device allocation is a physical pointer.
-    data_ptr_ = prog->get_llvm_program_impl()->get_ndarray_alloc_info_ptr(
-        ndarray_alloc_);
-  }
-#else
-  TI_ERROR("Llvm disabled");
-#endif
 }
 
 Ndarray::Ndarray(DeviceAllocation &devalloc,
@@ -57,10 +48,6 @@ Ndarray::~Ndarray() {
   }
 }
 
-intptr_t Ndarray::get_data_ptr_as_int() const {
-  return reinterpret_cast<intptr_t>(data_ptr_);
-}
-
 intptr_t Ndarray::get_device_allocation_ptr_as_int() const {
   // taichi's own ndarray's ptr points to its |DeviceAllocation| on the
   // specified device. Note that torch-based ndarray's ptr is a raw ptr but
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index 50381ca0f21cc..66ff63f3001c9 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -50,9 +50,6 @@ class Ndarray {
   ~Ndarray();
 
  private:
-  // Invariant:
-  //   data_ptr_ is not nullptr iff arch is a llvm backend
-  uint64_t *data_ptr_{nullptr};
   std::size_t nelement_{1};
   std::size_t element_size_{1};
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index a7c9796e74edf..abaac031b12fa 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -517,7 +517,6 @@ void export_lang(py::module &m) {
       });
 
   py::class_<Ndarray>(m, "Ndarray")
-      .def("data_ptr", &Ndarray::get_data_ptr_as_int)
       .def("device_allocation_ptr", &Ndarray::get_device_allocation_ptr_as_int)
       .def("element_size", &Ndarray::get_element_size)
       .def("nelement", &Ndarray::get_nelement)

From 9cb36ac4d118fde5ef89323e0cbfd201944d2a8b Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Wed, 18 May 2022 18:14:09 +0800
Subject: [PATCH 088/176] [Doc] Branding updates. Also tests netlify. (#4994)

* Branding updates. Also tests netlify.

* Minor editorial updates to trigger netlify preview.

* Minor updates to re-trigger CI/CD
---
 README.md                                            | 6 +++---
 docs/lang/articles/contribution/contributor_guide.md | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 271e6df806007..f5dd0cad86015 100644
--- a/README.md
+++ b/README.md
@@ -32,7 +32,7 @@ The language has broad applications spanning real-time physical simulation, numb
 
 [...More](#demos)
 
-## Why Taichi?
+## Why Taichi Lang?
 
 - Built around Python: Taichi Lang shares almost the same syntax with Python, allowing you to write algorithms with minimal language barrier. It is also well integrated into the Python ecosystem, including NumPy and PyTorch.
 - Flexibility: Taichi Lang provides a set of generic data containers known as *SNode* (/ˈsnoʊd/), an effective mechanism for composing hierarchical, multi-dimensional fields. This can cover many use patterns in numerical simulation (e.g. [spatially sparse computing](https://docs.taichi-lang.org/docs/sparse)).
@@ -63,7 +63,7 @@ The language has broad applications spanning real-time physical simulation, numb
   - WebAssembly (experiemental)
  </details>
 
-Use Python's package installer **pip** to install Taichi:
+Use Python's package installer **pip** to install Taichi Lang:
 
 ```bash
 pip install --upgrade taichi
@@ -150,7 +150,7 @@ See [Apache License](https://github.com/taichi-dev/taichi/blob/master/LICENSE) f
 ### Join our discussions
 
 - [GitHub Discussions](https://github.com/taichi-dev/taichi/discussions)
-- [Taichi 中文论坛](https://forum.taichi.graphics/)
+- [太极编程语言中文论坛](https://forum.taichi.graphics/)
 
 ### Report an issue
 
diff --git a/docs/lang/articles/contribution/contributor_guide.md b/docs/lang/articles/contribution/contributor_guide.md
index d2b47947b354b..2597f5eb673d5 100644
--- a/docs/lang/articles/contribution/contributor_guide.md
+++ b/docs/lang/articles/contribution/contributor_guide.md
@@ -294,6 +294,7 @@ Here, we do not want to repeat some best practices summarized in the following G
   - [How to have your PR merged quickly](https://testing.googleblog.com/2017/06/code-health-too-many-comments-on-your.html)
 
 ## Compilation Warnings
+
 Taichi enforces warning-free codes by turning on `-Werror` (treat warning as error) by default. It is highly recommended to resolve a warning as soon as it raises.
 
 On the other hand, real world issues could be way more complicated than what the compiler expected. So we prepared the following HOWTOs to help resolve some common problems. You are also more than welcome to open up an issue or consult the reviewer inplace for further discussions.
@@ -311,7 +312,7 @@ target_include_directories(${CORE_LIBRARY_NAME} SYSTEM PRIVATE external/VulkanMe
 ```
 
 ### How to deal with warnings raised when compiling third-party libraries or targets
-Ideally, third-party submodules should be built completely independent of Taichi project except for the topological dependency. Unfortunately, due to the design of CMake system, CMake variables from Taichi and its submodules could get messed up in certain circumstances. Refer to the following two steps to mute warnings from third-party targets.
+Ideally, third-party submodules should be built completely independent of Taichi project except for the topological dependency. Unfortunately, due to the design of the CMake system, CMake variables from Taichi and its submodules could be mixed together under certain circumstances. Refer to the following two steps to mute warnings from third-party targets.
 
 1. Separate submodule's `CMAKE_CXX_FLAGS` from that configured in Taichi.
 2. Remove "-Wall" option from submodule's `CMAKE_CXX_FLAGS`.

From 21ca242bd4e79ed7010a142c8b1c7c089d51c983 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Wed, 18 May 2022 18:17:34 +0800
Subject: [PATCH 089/176] [AOT] Supported inclusion of taichi as subdirectory
 for AOT modules (#5007)

* Support building taichi as CMake subdirectory

* Fixes for export-less integration on Android
---
 CMakeLists.txt             | 10 +++++++---
 cmake/TaichiCXXFlags.cmake |  6 ++++--
 cmake/TaichiCore.cmake     |  4 ++--
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9cee806f27ea3..ceeeda040a349 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -83,8 +83,10 @@ if (WIN32)
   SET(CMAKE_MSVC_RUNTIME_LIBRARY MultiThreadedDLL)
 endif()
 
-# No support of Python for Android build
-if (NOT ANDROID)
+# No support of Python for Android build; or in any case taichi is integrated
+# in another project as submodule.
+option(TI_WITH_PYTHON "Build with Python language binding" ON)
+if (TI_WITH_PYTHON AND NOT ANDROID)
     include(cmake/PythonNumpyPybind11.cmake)
 endif()
 include(cmake/TaichiCXXFlags.cmake)
@@ -161,7 +163,9 @@ if (${CLANG_VERSION_MAJOR} VERSION_GREATER ${CLANG_HIGHEST_VERSION})
   endif()
 endif()
 
-add_subdirectory(taichi/runtime/llvm)
+if (TI_WITH_LLVM)
+  add_subdirectory(taichi/runtime/llvm)
+endif()
 
 configure_file(taichi/common/version.h.in ${CMAKE_SOURCE_DIR}/taichi/common/version.h)
 configure_file(taichi/common/commit_hash.h.in ${CMAKE_SOURCE_DIR}/taichi/common/commit_hash.h)
diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index c5b647e597499..a2e555d22b44d 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -52,8 +52,10 @@ else()
 
 # Due to limited CI coverage, -Werror is only turned on with Clang-compiler for now.
 if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang")
-    # [Global] CXX compilation option to treat all warnings as errors.
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror ")
+    if (NOT ANDROID) # (penguinliong) Blocking builds on Android.
+        # [Global] CXX compilation option to treat all warnings as errors.
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Werror ")
+    endif()
 endif()
 
     # [Global] By default, CXX compiler will throw a warning if it decides to ignore an attribute, for example "[[ maybe unused ]]".
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 59178429d2e09..1bcafcf015530 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -252,7 +252,7 @@ if (APPLE)
 endif()
 
 # TODO: replace these includes per target basis
-include_directories(${CMAKE_SOURCE_DIR})
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
 include_directories(external/include)
 include_directories(external/spdlog/include)
 include_directories(external/glad/include)
@@ -462,7 +462,7 @@ endforeach ()
 
 message("PYTHON_LIBRARIES: " ${PYTHON_LIBRARIES})
 
-if(NOT TI_EMSCRIPTENED)
+if(TI_WITH_PYTHON AND NOT TI_EMSCRIPTENED)
     set(CORE_WITH_PYBIND_LIBRARY_NAME taichi_core)
     # Cannot compile Python source code with Android, but TI_EXPORT_CORE should be set and
     # Android should only use the isolated library ignoring those source code.

From a0a805972d1b73f28cebcf45591e6514c8b5989e Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Wed, 18 May 2022 20:22:21 +0800
Subject: [PATCH 090/176] [misc] Version bump: v1.0.2 -> v1.0.3 (#5008)

---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 570c796513fb7..e946d6bb7143f 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-v1.0.2
+v1.0.3

From d27217ac88eb4876ab059aaf472594e783693c4c Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Wed, 18 May 2022 22:57:27 +0800
Subject: [PATCH 091/176] [Lang] [type] Fix parameter name 'range' for
 ti.types.quant.fixed (#5006)

---
 python/taichi/types/quantized_types.py | 8 ++++----
 tests/python/test_bit_struct.py        | 2 +-
 tests/python/test_custom_float.py      | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/python/taichi/types/quantized_types.py b/python/taichi/types/quantized_types.py
index 7bb9297b284a0..16732d737751f 100644
--- a/python/taichi/types/quantized_types.py
+++ b/python/taichi/types/quantized_types.py
@@ -68,13 +68,13 @@ def int(bits, signed=False, compute=None):  # pylint: disable=W0622
     return _custom_int(bits, signed, compute)
 
 
-def fixed(frac, signed=True, num_range=1.0, compute=None):
+def fixed(frac, signed=True, range=1.0, compute=None):  # pylint: disable=W0622
     """Generates a quantized type for fixed-point real numbers.
 
     Args:
         frac (int): Number of bits.
         signed (bool): Signed or unsigned.
-        num_range (float): Range of the number.
+        range (float): Range of the number.
         compute (DataType): Type for computation.
 
     Returns:
@@ -83,9 +83,9 @@ def fixed(frac, signed=True, num_range=1.0, compute=None):
     # TODO: handle cases with frac > 32
     frac_type = int(bits=frac, signed=signed, compute=i32)
     if signed:
-        scale = num_range / 2**(frac - 1)
+        scale = range / 2**(frac - 1)
     else:
-        scale = num_range / 2**frac
+        scale = range / 2**frac
     if compute is None:
         compute = impl.get_runtime().default_fp
     return _custom_float(frac_type, None, compute, scale)
diff --git a/tests/python/test_bit_struct.py b/tests/python/test_bit_struct.py
index a7333802d86e5..03610eabf6e4b 100644
--- a/tests/python/test_bit_struct.py
+++ b/tests/python/test_bit_struct.py
@@ -148,7 +148,7 @@ def test_bit_struct_struct_for():
     block_size = 16
     N = 64
     cell = ti.root.pointer(ti.i, N // block_size)
-    fixed32 = ti.types.quant.fixed(frac=32, num_range=1024)
+    fixed32 = ti.types.quant.fixed(frac=32, range=1024)
 
     x = ti.field(dtype=fixed32)
     cell.dense(ti.i, block_size).bit_struct(32).place(x)
diff --git a/tests/python/test_custom_float.py b/tests/python/test_custom_float.py
index 5043e4d552df5..251a045cbcbf1 100644
--- a/tests/python/test_custom_float.py
+++ b/tests/python/test_custom_float.py
@@ -8,7 +8,7 @@
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_float():
-    cft = ti.types.quant.fixed(frac=32, num_range=2)
+    cft = ti.types.quant.fixed(frac=32, range=2)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -29,7 +29,7 @@ def foo():
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_matrix_rotation():
-    cft = ti.types.quant.fixed(frac=16, num_range=1.2)
+    cft = ti.types.quant.fixed(frac=16, range=1.2)
 
     x = ti.Matrix.field(2, 2, dtype=cft)
 

From 66f41a99bcb8596f7307ffee3db3edd1b22acb8f Mon Sep 17 00:00:00 2001
From: Zeyu Li <li_zeyu@pku.edu.cn>
Date: Thu, 19 May 2022 09:13:13 +0800
Subject: [PATCH 092/176] [SIMT] Add match_any warp intrinsics (#4921)

* add match_any warp intrinsic

del f32

reset

* alter predicate to value

* update warp.py to sync with PR4957

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/simt/warp.py |  8 +++++---
 taichi/runtime/llvm/runtime.cpp |  4 ++--
 tests/python/test_simt.py       | 21 +++++++++++++++++++--
 3 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
index ae3dae11fad90..37b3adefaab75 100644
--- a/python/taichi/lang/simt/warp.py
+++ b/python/taichi/lang/simt/warp.py
@@ -97,9 +97,11 @@ def shfl_xor_i32(mask, val, offset):
                               with_runtime_context=False)
 
 
-def match_any():
-    # TODO
-    pass
+def match_any(mask, value):
+    return impl.call_internal("cuda_match_any_sync_i32",
+                              mask,
+                              value,
+                              with_runtime_context=False)
 
 
 def match_all():
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index a3ff2bfee596f..004f748ee225a 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -1084,11 +1084,11 @@ int32 cuda_ballot_sync_i32(u32 mask, int32 predicate) {
   return cuda_ballot_sync(mask, (bool)predicate);
 }
 
-i32 cuda_match_any_sync_i32(i32 mask, i32 value) {
+uint32 cuda_match_any_sync_i32(u32 mask, i32 value) {
   return 0;
 }
 
-i32 cuda_match_any_sync_i64(i32 mask, i64 value) {
+uint32 cuda_match_any_sync_i64(u32 mask, i64 value) {
 #if ARCH_cuda
   u32 ret;
   asm volatile("match.any.sync.b64  %0, %1, %2;"
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 1b5cc39996a6d..69d804a8deeb3 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -269,8 +269,25 @@ def foo():
 
 @test_utils.test(arch=ti.cuda)
 def test_match_any():
-    # TODO
-    pass
+    a = ti.field(dtype=ti.i32, shape=32)
+    b = ti.field(dtype=ti.u32, shape=32)
+
+    @ti.kernel
+    def foo():
+        ti.loop_config(block_dim=32)
+        for i in range(16):
+            a[i] = 0
+            a[i + 16] = 1
+
+        for i in range(32):
+            b[i] = ti.simt.warp.match_any(ti.u32(0xFFFFFFFF), a[i])
+
+    foo()
+
+    for i in range(16):
+        assert b[i] == 65535
+    for i in range(16):
+        assert b[i + 16] == (2**32 - 2**16)
 
 
 @test_utils.test(arch=ti.cuda)

From 9db035175fc592f538f50c4bd558c2a60a2200e5 Mon Sep 17 00:00:00 2001
From: yanqingzhang <yanqingdw@gmail.com>
Date: Thu, 19 May 2022 09:13:53 +0800
Subject: [PATCH 093/176] [doc] Update community section (#4943)

* [doc] Update community section

add active events and communication

* fix typo

* refine docs

Co-authored-by: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>

* Update README.md

Co-authored-by: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>

Co-authored-by: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 README.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f5dd0cad86015..949fe62c312dd 100644
--- a/README.md
+++ b/README.md
@@ -147,8 +147,13 @@ See [Apache License](https://github.com/taichi-dev/taichi/blob/master/LICENSE) f
 
 ## Community
 
+### Event
+
+Voxel Challenge 2022 is open for [submissions](https://github.com/taichi-dev/voxel-challenge/issues/11) until 18th May. Find out more [here](https://github.com/taichi-dev/community/tree/main/events/voxel-challenge).
+
 ### Join our discussions
 
+- [Slack Channel](https://join.slack.com/t/taichicommunity/shared_invite/zt-14ic8j6no-Fd~wKNpfskXLfqDr58Tddg)
 - [GitHub Discussions](https://github.com/taichi-dev/taichi/discussions)
 - [太极编程语言中文论坛](https://forum.taichi.graphics/)
 
@@ -159,7 +164,8 @@ See [Apache License](https://github.com/taichi-dev/taichi/blob/master/LICENSE) f
 
 ### Contact us
 
-You can also join our community from Slack or WeChat. Drop us a message at <a href = "mailto:contact@taichi.graphics">contact@taichi.graphics</a> first, and we'll follow up.
+- [Slack](https://taichicommunity.slack.com/join/shared_invite/zt-14ic8j6no-Fd~wKNpfskXLfqDr58Tddg#/shared-invite/email)
+- WeChat: Drop us a message at <a href = "mailto:community@taichi.graphics">community@taichi.graphics</a> first, and we'll follow up.
 
 ## Reference
 

From c535c53c4b638a2a4070abb21117d3e633205dd3 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Thu, 19 May 2022 10:11:53 +0800
Subject: [PATCH 094/176] [llvm] Add serializable LlvmLaunchArgInfo (#4992)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/codegen_llvm.cpp            | 20 ++++-----------
 taichi/codegen/codegen_llvm.h              |  7 ++----
 taichi/llvm/launch_arg_info.cpp            | 22 ++++++++++++++++
 taichi/llvm/launch_arg_info.h              | 29 ++++++++++++++++++++++
 taichi/llvm/llvm_offline_cache.cpp         |  7 +++---
 taichi/llvm/llvm_offline_cache.h           |  9 ++++---
 taichi/llvm/llvm_program.cpp               |  4 ++-
 taichi/llvm/llvm_program.h                 |  3 +++
 tests/cpp/llvm/llvm_offline_cache_test.cpp |  7 ++++++
 9 files changed, 81 insertions(+), 27 deletions(-)
 create mode 100644 taichi/llvm/launch_arg_info.cpp
 create mode 100644 taichi/llvm/launch_arg_info.h

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index d1b880f8821a0..0444be59f9b4a 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -7,14 +7,13 @@
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
-
 #include "taichi/analysis/offline_cache_util.h"
 #include "taichi/ir/statements.h"
+#include "taichi/llvm/launch_arg_info.h"
 #include "taichi/llvm/llvm_offline_cache.h"
 #include "taichi/llvm/llvm_program.h"
 #include "taichi/struct/struct_llvm.h"
 #include "taichi/util/file_sequence_writer.h"
-#include "taichi/llvm/llvm_program.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -2353,13 +2352,11 @@ CodeGenLLVM::CompiledData CodeGenLLVM::run_compilation() {
   if (config.offline_cache && !config.async_mode &&
       this->supports_offline_cache() && !kernel->is_evaluator) {
     kernel_key = get_hashed_offline_cache_key(&kernel->program->config, kernel);
-
     CompiledData res;
     const bool ok = maybe_read_compilation_from_cache(kernel_key, &res);
     if (ok) {
       return res;
     }
-
     needs_cache = true;
   }
 
@@ -2494,6 +2491,7 @@ void CodeGenLLVM::cache_module(const std::string &kernel_key) {
     task_cache.grid_dim = task.grid_dim;
   }
   prog->get_llvm_program_impl()->cache_kernel(kernel_key, this->module.get(),
+                                              infer_launch_args(kernel),
                                               std::move(offloaded_task_list));
 }
 
@@ -2504,7 +2502,7 @@ ModuleToFunctionConverter::ModuleToFunctionConverter(TaichiLLVMContext *tlctx,
 
 FunctionType ModuleToFunctionConverter::convert(
     const std::string &kernel_name,
-    const std::vector<ArgInfo> &args,
+    const std::vector<LlvmLaunchArgInfo> &args,
     std::unique_ptr<llvm::Module> mod,
     std::vector<OffloadedTask> &&tasks) const {
   tlctx_->add_module(std::move(mod));
@@ -2544,16 +2542,8 @@ FunctionType ModuleToFunctionConverter::convert(
     const Kernel *kernel,
     std::unique_ptr<llvm::Module> mod,
     std::vector<OffloadedTask> &&tasks) const {
-  const auto &kargs = kernel->args;
-  std::vector<ArgInfo> args;
-  args.resize(kargs.size());
-  std::transform(kargs.begin(), kargs.end(), args.begin(),
-                 [](const auto &arg) -> ArgInfo {
-                   ArgInfo res;
-                   res.is_array = arg.is_array;
-                   return res;
-                 });
-  return convert(kernel->name, args, std::move(mod), std::move(tasks));
+  return convert(kernel->name, infer_launch_args(kernel), std::move(mod),
+                 std::move(tasks));
 }
 
 TLANG_NAMESPACE_END
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index 2613f86d0b803..bb83a3151b636 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -7,6 +7,7 @@
 #ifdef TI_WITH_LLVM
 
 #include "taichi/ir/ir.h"
+#include "taichi/llvm/launch_arg_info.h"
 #include "taichi/llvm/llvm_codegen_utils.h"
 #include "taichi/program/program.h"
 
@@ -415,17 +416,13 @@ class LlvmProgramImpl;
 // This is for CPU, we need one for CUDA (AMDGPU) as well.
 class ModuleToFunctionConverter {
  public:
-  struct ArgInfo {
-    bool is_array{false};
-  };
-
   explicit ModuleToFunctionConverter(TaichiLLVMContext *tlctx,
                                      LlvmProgramImpl *program);
 
   virtual ~ModuleToFunctionConverter() = default;
 
   virtual FunctionType convert(const std::string &kernel_name,
-                               const std::vector<ArgInfo> &args,
+                               const std::vector<LlvmLaunchArgInfo> &args,
                                std::unique_ptr<llvm::Module> mod,
                                std::vector<OffloadedTask> &&tasks) const;
 
diff --git a/taichi/llvm/launch_arg_info.cpp b/taichi/llvm/launch_arg_info.cpp
new file mode 100644
index 0000000000000..c9006dff04a16
--- /dev/null
+++ b/taichi/llvm/launch_arg_info.cpp
@@ -0,0 +1,22 @@
+#include "taichi/llvm/launch_arg_info.h"
+
+#include "taichi/program/kernel.h"
+
+namespace taichi {
+namespace lang {
+
+bool LlvmLaunchArgInfo::operator==(const LlvmLaunchArgInfo &other) const {
+  return is_array == other.is_array;
+}
+
+std::vector<LlvmLaunchArgInfo> infer_launch_args(const Kernel *kernel) {
+  std::vector<LlvmLaunchArgInfo> res;
+  res.reserve(kernel->args.size());
+  for (const auto &a : kernel->args) {
+    res.push_back(LlvmLaunchArgInfo{a.is_array});
+  }
+  return res;
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/launch_arg_info.h b/taichi/llvm/launch_arg_info.h
new file mode 100644
index 0000000000000..1227c8b8f4911
--- /dev/null
+++ b/taichi/llvm/launch_arg_info.h
@@ -0,0 +1,29 @@
+#pragma once
+
+#include <vector>
+
+#include "taichi/common/core.h"
+#include "taichi/common/serialization.h"
+
+namespace taichi {
+namespace lang {
+
+// TODO: It would be better if this can be unified with Callable::Arg. However,
+// Callable::Arg is not easily serializable.
+struct LlvmLaunchArgInfo {
+  bool is_array{false};
+
+  TI_IO_DEF(is_array);
+
+  bool operator==(const LlvmLaunchArgInfo &other) const;
+  bool operator!=(const LlvmLaunchArgInfo &other) const {
+    return !(*this == other);
+  }
+};
+
+class Kernel;
+
+std::vector<LlvmLaunchArgInfo> infer_launch_args(const Kernel *kernel);
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index 8b34198ab2c3b..542f02aebc2c7 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -1,15 +1,15 @@
 #include "llvm_offline_cache.h"
 
+#include <fstream>
 #include <sstream>
 
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_os_ostream.h"
-#include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-
 #include "taichi/ir/transforms.h"
 #include "taichi/llvm/llvm_context.h"
 
@@ -70,9 +70,10 @@ bool LlvmOfflineCacheFileReader::get_kernel_cache(
   }
 
   res.kernel_key = key;
+  res.args = kernel_data.args;
+  res.offloaded_task_list = kernel_data.offloaded_task_list;
   res.owned_module = llvm::CloneModule(*kernel_data.module);
   res.module = res.owned_module.get();
-  res.offloaded_task_list = kernel_data.offloaded_task_list;
   return true;
 }
 
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index fe5666e98a5e7..c82837e42521e 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -1,12 +1,14 @@
 #pragma once
 
+#include <memory>
+
+#include "llvm/IR/Module.h"
 #include "taichi/common/core.h"
 #include "taichi/common/serialization.h"
+#include "taichi/llvm/launch_arg_info.h"
 #include "taichi/program/kernel.h"
 #include "taichi/util/io.h"
 
-#include "llvm/IR/Module.h"
-
 namespace taichi {
 namespace lang {
 
@@ -26,6 +28,7 @@ struct LlvmOfflineCache {
 
   struct KernelCacheData {
     std::string kernel_key;
+    std::vector<LlvmLaunchArgInfo> args;
     std::vector<OffloadedTaskCacheData> offloaded_task_list;
 
     std::unique_ptr<llvm::Module> owned_module{nullptr};
@@ -36,7 +39,7 @@ struct LlvmOfflineCache {
     KernelCacheData &operator=(KernelCacheData &&) = default;
     ~KernelCacheData() = default;
 
-    TI_IO_DEF(kernel_key, offloaded_task_list);
+    TI_IO_DEF(kernel_key, args, offloaded_task_list);
   };
 
   std::unordered_map<std::string, KernelCacheData> kernels;
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 721074ecb9050..0ca7e9977c269 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -646,6 +646,7 @@ void LlvmProgramImpl::fill_ndarray(const DeviceAllocation &alloc,
 void LlvmProgramImpl::cache_kernel(
     const std::string &kernel_key,
     llvm::Module *module,
+    std::vector<LlvmLaunchArgInfo> &&args,
     std::vector<LlvmOfflineCache::OffloadedTaskCacheData>
         &&offloaded_task_list) {
   if (cache_data_.kernels.find(kernel_key) != cache_data_.kernels.end()) {
@@ -654,7 +655,8 @@ void LlvmProgramImpl::cache_kernel(
   auto &kernel_cache = cache_data_.kernels[kernel_key];
   kernel_cache.kernel_key = kernel_key;
   kernel_cache.owned_module = llvm::CloneModule(*module);
-  kernel_cache.offloaded_task_list = offloaded_task_list;
+  kernel_cache.args = std::move(args);
+  kernel_cache.offloaded_task_list = std::move(offloaded_task_list);
 }
 
 void LlvmProgramImpl::dump_cache_data_to_disk() {
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index 7ceb33e9fe559..5274c9a1f836c 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -10,6 +10,7 @@
 #include "taichi/program/compile_config.h"
 #include "taichi/common/logging.h"
 #include "taichi/llvm/llvm_context.h"
+#include "taichi/llvm/launch_arg_info.h"
 #include "taichi/runtime/runtime.h"
 #include "taichi/system/threading.h"
 #include "taichi/struct/struct.h"
@@ -113,6 +114,7 @@ class LlvmProgramImpl : public ProgramImpl {
 
   void cache_kernel(const std::string &kernel_key,
                     llvm::Module *module,
+                    std::vector<LlvmLaunchArgInfo> &&args,
                     std::vector<LlvmOfflineCache::OffloadedTaskCacheData>
                         &&offloaded_task_list);
 
@@ -158,6 +160,7 @@ class LlvmProgramImpl : public ProgramImpl {
 
   std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override {
     TI_NOT_IMPLEMENTED;
+    return nullptr;
   }
 
   DevicePtr get_snode_tree_device_ptr(int tree_id) override;
diff --git a/tests/cpp/llvm/llvm_offline_cache_test.cpp b/tests/cpp/llvm/llvm_offline_cache_test.cpp
index 38fc95dcab209..ab8282bc71bc7 100644
--- a/tests/cpp/llvm/llvm_offline_cache_test.cpp
+++ b/tests/cpp/llvm/llvm_offline_cache_test.cpp
@@ -86,6 +86,10 @@ TEST_P(LlvmOfflineCacheTest, ReadWrite) {
   const auto tmp_dir_str{tmp_dir.u8string()};
   const bool dir_ok = fs::create_directories(tmp_dir);
   ASSERT_TRUE(dir_ok);
+  const std::vector<LlvmLaunchArgInfo> arg_infos = {
+      LlvmLaunchArgInfo{/*is_array=*/false},
+      LlvmLaunchArgInfo{/*is_array=*/true},
+  };
   {
     auto llvm_ctx = std::make_unique<llvm::LLVMContext>();
 
@@ -97,6 +101,7 @@ TEST_P(LlvmOfflineCacheTest, ReadWrite) {
     kcache.offloaded_task_list.push_back(
         LlvmOfflineCache::OffloadedTaskCacheData{kTaskName, kBlockDim,
                                                  kGridDim});
+    kcache.args = arg_infos;
     writer.add_kernel_cache(kKernelName, std::move(kcache));
     writer.set_no_mangle();
     writer.dump(tmp_dir_str, llvm_fmt);
@@ -126,6 +131,8 @@ TEST_P(LlvmOfflineCacheTest, ReadWrite) {
     LlvmOfflineCache::KernelCacheData kcache;
     const bool ok = reader->get_kernel_cache(kcache, kKernelName, *llvm_ctx);
     ASSERT_TRUE(ok);
+    const auto &actual_arg_infos = kcache.args;
+    EXPECT_EQ(actual_arg_infos, arg_infos);
   };
 }
 

From 8c936ef4ad0f8c4c924ef70fc3e59fa43b99f12c Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 19 May 2022 13:35:27 +0800
Subject: [PATCH 095/176] [bug] Fixed numerical error for Atomic-Sub between
 unsigned values with different number of bits (#5011)

---
 taichi/ir/frontend_ir.cpp   |  5 +++++
 tests/python/test_atomic.py | 41 +++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+)

diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 95acab8df2f0c..745e82e6dc752 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -481,6 +481,11 @@ void AtomicOpExpression::type_check(CompileConfig *) {
 void AtomicOpExpression::flatten(FlattenContext *ctx) {
   // replace atomic sub with negative atomic add
   if (op_type == AtomicOpType::sub) {
+    if (val->ret_type != ret_type) {
+      val.set(Expr::make<UnaryOpExpression>(UnaryOpType::cast_value, val,
+                                            ret_type));
+    }
+
     val.set(Expr::make<UnaryOpExpression>(UnaryOpType::neg, val));
     op_type = AtomicOpType::add;
   }
diff --git a/tests/python/test_atomic.py b/tests/python/test_atomic.py
index 73896fcfa0dc5..0223e71e6f465 100644
--- a/tests/python/test_atomic.py
+++ b/tests/python/test_atomic.py
@@ -223,6 +223,47 @@ def test():
     assert ret[None] == 1
 
 
+@test_utils.test(arch=[ti.cpu, ti.cuda])
+def test_atomic_sub_with_type_promotion():
+    # Test Case 1
+    @ti.kernel
+    def test_u16_sub_u8() -> ti.uint16:
+        x: ti.uint16 = 1000
+        y: ti.uint8 = 255
+
+        ti.atomic_sub(x, y)
+        return x
+
+    res = test_u16_sub_u8()
+    assert res == 745
+
+    # Test Case 2
+    @ti.kernel
+    def test_u8_sub_u16() -> ti.uint8:
+        x: ti.uint8 = 255
+        y: ti.uint16 = 100
+
+        ti.atomic_sub(x, y)
+        return x
+
+    res = test_u8_sub_u16()
+    assert res == 155
+
+    # Test Case 3
+    A = ti.field(ti.uint8, shape=())
+    B = ti.field(ti.uint16, shape=())
+
+    @ti.kernel
+    def test_with_field():
+        v: ti.uint16 = 1000
+        v -= A[None]
+        B[None] = v
+
+    A[None] = 255
+    test_with_field()
+    assert B[None] == 745
+
+
 @test_utils.test()
 def test_atomic_sub_expr_evaled():
     c = ti.field(ti.i32)

From 7157b13fa50d8d01b198b70ee1a28a7ae351e784 Mon Sep 17 00:00:00 2001
From: pengyu <6712304+FantasyVR@users.noreply.github.com>
Date: Thu, 19 May 2022 14:19:53 +0800
Subject: [PATCH 096/176] [refactor] Move get ndarray data ptr to program
 (#5012)

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/program/program.cpp    | 15 +++++++++++++++
 taichi/program/program.h      |  2 ++
 taichi/python/export_lang.cpp |  4 ++++
 3 files changed, 21 insertions(+)

diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index e2a9515111a6a..19ce2b8cdf00a 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -569,6 +569,21 @@ void Program::delete_ndarray(Ndarray *ndarray) {
   ndarrays_.erase(ndarray);
 }
 
+intptr_t Program::get_ndarray_data_ptr_as_int(Ndarray *ndarray) {
+  uint64_t *data_ptr{nullptr};
+#ifdef TI_WITH_LLVM
+  if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) {
+    // For the LLVM backends, device allocation is a physical pointer.
+    data_ptr = get_llvm_program_impl()->get_ndarray_alloc_info_ptr(
+        ndarray->ndarray_alloc_);
+  }
+#else
+  TI_ERROR("Llvm disabled");
+#endif
+
+  return reinterpret_cast<intptr_t>(data_ptr);
+}
+
 void Program::fill_ndarray_fast(Ndarray *ndarray, uint32_t val) {
 // This is a temporary solution to bypass device api.
 // Should be moved to CommandList once available in CUDA.
diff --git a/taichi/program/program.h b/taichi/program/program.h
index a85fd41484d49..06f00f4cbe761 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -321,6 +321,8 @@ class TI_DLL_EXPORT Program {
 
   void delete_ndarray(Ndarray *ndarray);
 
+  intptr_t get_ndarray_data_ptr_as_int(Ndarray *ndarray);
+
   void fill_ndarray_fast(Ndarray *ndarray, uint32_t val);
 
   ASTBuilder *current_ast_builder() {
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index abaac031b12fa..b237fad6c9d6d 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -424,6 +424,10 @@ void export_lang(py::module &m) {
           },
           py::return_value_policy::reference)
       .def("delete_ndarray", &Program::delete_ndarray)
+      .def("get_ndarray_data_ptr_as_int",
+           [](Program *program, Ndarray *ndarray) {
+             return program->get_ndarray_data_ptr_as_int(ndarray);
+           })
       .def("fill_float",
            [](Program *program, Ndarray *ndarray, float val) {
              program->fill_ndarray_fast(ndarray,

From 5dffaa0faeb780c11034d1b2feb3011d5dfa05f9 Mon Sep 17 00:00:00 2001
From: Frost Ming <mianghong@gmail.com>
Date: Fri, 20 May 2022 20:46:54 +0800
Subject: [PATCH 097/176] [ci] [build] Enable ccache for windows docker (#5001)

* Enable ccache for windows docker

* only run windows docker job

* copy ccache_folder

* trigger CI

* Re-enable all jobs

* remove dumb text
---
 .github/workflows/scripts/win_build_test_cpu.ps1 | 15 +++++++++++++++
 .github/workflows/testing.yml                    | 15 ++++++++++++---
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/scripts/win_build_test_cpu.ps1 b/.github/workflows/scripts/win_build_test_cpu.ps1
index 07b257c47c66f..a551cadb2b543 100644
--- a/.github/workflows/scripts/win_build_test_cpu.ps1
+++ b/.github/workflows/scripts/win_build_test_cpu.ps1
@@ -49,6 +49,20 @@ clang --version
 WriteInfo("Enter the repository")
 Set-Location .\taichi
 
+# Get sccache
+WriteInfo("Downloading sccache")
+$env:CCACHE_DIR="${pwd}/ccache_cache"
+$env:CCACHE_MAXSIZE="128M"
+$env:CCACHE_LOGFILE="${pwd}/ccache_error.log"
+WriteInfo("ccache dir: $Env:CCACHE_DIR")
+md "$Env:CCACHE_DIR" -ea 0
+if (-not (Test-Path "ccache-4.5.1-windows-64")) {
+    curl.exe --retry 10 --retry-delay 5 https://github.com/ccache/ccache/releases/download/v4.5.1/ccache-4.5.1-windows-64.zip -LO
+    7z x ccache-4.5.1-windows-64.zip
+    $env:PATH += ";${pwd}/ccache-4.5.1-windows-64"
+}
+ccache -v -s
+
 WriteInfo("Setting up Python environment")
 conda activate py37
 python -m pip install -r requirements_dev.txt
@@ -65,6 +79,7 @@ WriteInfo("Building Taichi")
 python setup.py install
 if (-not $?) { exit 1 }
 WriteInfo("Build finished")
+ccache -s -v
 
 $env:TI_ENABLE_PADDLE = "0"
 WriteInfo("Testing Taichi")
diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
index 9a0034b6b49dd..bb9f95a340c4e 100644
--- a/.github/workflows/testing.yml
+++ b/.github/workflows/testing.yml
@@ -252,12 +252,19 @@ jobs:
         with:
           submodules: 'recursive'
 
+      - name: Get sccache cache
+        uses: actions/cache@v2
+        if: needs.check_files.outputs.run_job == 'true'
+        with:
+          path: ccache_cache
+          key: ccache-win64-cpu-${{ github.sha }}
+          restore-keys: |
+            ccache-win64-cpu-
+
       - name: Get docker images
         shell: bash
+        if: needs.check_files.outputs.run_job == 'true'
         run: |
-          if [[ ${{needs.check_files.outputs.run_job}} == false ]]; then
-            exit 0
-          fi
           echo $CR_PAT | docker login ghcr.io -u ${{ github.actor }} --password-stdin
           docker pull ghcr.io/taichi-dev/taichidev-cpu-windows:v0.0.1
         env:
@@ -274,6 +281,8 @@ jobs:
             C:/taichi/.github/workflows/scripts/win_build_test_cpu.ps1
           tar -cf - ../${{ github.event.repository.name }} --mode u=+rwx,g=+rwx,o=+rwx | docker cp - taichi_build_test:C:/
           docker start -a taichi_build_test
+          rm -rf ccache_cache
+          docker cp taichi_build_test:C:/taichi/ccache_cache ccache_cache
 
       - name: clean docker container
         shell: bash

From 15b278a2649c6697f4937364e1cfa30e7d37edab Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Fri, 20 May 2022 08:27:28 +0800
Subject: [PATCH 098/176] [test] Unify kernel setup for ndarray related tests

We'll reuse these two kernels for cgraph tests as well so let's clean it
up first.

ghstack-source-id: acd772f092ac9044197ac2e1f16100ba4ba9005d
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5014
---
 tests/cpp/aot/aot_save_load_test.cpp | 39 ++--------------------
 tests/cpp/ir/ir_builder_test.cpp     | 37 ++-------------------
 tests/cpp/ir/ndarray_kernel.cpp      | 48 ++++++++++++++++++++++++++++
 tests/cpp/ir/ndarray_kernel.h        | 14 ++++++++
 4 files changed, 68 insertions(+), 70 deletions(-)
 create mode 100644 tests/cpp/ir/ndarray_kernel.cpp
 create mode 100644 tests/cpp/ir/ndarray_kernel.h

diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index 2ab9f8a3102d3..3a095988128ec 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -3,6 +3,7 @@
 #include "taichi/ir/statements.h"
 #include "taichi/inc/constants.h"
 #include "taichi/program/program.h"
+#include "tests/cpp/ir/ndarray_kernel.h"
 #include "tests/cpp/program/test_program.h"
 #ifdef TI_WITH_VULKAN
 #include "taichi/backends/vulkan/aot_module_loader_impl.h"
@@ -109,42 +110,8 @@ using namespace lang;
   TestProgram test_prog;
   test_prog.setup(arch);
   auto aot_builder = test_prog.prog()->make_aot_module_builder(arch);
-  IRBuilder builder1, builder2;
-
-  {
-    auto *arg = builder1.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
-                                         /*is_ptr=*/true);
-    auto *zero = builder1.get_int32(0);
-    auto *one = builder1.get_int32(1);
-    auto *two = builder1.get_int32(2);
-    auto *a1ptr = builder1.create_external_ptr(arg, {one});
-    builder1.create_global_store(a1ptr, one);  // a[1] = 1
-    auto *a0 =
-        builder1.create_global_load(builder1.create_external_ptr(arg, {zero}));
-    auto *a2ptr = builder1.create_external_ptr(arg, {two});
-    auto *a2 = builder1.create_global_load(a2ptr);
-    auto *a0plusa2 = builder1.create_add(a0, a2);
-    builder1.create_global_store(a2ptr, a0plusa2);  // a[2] = a[0] + a[2]
-  }
-  auto block = builder1.extract_ir();
-  auto ker1 =
-      std::make_unique<Kernel>(*test_prog.prog(), std::move(block), "ker1");
-  ker1->insert_arg(get_data_type<int>(), /*is_array=*/true);
-  {
-    auto *arg0 = builder2.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
-                                          /*is_ptr=*/true);
-    auto *arg1 = builder2.create_arg_load(/*arg_id=*/1, get_data_type<int>(),
-                                          /*is_ptr=*/false);
-    auto *one = builder2.get_int32(1);
-    auto *a1ptr = builder2.create_external_ptr(arg0, {one});
-    builder2.create_global_store(a1ptr, arg1);  // a[1] = arg1
-  }
-  auto block2 = builder2.extract_ir();
-  auto ker2 =
-      std::make_unique<Kernel>(*test_prog.prog(), std::move(block2), "ker2");
-  ker2->insert_arg(get_data_type<int>(), /*is_array=*/true);
-  ker2->insert_arg(get_data_type<int>(), /*is_array=*/false);
-
+  auto ker1 = setup_kernel1(test_prog.prog());
+  auto ker2 = setup_kernel2(test_prog.prog());
   aot_builder->add("ker1", ker1.get());
   aot_builder->add("ker2", ker2.get());
   aot_builder->dump(".", "");
diff --git a/tests/cpp/ir/ir_builder_test.cpp b/tests/cpp/ir/ir_builder_test.cpp
index cbb082c780bf8..2445592436f7b 100644
--- a/tests/cpp/ir/ir_builder_test.cpp
+++ b/tests/cpp/ir/ir_builder_test.cpp
@@ -3,6 +3,7 @@
 #include "taichi/ir/ir_builder.h"
 #include "taichi/ir/statements.h"
 #include "tests/cpp/program/test_program.h"
+#include "tests/cpp/ir/ndarray_kernel.h"
 #ifdef TI_WITH_VULKAN
 #include "taichi/backends/vulkan/vulkan_loader.h"
 #endif
@@ -136,25 +137,7 @@ TEST(IRBuilder, Ndarray) {
   auto array = Ndarray(test_prog.prog(), PrimitiveType::i32, {size});
   array.write_int({0}, 2);
   array.write_int({2}, 40);
-  {
-    auto *arg = builder1.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
-                                         /*is_ptr=*/true);
-    auto *zero = builder1.get_int32(0);
-    auto *one = builder1.get_int32(1);
-    auto *two = builder1.get_int32(2);
-    auto *a1ptr = builder1.create_external_ptr(arg, {one});
-    builder1.create_global_store(a1ptr, one);  // a[1] = 1
-    auto *a0 =
-        builder1.create_global_load(builder1.create_external_ptr(arg, {zero}));
-    auto *a2ptr = builder1.create_external_ptr(arg, {two});
-    auto *a2 = builder1.create_global_load(a2ptr);
-    auto *a0plusa2 = builder1.create_add(a0, a2);
-    builder1.create_global_store(a2ptr, a0plusa2);  // a[2] = a[0] + a[2]
-  }
-  auto block1 = builder1.extract_ir();
-  auto ker1 =
-      std::make_unique<Kernel>(*test_prog.prog(), std::move(block1), "ker1");
-  ker1->insert_arg(get_data_type<int>(), /*is_array=*/true);
+  auto ker1 = setup_kernel1(test_prog.prog());
   auto launch_ctx1 = ker1->make_launch_context();
   launch_ctx1.set_arg_external_array(
       /*arg_id=*/0, array.get_device_allocation_ptr_as_int(), size,
@@ -164,21 +147,7 @@ TEST(IRBuilder, Ndarray) {
   EXPECT_EQ(array.read_int({1}), 1);
   EXPECT_EQ(array.read_int({2}), 42);
 
-  IRBuilder builder2;
-  {
-    auto *arg0 = builder2.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
-                                          /*is_ptr=*/true);
-    auto *arg1 = builder2.create_arg_load(/*arg_id=*/1, PrimitiveType::i32,
-                                          /*is_ptr=*/false);
-    auto *one = builder2.get_int32(1);
-    auto *a1ptr = builder2.create_external_ptr(arg0, {one});
-    builder2.create_global_store(a1ptr, arg1);  // a[1] = arg1
-  }
-  auto block2 = builder2.extract_ir();
-  auto ker2 =
-      std::make_unique<Kernel>(*test_prog.prog(), std::move(block2), "ker2");
-  ker2->insert_arg(get_data_type<int>(), /*is_array=*/true);
-  ker2->insert_arg(get_data_type<int>(), /*is_array=*/false);
+  auto ker2 = setup_kernel2(test_prog.prog());
   auto launch_ctx2 = ker2->make_launch_context();
   launch_ctx2.set_arg_external_array(
       /*arg_id=*/0, array.get_device_allocation_ptr_as_int(), size,
diff --git a/tests/cpp/ir/ndarray_kernel.cpp b/tests/cpp/ir/ndarray_kernel.cpp
new file mode 100644
index 0000000000000..282fe24acba01
--- /dev/null
+++ b/tests/cpp/ir/ndarray_kernel.cpp
@@ -0,0 +1,48 @@
+#include "tests/cpp/ir/ndarray_kernel.h"
+
+namespace taichi {
+namespace lang {
+
+std::unique_ptr<Kernel> setup_kernel1(Program *prog) {
+  IRBuilder builder1;
+  {
+    auto *arg = builder1.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
+                                         /*is_ptr=*/true);
+    auto *zero = builder1.get_int32(0);
+    auto *one = builder1.get_int32(1);
+    auto *two = builder1.get_int32(2);
+    auto *a1ptr = builder1.create_external_ptr(arg, {one});
+    builder1.create_global_store(a1ptr, one);  // a[1] = 1
+    auto *a0 =
+        builder1.create_global_load(builder1.create_external_ptr(arg, {zero}));
+    auto *a2ptr = builder1.create_external_ptr(arg, {two});
+    auto *a2 = builder1.create_global_load(a2ptr);
+    auto *a0plusa2 = builder1.create_add(a0, a2);
+    builder1.create_global_store(a2ptr, a0plusa2);  // a[2] = a[0] + a[2]
+  }
+  auto block = builder1.extract_ir();
+  auto ker1 = std::make_unique<Kernel>(*prog, std::move(block), "ker1");
+  ker1->insert_arg(get_data_type<int>(), /*is_array=*/true);
+  return ker1;
+}
+
+std::unique_ptr<Kernel> setup_kernel2(Program *prog) {
+  IRBuilder builder2;
+
+  {
+    auto *arg0 = builder2.create_arg_load(/*arg_id=*/0, get_data_type<int>(),
+                                          /*is_ptr=*/true);
+    auto *arg1 = builder2.create_arg_load(/*arg_id=*/1, get_data_type<int>(),
+                                          /*is_ptr=*/false);
+    auto *one = builder2.get_int32(1);
+    auto *a1ptr = builder2.create_external_ptr(arg0, {one});
+    builder2.create_global_store(a1ptr, arg1);  // a[1] = arg1
+  }
+  auto block2 = builder2.extract_ir();
+  auto ker2 = std::make_unique<Kernel>(*prog, std::move(block2), "ker2");
+  ker2->insert_arg(get_data_type<int>(), /*is_array=*/true);
+  ker2->insert_arg(get_data_type<int>(), /*is_array=*/false);
+  return ker2;
+}
+}  // namespace lang
+}  // namespace taichi
diff --git a/tests/cpp/ir/ndarray_kernel.h b/tests/cpp/ir/ndarray_kernel.h
new file mode 100644
index 0000000000000..9ee8d32159615
--- /dev/null
+++ b/tests/cpp/ir/ndarray_kernel.h
@@ -0,0 +1,14 @@
+#pragma once
+#include "taichi/ir/ir_builder.h"
+#include "taichi/ir/statements.h"
+#include "taichi/inc/constants.h"
+#include "taichi/program/program.h"
+
+namespace taichi {
+namespace lang {
+
+std::unique_ptr<Kernel> setup_kernel1(Program *prog);
+
+std::unique_ptr<Kernel> setup_kernel2(Program *prog);
+}  // namespace lang
+}  // namespace taichi

From 963b1c3b001514cf66acefd66d9355be23605e38 Mon Sep 17 00:00:00 2001
From: Ailing Zhang <ailing@taichi.graphics>
Date: Fri, 20 May 2022 19:19:07 +0800
Subject: [PATCH 099/176] [aot] Build and run graph without serialization

This PR servces as the base PR with a minimal example of building and
running a Graph. Runtime values for graph arguments can be either
scalars or ndarrays.

For detailed proposal please see #4786.

Things handled in this PR:
- Maximize common code/runtime shared by the two workflows below:
  1. build -> compile -> run
  2. build -> compile -> serialize -> deserilize -> run
- Graph arguments are annotated with dtype and element shape for ndarray (temporary
until we have vec3 types in C++)

Things that we've discussed but not included in this PR:
- C API: I'll leave that for a unified C API PR in the future.
- bind IValues to graph: easy, will add later.

ghstack-source-id: f459afccdde56b59ab0ecc860ed11d761a20fe0a
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5015
---
 taichi/aot/graph_data.h                       |  92 ++++++++++++++
 taichi/aot/module_loader.h                    |  22 +---
 .../vulkan/aot_module_loader_impl.cpp         |  21 +---
 .../backends/vulkan/aot_module_loader_impl.h  |  16 +++
 taichi/backends/vulkan/vulkan_program.cpp     |  11 ++
 taichi/backends/vulkan/vulkan_program.h       |   2 +
 taichi/program/graph.cpp                      | 101 ++++++++++++++++
 taichi/program/graph.h                        | 113 ++++++++++++++++++
 taichi/program/kernel.cpp                     |   4 +
 taichi/program/kernel.h                       |   2 +
 taichi/program/ndarray.cpp                    |   6 +-
 taichi/program/ndarray.h                      |   3 +-
 taichi/program/program.h                      |   4 +
 taichi/program/program_impl.h                 |   8 ++
 tests/cpp/aot/aot_save_load_test.cpp          |   3 +-
 tests/cpp/program/graph_test.cpp              |  52 ++++++++
 16 files changed, 412 insertions(+), 48 deletions(-)
 create mode 100644 taichi/aot/graph_data.h
 create mode 100644 taichi/program/graph.cpp
 create mode 100644 taichi/program/graph.h
 create mode 100644 tests/cpp/program/graph_test.cpp

diff --git a/taichi/aot/graph_data.h b/taichi/aot/graph_data.h
new file mode 100644
index 0000000000000..5da03b4d89d41
--- /dev/null
+++ b/taichi/aot/graph_data.h
@@ -0,0 +1,92 @@
+#pragma once
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include "taichi/aot/module_data.h"
+
+namespace taichi {
+namespace lang {
+class AotModuleBuilder;
+class Ndarray;
+namespace aot {
+// Currently only scalar and ndarray are supported.
+enum class ArgKind { SCALAR, NDARRAY, UNKNOWN };
+
+/**
+ * Symbolic argument used in building `Dispatch` nodes in the `Graph`.
+ */
+struct Arg {
+  std::string name;
+  // TODO: real element dtype = dtype + element_shape
+  std::string dtype_name;
+  ArgKind tag;
+  std::vector<int> element_shape;
+
+  TI_IO_DEF(name, dtype_name, tag, element_shape);
+};
+
+/**
+ * Runtime value used in graph execution.
+ */
+struct IValue {
+ public:
+  uint64 val;
+  ArgKind tag;
+
+  static IValue create(const Ndarray &ndarray) {
+    return IValue(reinterpret_cast<intptr_t>(&ndarray), ArgKind::NDARRAY);
+  }
+
+  template <typename T,
+            typename = std::enable_if_t<!std::is_same<T, Ndarray>::value, void>>
+  static IValue create(T v) {
+    return IValue(taichi_union_cast_with_different_sizes<uint64>(v),
+                  ArgKind::SCALAR);
+  }
+
+ private:
+  IValue(uint64 val, ArgKind tag) : val(val), tag(tag) {
+  }
+};
+
+class TI_DLL_EXPORT Kernel {
+ public:
+  // Rule of 5 to make MSVC happy
+  Kernel() = default;
+  virtual ~Kernel() = default;
+  Kernel(const Kernel &) = delete;
+  Kernel &operator=(const Kernel &) = delete;
+  Kernel(Kernel &&) = default;
+  Kernel &operator=(Kernel &&) = default;
+
+  /**
+   * @brief Launches the kernel to the device
+   *
+   * This does not manage the device to host synchronization.
+   *
+   * @param ctx Host context
+   */
+  virtual void launch(RuntimeContext *ctx) = 0;
+
+  virtual void save_to_module(AotModuleBuilder *builder) {
+    TI_NOT_IMPLEMENTED;
+  }
+};
+
+struct CompiledDispatch {
+  std::string kernel_name;
+  std::vector<Arg> symbolic_args;
+  Kernel *compiled_kernel{nullptr};
+
+  TI_IO_DEF(kernel_name, symbolic_args);
+};
+
+struct CompiledGraph {
+  std::vector<CompiledDispatch> dispatches;
+
+  TI_IO_DEF(dispatches);
+};
+
+}  // namespace aot
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/aot/module_loader.h b/taichi/aot/module_loader.h
index 0551152cfae4e..b63fd9e676277 100644
--- a/taichi/aot/module_loader.h
+++ b/taichi/aot/module_loader.h
@@ -10,7 +10,7 @@
 #include "taichi/aot/module_data.h"
 #include "taichi/backends/device.h"
 #include "taichi/ir/snode.h"
-#include "taichi/aot/module_data.h"
+#include "taichi/aot/graph_data.h"
 
 namespace taichi {
 namespace lang {
@@ -30,26 +30,6 @@ class TI_DLL_EXPORT Field {
   Field &operator=(Field &&) = default;
 };
 
-class TI_DLL_EXPORT Kernel {
- public:
-  // Rule of 5 to make MSVC happy
-  Kernel() = default;
-  virtual ~Kernel() = default;
-  Kernel(const Kernel &) = delete;
-  Kernel &operator=(const Kernel &) = delete;
-  Kernel(Kernel &&) = default;
-  Kernel &operator=(Kernel &&) = default;
-
-  /**
-   * @brief Launches the kernel to the device
-   *
-   * This does not manage the device to host synchronization.
-   *
-   * @param ctx Host context
-   */
-  virtual void launch(RuntimeContext *ctx) = 0;
-};
-
 class TI_DLL_EXPORT KernelTemplateArg {
  public:
   using ArgUnion = std::variant<bool, int64_t, uint64_t, const Field *>;
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.cpp b/taichi/backends/vulkan/aot_module_loader_impl.cpp
index 5f87eb4fc9ca8..149b172ac4c2c 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.cpp
+++ b/taichi/backends/vulkan/aot_module_loader_impl.cpp
@@ -9,9 +9,6 @@ namespace taichi {
 namespace lang {
 namespace vulkan {
 namespace {
-
-using KernelHandle = VkRuntime::KernelHandle;
-
 class FieldImpl : public aot::Field {
  public:
   explicit FieldImpl(VkRuntime *runtime, const aot::CompiledFieldData &field)
@@ -23,21 +20,6 @@ class FieldImpl : public aot::Field {
   aot::CompiledFieldData field_;
 };
 
-class KernelImpl : public aot::Kernel {
- public:
-  explicit KernelImpl(VkRuntime *runtime, KernelHandle handle)
-      : runtime_(runtime), handle_(handle) {
-  }
-
-  void launch(RuntimeContext *ctx) override {
-    runtime_->launch_kernel(handle_, ctx);
-  }
-
- private:
-  VkRuntime *const runtime_;
-  const KernelHandle handle_;
-};
-
 class AotModuleImpl : public aot::Module {
  public:
   explicit AotModuleImpl(const AotModuleParams &params)
@@ -109,8 +91,7 @@ class AotModuleImpl : public aot::Module {
       TI_DEBUG("Failed to load kernel {}", name);
       return nullptr;
     }
-    auto handle = runtime_->register_taichi_kernel(kparams);
-    return std::make_unique<KernelImpl>(runtime_, handle);
+    return std::make_unique<KernelImpl>(runtime_, std::move(kparams));
   }
 
   std::unique_ptr<aot::KernelTemplate> make_new_kernel_template(
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.h b/taichi/backends/vulkan/aot_module_loader_impl.h
index b188281cb749d..23990b56a5c68 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.h
+++ b/taichi/backends/vulkan/aot_module_loader_impl.h
@@ -16,6 +16,22 @@ namespace vulkan {
 
 class VkRuntime;
 
+class KernelImpl : public aot::Kernel {
+ public:
+  explicit KernelImpl(VkRuntime *runtime, VkRuntime::RegisterParams &&params)
+      : runtime_(runtime), params_(std::move(params)) {
+  }
+
+  void launch(RuntimeContext *ctx) override {
+    auto handle = runtime_->register_taichi_kernel(params_);
+    runtime_->launch_kernel(handle, ctx);
+  }
+
+ private:
+  VkRuntime *const runtime_;
+  const VkRuntime::RegisterParams params_;
+};
+
 struct TI_DLL_EXPORT AotModuleParams {
   std::string module_path;
   VkRuntime *runtime{nullptr};
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/backends/vulkan/vulkan_program.cpp
index b2d0c6a8d80a3..cddb446ae469f 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/backends/vulkan/vulkan_program.cpp
@@ -2,6 +2,7 @@
 
 #include "taichi/backends/vulkan/aot_module_builder_impl.h"
 #include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/backends/vulkan/aot_module_loader_impl.h"
 
 #if !defined(ANDROID) && !defined(TI_EMSCRIPTENED)
 #include "GLFW/glfw3.h"
@@ -183,6 +184,16 @@ DeviceAllocation VulkanProgramImpl::allocate_memory_ndarray(
        /*export_sharing=*/false});
 }
 
+std::unique_ptr<aot::Kernel> VulkanProgramImpl::make_aot_kernel(
+    Kernel &kernel) {
+  spirv::lower(&kernel);
+  std::vector<CompiledSNodeStructs> compiled_structs;
+  VkRuntime::RegisterParams kparams =
+      run_codegen(&kernel, get_compute_device(), compiled_structs);
+  return std::make_unique<KernelImpl>(vulkan_runtime_.get(),
+                                      std::move(kparams));
+}
+
 VulkanProgramImpl::~VulkanProgramImpl() {
   vulkan_runtime_.reset();
   embedded_device_.reset();
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index 8285fdf7e8947..b8d7a820fb7be 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -82,6 +82,8 @@ class VulkanProgramImpl : public ProgramImpl {
     return snode_tree_mgr_->get_snode_tree_device_ptr(tree_id);
   }
 
+  std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) override;
+
   ~VulkanProgramImpl();
 
  private:
diff --git a/taichi/program/graph.cpp b/taichi/program/graph.cpp
new file mode 100644
index 0000000000000..3fa1c703c7064
--- /dev/null
+++ b/taichi/program/graph.cpp
@@ -0,0 +1,101 @@
+#include "taichi/program/graph.h"
+#include "taichi/program/kernel.h"
+#include "taichi/aot/module_builder.h"
+#include "spdlog/fmt/fmt.h"
+
+#include <fstream>
+
+namespace taichi {
+namespace lang {
+
+void Dispatch::compile(
+    std::vector<aot::CompiledDispatch> &compiled_dispatches) {
+  if (compiled_kernel_)
+    return;
+  compiled_kernel_ = kernel_->compile_to_aot_kernel();
+  aot::CompiledDispatch dispatch{kernel_->get_name(), symbolic_args_,
+                                 compiled_kernel_.get()};
+  compiled_dispatches.push_back(std::move(dispatch));
+}
+
+void Sequential::compile(
+    std::vector<aot::CompiledDispatch> &compiled_dispatches) {
+  // In the future we can do more across-kernel optimization here.
+  for (Node *n : sequence_) {
+    n->compile(compiled_dispatches);
+  }
+}
+
+void Sequential::append(Node *node) {
+  sequence_.push_back(node);
+}
+
+void Sequential::dispatch(Kernel *kernel, const std::vector<aot::Arg> &args) {
+  Node *n = owning_graph_->new_dispatch_node(kernel, args);
+  sequence_.push_back(n);
+}
+
+Graph::Graph(std::string name) : name_(name) {
+  seq_ = std::make_unique<Sequential>(this);
+}
+Node *Graph::new_dispatch_node(Kernel *kernel,
+                               const std::vector<aot::Arg> &args) {
+  all_nodes_.push_back(std::make_unique<Dispatch>(kernel, args));
+  return all_nodes_.back().get();
+}
+
+Sequential *Graph::new_sequential_node() {
+  all_nodes_.push_back(std::make_unique<Sequential>(this));
+  return static_cast<Sequential *>(all_nodes_.back().get());
+}
+
+void Graph::compile() {
+  seq()->compile(compiled_graph_.dispatches);
+}
+
+Sequential *Graph::seq() const {
+  return seq_.get();
+}
+
+void Graph::dispatch(Kernel *kernel, const std::vector<aot::Arg> &args) {
+  seq()->dispatch(kernel, args);
+}
+
+void Graph::run(
+    const std::unordered_map<std::string, aot::IValue> &args) const {
+  RuntimeContext ctx;
+  for (const auto &dispatch : compiled_graph_.dispatches) {
+    memset(&ctx, 0, sizeof(RuntimeContext));
+
+    TI_ASSERT(dispatch.compiled_kernel);
+    // Populate args metadata into RuntimeContext
+    const auto &symbolic_args_ = dispatch.symbolic_args;
+    for (int i = 0; i < symbolic_args_.size(); ++i) {
+      auto &symbolic_arg = symbolic_args_[i];
+      auto found = args.find(symbolic_arg.name);
+      TI_ERROR_IF(found == args.end(), "Missing runtime value for {}",
+                  symbolic_arg.name);
+      const aot::IValue &ival = found->second;
+      if (ival.tag == aot::ArgKind::NDARRAY) {
+        Ndarray *arr = reinterpret_cast<Ndarray *>(ival.val);
+        TI_ERROR_IF(ival.tag != aot::ArgKind::NDARRAY,
+                    "Required a ndarray for argument {}", symbolic_arg.name);
+        auto ndarray_elem_shape = std::vector<int>(
+            arr->shape.end() - symbolic_arg.element_shape.size(),
+            arr->shape.end());
+        TI_ERROR_IF(ndarray_elem_shape != symbolic_arg.element_shape,
+                    "Mismatched shape information for argument {}",
+                    symbolic_arg.name);
+        set_runtime_ctx_ndarray(&ctx, i, arr);
+      } else {
+        TI_ERROR_IF(ival.tag != aot::ArgKind::SCALAR,
+                    "Required a scalar for argument {}", symbolic_arg.name);
+        ctx.set_arg(i, ival.val);
+      }
+    }
+
+    dispatch.compiled_kernel->launch(&ctx);
+  }
+}
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/program/graph.h b/taichi/program/graph.h
new file mode 100644
index 0000000000000..1999b9b015e74
--- /dev/null
+++ b/taichi/program/graph.h
@@ -0,0 +1,113 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <unordered_set>
+
+#include "taichi/program/ndarray.h"
+#include "taichi/program/program.h"
+#include "taichi/ir/type.h"
+#include "taichi/aot/graph_data.h"
+#include "taichi/aot/module_builder.h"
+
+namespace taichi {
+namespace lang {
+class Kernel;
+class Graph;
+
+class Node {
+ public:
+  Node() = default;
+  virtual ~Node() = default;
+  Node(const Node &) = delete;
+  Node &operator=(const Node &) = delete;
+  Node(Node &&) = default;
+  Node &operator=(Node &&) = default;
+
+  virtual void compile(
+      std::vector<aot::CompiledDispatch> &compiled_dispatches) = 0;
+};
+class Dispatch : public Node {
+ public:
+  explicit Dispatch(Kernel *kernel, const std::vector<aot::Arg> &args)
+      : kernel_(kernel), symbolic_args_(args) {
+  }
+
+  void compile(
+      std::vector<aot::CompiledDispatch> &compiled_dispatches) override;
+
+ private:
+  mutable bool serialized_{false};
+  Kernel *kernel_{nullptr};
+  std::unique_ptr<aot::Kernel> compiled_kernel_{nullptr};
+  std::vector<aot::Arg> symbolic_args_;
+};
+
+class Sequential : public Node {
+ public:
+  explicit Sequential(Graph *graph) : owning_graph_(graph) {
+  }
+
+  void append(Node *node);
+
+  void dispatch(Kernel *kernel, const std::vector<aot::Arg> &args);
+
+  void compile(
+      std::vector<aot::CompiledDispatch> &compiled_dispatches) override;
+
+ private:
+  std::vector<Node *> sequence_;
+  Graph *owning_graph_{nullptr};
+};
+
+/*
+ * Graph class works as both builder and runner.
+ *
+ * Two typical workflows using Graph:
+ * - build graph -> compile -> run
+ * - build graph -> compile -> serialize -> deserialize -> run
+ *
+ * Thus Graph can be constructed in two ways, either as an empty object
+ * or from an `aot::CompiledGraph` loaded from aot module.
+ *
+ * Currently Graph only supports sequential launches without returning value
+ * to host.
+ */
+class Graph {
+ public:
+  explicit Graph(std::string name);
+
+  explicit Graph(std::string name, const aot::CompiledGraph &compiled)
+      : name_(name), compiled_graph_(compiled) {
+  }
+
+  // TODO: compile() can take in Arch argument
+  void compile();
+
+  void run(const std::unordered_map<std::string, aot::IValue> &args) const;
+
+  Node *new_dispatch_node(Kernel *kernel, const std::vector<aot::Arg> &args);
+
+  Sequential *new_sequential_node();
+
+  void dispatch(Kernel *kernel, const std::vector<aot::Arg> &args);
+
+  Sequential *seq() const;
+
+  aot::CompiledGraph compiled_graph() const {
+    return compiled_graph_;
+  }
+
+  std::string name() const {
+    return name_;
+  }
+
+ private:
+  std::string name_;
+  std::unique_ptr<Sequential> seq_{nullptr};
+  std::vector<std::unique_ptr<Node>> all_nodes_;
+  aot::CompiledGraph compiled_graph_;
+};
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 3965bd9b27566..19f9ee8370ec6 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -64,6 +64,10 @@ void Kernel::compile() {
   compiled_ = program->compile(*this);
 }
 
+std::unique_ptr<aot::Kernel> Kernel::compile_to_aot_kernel() {
+  return program->make_aot_kernel(*this);
+}
+
 void Kernel::lower(bool to_executable) {
   TI_ASSERT(!lowered_);
   TI_ASSERT(supports_lowering(arch));
diff --git a/taichi/program/kernel.h b/taichi/program/kernel.h
index d29aac4399f4a..0d98252b8c499 100644
--- a/taichi/program/kernel.h
+++ b/taichi/program/kernel.h
@@ -6,6 +6,7 @@
 #include "taichi/backends/arch.h"
 #include "taichi/program/callable.h"
 #include "taichi/program/ndarray.h"
+#include "taichi/aot/graph_data.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -86,6 +87,7 @@ class TI_DLL_EXPORT Kernel : public Callable {
 
   void compile();
 
+  std::unique_ptr<aot::Kernel> compile_to_aot_kernel();
   /**
    * Lowers |ir| to CHI IR level
    *
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index 46209a56abfc8..ae19dde3ea623 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -83,10 +83,10 @@ void Ndarray::write_float(const std::vector<int> &i, float64 val) {
   rw_accessors_bank_->get(this).write_float(i, val);
 }
 
-void set_runtime_ctx_ndarray(RuntimeContext &ctx,
+void set_runtime_ctx_ndarray(RuntimeContext *ctx,
                              int arg_id,
-                             Ndarray &ndarray) {
-  ctx.set_arg_devalloc(arg_id, ndarray.ndarray_alloc_, ndarray.shape);
+                             Ndarray *ndarray) {
+  ctx->set_arg_devalloc(arg_id, ndarray->ndarray_alloc_, ndarray->shape);
 }
 
 }  // namespace lang
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index 66ff63f3001c9..31181fc55705a 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -60,7 +60,6 @@ class Ndarray {
 
 // TODO: move this as a method inside RuntimeContext once Ndarray is decoupled
 // with Program
-void set_runtime_ctx_ndarray(RuntimeContext &ctx, int arg_id, Ndarray &ndarray);
-
+void set_runtime_ctx_ndarray(RuntimeContext *ctx, int arg_id, Ndarray *ndarray);
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 06f00f4cbe761..4ff8227b483a1 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -199,6 +199,10 @@ class TI_DLL_EXPORT Program {
   // future.
   FunctionType compile(Kernel &kernel, OffloadedStmt *offloaded = nullptr);
 
+  std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) {
+    return program_impl_->make_aot_kernel(kernel);
+  }
+
   void check_runtime_error();
 
   Kernel &get_snode_reader(SNode *snode);
diff --git a/taichi/program/program_impl.h b/taichi/program/program_impl.h
index c168bf1246d2a..f0ae8d243c69a 100644
--- a/taichi/program/program_impl.h
+++ b/taichi/program/program_impl.h
@@ -8,6 +8,7 @@
 #include "taichi/program/snode_expr_utils.h"
 #include "taichi/program/kernel_profiler.h"
 #include "taichi/backends/device.h"
+#include "taichi/aot/graph_data.h"
 
 namespace taichi {
 namespace lang {
@@ -66,6 +67,13 @@ class ProgramImpl {
    */
   virtual std::unique_ptr<AotModuleBuilder> make_aot_module_builder() = 0;
 
+  /**
+   * Compile a taichi::lang::Kernel to taichi::lang::aot::Kernel.
+   */
+  virtual std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) {
+    TI_NOT_IMPLEMENTED;
+  }
+
   /**
    * Dump Offline-cache data to disk
    */
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index 3a095988128ec..b64b49b2350fb 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -271,8 +271,7 @@ TEST(AotSaveLoad, VulkanNdarray) {
   DeviceAllocation devalloc_arr_ =
       embedded_device->device()->allocate_memory(alloc_params);
   Ndarray arr = Ndarray(devalloc_arr_, PrimitiveType::i32, {size});
-  taichi::lang::set_runtime_ctx_ndarray(host_ctx, 0, arr);
-
+  taichi::lang::set_runtime_ctx_ndarray(&host_ctx, 0, &arr);
   int src[size] = {0};
   src[0] = 2;
   src[2] = 40;
diff --git a/tests/cpp/program/graph_test.cpp b/tests/cpp/program/graph_test.cpp
new file mode 100644
index 0000000000000..2ac688f63cad8
--- /dev/null
+++ b/tests/cpp/program/graph_test.cpp
@@ -0,0 +1,52 @@
+#include "gtest/gtest.h"
+#include "taichi/ir/ir_builder.h"
+#include "taichi/ir/statements.h"
+#include "taichi/inc/constants.h"
+#include "taichi/program/program.h"
+#include "tests/cpp/program/test_program.h"
+#include "taichi/program/graph.h"
+#include "tests/cpp/ir/ndarray_kernel.h"
+#ifdef TI_WITH_VULKAN
+#include "taichi/backends/vulkan/vulkan_loader.h"
+#endif
+
+using namespace taichi;
+using namespace lang;
+#ifdef TI_WITH_VULKAN
+TEST(GraphTest, SimpleGraphRun) {
+  // Otherwise will segfault on macOS VM,
+  // where Vulkan is installed but no devices are present
+  if (!vulkan::is_vulkan_api_available()) {
+    return;
+  }
+  TestProgram test_prog;
+  test_prog.setup(Arch::vulkan);
+
+  const int size = 10;
+
+  auto ker1 = setup_kernel1(test_prog.prog());
+  auto ker2 = setup_kernel2(test_prog.prog());
+
+  auto g = std::make_unique<Graph>("test");
+  auto seq = g->seq();
+  auto arr_arg = aot::Arg{
+      "arr", PrimitiveType::i32.to_string(), aot::ArgKind::NDARRAY, {}};
+  seq->dispatch(ker1.get(), {arr_arg});
+  seq->dispatch(ker2.get(),
+                {arr_arg, aot::Arg{"x", PrimitiveType::i32.to_string(),
+                                   aot::ArgKind::SCALAR}});
+  g->compile();
+
+  auto array = Ndarray(test_prog.prog(), PrimitiveType::i32, {size});
+  array.write_int({0}, 2);
+  array.write_int({2}, 40);
+  std::unordered_map<std::string, aot::IValue> args;
+  args.insert({"arr", aot::IValue::create(array)});
+  args.insert({"x", aot::IValue::create<int>(2)});
+
+  g->run(args);
+  EXPECT_EQ(array.read_int({0}), 2);
+  EXPECT_EQ(array.read_int({1}), 2);
+  EXPECT_EQ(array.read_int({2}), 42);
+}
+#endif

From f1bc90906814b40b0e31b601901d1147f1af2e94 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Sat, 21 May 2022 11:32:39 +0800
Subject: [PATCH 100/176] [Llvm] Add AOT builder and loader (#5013)

* [Llvm] Add AOT builder and loader

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check nullptr

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/aot/module_builder.h                   |  9 +-
 .../backends/cpu/aot_module_builder_impl.cpp  | 44 +++++++++
 taichi/backends/cpu/aot_module_builder_impl.h | 24 +++++
 .../backends/cpu/aot_module_loader_impl.cpp   | 97 +++++++++++++++++++
 taichi/backends/cpu/aot_module_loader_impl.h  | 21 ++++
 taichi/llvm/llvm_program.cpp                  |  9 ++
 taichi/llvm/llvm_program.h                    |  5 +-
 7 files changed, 203 insertions(+), 6 deletions(-)
 create mode 100644 taichi/backends/cpu/aot_module_builder_impl.cpp
 create mode 100644 taichi/backends/cpu/aot_module_builder_impl.h
 create mode 100644 taichi/backends/cpu/aot_module_loader_impl.cpp
 create mode 100644 taichi/backends/cpu/aot_module_loader_impl.h

diff --git a/taichi/aot/module_builder.h b/taichi/aot/module_builder.h
index 1b05af1875038..29c4869d509b3 100644
--- a/taichi/aot/module_builder.h
+++ b/taichi/aot/module_builder.h
@@ -49,7 +49,10 @@ class AotModuleBuilder {
                                      DataType dt,
                                      std::vector<int> shape,
                                      int row_num,
-                                     int column_num) = 0;
+                                     int column_num) {
+    TI_NOT_IMPLEMENTED;
+  }
+
   virtual void add_ndarray_per_backend(const std::string &identifier,
                                        bool is_scalar,
                                        DataType dt,
@@ -61,7 +64,9 @@ class AotModuleBuilder {
 
   virtual void add_per_backend_tmpl(const std::string &identifier,
                                     const std::string &key,
-                                    Kernel *kernel) = 0;
+                                    Kernel *kernel) {
+    TI_NOT_IMPLEMENTED;
+  }
 
   static bool all_fields_are_dense_in_container(const SNode *container);
 };
diff --git a/taichi/backends/cpu/aot_module_builder_impl.cpp b/taichi/backends/cpu/aot_module_builder_impl.cpp
new file mode 100644
index 0000000000000..a65b4c01db57e
--- /dev/null
+++ b/taichi/backends/cpu/aot_module_builder_impl.cpp
@@ -0,0 +1,44 @@
+#include "taichi/backends/cpu/aot_module_builder_impl.h"
+
+#include <algorithm>
+
+#include "taichi/backends/cpu/codegen_cpu.h"
+
+#include "taichi/llvm/launch_arg_info.h"
+
+namespace taichi {
+namespace lang {
+namespace cpu {
+
+void AotModuleBuilderImpl::dump(const std::string &output_dir,
+                                const std::string &filename) const {
+  LlvmOfflineCacheFileWriter writer;
+  writer.set_data(std::move(cache_));
+  writer.dump(output_dir);
+}
+
+void AotModuleBuilderImpl::add_per_backend(const std::string &identifier,
+                                           Kernel *kernel) {
+  auto cgen = CodeGenCPU::make_codegen_llvm(kernel, /*ir=*/nullptr);
+  auto compiled = cgen->run_compilation();
+  LlvmOfflineCache::KernelCacheData kcache;
+  kcache.kernel_key = identifier;
+  kcache.module = compiled.llvm_module.get();
+  kcache.owned_module = std::move(compiled.llvm_module);
+  const auto &tasks = compiled.offloaded_tasks;
+  kcache.args = infer_launch_args(kernel);
+  kcache.offloaded_task_list.resize(tasks.size());
+  std::transform(tasks.begin(), tasks.end(), kcache.offloaded_task_list.begin(),
+                 [](const auto &t) -> LlvmOfflineCache::OffloadedTaskCacheData {
+                   LlvmOfflineCache::OffloadedTaskCacheData res;
+                   res.name = t.name;
+                   res.block_dim = t.block_dim;
+                   res.grid_dim = t.grid_dim;
+                   return res;
+                 });
+  cache_.kernels[identifier] = std::move(kcache);
+}
+
+}  // namespace cpu
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/cpu/aot_module_builder_impl.h b/taichi/backends/cpu/aot_module_builder_impl.h
new file mode 100644
index 0000000000000..b800398f49f79
--- /dev/null
+++ b/taichi/backends/cpu/aot_module_builder_impl.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "taichi/aot/module_builder.h"
+#include "taichi/llvm/llvm_offline_cache.h"
+
+namespace taichi {
+namespace lang {
+namespace cpu {
+
+class AotModuleBuilderImpl : public AotModuleBuilder {
+ public:
+  void dump(const std::string &output_dir,
+            const std::string &filename) const override;
+
+ protected:
+  void add_per_backend(const std::string &identifier, Kernel *kernel) override;
+
+ private:
+  mutable LlvmOfflineCache cache_;
+};
+
+}  // namespace cpu
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/cpu/aot_module_loader_impl.cpp b/taichi/backends/cpu/aot_module_loader_impl.cpp
new file mode 100644
index 0000000000000..0026bc41e299d
--- /dev/null
+++ b/taichi/backends/cpu/aot_module_loader_impl.cpp
@@ -0,0 +1,97 @@
+#include "taichi/backends/cpu/aot_module_loader_impl.h"
+
+#include "taichi/llvm/llvm_offline_cache.h"
+#include "taichi/llvm/llvm_program.h"
+#include "taichi/codegen/codegen_llvm.h"
+
+namespace taichi {
+namespace lang {
+namespace cpu {
+namespace {
+
+class KernelImpl : public aot::Kernel {
+ public:
+  explicit KernelImpl(FunctionType fn) : fn_(fn) {
+  }
+
+  void launch(RuntimeContext *ctx) override {
+    fn_(*ctx);
+  }
+
+ private:
+  FunctionType fn_;
+};
+
+class AotModuleImpl : public aot::Module {
+ public:
+  explicit AotModuleImpl(const AotModuleParams &params)
+      : program_(params.program),
+        cache_reader_(LlvmOfflineCacheFileReader::make(params.module_path)) {
+    TI_ASSERT(program_ != nullptr);
+  }
+
+  Arch arch() const override {
+    return Arch::x64;
+  }
+
+  uint64_t version() const override {
+    return 0;
+  }
+
+  size_t get_root_size() const override {
+    return 0;
+  }
+
+ private:
+  std::unique_ptr<aot::Kernel> make_new_kernel(
+      const std::string &name) override {
+    TI_ASSERT(cache_reader_ != nullptr);
+    auto *tlctx = program_->get_llvm_context(program_->config->arch);
+    LlvmOfflineCache::KernelCacheData loaded;
+    auto ok = cache_reader_->get_kernel_cache(
+        loaded, name, *tlctx->get_this_thread_context());
+    TI_ERROR_IF(!ok, "Failed to load kernel={}", name);
+
+    const auto &tasks = loaded.offloaded_task_list;
+    std::vector<OffloadedTask> offloaded_tasks;
+    offloaded_tasks.reserve(tasks.size());
+    for (const auto &t : tasks) {
+      OffloadedTask ot{/*codegen=*/nullptr};
+      ot.name = t.name;
+      ot.block_dim = t.block_dim;
+      ot.grid_dim = t.grid_dim;
+      offloaded_tasks.push_back(std::move(ot));
+    }
+    ModuleToFunctionConverter converter{tlctx, program_};
+    auto fn =
+        converter.convert(name, loaded.args, std::move(loaded.owned_module),
+                          std::move(offloaded_tasks));
+    return std::make_unique<KernelImpl>(fn);
+  }
+
+  std::unique_ptr<aot::KernelTemplate> make_new_kernel_template(
+      const std::string &name) override {
+    TI_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+
+  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override {
+    TI_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+
+  LlvmProgramImpl *const program_{nullptr};
+  std::unique_ptr<LlvmOfflineCacheFileReader> cache_reader_{nullptr};
+};
+
+}  // namespace
+
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params) {
+  auto mod = std::make_unique<AotModuleImpl>(
+      std::any_cast<const AotModuleParams &>(mod_params));
+  return mod;
+}
+
+}  // namespace cpu
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/cpu/aot_module_loader_impl.h b/taichi/backends/cpu/aot_module_loader_impl.h
new file mode 100644
index 0000000000000..8791c1c273078
--- /dev/null
+++ b/taichi/backends/cpu/aot_module_loader_impl.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "taichi/aot/module_loader.h"
+
+namespace taichi {
+namespace lang {
+
+class LlvmProgramImpl;
+
+namespace cpu {
+
+struct TI_DLL_EXPORT AotModuleParams {
+  std::string module_path;
+  LlvmProgramImpl *program{nullptr};
+};
+
+TI_DLL_EXPORT std::unique_ptr<aot::Module> make_aot_module(std::any mod_params);
+
+}  // namespace cpu
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 0ca7e9977c269..026a2c876b127 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -12,6 +12,7 @@
 #include "taichi/codegen/codegen.h"
 #include "taichi/ir/statements.h"
 #include "taichi/ir/transforms.h"
+#include "taichi/backends/cpu/aot_module_builder_impl.h"
 #include "taichi/backends/cpu/cpu_device.h"
 #include "taichi/backends/cuda/cuda_device.h"
 
@@ -336,6 +337,14 @@ void LlvmProgramImpl::print_list_manager_info(void *list_manager,
       size_MB);
 }
 
+std::unique_ptr<AotModuleBuilder> LlvmProgramImpl::make_aot_module_builder() {
+  if (config->arch == Arch::x64) {
+    return std::make_unique<cpu::AotModuleBuilderImpl>();
+  }
+  TI_NOT_IMPLEMENTED;
+  return nullptr;
+}
+
 void LlvmProgramImpl::materialize_runtime(MemoryPool *memory_pool,
                                           KernelProfilerBase *profiler,
                                           uint64 **result_buffer_ptr) {
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index 5274c9a1f836c..c9029bbcd85f0 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -158,10 +158,7 @@ class LlvmProgramImpl : public ProgramImpl {
 
   void print_list_manager_info(void *list_manager, uint64 *result_buffer);
 
-  std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override {
-    TI_NOT_IMPLEMENTED;
-    return nullptr;
-  }
+  std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override;
 
   DevicePtr get_snode_tree_device_ptr(int tree_id) override;
 

From 605102022176dd481e4d9f10d58bee1f0cdbbc83 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Sat, 21 May 2022 13:40:50 +0800
Subject: [PATCH 101/176] [ci] Fix nightly macos (#5018)

---
 .github/workflows/scripts/unix_test.sh | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
index b50aee72775f6..7954f053d9903 100755
--- a/.github/workflows/scripts/unix_test.sh
+++ b/.github/workflows/scripts/unix_test.sh
@@ -34,9 +34,14 @@ else
     # Import Paddle's develop GPU package will occur error `Illegal Instruction`.
 
     # Log hardware info for the current CI-bot
-    # There's random CI failure caused by "import paddle"
+    # There's random CI failure caused by "import paddle" (Linux)
     # Top suspect is an issue with MKL support for specific CPU
-    lscpu | grep "Model name"
+    echo "CI-bot CPU info:"
+    if [[ $OSTYPE == "linux-"* ]]; then
+        lscpu | grep "Model name"
+    elif [[ $OSTYPE == "darwin"* ]]; then
+        sysctl -a | grep machdep.cpu
+    fi
 fi
 
 ti diagnose

From eba326f037cc71dcfa4c7f0f738bb8c639e48fb3 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Sat, 21 May 2022 15:36:18 +0800
Subject: [PATCH 102/176] [bug] Revert freeing ndarray memory when python GC
 triggers (#5019)

---
 python/taichi/lang/_ndarray.py | 13 -------------
 python/taichi/lang/impl.py     |  4 ----
 taichi/program/program.cpp     | 14 ++------------
 taichi/program/program.h       |  4 +---
 taichi/python/export_lang.cpp  |  1 -
 tests/python/test_ndarray.py   | 22 ----------------------
 6 files changed, 3 insertions(+), 55 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index f9546df9c242f..bc418f1798da7 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -18,19 +18,6 @@ def __init__(self, dtype, arr_shape):
         self.dtype = cook_dtype(dtype)
         self.arr = impl.get_runtime().prog.create_ndarray(
             cook_dtype(dtype), arr_shape)
-        self._gen = impl.get_runtime().generation
-
-    def __del__(self):
-        # - impl.get_runtime().prog == None:
-        #   ti.reset() is called but ti.init() isn't re-initialized yet.
-        #   At this point all ndarrays allocated in the previous program
-        #   are freed along with program destruction.
-        # - impl.get_generation() != self.gen
-        #   This ndarray was created from previous prog which was destructed.
-        #   So its memory was freed already.
-        if impl.get_runtime().prog is not None and impl.get_runtime(
-        ).generation == self._gen:
-            impl.get_runtime().prog.delete_ndarray(self.arr)
 
     @property
     def element_shape(self):
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 4e768d4457389..90cfc7446d3af 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -1,5 +1,4 @@
 import numbers
-from itertools import count
 from types import FunctionType, MethodType
 from typing import Iterable
 
@@ -224,8 +223,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class PyTaichi:
-    _gen = count(0)
-
     def __init__(self, kernels=None):
         self.materialized = False
         self.prog = None
@@ -242,7 +239,6 @@ def __init__(self, kernels=None):
         self.grad_replaced = False
         self.kernels = kernels or []
         self._signal_handler_registry = None
-        self.generation = next(self._gen)
 
     def get_num_compiled_functions(self):
         return len(self.compiled_functions) + len(self.compiled_grad_functions)
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 19ce2b8cdf00a..94c95357d04b1 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -508,8 +508,6 @@ void Program::finalize() {
     }
   }
 
-  ndarrays_.clear();
-
   synchronize();
   memory_pool_->terminate();
 
@@ -557,16 +555,8 @@ std::size_t Program::get_snode_num_dynamically_allocated(SNode *snode) {
 
 Ndarray *Program::create_ndarray(const DataType type,
                                  const std::vector<int> &shape) {
-  // TODO: allocate DeviceAllocation first and then create Ndarray
-  auto arr = std::make_unique<Ndarray>(this, type, shape);
-  auto arr_ptr = arr.get();
-  ndarrays_.insert({arr_ptr, std::move(arr)});
-  return arr_ptr;
-}
-
-void Program::delete_ndarray(Ndarray *ndarray) {
-  TI_ASSERT(ndarrays_.count(ndarray));
-  ndarrays_.erase(ndarray);
+  ndarrays_.emplace_back(std::make_unique<Ndarray>(this, type, shape));
+  return ndarrays_.back().get();
 }
 
 intptr_t Program::get_ndarray_data_ptr_as_int(Ndarray *ndarray) {
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 4ff8227b483a1..e411cad9c620c 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -323,8 +323,6 @@ class TI_DLL_EXPORT Program {
 
   Ndarray *create_ndarray(const DataType type, const std::vector<int> &shape);
 
-  void delete_ndarray(Ndarray *ndarray);
-
   intptr_t get_ndarray_data_ptr_as_int(Ndarray *ndarray);
 
   void fill_ndarray_fast(Ndarray *ndarray, uint32_t val);
@@ -359,7 +357,7 @@ class TI_DLL_EXPORT Program {
   bool finalized_{false};
 
   std::unique_ptr<MemoryPool> memory_pool_{nullptr};
-  std::unordered_map<void *, std::unique_ptr<Ndarray>> ndarrays_;
+  std::vector<std::unique_ptr<Ndarray>> ndarrays_;
 };
 
 }  // namespace lang
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index b237fad6c9d6d..850056d49642b 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -423,7 +423,6 @@ void export_lang(py::module &m) {
             return program->create_ndarray(dt, shape);
           },
           py::return_value_policy::reference)
-      .def("delete_ndarray", &Program::delete_ndarray)
       .def("get_ndarray_data_ptr_as_int",
            [](Program *program, Ndarray *ndarray) {
              return program->get_ndarray_data_ptr_as_int(ndarray);
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 5034320a15935..8579c04b4acf0 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -341,15 +341,6 @@ def test_ndarray_numpy_io():
     _test_ndarray_numpy_io()
 
 
-@test_utils.test(arch=supported_archs_taichi_ndarray)
-def test_ndarray_reset():
-    n = 8
-    c = ti.Matrix.ndarray(4, 4, ti.f32, shape=(n))
-    del c
-    d = ti.Matrix.ndarray(4, 4, ti.f32, shape=(n))
-    ti.reset()
-
-
 def _test_ndarray_matrix_numpy_io(layout):
     n = 5
     m = 2
@@ -622,16 +613,3 @@ def init(d: ti.i32, arr: ti.types.ndarray()):
     y = ti.ndarray(dtype=ti.f32, shape=(n2, n2))
     init(3, y)
     assert (y.to_numpy() == (np.ones(shape=(n2, n2)) * 3)).all()
-
-
-@test_utils.test(arch=supported_archs_taichi_ndarray)
-def test_generation():
-    curr_arch = ti.lang.impl.current_cfg().arch
-    n1 = 4
-    x = ti.ndarray(dtype=ti.f32, shape=(n1, n1))
-    prev_gen = x._gen
-    ti.reset()  # gen++
-    ti.init(curr_arch)  # calls ti.reset(), gen++
-    y = ti.ndarray(dtype=ti.f32, shape=(n1, ))
-    assert y._gen > prev_gen
-    del x

From 25644e3b6fc162a2d4c15d5049d16a560de03526 Mon Sep 17 00:00:00 2001
From: Zeyu Li <li_zeyu@pku.edu.cn>
Date: Sat, 21 May 2022 15:40:46 +0800
Subject: [PATCH 103/176] [SIMT] Add match_all warp intrinsics (#4961)

* add match_all warp intrinsic by ptx

* add args to match_all in warp.py

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update warp.py to sync with PR4957

* update llvm_context.cpp: add more details about match_all_sync intrinsic

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update test_simt.py

Initialize a with1

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/simt/warp.py |  8 +++++---
 taichi/llvm/llvm_context.cpp    | 13 +++++++++++++
 taichi/runtime/llvm/runtime.cpp | 12 ++++++++++++
 tests/python/test_simt.py       | 25 +++++++++++++++++++++++--
 4 files changed, 53 insertions(+), 5 deletions(-)

diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
index 37b3adefaab75..720b504577ade 100644
--- a/python/taichi/lang/simt/warp.py
+++ b/python/taichi/lang/simt/warp.py
@@ -104,9 +104,11 @@ def match_any(mask, value):
                               with_runtime_context=False)
 
 
-def match_all():
-    # TODO
-    pass
+def match_all(mask, val):
+    return impl.call_internal("cuda_match_all_sync_i32",
+                              mask,
+                              val,
+                              with_runtime_context=False)
 
 
 def active_mask():
diff --git a/taichi/llvm/llvm_context.cpp b/taichi/llvm/llvm_context.cpp
index 278013095231d..fa98d12245694 100644
--- a/taichi/llvm/llvm_context.cpp
+++ b/taichi/llvm/llvm_context.cpp
@@ -404,6 +404,19 @@ std::unique_ptr<llvm::Module> TaichiLLVMContext::clone_module(
     patch_intrinsic("cuda_match_any_sync_i32",
                     Intrinsic::nvvm_match_any_sync_i32);
 
+    // LLVM 10.0.0 seems to have a bug on this intrinsic function
+    /*
+    nvvm_match_all_sync_i32
+    Args:
+        1. u32 mask
+        2. i32 value
+        3. i32 *pred
+    */
+    /*
+    patch_intrinsic("cuda_match_all_sync_i32p",
+                    Intrinsic::nvvm_math_all_sync_i32);
+    */
+
     // LLVM 10.0.0 seems to have a bug on this intrinsic function
     /*
     patch_intrinsic("cuda_match_any_sync_i64",
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 004f748ee225a..045cd9f274efd 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -1088,6 +1088,18 @@ uint32 cuda_match_any_sync_i32(u32 mask, i32 value) {
   return 0;
 }
 
+u32 cuda_match_all_sync_i32(u32 mask, i32 value) {
+#if ARCH_cuda
+  u32 ret;
+  asm volatile("match.all.sync.b32  %0, %1, %2;"
+               : "=r"(ret)
+               : "r"(value), "r"(mask));
+  return ret;
+#else
+  return 0;
+#endif
+}
+
 uint32 cuda_match_any_sync_i64(u32 mask, i64 value) {
 #if ARCH_cuda
   u32 ret;
diff --git a/tests/python/test_simt.py b/tests/python/test_simt.py
index 69d804a8deeb3..accf5b1dbc9a9 100644
--- a/tests/python/test_simt.py
+++ b/tests/python/test_simt.py
@@ -292,8 +292,29 @@ def foo():
 
 @test_utils.test(arch=ti.cuda)
 def test_match_all():
-    # TODO
-    pass
+    a = ti.field(dtype=ti.i32, shape=32)
+    b = ti.field(dtype=ti.u32, shape=32)
+    c = ti.field(dtype=ti.u32, shape=32)
+
+    @ti.kernel
+    def foo():
+        ti.loop_config(block_dim=32)
+        for i in range(32):
+            a[i] = 1
+        for i in range(32):
+            b[i] = ti.simt.warp.match_all(ti.u32(0xFFFFFFFF), a[i])
+
+        a[0] = 2
+        for i in range(32):
+            c[i] = ti.simt.warp.match_all(ti.u32(0xFFFFFFFF), a[i])
+
+    foo()
+
+    for i in range(32):
+        assert b[i] == (2**32 - 1)
+
+    for i in range(32):
+        assert c[i] == 0
 
 
 @test_utils.test(arch=ti.cuda)

From fba92cf76f93668033678e91eb219ba9c9f4a1ef Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Sat, 21 May 2022 17:37:53 +0800
Subject: [PATCH 104/176] [AOT] Support importing external Vulkan buffers
 (#5020)

---
 taichi/backends/vulkan/vulkan_device.cpp | 16 ++++++++++++++++
 taichi/backends/vulkan/vulkan_device.h   |  4 ++++
 2 files changed, 20 insertions(+)

diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
index a739758e69f88..ff00fb18e6648 100644
--- a/taichi/backends/vulkan/vulkan_device.cpp
+++ b/taichi/backends/vulkan/vulkan_device.cpp
@@ -1682,6 +1682,22 @@ vkapi::IVkFramebuffer VulkanDevice::get_framebuffer(
   return framebuffer;
 }
 
+DeviceAllocation VulkanDevice::import_vkbuffer(vkapi::IVkBuffer buffer) {
+  AllocationInternal alloc_int{};
+  alloc_int.external = true;
+  alloc_int.buffer = buffer;
+  alloc_int.mapped = nullptr;
+  alloc_int.addr = 0;
+
+  DeviceAllocation alloc;
+  alloc.device = this;
+  alloc.alloc_id = alloc_cnt_++;
+
+  allocations_[alloc.alloc_id] = alloc_int;
+
+  return alloc;
+}
+
 DeviceAllocation VulkanDevice::import_vk_image(vkapi::IVkImage image,
                                                vkapi::IVkImageView view,
                                                VkFormat format) {
diff --git a/taichi/backends/vulkan/vulkan_device.h b/taichi/backends/vulkan/vulkan_device.h
index 5a4b9eeeefff8..15d5cde4d88a8 100644
--- a/taichi/backends/vulkan/vulkan_device.h
+++ b/taichi/backends/vulkan/vulkan_device.h
@@ -604,6 +604,9 @@ class TI_DLL_EXPORT VulkanDevice : public GraphicsDevice {
 
   std::tuple<vkapi::IVkImage, vkapi::IVkImageView, VkFormat> get_vk_image(
       const DeviceAllocation &alloc) const;
+
+  DeviceAllocation import_vkbuffer(vkapi::IVkBuffer buffer);
+
   DeviceAllocation import_vk_image(vkapi::IVkImage image,
                                    vkapi::IVkImageView view,
                                    VkFormat format);
@@ -642,6 +645,7 @@ class TI_DLL_EXPORT VulkanDevice : public GraphicsDevice {
 
   // Memory allocation
   struct AllocationInternal {
+    bool external{false};
     VmaAllocationInfo alloc_info;
     vkapi::IVkBuffer buffer;
     void *mapped{nullptr};

From 504f619b79f29aeed3243c3553487669e9789a3f Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Mon, 23 May 2022 18:08:51 +0800
Subject: [PATCH 105/176] [Bug] [type] Fix frontend type check for reading a
 whole bit_struct (#5027)

---
 taichi/ir/frontend_ir.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 745e82e6dc752..e5fcb6d18cf83 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -318,7 +318,8 @@ void GlobalVariableExpression::flatten(FlattenContext *ctx) {
 void GlobalPtrExpression::type_check(CompileConfig *) {
   // Currently, dimension compatibility check happens in Python
   if (snode != nullptr) {
-    ret_type = snode->dt;
+    TI_ASSERT(snode->dt->is<BitStructType>());
+    ret_type = snode->dt->cast<BitStructType>()->get_physical_type();
   } else if (var.is<GlobalVariableExpression>()) {
     ret_type =
         var.cast<GlobalVariableExpression>()->snode->dt->get_compute_type();

From 1532d9f29d1b4d8b5b1f53e08f14fb818e704355 Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Mon, 23 May 2022 22:57:15 +0800
Subject: [PATCH 106/176] fix fast_gui rgba bug (#5031)

---
 python/taichi/_kernels.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/taichi/_kernels.py b/python/taichi/_kernels.py
index 8043cc5e0de9a..dc7d88b43dc9c 100644
--- a/python/taichi/_kernels.py
+++ b/python/taichi/_kernels.py
@@ -71,10 +71,11 @@ def vector_to_fast_image(img: template(), out: ndarray_type.ndarray()):
         r, g, b = 0, 0, 0
         color = img[i, img.shape[1] - 1 - j]
         if static(img.dtype in [f16, f32, f64]):
-            r, g, b = min(255, max(0, int(color * 255)))
+            r, g, b = min(255, max(0, int(color * 255)))[:3]
         else:
             static_assert(img.dtype == u8)
-            r, g, b = color
+            r, g, b = color[:3]
+
         idx = j * img.shape[0] + i
         # We use i32 for |out| since OpenGL and Metal doesn't support u8 types
         if static(get_os_name() != 'osx'):

From bdc95450bd8b836c9d58022c4b5f96cfe6616537 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Tue, 24 May 2022 12:28:52 +0800
Subject: [PATCH 107/176] [doc] Update OS names (#5030)

---
 docs/lang/articles/contribution/dev_install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/lang/articles/contribution/dev_install.md b/docs/lang/articles/contribution/dev_install.md
index ba85f89e2e04e..2d3294e6a47be 100644
--- a/docs/lang/articles/contribution/dev_install.md
+++ b/docs/lang/articles/contribution/dev_install.md
@@ -37,7 +37,7 @@ Installation instructions vary depending on which operating system (OS) you are
 <Tabs
   defaultValue="unix"
   values={[
-    {label: 'Linux/Unix/Mac', value: 'unix'},
+    {label: 'Linux/Mac', value: 'unix'},
     {label: 'Windows', value: 'windows'}
   ]}>
 

From 686deb1bd66f1ceaf30319cd128652089aef7934 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Tue, 24 May 2022 12:29:09 +0800
Subject: [PATCH 108/176] [ci] Disable win cpu docker job test (#5033)

* Disable win cpu docker job test

* Revert changes on naming
---
 .github/workflows/scripts/win_build_test_cpu.ps1 | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/scripts/win_build_test_cpu.ps1 b/.github/workflows/scripts/win_build_test_cpu.ps1
index a551cadb2b543..0296d98aa4d57 100644
--- a/.github/workflows/scripts/win_build_test_cpu.ps1
+++ b/.github/workflows/scripts/win_build_test_cpu.ps1
@@ -81,7 +81,8 @@ if (-not $?) { exit 1 }
 WriteInfo("Build finished")
 ccache -s -v
 
-$env:TI_ENABLE_PADDLE = "0"
-WriteInfo("Testing Taichi")
-python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cpu
-WriteInfo("Test finished")
+# We skip the test for the moment due to the long job execution time.
+#$env:TI_ENABLE_PADDLE = "0"
+#WriteInfo("Testing Taichi")
+#python tests/run_tests.py -vr2 -t4 -k "not torch and not paddle" -a cpu
+#WriteInfo("Test finished")

From 97e9b3920d4948db2399737ad6a3dd8f23bc9a3f Mon Sep 17 00:00:00 2001
From: ailzhang <ailing@taichi.graphics>
Date: Mon, 23 May 2022 13:30:32 +0800
Subject: [PATCH 109/176] [aot] Serialize built graph, deserialize and run.

related: #4786

[Update]: based on an offline discussion with k-ye, I've split the
original `Graph` class into `GraphBuilder` and `CompiledGraph` classes
in C++. Note that the implementation didn't follow exactly the builder
design pattern as our builder is slightly simpler as shown below.
The complexity in our problem is more in the need of serialization and
deserialization for the same graph representation intead of its
construction process. So IMHO it's good enough to separate the
GraphBuilder and Runner(`CompiledGraph`) as we discussed. Please feel
free to correct me if I'm wrong!

```
GraphBuilder
    |
 compile()
    |
    |
CompiledGraph <----  serialize/deserialize ----> file
    |
    |
   run()
```

This PR demonstrates a minimal example of serializing a built graph,
deserializing and running it.

ghstack-source-id: 7dda7cc11ef3a946f31d75783a8cfd1836e47ba5
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5024
---
 taichi/aot/graph_data.cpp                     |  48 +++++++++
 taichi/aot/graph_data.h                       |  10 +-
 taichi/aot/module_builder.cpp                 |  16 +++
 taichi/aot/module_builder.h                   |  12 +++
 taichi/aot/module_loader.h                    |   7 +-
 .../vulkan/aot_module_builder_impl.cpp        |   9 ++
 .../backends/vulkan/aot_module_builder_impl.h |   2 +
 .../vulkan/aot_module_loader_impl.cpp         |  16 +++
 .../backends/vulkan/aot_module_loader_impl.h  |  23 +---
 taichi/backends/vulkan/vulkan_graph_data.h    |  29 +++++
 taichi/program/graph.cpp                      | 101 -----------------
 taichi/program/graph_builder.cpp              |  65 +++++++++++
 taichi/program/{graph.h => graph_builder.h}   |  46 ++------
 tests/cpp/aot/aot_save_load_test.cpp          | 102 ++++++++++++++++++
 tests/cpp/program/graph_test.cpp              |  10 +-
 15 files changed, 327 insertions(+), 169 deletions(-)
 create mode 100644 taichi/aot/graph_data.cpp
 create mode 100644 taichi/backends/vulkan/vulkan_graph_data.h
 delete mode 100644 taichi/program/graph.cpp
 create mode 100644 taichi/program/graph_builder.cpp
 rename taichi/program/{graph.h => graph_builder.h} (58%)

diff --git a/taichi/aot/graph_data.cpp b/taichi/aot/graph_data.cpp
new file mode 100644
index 0000000000000..6b988356f6831
--- /dev/null
+++ b/taichi/aot/graph_data.cpp
@@ -0,0 +1,48 @@
+#include "taichi/aot/graph_data.h"
+#include "taichi/program/ndarray.h"
+#define TI_RUNTIME_HOST
+#include "taichi/program/context.h"
+#undef TI_RUNTIME_HOST
+
+namespace taichi {
+namespace lang {
+namespace aot {
+void CompiledGraph::run(
+    const std::unordered_map<std::string, IValue> &args) const {
+  RuntimeContext ctx;
+  for (const auto &dispatch : dispatches) {
+    memset(&ctx, 0, sizeof(RuntimeContext));
+
+    TI_ASSERT(dispatch.compiled_kernel);
+    // Populate args metadata into RuntimeContext
+    const auto &symbolic_args_ = dispatch.symbolic_args;
+    for (int i = 0; i < symbolic_args_.size(); ++i) {
+      auto &symbolic_arg = symbolic_args_[i];
+      auto found = args.find(symbolic_arg.name);
+      TI_ERROR_IF(found == args.end(), "Missing runtime value for {}",
+                  symbolic_arg.name);
+      const aot::IValue &ival = found->second;
+      if (ival.tag == aot::ArgKind::NDARRAY) {
+        Ndarray *arr = reinterpret_cast<Ndarray *>(ival.val);
+        TI_ERROR_IF(ival.tag != aot::ArgKind::NDARRAY,
+                    "Required a ndarray for argument {}", symbolic_arg.name);
+        auto ndarray_elem_shape = std::vector<int>(
+            arr->shape.end() - symbolic_arg.element_shape.size(),
+            arr->shape.end());
+        TI_ERROR_IF(ndarray_elem_shape != symbolic_arg.element_shape,
+                    "Mismatched shape information for argument {}",
+                    symbolic_arg.name);
+        set_runtime_ctx_ndarray(&ctx, i, arr);
+      } else {
+        TI_ERROR_IF(ival.tag != aot::ArgKind::SCALAR,
+                    "Required a scalar for argument {}", symbolic_arg.name);
+        ctx.set_arg(i, ival.val);
+      }
+    }
+
+    dispatch.compiled_kernel->launch(&ctx);
+  }
+}
+}  // namespace aot
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/aot/graph_data.h b/taichi/aot/graph_data.h
index 5da03b4d89d41..12e96062f59d6 100644
--- a/taichi/aot/graph_data.h
+++ b/taichi/aot/graph_data.h
@@ -4,10 +4,14 @@
 #include <unordered_map>
 #include "taichi/aot/module_data.h"
 
+template <typename T, typename G>
+T taichi_union_cast_with_different_sizes(G g);
+
 namespace taichi {
 namespace lang {
 class AotModuleBuilder;
 class Ndarray;
+struct RuntimeContext;
 namespace aot {
 // Currently only scalar and ndarray are supported.
 enum class ArgKind { SCALAR, NDARRAY, UNKNOWN };
@@ -67,10 +71,6 @@ class TI_DLL_EXPORT Kernel {
    * @param ctx Host context
    */
   virtual void launch(RuntimeContext *ctx) = 0;
-
-  virtual void save_to_module(AotModuleBuilder *builder) {
-    TI_NOT_IMPLEMENTED;
-  }
 };
 
 struct CompiledDispatch {
@@ -84,6 +84,8 @@ struct CompiledDispatch {
 struct CompiledGraph {
   std::vector<CompiledDispatch> dispatches;
 
+  void run(const std::unordered_map<std::string, IValue> &args) const;
+
   TI_IO_DEF(dispatches);
 };
 
diff --git a/taichi/aot/module_builder.cpp b/taichi/aot/module_builder.cpp
index b194d2ee384c6..d5f7668a0a430 100644
--- a/taichi/aot/module_builder.cpp
+++ b/taichi/aot/module_builder.cpp
@@ -52,5 +52,21 @@ void AotModuleBuilder::load(const std::string &output_dir) {
   TI_ERROR("Aot loader not supported");
 }
 
+void AotModuleBuilder::dump_graph(std::string output_dir) const {
+  const std::string graph_file = fmt::format("{}/graphs.tcb", output_dir);
+  write_to_binary_file(graphs_, graph_file);
+}
+
+void AotModuleBuilder::add_graph(const std::string &name,
+                                 const aot::CompiledGraph &graph) {
+  if (graphs_.count(name) != 0) {
+    TI_ERROR("Graph {} already exists", name);
+  }
+  // Handle adding kernels separately.
+  for (const auto &dispatch : graph.dispatches) {
+    add_compiled_kernel(dispatch.compiled_kernel);
+  }
+  graphs_[name] = graph;
+}
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/aot/module_builder.h b/taichi/aot/module_builder.h
index 29c4869d509b3..02cdb9a83ded4 100644
--- a/taichi/aot/module_builder.h
+++ b/taichi/aot/module_builder.h
@@ -7,6 +7,7 @@
 #include "taichi/backends/device.h"
 #include "taichi/ir/snode.h"
 #include "taichi/aot/module_data.h"
+#include "taichi/aot/graph_data.h"
 
 namespace taichi {
 namespace lang {
@@ -37,6 +38,8 @@ class AotModuleBuilder {
   virtual void dump(const std::string &output_dir,
                     const std::string &filename) const = 0;
 
+  void add_graph(const std::string &name, const aot::CompiledGraph &graph);
+
  protected:
   /**
    * Intended to be overriden by each backend's implementation.
@@ -62,13 +65,22 @@ class AotModuleBuilder {
     TI_NOT_IMPLEMENTED;
   }
 
+  virtual void add_compiled_kernel(aot::Kernel *kernel) {
+    TI_NOT_IMPLEMENTED;
+  }
+
   virtual void add_per_backend_tmpl(const std::string &identifier,
                                     const std::string &key,
                                     Kernel *kernel) {
     TI_NOT_IMPLEMENTED;
   }
 
+  void dump_graph(std::string output_dir) const;
+
   static bool all_fields_are_dense_in_container(const SNode *container);
+
+ private:
+  std::unordered_map<std::string, aot::CompiledGraph> graphs_;
 };
 
 }  // namespace lang
diff --git a/taichi/aot/module_loader.h b/taichi/aot/module_loader.h
index b63fd9e676277..caf4c857ce156 100644
--- a/taichi/aot/module_loader.h
+++ b/taichi/aot/module_loader.h
@@ -16,7 +16,7 @@ namespace taichi {
 namespace lang {
 
 struct RuntimeContext;
-
+class Graph;
 namespace aot {
 
 class TI_DLL_EXPORT Field {
@@ -90,11 +90,16 @@ class TI_DLL_EXPORT Module {
   KernelTemplate *get_kernel_template(const std::string &name);
   Field *get_field(const std::string &name);
 
+  virtual std::unique_ptr<aot::CompiledGraph> get_graph(std::string name) {
+    TI_NOT_IMPLEMENTED;
+  }
+
  protected:
   virtual std::unique_ptr<Kernel> make_new_kernel(const std::string &name) = 0;
   virtual std::unique_ptr<KernelTemplate> make_new_kernel_template(
       const std::string &name) = 0;
   virtual std::unique_ptr<Field> make_new_field(const std::string &name) = 0;
+  std::unordered_map<std::string, CompiledGraph> graphs_;
 
  private:
   std::unordered_map<std::string, std::unique_ptr<Kernel>> loaded_kernels_;
diff --git a/taichi/backends/vulkan/aot_module_builder_impl.cpp b/taichi/backends/vulkan/aot_module_builder_impl.cpp
index 60cc5f7aa266f..ed03800098bd1 100644
--- a/taichi/backends/vulkan/aot_module_builder_impl.cpp
+++ b/taichi/backends/vulkan/aot_module_builder_impl.cpp
@@ -5,6 +5,7 @@
 
 #include "taichi/aot/module_data.h"
 #include "taichi/codegen/spirv/spirv_codegen.h"
+#include "taichi/backends/vulkan/vulkan_graph_data.h"
 
 namespace taichi {
 namespace lang {
@@ -135,6 +136,8 @@ void AotModuleBuilderImpl::dump(const std::string &output_dir,
 
   const std::string json_path = fmt::format("{}/metadata.json", output_dir);
   converted.dump_json(json_path);
+
+  dump_graph(output_dir);
 }
 
 void AotModuleBuilderImpl::add_per_backend(const std::string &identifier,
@@ -147,6 +150,12 @@ void AotModuleBuilderImpl::add_per_backend(const std::string &identifier,
   ti_aot_data_.spirv_codes.push_back(compiled.task_spirv_source_codes);
 }
 
+void AotModuleBuilderImpl::add_compiled_kernel(aot::Kernel *kernel) {
+  const auto register_params = static_cast<KernelImpl *>(kernel)->params();
+  ti_aot_data_.kernels.push_back(register_params.kernel_attribs);
+  ti_aot_data_.spirv_codes.push_back(register_params.task_spirv_source_codes);
+}
+
 void AotModuleBuilderImpl::add_field_per_backend(const std::string &identifier,
                                                  const SNode *rep_snode,
                                                  bool is_scalar,
diff --git a/taichi/backends/vulkan/aot_module_builder_impl.h b/taichi/backends/vulkan/aot_module_builder_impl.h
index 0accfcc203343..40dc4157c06f4 100644
--- a/taichi/backends/vulkan/aot_module_builder_impl.h
+++ b/taichi/backends/vulkan/aot_module_builder_impl.h
@@ -36,6 +36,8 @@ class AotModuleBuilderImpl : public AotModuleBuilder {
                             const std::string &key,
                             Kernel *kernel) override;
 
+  void add_compiled_kernel(aot::Kernel *kernel) override;
+
   std::string write_spv_file(const std::string &output_dir,
                              const TaskAttributes &k,
                              const std::vector<uint32_t> &source_code) const;
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.cpp b/taichi/backends/vulkan/aot_module_loader_impl.cpp
index 149b172ac4c2c..4ea34de89fc7c 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.cpp
+++ b/taichi/backends/vulkan/aot_module_loader_impl.cpp
@@ -4,6 +4,7 @@
 #include <type_traits>
 
 #include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/aot/graph_data.h"
 
 namespace taichi {
 namespace lang {
@@ -39,6 +40,21 @@ class AotModuleImpl : public aot::Module {
       }
       ti_aot_data_.spirv_codes.push_back(spirv_sources_codes);
     }
+
+    const std::string graph_path =
+        fmt::format("{}/graphs.tcb", params.module_path);
+    read_from_binary_file(graphs_, graph_path);
+  }
+
+  std::unique_ptr<aot::CompiledGraph> get_graph(std::string name) override {
+    TI_ERROR_IF(graphs_.count(name) == 0, "Cannot find graph {}", name);
+    std::vector<aot::CompiledDispatch> dispatches;
+    for (auto &dispatch : graphs_[name].dispatches) {
+      dispatches.push_back({dispatch.kernel_name, dispatch.symbolic_args,
+                            get_kernel(dispatch.kernel_name)});
+    }
+    aot::CompiledGraph graph{dispatches};
+    return std::make_unique<aot::CompiledGraph>(std::move(graph));
   }
 
   size_t get_root_size() const override {
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.h b/taichi/backends/vulkan/aot_module_loader_impl.h
index 23990b56a5c68..16230e1411306 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.h
+++ b/taichi/backends/vulkan/aot_module_loader_impl.h
@@ -7,31 +7,14 @@
 #include "taichi/backends/vulkan/aot_utils.h"
 #include "taichi/runtime/vulkan/runtime.h"
 #include "taichi/codegen/spirv/kernel_utils.h"
-
+#include "taichi/aot/module_builder.h"
 #include "taichi/aot/module_loader.h"
+#include "taichi/backends/vulkan/aot_module_builder_impl.h"
+#include "taichi/backends/vulkan/vulkan_graph_data.h"
 
 namespace taichi {
 namespace lang {
 namespace vulkan {
-
-class VkRuntime;
-
-class KernelImpl : public aot::Kernel {
- public:
-  explicit KernelImpl(VkRuntime *runtime, VkRuntime::RegisterParams &&params)
-      : runtime_(runtime), params_(std::move(params)) {
-  }
-
-  void launch(RuntimeContext *ctx) override {
-    auto handle = runtime_->register_taichi_kernel(params_);
-    runtime_->launch_kernel(handle, ctx);
-  }
-
- private:
-  VkRuntime *const runtime_;
-  const VkRuntime::RegisterParams params_;
-};
-
 struct TI_DLL_EXPORT AotModuleParams {
   std::string module_path;
   VkRuntime *runtime{nullptr};
diff --git a/taichi/backends/vulkan/vulkan_graph_data.h b/taichi/backends/vulkan/vulkan_graph_data.h
new file mode 100644
index 0000000000000..6fa3cafc1e3e0
--- /dev/null
+++ b/taichi/backends/vulkan/vulkan_graph_data.h
@@ -0,0 +1,29 @@
+#pragma once
+#include "taichi/runtime/vulkan/runtime.h"
+
+namespace taichi {
+namespace lang {
+namespace vulkan {
+class KernelImpl : public aot::Kernel {
+ public:
+  explicit KernelImpl(VkRuntime *runtime, VkRuntime::RegisterParams &&params)
+      : runtime_(runtime), params_(std::move(params)) {
+    handle_ = runtime_->register_taichi_kernel(params_);
+  }
+
+  void launch(RuntimeContext *ctx) override {
+    runtime_->launch_kernel(handle_, ctx);
+  }
+
+  const VkRuntime::RegisterParams &params() {
+    return params_;
+  }
+
+ private:
+  VkRuntime *const runtime_;
+  VkRuntime::KernelHandle handle_;
+  const VkRuntime::RegisterParams params_;
+};
+}  // namespace vulkan
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/program/graph.cpp b/taichi/program/graph.cpp
deleted file mode 100644
index 3fa1c703c7064..0000000000000
--- a/taichi/program/graph.cpp
+++ /dev/null
@@ -1,101 +0,0 @@
-#include "taichi/program/graph.h"
-#include "taichi/program/kernel.h"
-#include "taichi/aot/module_builder.h"
-#include "spdlog/fmt/fmt.h"
-
-#include <fstream>
-
-namespace taichi {
-namespace lang {
-
-void Dispatch::compile(
-    std::vector<aot::CompiledDispatch> &compiled_dispatches) {
-  if (compiled_kernel_)
-    return;
-  compiled_kernel_ = kernel_->compile_to_aot_kernel();
-  aot::CompiledDispatch dispatch{kernel_->get_name(), symbolic_args_,
-                                 compiled_kernel_.get()};
-  compiled_dispatches.push_back(std::move(dispatch));
-}
-
-void Sequential::compile(
-    std::vector<aot::CompiledDispatch> &compiled_dispatches) {
-  // In the future we can do more across-kernel optimization here.
-  for (Node *n : sequence_) {
-    n->compile(compiled_dispatches);
-  }
-}
-
-void Sequential::append(Node *node) {
-  sequence_.push_back(node);
-}
-
-void Sequential::dispatch(Kernel *kernel, const std::vector<aot::Arg> &args) {
-  Node *n = owning_graph_->new_dispatch_node(kernel, args);
-  sequence_.push_back(n);
-}
-
-Graph::Graph(std::string name) : name_(name) {
-  seq_ = std::make_unique<Sequential>(this);
-}
-Node *Graph::new_dispatch_node(Kernel *kernel,
-                               const std::vector<aot::Arg> &args) {
-  all_nodes_.push_back(std::make_unique<Dispatch>(kernel, args));
-  return all_nodes_.back().get();
-}
-
-Sequential *Graph::new_sequential_node() {
-  all_nodes_.push_back(std::make_unique<Sequential>(this));
-  return static_cast<Sequential *>(all_nodes_.back().get());
-}
-
-void Graph::compile() {
-  seq()->compile(compiled_graph_.dispatches);
-}
-
-Sequential *Graph::seq() const {
-  return seq_.get();
-}
-
-void Graph::dispatch(Kernel *kernel, const std::vector<aot::Arg> &args) {
-  seq()->dispatch(kernel, args);
-}
-
-void Graph::run(
-    const std::unordered_map<std::string, aot::IValue> &args) const {
-  RuntimeContext ctx;
-  for (const auto &dispatch : compiled_graph_.dispatches) {
-    memset(&ctx, 0, sizeof(RuntimeContext));
-
-    TI_ASSERT(dispatch.compiled_kernel);
-    // Populate args metadata into RuntimeContext
-    const auto &symbolic_args_ = dispatch.symbolic_args;
-    for (int i = 0; i < symbolic_args_.size(); ++i) {
-      auto &symbolic_arg = symbolic_args_[i];
-      auto found = args.find(symbolic_arg.name);
-      TI_ERROR_IF(found == args.end(), "Missing runtime value for {}",
-                  symbolic_arg.name);
-      const aot::IValue &ival = found->second;
-      if (ival.tag == aot::ArgKind::NDARRAY) {
-        Ndarray *arr = reinterpret_cast<Ndarray *>(ival.val);
-        TI_ERROR_IF(ival.tag != aot::ArgKind::NDARRAY,
-                    "Required a ndarray for argument {}", symbolic_arg.name);
-        auto ndarray_elem_shape = std::vector<int>(
-            arr->shape.end() - symbolic_arg.element_shape.size(),
-            arr->shape.end());
-        TI_ERROR_IF(ndarray_elem_shape != symbolic_arg.element_shape,
-                    "Mismatched shape information for argument {}",
-                    symbolic_arg.name);
-        set_runtime_ctx_ndarray(&ctx, i, arr);
-      } else {
-        TI_ERROR_IF(ival.tag != aot::ArgKind::SCALAR,
-                    "Required a scalar for argument {}", symbolic_arg.name);
-        ctx.set_arg(i, ival.val);
-      }
-    }
-
-    dispatch.compiled_kernel->launch(&ctx);
-  }
-}
-}  // namespace lang
-}  // namespace taichi
diff --git a/taichi/program/graph_builder.cpp b/taichi/program/graph_builder.cpp
new file mode 100644
index 0000000000000..76c579d70e74b
--- /dev/null
+++ b/taichi/program/graph_builder.cpp
@@ -0,0 +1,65 @@
+#include "taichi/program/graph_builder.h"
+#include "taichi/program/ndarray.h"
+#include "taichi/program/program.h"
+
+namespace taichi {
+namespace lang {
+void Dispatch::compile(
+    std::vector<aot::CompiledDispatch> &compiled_dispatches) {
+  if (!compiled_kernel_) {
+    compiled_kernel_ = kernel_->compile_to_aot_kernel();
+  }
+  aot::CompiledDispatch dispatch{kernel_->get_name(), symbolic_args_,
+                                 compiled_kernel_.get()};
+  compiled_dispatches.push_back(std::move(dispatch));
+}
+
+void Sequential::compile(
+    std::vector<aot::CompiledDispatch> &compiled_dispatches) {
+  // In the future we can do more across-kernel optimization here.
+  for (Node *n : sequence_) {
+    n->compile(compiled_dispatches);
+  }
+}
+
+void Sequential::append(Node *node) {
+  sequence_.push_back(node);
+}
+
+void Sequential::dispatch(Kernel *kernel, const std::vector<aot::Arg> &args) {
+  Node *n = owning_graph_->new_dispatch_node(kernel, args);
+  sequence_.push_back(n);
+}
+
+GraphBuilder::GraphBuilder() {
+  seq_ = std::make_unique<Sequential>(this);
+}
+
+Node *GraphBuilder::new_dispatch_node(Kernel *kernel,
+                                      const std::vector<aot::Arg> &args) {
+  all_nodes_.push_back(std::make_unique<Dispatch>(kernel, args));
+  return all_nodes_.back().get();
+}
+
+Sequential *GraphBuilder::new_sequential_node() {
+  all_nodes_.push_back(std::make_unique<Sequential>(this));
+  return static_cast<Sequential *>(all_nodes_.back().get());
+}
+
+std::unique_ptr<aot::CompiledGraph> GraphBuilder::compile() {
+  std::vector<aot::CompiledDispatch> dispatches;
+  seq()->compile(dispatches);
+  aot::CompiledGraph graph{dispatches};
+  return std::make_unique<aot::CompiledGraph>(std::move(graph));
+}
+
+Sequential *GraphBuilder::seq() const {
+  return seq_.get();
+}
+
+void GraphBuilder::dispatch(Kernel *kernel, const std::vector<aot::Arg> &args) {
+  seq()->dispatch(kernel, args);
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/program/graph.h b/taichi/program/graph_builder.h
similarity index 58%
rename from taichi/program/graph.h
rename to taichi/program/graph_builder.h
index 1999b9b015e74..129e5adc7b94b 100644
--- a/taichi/program/graph.h
+++ b/taichi/program/graph_builder.h
@@ -2,18 +2,14 @@
 
 #include <string>
 #include <vector>
-#include <unordered_set>
 
-#include "taichi/program/ndarray.h"
-#include "taichi/program/program.h"
 #include "taichi/ir/type.h"
 #include "taichi/aot/graph_data.h"
-#include "taichi/aot/module_builder.h"
 
 namespace taichi {
 namespace lang {
 class Kernel;
-class Graph;
+class GraphBuilder;
 
 class Node {
  public:
@@ -27,6 +23,7 @@ class Node {
   virtual void compile(
       std::vector<aot::CompiledDispatch> &compiled_dispatches) = 0;
 };
+
 class Dispatch : public Node {
  public:
   explicit Dispatch(Kernel *kernel, const std::vector<aot::Arg> &args)
@@ -45,7 +42,7 @@ class Dispatch : public Node {
 
 class Sequential : public Node {
  public:
-  explicit Sequential(Graph *graph) : owning_graph_(graph) {
+  explicit Sequential(GraphBuilder *graph) : owning_graph_(graph) {
   }
 
   void append(Node *node);
@@ -57,34 +54,15 @@ class Sequential : public Node {
 
  private:
   std::vector<Node *> sequence_;
-  Graph *owning_graph_{nullptr};
+  GraphBuilder *owning_graph_{nullptr};
 };
 
-/*
- * Graph class works as both builder and runner.
- *
- * Two typical workflows using Graph:
- * - build graph -> compile -> run
- * - build graph -> compile -> serialize -> deserialize -> run
- *
- * Thus Graph can be constructed in two ways, either as an empty object
- * or from an `aot::CompiledGraph` loaded from aot module.
- *
- * Currently Graph only supports sequential launches without returning value
- * to host.
- */
-class Graph {
+class GraphBuilder {
  public:
-  explicit Graph(std::string name);
-
-  explicit Graph(std::string name, const aot::CompiledGraph &compiled)
-      : name_(name), compiled_graph_(compiled) {
-  }
+  explicit GraphBuilder();
 
   // TODO: compile() can take in Arch argument
-  void compile();
-
-  void run(const std::unordered_map<std::string, aot::IValue> &args) const;
+  std::unique_ptr<aot::CompiledGraph> compile();
 
   Node *new_dispatch_node(Kernel *kernel, const std::vector<aot::Arg> &args);
 
@@ -94,19 +72,9 @@ class Graph {
 
   Sequential *seq() const;
 
-  aot::CompiledGraph compiled_graph() const {
-    return compiled_graph_;
-  }
-
-  std::string name() const {
-    return name_;
-  }
-
  private:
-  std::string name_;
   std::unique_ptr<Sequential> seq_{nullptr};
   std::vector<std::unique_ptr<Node>> all_nodes_;
-  aot::CompiledGraph compiled_graph_;
 };
 
 }  // namespace lang
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index b64b49b2350fb..4ed35ada03585 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -5,6 +5,8 @@
 #include "taichi/program/program.h"
 #include "tests/cpp/ir/ndarray_kernel.h"
 #include "tests/cpp/program/test_program.h"
+#include "taichi/aot/graph_data.h"
+#include "taichi/program/graph_builder.h"
 #ifdef TI_WITH_VULKAN
 #include "taichi/backends/vulkan/aot_module_loader_impl.h"
 #include "taichi/backends/device.h"
@@ -299,4 +301,104 @@ TEST(AotSaveLoad, VulkanNdarray) {
   // Deallocate
   embedded_device->device()->dealloc_memory(devalloc_arr_);
 }
+
+[[maybe_unused]] static void save_graph() {
+  TestProgram test_prog;
+  test_prog.setup(Arch::vulkan);
+  auto aot_builder = test_prog.prog()->make_aot_module_builder(Arch::vulkan);
+  auto ker1 = setup_kernel1(test_prog.prog());
+  auto ker2 = setup_kernel2(test_prog.prog());
+
+  auto g_builder = std::make_unique<GraphBuilder>();
+  auto seq = g_builder->seq();
+  auto arr_arg = aot::Arg{
+      "arr", PrimitiveType::i32.to_string(), aot::ArgKind::NDARRAY, {}};
+  seq->dispatch(ker1.get(), {arr_arg});
+  seq->dispatch(ker2.get(),
+                {arr_arg, aot::Arg{"x", PrimitiveType::i32.to_string(),
+                                   aot::ArgKind::SCALAR}});
+  auto graph = g_builder->compile();
+
+  aot_builder->add_graph("test", *graph);
+  aot_builder->dump(".", "");
+}
+
+TEST(AotLoadGraph, Vulkan) {
+  // Otherwise will segfault on macOS VM,
+  // where Vulkan is installed but no devices are present
+  if (!vulkan::is_vulkan_api_available()) {
+    return;
+  }
+
+  save_graph();
+
+  // API based on proposal https://github.com/taichi-dev/taichi/issues/3642
+  // Initialize Vulkan program
+  taichi::uint64 *result_buffer{nullptr};
+  taichi::lang::RuntimeContext host_ctx;
+  auto memory_pool =
+      std::make_unique<taichi::lang::MemoryPool>(Arch::vulkan, nullptr);
+  result_buffer = (taichi::uint64 *)memory_pool->allocate(
+      sizeof(taichi::uint64) * taichi_result_buffer_entries, 8);
+  host_ctx.result_buffer = result_buffer;
+
+  // Create Taichi Device for computation
+  lang::vulkan::VulkanDeviceCreator::Params evd_params;
+  evd_params.api_version =
+      taichi::lang::vulkan::VulkanEnvSettings::kApiVersion();
+  auto embedded_device =
+      std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
+  taichi::lang::vulkan::VulkanDevice *device_ =
+      static_cast<taichi::lang::vulkan::VulkanDevice *>(
+          embedded_device->device());
+  // Create Vulkan runtime
+  vulkan::VkRuntime::Params params;
+  params.host_result_buffer = result_buffer;
+  params.device = device_;
+  auto vulkan_runtime =
+      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+
+  // Run AOT module loader
+  vulkan::AotModuleParams mod_params;
+  mod_params.module_path = ".";
+  mod_params.runtime = vulkan_runtime.get();
+
+  std::unique_ptr<aot::Module> vk_module =
+      aot::Module::load(Arch::vulkan, mod_params);
+  EXPECT_TRUE(vk_module);
+
+  // Retrieve kernels/fields/etc from AOT module
+  auto root_size = vk_module->get_root_size();
+  EXPECT_EQ(root_size, 0);
+  vulkan_runtime->add_root_buffer(root_size);
+
+  auto graph = vk_module->get_graph("test");
+
+  const int size = 10;
+  taichi::lang::Device::AllocParams alloc_params;
+  alloc_params.host_write = true;
+  alloc_params.size = size * sizeof(int);
+  alloc_params.usage = taichi::lang::AllocUsage::Storage;
+  DeviceAllocation devalloc_arr_ = device_->allocate_memory(alloc_params);
+
+  int src[size] = {0};
+  src[0] = 2;
+  src[2] = 40;
+  write_devalloc(vulkan_runtime.get(), devalloc_arr_, src, sizeof(src));
+
+  std::unordered_map<std::string, aot::IValue> args;
+  auto arr = Ndarray(devalloc_arr_, PrimitiveType::i32, {size});
+  args.insert({"arr", aot::IValue::create(arr)});
+  args.insert({"x", aot::IValue::create<int>(2)});
+  graph->run(args);
+  vulkan_runtime->synchronize();
+
+  int dst[size] = {1};
+  load_devalloc(vulkan_runtime.get(), devalloc_arr_, dst, sizeof(dst));
+
+  EXPECT_EQ(dst[0], 2);
+  EXPECT_EQ(dst[1], 2);
+  EXPECT_EQ(dst[2], 42);
+  device_->dealloc_memory(devalloc_arr_);
+}
 #endif
diff --git a/tests/cpp/program/graph_test.cpp b/tests/cpp/program/graph_test.cpp
index 2ac688f63cad8..c6b36b1a64fd2 100644
--- a/tests/cpp/program/graph_test.cpp
+++ b/tests/cpp/program/graph_test.cpp
@@ -4,8 +4,9 @@
 #include "taichi/inc/constants.h"
 #include "taichi/program/program.h"
 #include "tests/cpp/program/test_program.h"
-#include "taichi/program/graph.h"
+#include "taichi/aot/graph_data.h"
 #include "tests/cpp/ir/ndarray_kernel.h"
+#include "taichi/program/graph_builder.h"
 #ifdef TI_WITH_VULKAN
 #include "taichi/backends/vulkan/vulkan_loader.h"
 #endif
@@ -27,15 +28,16 @@ TEST(GraphTest, SimpleGraphRun) {
   auto ker1 = setup_kernel1(test_prog.prog());
   auto ker2 = setup_kernel2(test_prog.prog());
 
-  auto g = std::make_unique<Graph>("test");
-  auto seq = g->seq();
+  auto g_builder = std::make_unique<GraphBuilder>();
+  auto seq = g_builder->seq();
   auto arr_arg = aot::Arg{
       "arr", PrimitiveType::i32.to_string(), aot::ArgKind::NDARRAY, {}};
   seq->dispatch(ker1.get(), {arr_arg});
   seq->dispatch(ker2.get(),
                 {arr_arg, aot::Arg{"x", PrimitiveType::i32.to_string(),
                                    aot::ArgKind::SCALAR}});
-  g->compile();
+
+  auto g = g_builder->compile();
 
   auto array = Ndarray(test_prog.prog(), PrimitiveType::i32, {size});
   array.write_int({0}, 2);

From c39f5394da13374c5de362849e7cb5793faa7d10 Mon Sep 17 00:00:00 2001
From: ailzhang <ailing@taichi.graphics>
Date: Mon, 23 May 2022 13:30:33 +0800
Subject: [PATCH 110/176] [aot] Move ArgKind as first argument in Arg class

Thought this might be more intuitive for users.

ghstack-source-id: 865062f0982db4d69a41ba345a1d254c2054a12f
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5025
---
 taichi/aot/graph_data.cpp            |  6 +++---
 taichi/aot/graph_data.h              |  8 ++++----
 tests/cpp/aot/aot_save_load_test.cpp |  6 +++---
 tests/cpp/program/graph_test.cpp     | 10 ++++++----
 4 files changed, 16 insertions(+), 14 deletions(-)

diff --git a/taichi/aot/graph_data.cpp b/taichi/aot/graph_data.cpp
index 6b988356f6831..4f642ba6e9fc3 100644
--- a/taichi/aot/graph_data.cpp
+++ b/taichi/aot/graph_data.cpp
@@ -22,9 +22,9 @@ void CompiledGraph::run(
       TI_ERROR_IF(found == args.end(), "Missing runtime value for {}",
                   symbolic_arg.name);
       const aot::IValue &ival = found->second;
-      if (ival.tag == aot::ArgKind::NDARRAY) {
+      if (ival.tag == aot::ArgKind::kNdarray) {
         Ndarray *arr = reinterpret_cast<Ndarray *>(ival.val);
-        TI_ERROR_IF(ival.tag != aot::ArgKind::NDARRAY,
+        TI_ERROR_IF(ival.tag != aot::ArgKind::kNdarray,
                     "Required a ndarray for argument {}", symbolic_arg.name);
         auto ndarray_elem_shape = std::vector<int>(
             arr->shape.end() - symbolic_arg.element_shape.size(),
@@ -34,7 +34,7 @@ void CompiledGraph::run(
                     symbolic_arg.name);
         set_runtime_ctx_ndarray(&ctx, i, arr);
       } else {
-        TI_ERROR_IF(ival.tag != aot::ArgKind::SCALAR,
+        TI_ERROR_IF(ival.tag != aot::ArgKind::kScalar,
                     "Required a scalar for argument {}", symbolic_arg.name);
         ctx.set_arg(i, ival.val);
       }
diff --git a/taichi/aot/graph_data.h b/taichi/aot/graph_data.h
index 12e96062f59d6..d87e53bd15d31 100644
--- a/taichi/aot/graph_data.h
+++ b/taichi/aot/graph_data.h
@@ -14,16 +14,16 @@ class Ndarray;
 struct RuntimeContext;
 namespace aot {
 // Currently only scalar and ndarray are supported.
-enum class ArgKind { SCALAR, NDARRAY, UNKNOWN };
+enum class ArgKind { kScalar, kNdarray, kUnknown };
 
 /**
  * Symbolic argument used in building `Dispatch` nodes in the `Graph`.
  */
 struct Arg {
+  ArgKind tag;
   std::string name;
   // TODO: real element dtype = dtype + element_shape
   std::string dtype_name;
-  ArgKind tag;
   std::vector<int> element_shape;
 
   TI_IO_DEF(name, dtype_name, tag, element_shape);
@@ -38,14 +38,14 @@ struct IValue {
   ArgKind tag;
 
   static IValue create(const Ndarray &ndarray) {
-    return IValue(reinterpret_cast<intptr_t>(&ndarray), ArgKind::NDARRAY);
+    return IValue(reinterpret_cast<intptr_t>(&ndarray), ArgKind::kNdarray);
   }
 
   template <typename T,
             typename = std::enable_if_t<!std::is_same<T, Ndarray>::value, void>>
   static IValue create(T v) {
     return IValue(taichi_union_cast_with_different_sizes<uint64>(v),
-                  ArgKind::SCALAR);
+                  ArgKind::kScalar);
   }
 
  private:
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index 4ed35ada03585..e0c0211fb195a 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -312,11 +312,11 @@ TEST(AotSaveLoad, VulkanNdarray) {
   auto g_builder = std::make_unique<GraphBuilder>();
   auto seq = g_builder->seq();
   auto arr_arg = aot::Arg{
-      "arr", PrimitiveType::i32.to_string(), aot::ArgKind::NDARRAY, {}};
+      aot::ArgKind::kNdarray, "arr", PrimitiveType::i32.to_string(), {}};
   seq->dispatch(ker1.get(), {arr_arg});
   seq->dispatch(ker2.get(),
-                {arr_arg, aot::Arg{"x", PrimitiveType::i32.to_string(),
-                                   aot::ArgKind::SCALAR}});
+                {arr_arg, aot::Arg{aot::ArgKind::kScalar, "x",
+                                   PrimitiveType::i32.to_string()}});
   auto graph = g_builder->compile();
 
   aot_builder->add_graph("test", *graph);
diff --git a/tests/cpp/program/graph_test.cpp b/tests/cpp/program/graph_test.cpp
index c6b36b1a64fd2..2e062a31da7cb 100644
--- a/tests/cpp/program/graph_test.cpp
+++ b/tests/cpp/program/graph_test.cpp
@@ -31,11 +31,13 @@ TEST(GraphTest, SimpleGraphRun) {
   auto g_builder = std::make_unique<GraphBuilder>();
   auto seq = g_builder->seq();
   auto arr_arg = aot::Arg{
-      "arr", PrimitiveType::i32.to_string(), aot::ArgKind::NDARRAY, {}};
+      aot::ArgKind::kNdarray, "arr", PrimitiveType::i32.to_string(), {}};
   seq->dispatch(ker1.get(), {arr_arg});
-  seq->dispatch(ker2.get(),
-                {arr_arg, aot::Arg{"x", PrimitiveType::i32.to_string(),
-                                   aot::ArgKind::SCALAR}});
+  seq->dispatch(ker2.get(), {arr_arg, aot::Arg{
+                                          aot::ArgKind::kScalar,
+                                          "x",
+                                          PrimitiveType::i32.to_string(),
+                                      }});
 
   auto g = g_builder->compile();
 

From 9fd390b6f06656b8a0f4c06c6e64e179b0435d57 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Tue, 24 May 2022 14:24:20 +0800
Subject: [PATCH 111/176] [aot] Bind graph APIs to python and add mpm88 example
 (#5034)

This PR supports graph builder and runner APIs in python. Note for
simplicity I've merged builder and runner in the same Python class.
Please feel free to comment if you have any suggestions.

This PR also adds a test of saving mpm88 graph in aot module, as well
as an example script to demonstrate the speed improvement (15fps ->
45fps) compared to the current taichi.

ghstack-source-id: 600e604b141f9e534045f930d8424125c38ed875
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5026
---
 python/taichi/__init__.py                   |   2 +-
 python/taichi/aot/module.py                 |  65 ++------
 python/taichi/aot/utils.py                  |  68 ++++++++
 python/taichi/examples/graph/mpm88_graph.py | 168 ++++++++++++++++++++
 python/taichi/graph/__init__.py             |   1 +
 python/taichi/graph/_graph.py               |  68 ++++++++
 taichi/aot/graph_data.h                     |   4 +-
 taichi/program/ndarray.h                    |   2 +-
 taichi/python/export_lang.cpp               |  50 ++++++
 tests/python/test_aot.py                    | 147 +++++++++++++++++
 tests/python/test_api.py                    |   6 +-
 11 files changed, 520 insertions(+), 61 deletions(-)
 create mode 100644 python/taichi/aot/utils.py
 create mode 100644 python/taichi/examples/graph/mpm88_graph.py
 create mode 100644 python/taichi/graph/__init__.py
 create mode 100644 python/taichi/graph/_graph.py

diff --git a/python/taichi/__init__.py b/python/taichi/__init__.py
index e89a221b99575..439a04a4565bc 100644
--- a/python/taichi/__init__.py
+++ b/python/taichi/__init__.py
@@ -9,7 +9,7 @@
 # Provide a shortcut to types since they're commonly used.
 from taichi.types.primitive_types import *
 
-from taichi import ad, experimental, linalg, math, tools
+from taichi import ad, experimental, graph, linalg, math, tools
 from taichi.ui import GUI, hex_to_rgb, rgb_to_hex, ui
 
 # Issue#2223: Do not reorder, or we're busted with partially initialized module
diff --git a/python/taichi/aot/module.py b/python/taichi/aot/module.py
index 3a35cab650226..32cb16c63a083 100644
--- a/python/taichi/aot/module.py
+++ b/python/taichi/aot/module.py
@@ -1,15 +1,12 @@
 from contextlib import contextmanager
 from pathlib import Path, PurePosixPath
 
+from taichi.aot.utils import (produce_injected_args,
+                              produce_injected_args_from_template)
 from taichi.lang import impl, kernel_impl
-from taichi.lang._ndarray import ScalarNdarray
-from taichi.lang.enums import Layout
-from taichi.lang.exception import TaichiCompilationError
 from taichi.lang.field import ScalarField
-from taichi.lang.matrix import MatrixField, MatrixNdarray, VectorNdarray
+from taichi.lang.matrix import MatrixField
 from taichi.types.annotations import template
-from taichi.types.ndarray_type import NdarrayType
-from taichi.types.primitive_types import f32
 
 
 class KernelTemplate:
@@ -139,60 +136,20 @@ def add_kernel(self, kernel_fn, template_args=None, name=None):
         kernel_name = name or kernel_fn.__name__
         kernel = kernel_fn._primal
         assert isinstance(kernel, kernel_impl.Kernel)
-        injected_args = []
-        template_types = (NdarrayType, template)
-        num_template_args = len([
-            arg.annotation for arg in kernel.arguments
-            if isinstance(arg.annotation, template_types)
-        ])
-        if template_args is not None and num_template_args != len(
-                template_args):
-            raise TaichiCompilationError(
-                f'Need {num_template_args} inputs to instantiate the template '
-                f'parameters, got {len(template_args)}')
-        i = 0
-        for arg in kernel.arguments:
-            anno = arg.annotation
-            if isinstance(anno, template_types):
-                if template_args:
-                    injected_args.append(template_args[arg.name])
-                else:
-                    if not isinstance(anno, NdarrayType):
-                        raise TaichiCompilationError(
-                            f'Expected Ndaray type, got {anno}')
-                    if anno.element_shape is None or anno.field_dim is None:
-                        raise TaichiCompilationError(
-                            'Please either specify both `element_shape` and `field_dim` '
-                            'in the param annotation, or provide an example '
-                            f'ndarray for param={name}')
-                    if anno.element_dim == 0:
-                        injected_args.append(
-                            ScalarNdarray(f32, (2, ) * anno.field_dim))
-                    elif anno.element_dim == 1:
-                        injected_args.append(
-                            VectorNdarray(anno.element_shape[0],
-                                          dtype=f32,
-                                          shape=(2, ) * anno.field_dim,
-                                          layout=Layout.AOS))
-                    elif anno.element_dim == 2:
-                        injected_args.append(
-                            MatrixNdarray(anno.element_shape[0],
-                                          anno.element_shape[1],
-                                          dtype=f32,
-                                          shape=(2, ) * anno.field_dim,
-                                          layout=Layout.AOS))
-                    else:
-                        raise RuntimeError('')
-                i = i + 1
-            else:
-                # For primitive types, we can just inject a dummy value.
-                injected_args.append(0)
+        if template_args is not None:
+            injected_args = produce_injected_args_from_template(
+                kernel, template_args)
+        else:
+            injected_args = produce_injected_args(kernel)
         kernel.ensure_compiled(*injected_args)
         self._aot_builder.add(kernel_name, kernel.kernel_cpp)
 
         # kernel AOT
         self._kernels.append(kernel)
 
+    def add_graph(self, name, graph):
+        self._aot_builder.add_graph(name, graph._compiled_graph)
+
     @contextmanager
     def add_kernel_template(self, kernel_fn):
         """Add a taichi kernel (with template parameters) to the AOT module.
diff --git a/python/taichi/aot/utils.py b/python/taichi/aot/utils.py
new file mode 100644
index 0000000000000..fa6a0783c575a
--- /dev/null
+++ b/python/taichi/aot/utils.py
@@ -0,0 +1,68 @@
+from taichi.lang._ndarray import ScalarNdarray
+from taichi.lang.enums import Layout
+from taichi.lang.exception import TaichiCompilationError
+from taichi.lang.matrix import MatrixNdarray, VectorNdarray
+from taichi.types.annotations import template
+from taichi.types.ndarray_type import NdarrayType
+from taichi.types.primitive_types import f32
+
+template_types = (NdarrayType, template)
+
+
+def produce_injected_args_from_template(kernel, template_args):
+    injected_args = []
+    num_template_args = len([
+        arg.annotation for arg in kernel.arguments
+        if isinstance(arg.annotation, template_types)
+    ])
+    assert num_template_args == len(
+        template_args
+    ), f'Need {num_template_args} inputs to instantiate the template parameters, got {len(template_args)}'
+    for arg in kernel.arguments:
+        anno = arg.annotation
+        if isinstance(anno, template_types):
+            injected_args.append(template_args[arg.name])
+        else:
+            injected_args.append(0)
+    return injected_args
+
+
+def produce_injected_args(kernel, symbolic_args=None):
+    injected_args = []
+    for j, arg in enumerate(kernel.arguments):
+        anno = arg.annotation
+        if isinstance(anno, template_types):
+            if not isinstance(anno, NdarrayType):
+                raise TaichiCompilationError(
+                    f'Expected Ndaray type, got {anno}')
+            if symbolic_args is not None:
+                anno.element_shape = tuple(symbolic_args[j].element_shape)
+                anno.element_dim = len(anno.element_shape)
+
+            if anno.element_shape is None or anno.field_dim is None:
+                raise TaichiCompilationError(
+                    'Please either specify both `element_shape` and `field_dim` '
+                    'in the param annotation, or provide an example '
+                    f'ndarray for param={arg.name}')
+            if anno.element_dim == 0:
+                injected_args.append(ScalarNdarray(f32,
+                                                   (2, ) * anno.field_dim))
+            elif anno.element_dim == 1:
+                injected_args.append(
+                    VectorNdarray(anno.element_shape[0],
+                                  dtype=f32,
+                                  shape=(2, ) * anno.field_dim,
+                                  layout=Layout.AOS))
+            elif anno.element_dim == 2:
+                injected_args.append(
+                    MatrixNdarray(anno.element_shape[0],
+                                  anno.element_shape[1],
+                                  dtype=f32,
+                                  shape=(2, ) * anno.field_dim,
+                                  layout=Layout.AOS))
+            else:
+                raise RuntimeError('')
+        else:
+            # For primitive types, we can just inject a dummy value.
+            injected_args.append(0)
+    return injected_args
diff --git a/python/taichi/examples/graph/mpm88_graph.py b/python/taichi/examples/graph/mpm88_graph.py
new file mode 100644
index 0000000000000..d7a15119718cd
--- /dev/null
+++ b/python/taichi/examples/graph/mpm88_graph.py
@@ -0,0 +1,168 @@
+import argparse
+import numpy as np
+import taichi as ti
+
+ti.init(arch=ti.vulkan)
+n_particles = 8192
+n_grid = 128
+dx = 1 / n_grid
+dt = 2e-4
+
+p_rho = 1
+p_vol = (dx * 0.5)**2
+p_mass = p_vol * p_rho
+gravity = 9.8
+bound = 3
+E = 400
+N_ITER = 500  # Use 500 to make speed diff more obvious
+
+
+
+@ti.kernel
+def substep_reset_grid(grid_v: ti.any_arr(field_dim=2),
+                       grid_m: ti.any_arr(field_dim=2)):
+    for i, j in grid_m:
+        grid_v[i, j] = [0, 0]
+        grid_m[i, j] = 0
+
+
+@ti.kernel
+def substep_p2g(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
+                grid_v: ti.any_arr(field_dim=2),
+                grid_m: ti.any_arr(field_dim=2)):
+    for p in x:
+        Xp = x[p] / dx
+        base = int(Xp - 0.5)
+        fx = Xp - base
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
+        affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+        for i, j in ti.static(ti.ndrange(3, 3)):
+            offset = ti.Vector([i, j])
+            dpos = (offset - fx) * dx
+            weight = w[i].x * w[j].y
+            grid_v[base + offset] += weight * (p_mass * v[p] + affine @ dpos)
+            grid_m[base + offset] += weight * p_mass
+
+
+@ti.kernel
+def substep_update_grid_v(grid_v: ti.any_arr(field_dim=2),
+                          grid_m: ti.any_arr(field_dim=2)):
+    for i, j in grid_m:
+        if grid_m[i, j] > 0:
+            grid_v[i, j] /= grid_m[i, j]
+        grid_v[i, j].y -= dt * gravity
+        if i < bound and grid_v[i, j].x < 0:
+            grid_v[i, j].x = 0
+        if i > n_grid - bound and grid_v[i, j].x > 0:
+            grid_v[i, j].x = 0
+        if j < bound and grid_v[i, j].y < 0:
+            grid_v[i, j].y = 0
+        if j > n_grid - bound and grid_v[i, j].y > 0:
+            grid_v[i, j].y = 0
+
+
+@ti.kernel
+def substep_g2p(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
+                grid_v: ti.any_arr(field_dim=2)):
+    for p in x:
+        Xp = x[p] / dx
+        base = int(Xp - 0.5)
+        fx = Xp - base
+        w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+        new_v = ti.Vector.zero(float, 2)
+        new_C = ti.Matrix.zero(float, 2, 2)
+        for i, j in ti.static(ti.ndrange(3, 3)):
+            offset = ti.Vector([i, j])
+            dpos = (offset - fx) * dx
+            weight = w[i].x * w[j].y
+            g_v = grid_v[base + offset]
+            new_v += weight * g_v
+            new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
+        v[p] = new_v
+        x[p] += dt * v[p]
+        J[p] *= 1 + dt * new_C.trace()
+        C[p] = new_C
+
+
+@ti.kernel
+def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                   J: ti.any_arr(field_dim=1)):
+    for i in range(n_particles):
+        x[i] = [ti.random() * 0.4 + 0.2, ti.random() * 0.4 + 0.2]
+        v[i] = [0, -1]
+        J[i] = 1
+
+x = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
+v = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
+
+C = ti.Matrix.ndarray(2, 2, ti.f32, shape=(n_particles))
+J = ti.ndarray(ti.f32, shape=(n_particles))
+grid_v = ti.Vector.ndarray(2, ti.f32, shape=(n_grid, n_grid))
+grid_m = ti.ndarray(ti.f32, shape=(n_grid, n_grid))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--baseline',
+        action='store_true')
+    args, unknown = parser.parse_known_args()
+
+    if not args.baseline:
+        print('running in graph mode')
+        # Build graph
+        sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'x', 'f32', element_shape=(2, ))
+        sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'v', 'f32', element_shape=(2, ))
+        sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'C', 'f32', element_shape=(2, 2))
+        sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'J', 'f32', element_shape=())
+        sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_v', 'f32', element_shape=(2, ))
+        sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_m', 'f32', element_shape=())
+        g_init = ti.graph.Graph()
+        g_init.dispatch(init_particles, sym_x, sym_v, sym_J)
+
+        g_update = ti.graph.Graph()
+        substep = g_update.create_sequential()
+
+        substep.dispatch(substep_reset_grid, sym_grid_v, sym_grid_m)
+        substep.dispatch(substep_p2g, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
+                        sym_grid_m)
+        substep.dispatch(substep_update_grid_v, sym_grid_v, sym_grid_m)
+        substep.dispatch(substep_g2p, sym_x, sym_v, sym_C, sym_J, sym_grid_v)
+
+        for i in range(N_ITER):
+            g_update.append(substep)
+
+        # Compile
+        g_init.compile()
+        g_update.compile()
+
+        # Run
+        g_init.run({'x': x, 'v': v, 'J': J})
+
+        gui = ti.GUI('MPM88')
+        while gui.running:
+            g_update.run({
+                'x': x,
+                'v': v,
+                'C': C,
+                'J': J,
+                'grid_v': grid_v,
+                'grid_m': grid_m
+            })
+            gui.clear(0x112F41)
+            gui.circles(x.to_numpy(), radius=1.5, color=0x068587)
+            gui.show()
+    else:
+        init_particles(x, v, J)
+        gui = ti.GUI('MPM88')
+        while gui.running and not gui.get_event(gui.ESCAPE):
+            for s in range(N_ITER):
+                substep_reset_grid(grid_v, grid_m)
+                substep_p2g(x, v, C, J, grid_v, grid_m)
+                substep_update_grid_v(grid_v, grid_m)
+                substep_g2p(x, v, C, J, grid_v)
+            gui.clear(0x112F41)
+            gui.circles(x.to_numpy(), radius=1.5, color=0x068587)
+            gui.show()
diff --git a/python/taichi/graph/__init__.py b/python/taichi/graph/__init__.py
new file mode 100644
index 0000000000000..dd17d687c5fa5
--- /dev/null
+++ b/python/taichi/graph/__init__.py
@@ -0,0 +1 @@
+from ._graph import *
diff --git a/python/taichi/graph/_graph.py b/python/taichi/graph/_graph.py
new file mode 100644
index 0000000000000..1758ea0b7146d
--- /dev/null
+++ b/python/taichi/graph/_graph.py
@@ -0,0 +1,68 @@
+from taichi._lib import core as _ti_core
+from taichi.aot.utils import produce_injected_args
+from taichi.lang import kernel_impl
+from taichi.lang._ndarray import Ndarray
+from taichi.lang.exception import TaichiRuntimeError
+
+ArgKind = _ti_core.ArgKind
+Arg = _ti_core.Arg
+
+
+def gen_cpp_kernel(kernel_fn, args):
+    kernel = kernel_fn._primal
+    assert isinstance(kernel, kernel_impl.Kernel)
+    injected_args = produce_injected_args(kernel, symbolic_args=args)
+    kernel.ensure_compiled(*injected_args)
+    return kernel.kernel_cpp
+
+
+class Sequential:
+    def __init__(self, seq):
+        self.seq_ = seq
+
+    def dispatch(self, kernel_fn, *args):
+        kernel_cpp = gen_cpp_kernel(kernel_fn, args)
+        self.seq_.dispatch(kernel_cpp, args)
+
+
+class Graph:
+    def __init__(self):
+        self._graph_builder = _ti_core.GraphBuilder()
+        self._compiled_graph = None
+
+    def dispatch(self, kernel_fn, *args):
+        kernel_cpp = gen_cpp_kernel(kernel_fn, args)
+        self._graph_builder.dispatch(kernel_cpp, args)
+
+    def create_sequential(self):
+        return Sequential(self._graph_builder.create_sequential())
+
+    def append(self, node):
+        # TODO: support appending dispatch node as well.
+        assert isinstance(node, Sequential)
+        self._graph_builder.seq().append(node.seq_)
+
+    def compile(self):
+        self._compiled_graph = self._graph_builder.compile()
+
+    def run(self, args):
+        arg_ptrs = {}
+        # Only support native python numerical types (int, float) for now.
+        arg_ints = {}
+        arg_floats = {}
+
+        for k, v in args.items():
+            if isinstance(v, Ndarray):
+                arg_ptrs[k] = v.arr
+            elif isinstance(v, int):
+                arg_ints[k] = v
+            elif isinstance(v, float):
+                arg_floats[k] = v
+            else:
+                raise TaichiRuntimeError(
+                    'Only python int, float and ti.Ndarray are supported as runtime arguments'
+                )
+        self._compiled_graph.run(arg_ptrs, arg_ints, arg_floats)
+
+
+__all__ = ['Graph', 'Arg', 'ArgKind']
diff --git a/taichi/aot/graph_data.h b/taichi/aot/graph_data.h
index d87e53bd15d31..aa2c2479d1d67 100644
--- a/taichi/aot/graph_data.h
+++ b/taichi/aot/graph_data.h
@@ -32,7 +32,7 @@ struct Arg {
 /**
  * Runtime value used in graph execution.
  */
-struct IValue {
+struct TI_DLL_EXPORT IValue {
  public:
   uint64 val;
   ArgKind tag;
@@ -81,7 +81,7 @@ struct CompiledDispatch {
   TI_IO_DEF(kernel_name, symbolic_args);
 };
 
-struct CompiledGraph {
+struct TI_DLL_EXPORT CompiledGraph {
   std::vector<CompiledDispatch> dispatches;
 
   void run(const std::unordered_map<std::string, IValue> &args) const;
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index 31181fc55705a..a77686c1e194f 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -14,7 +14,7 @@ class Program;
 class LlvmProgramImpl;
 class NdarrayRwAccessorsBank;
 
-class Ndarray {
+class TI_DLL_EXPORT Ndarray {
  public:
   /* Constructs a Ndarray managed by Program.
    * Memory allocation and deallocation is handled by Program.
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 850056d49642b..d2314d238ad4b 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -16,6 +16,7 @@
 #include "taichi/ir/expression_ops.h"
 #include "taichi/ir/frontend_ir.h"
 #include "taichi/ir/statements.h"
+#include "taichi/program/graph_builder.h"
 #include "taichi/program/extension.h"
 #include "taichi/program/async_engine.h"
 #include "taichi/program/ndarray.h"
@@ -27,6 +28,7 @@
 #include "taichi/python/snode_registry.h"
 #include "taichi/program/sparse_matrix.h"
 #include "taichi/program/sparse_solver.h"
+#include "taichi/aot/graph_data.h"
 #include "taichi/ir/mesh.h"
 
 #include "taichi/program/kernel_profiler.h"
@@ -450,6 +452,7 @@ void export_lang(py::module &m) {
       .def("add_field", &AotModuleBuilder::add_field)
       .def("add", &AotModuleBuilder::add)
       .def("add_kernel_template", &AotModuleBuilder::add_kernel_template)
+      .def("add_graph", &AotModuleBuilder::add_graph)
       .def("dump", &AotModuleBuilder::dump);
 
   py::class_<Axis>(m, "Axis").def(py::init<int>());
@@ -531,6 +534,53 @@ void export_lang(py::module &m) {
       .def_readonly("dtype", &Ndarray::dtype)
       .def_readonly("shape", &Ndarray::shape);
 
+  py::enum_<aot::ArgKind>(m, "ArgKind")
+      .value("SCALAR", aot::ArgKind::kScalar)
+      .value("NDARRAY", aot::ArgKind::kNdarray)
+      .export_values();
+
+  py::class_<aot::Arg>(m, "Arg")
+      .def(py::init<aot::ArgKind, std::string, std::string, std::vector<int>>(),
+           py::arg("tag"), py::arg("name"), py::arg("dtype_name"),
+           py::arg("element_shape"))
+      .def_readonly("name", &aot::Arg::name)
+      .def_readonly("element_shape", &aot::Arg::element_shape);
+
+  py::class_<Node>(m, "Node");
+
+  py::class_<Sequential, Node>(m, "Sequential")
+      .def(py::init<GraphBuilder *>())
+      .def("append", &Sequential::append)
+      .def("dispatch", &Sequential::dispatch);
+
+  py::class_<GraphBuilder>(m, "GraphBuilder")
+      .def(py::init<>())
+      .def("dispatch", &GraphBuilder::dispatch)
+      .def("compile", &GraphBuilder::compile)
+      .def("create_sequential", &GraphBuilder::new_sequential_node,
+           py::return_value_policy::reference)
+      .def("seq", &GraphBuilder::seq, py::return_value_policy::reference);
+
+  py::class_<aot::CompiledGraph>(m, "CompiledGraph")
+      .def("run", [](aot::CompiledGraph *self, const py::dict &arg_ptrs,
+                     const py::dict &arg_ints, const py::dict &arg_floats) {
+        std::unordered_map<std::string, aot::IValue> args;
+        for (auto it : arg_ptrs) {
+          auto &val = it.second.cast<Ndarray &>();
+          args.insert(
+              {py::cast<std::string>(it.first), aot::IValue::create(val)});
+        }
+        for (auto it : arg_ints) {
+          args.insert({py::cast<std::string>(it.first),
+                       aot::IValue::create(py::cast<int>(it.second))});
+        }
+        for (auto it : arg_floats) {
+          args.insert({py::cast<std::string>(it.first),
+                       aot::IValue::create(py::cast<double>(it.second))});
+        }
+        self->run(args);
+      });
+
   py::class_<Kernel>(m, "Kernel")
       .def("get_ret_int", &Kernel::get_ret_int)
       .def("get_ret_float", &Kernel::get_ret_float)
diff --git a/tests/python/test_aot.py b/tests/python/test_aot.py
index 9460e386936c3..9cb531d6efe65 100644
--- a/tests/python/test_aot.py
+++ b/tests/python/test_aot.py
@@ -620,3 +620,150 @@ def run(arr: ti.types.ndarray(), val1: ti.f32, val2: ti.template()):
             res = json.load(json_file)
             args_count = res['aot_data']['kernels']['run']['args_count']
             assert args_count == 2, res  # `arr` and `val1`
+
+
+@test_utils.test(arch=ti.vulkan)
+def test_mpm88_ndarray_graph_aot():
+    n_particles = 8192
+    n_grid = 128
+    dx = 1 / n_grid
+    dt = 2e-4
+
+    p_rho = 1
+    p_vol = (dx * 0.5)**2
+    p_mass = p_vol * p_rho
+    gravity = 9.8
+    bound = 3
+    E = 400
+
+    @ti.kernel
+    def substep_reset_grid(grid_v: ti.any_arr(field_dim=2),
+                           grid_m: ti.any_arr(field_dim=2)):
+        for i, j in grid_m:
+            grid_v[i, j] = [0, 0]
+            grid_m[i, j] = 0
+
+    @ti.kernel
+    def substep_p2g(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                    C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
+                    grid_v: ti.any_arr(field_dim=2),
+                    grid_m: ti.any_arr(field_dim=2)):
+        for p in x:
+            Xp = x[p] / dx
+            base = int(Xp - 0.5)
+            fx = Xp - base
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
+            affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+            for i, j in ti.static(ti.ndrange(3, 3)):
+                offset = ti.Vector([i, j])
+                dpos = (offset - fx) * dx
+                weight = w[i].x * w[j].y
+                grid_v[base +
+                       offset] += weight * (p_mass * v[p] + affine @ dpos)
+                grid_m[base + offset] += weight * p_mass
+
+    @ti.kernel
+    def substep_update_grid_v(grid_v: ti.any_arr(field_dim=2),
+                              grid_m: ti.any_arr(field_dim=2)):
+        for i, j in grid_m:
+            if grid_m[i, j] > 0:
+                grid_v[i, j] /= grid_m[i, j]
+            grid_v[i, j].y -= dt * gravity
+            if i < bound and grid_v[i, j].x < 0:
+                grid_v[i, j].x = 0
+            if i > n_grid - bound and grid_v[i, j].x > 0:
+                grid_v[i, j].x = 0
+            if j < bound and grid_v[i, j].y < 0:
+                grid_v[i, j].y = 0
+            if j > n_grid - bound and grid_v[i, j].y > 0:
+                grid_v[i, j].y = 0
+
+    @ti.kernel
+    def substep_g2p(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                    C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
+                    grid_v: ti.any_arr(field_dim=2)):
+        for p in x:
+            Xp = x[p] / dx
+            base = int(Xp - 0.5)
+            fx = Xp - base
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            new_v = ti.Vector.zero(float, 2)
+            new_C = ti.Matrix.zero(float, 2, 2)
+            for i, j in ti.static(ti.ndrange(3, 3)):
+                offset = ti.Vector([i, j])
+                dpos = (offset - fx) * dx
+                weight = w[i].x * w[j].y
+                g_v = grid_v[base + offset]
+                new_v += weight * g_v
+                new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
+            v[p] = new_v
+            x[p] += dt * v[p]
+            J[p] *= 1 + dt * new_C.trace()
+            C[p] = new_C
+
+    @ti.kernel
+    def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                       J: ti.any_arr(field_dim=1)):
+        for i in range(n_particles):
+            x[i] = [ti.random() * 0.4 + 0.2, ti.random() * 0.4 + 0.2]
+            v[i] = [0, -1]
+            J[i] = 1
+
+    N_ITER = 50
+
+    sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'x',
+                         'f32',
+                         element_shape=(2, ))
+    sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'v',
+                         'f32',
+                         element_shape=(2, ))
+    sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'C',
+                         'f32',
+                         element_shape=(2, 2))
+    sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'J',
+                         'f32',
+                         element_shape=())
+    sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                              'grid_v',
+                              'f32',
+                              element_shape=(2, ))
+    sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                              'grid_m',
+                              'f32',
+                              element_shape=())
+    g_init = ti.graph.Graph()
+    g_init.dispatch(init_particles, sym_x, sym_v, sym_J)
+
+    g_update = ti.graph.Graph()
+    substep = g_update.create_sequential()
+
+    substep.dispatch(substep_reset_grid, sym_grid_v, sym_grid_m)
+    substep.dispatch(substep_p2g, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
+                     sym_grid_m)
+    substep.dispatch(substep_update_grid_v, sym_grid_v, sym_grid_m)
+    substep.dispatch(substep_g2p, sym_x, sym_v, sym_C, sym_J, sym_grid_v)
+
+    for i in range(N_ITER):
+        g_update.append(substep)
+
+    g_init.compile()
+    g_update.compile()
+
+    x = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
+    v = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
+
+    C = ti.Matrix.ndarray(2, 2, ti.f32, shape=(n_particles))
+    J = ti.ndarray(ti.f32, shape=(n_particles))
+    grid_v = ti.Vector.ndarray(2, ti.f32, shape=(n_grid, n_grid))
+    grid_m = ti.ndarray(ti.f32, shape=(n_grid, n_grid))
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        mod = ti.aot.Module(ti.vulkan)
+        mod.add_graph('init', g_init)
+        mod.add_graph('update', g_update)
+        mod.save(tmpdir, '')
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index af554afcbd266..2c36c753e079b 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -73,9 +73,9 @@ def _get_expected_matrix_apis():
     'clear_all_gradients', 'cos', 'cpu', 'cuda', 'data_oriented', 'deactivate',
     'deactivate_all_snodes', 'dx11', 'eig', 'exp', 'experimental', 'extension',
     'f16', 'f32', 'f64', 'field', 'float16', 'float32', 'float64', 'floor',
-    'func', 'get_addr', 'global_thread_idx', 'gpu', 'grouped', 'hex_to_rgb',
-    'i', 'i16', 'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl', 'ik', 'ikl',
-    'il', 'init', 'int16', 'int32', 'int64', 'int8', 'is_active',
+    'func', 'get_addr', 'global_thread_idx', 'gpu', 'graph', 'grouped',
+    'hex_to_rgb', 'i', 'i16', 'i32', 'i64', 'i8', 'ij', 'ijk', 'ijkl', 'ijl',
+    'ik', 'ikl', 'il', 'init', 'int16', 'int32', 'int64', 'int8', 'is_active',
     'is_logging_effective', 'j', 'jk', 'jkl', 'jl', 'k', 'kernel', 'kl', 'l',
     'lang', 'length', 'linalg', 'log', 'loop_config', 'math', 'max',
     'mesh_local', 'mesh_patch_idx', 'metal', 'min', 'ndarray', 'ndrange',

From b73da31c44b74772377b7e14d925c033302ebb02 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Wed, 25 May 2022 11:29:34 +0800
Subject: [PATCH 112/176] [Lang] [type] Refactor quant type definition APIs
 (#5036)

* [Lang] [type] Refactor quant type definition APIs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/types/quantized_types.py        | 46 ++++++-------------
 tests/python/test_custom_float.py             |  6 +--
 tests/python/test_custom_float_exponents.py   | 30 ++----------
 tests/python/test_custom_float_shared_exp.py  | 44 ++++--------------
 .../test_custom_float_time_integration.py     | 10 +---
 tests/python/test_custom_type_atomics.py      |  6 +--
 tests/python/test_matrix_different_type.py    |  4 +-
 tests/python/test_snode_layout_inspection.py  |  5 +-
 8 files changed, 36 insertions(+), 115 deletions(-)

diff --git a/python/taichi/types/quantized_types.py b/python/taichi/types/quantized_types.py
index 16732d737751f..906ceebc64f3b 100644
--- a/python/taichi/types/quantized_types.py
+++ b/python/taichi/types/quantized_types.py
@@ -9,24 +9,6 @@
 _type_factory = _ti_core.get_type_factory_instance()
 
 
-def _custom_int(bits, signed=True, compute_type=None):
-    """Generates a custom int type.
-
-    Args:
-        bits (int): Number of bits.
-        signed (bool): Signed or unsigned.
-        compute_type (DataType): Type for computation.
-
-    Returns:
-        DataType: The specified type.
-    """
-    if compute_type is None:
-        compute_type = impl.get_runtime().default_ip
-    if isinstance(compute_type, _ti_core.DataType):
-        compute_type = compute_type.get_ptr()
-    return _type_factory.get_custom_int_type(bits, signed, compute_type)
-
-
 def _custom_float(significand_type,
                   exponent_type=None,
                   compute_type=None,
@@ -46,13 +28,11 @@ def _custom_float(significand_type,
         compute_type = impl.get_runtime().default_fp
     if isinstance(compute_type, _ti_core.DataType):
         compute_type = compute_type.get_ptr()
-    return _type_factory.get_custom_float_type(significand_type,
-                                               exponent_type,
-                                               compute_type,
-                                               scale=scale)
+    return _type_factory.get_custom_float_type(significand_type, exponent_type,
+                                               compute_type, scale)
 
 
-def int(bits, signed=False, compute=None):  # pylint: disable=W0622
+def int(bits, signed=True, compute=None):  # pylint: disable=W0622
     """Generates a quantized type for integers.
 
     Args:
@@ -65,10 +45,12 @@ def int(bits, signed=False, compute=None):  # pylint: disable=W0622
     """
     if compute is None:
         compute = impl.get_runtime().default_ip
-    return _custom_int(bits, signed, compute)
+    if isinstance(compute, _ti_core.DataType):
+        compute = compute.get_ptr()
+    return _type_factory.get_custom_int_type(bits, signed, compute)
 
 
-def fixed(frac, signed=True, range=1.0, compute=None):  # pylint: disable=W0622
+def fixed(frac, signed=True, range=1.0, compute=None, scale=None):  # pylint: disable=W0622
     """Generates a quantized type for fixed-point real numbers.
 
     Args:
@@ -76,18 +58,18 @@ def fixed(frac, signed=True, range=1.0, compute=None):  # pylint: disable=W0622
         signed (bool): Signed or unsigned.
         range (float): Range of the number.
         compute (DataType): Type for computation.
+        scale (float): Scaling factor. The argument is prioritized over range.
 
     Returns:
         DataType: The specified type.
     """
     # TODO: handle cases with frac > 32
     frac_type = int(bits=frac, signed=signed, compute=i32)
-    if signed:
-        scale = range / 2**(frac - 1)
-    else:
-        scale = range / 2**frac
-    if compute is None:
-        compute = impl.get_runtime().default_fp
+    if scale is None:
+        if signed:
+            scale = range / 2**(frac - 1)
+        else:
+            scale = range / 2**frac
     return _custom_float(frac_type, None, compute, scale)
 
 
@@ -107,8 +89,6 @@ def float(exp, frac, signed=True, compute=None):  # pylint: disable=W0622
     exp_type = int(bits=exp, signed=False, compute=i32)
     # TODO: handle cases with frac > 32
     frac_type = int(bits=frac, signed=signed, compute=i32)
-    if compute is None:
-        compute = impl.get_runtime().default_fp
     return _custom_float(significand_type=frac_type,
                          exponent_type=exp_type,
                          compute_type=compute)
diff --git a/tests/python/test_custom_float.py b/tests/python/test_custom_float.py
index 251a045cbcbf1..789079385777f 100644
--- a/tests/python/test_custom_float.py
+++ b/tests/python/test_custom_float.py
@@ -57,8 +57,7 @@ def rotate_18_degrees():
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_float_implicit_cast():
-    ci13 = ti.types.quant.int(bits=13)
-    cft = ti.types.quant._custom_float(significand_type=ci13, scale=0.1)
+    cft = ti.types.quant.fixed(frac=13, scale=0.1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -73,8 +72,7 @@ def foo():
 
 @test_utils.test(require=ti.extension.quant_basic)
 def test_cache_read_only():
-    ci15 = ti.types.quant.int(bits=15)
-    cft = ti.types.quant._custom_float(significand_type=ci15, scale=0.1)
+    cft = ti.types.quant.fixed(frac=15, scale=0.1)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
diff --git a/tests/python/test_custom_float_exponents.py b/tests/python/test_custom_float_exponents.py
index 1411d566ae232..5c2be6f58eab5 100644
--- a/tests/python/test_custom_float_exponents.py
+++ b/tests/python/test_custom_float_exponents.py
@@ -8,11 +8,7 @@
 
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_unsigned():
-    cu13 = ti.types.quant.int(13, False)
-    exp = ti.types.quant.int(6, False)
-    cft = ti.types.quant._custom_float(significand_type=cu13,
-                                       exponent_type=exp,
-                                       scale=1)
+    cft = ti.types.quant.float(exp=6, frac=13, signed=False)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -31,11 +27,7 @@ def test_custom_float_unsigned():
 
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_signed():
-    cu13 = ti.types.quant.int(13, True)
-    exp = ti.types.quant.int(6, False)
-    cft = ti.types.quant._custom_float(significand_type=cu13,
-                                       exponent_type=exp,
-                                       scale=1)
+    cft = ti.types.quant.float(exp=6, frac=13, signed=True)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -63,11 +55,7 @@ def test_custom_float_signed():
 @pytest.mark.parametrize('digits_bits', [23, 24])
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_precision(digits_bits):
-    cu24 = ti.types.quant.int(digits_bits, True)
-    exp = ti.types.quant.int(8, False)
-    cft = ti.types.quant._custom_float(significand_type=cu24,
-                                       exponent_type=exp,
-                                       scale=1)
+    cft = ti.types.quant.float(exp=8, frac=digits_bits)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -88,11 +76,7 @@ def test_custom_float_precision(digits_bits):
 @pytest.mark.parametrize('signed', [True, False])
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_truncation(signed):
-    cit = ti.types.quant.int(2, signed)
-    exp = ti.types.quant.int(5, False)
-    cft = ti.types.quant._custom_float(significand_type=cit,
-                                       exponent_type=exp,
-                                       scale=1)
+    cft = ti.types.quant.float(exp=5, frac=2, signed=signed)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
@@ -120,11 +104,7 @@ def test_custom_float_truncation(signed):
 
 @test_utils.test(require=ti.extension.quant)
 def test_custom_float_atomic_demotion():
-    cit = ti.types.quant.int(2, True)
-    exp = ti.types.quant.int(5, False)
-    cft = ti.types.quant._custom_float(significand_type=cit,
-                                       exponent_type=exp,
-                                       scale=1)
+    cft = ti.types.quant.float(exp=5, frac=2)
     x = ti.field(dtype=cft)
 
     ti.root.bit_struct(num_bits=32).place(x)
diff --git a/tests/python/test_custom_float_shared_exp.py b/tests/python/test_custom_float_shared_exp.py
index 6f56ab96b290f..9ca7dd8f15b0f 100644
--- a/tests/python/test_custom_float_shared_exp.py
+++ b/tests/python/test_custom_float_shared_exp.py
@@ -8,15 +8,8 @@
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_shared_exponents(exponent_bits):
-    exp = ti.types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quant.int(10, False)
-    cit2 = ti.types.quant.int(14, False)
-    cft1 = ti.types.quant._custom_float(significand_type=cit1,
-                                        exponent_type=exp,
-                                        scale=1)
-    cft2 = ti.types.quant._custom_float(significand_type=cit2,
-                                        exponent_type=exp,
-                                        scale=1)
+    cft1 = ti.types.quant.float(exp=exponent_bits, frac=10, signed=False)
+    cft2 = ti.types.quant.float(exp=exponent_bits, frac=14, signed=False)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
@@ -78,15 +71,8 @@ def foo(x: ti.f32, y: ti.f32):
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_shared_exponent_add(exponent_bits):
-    exp = ti.types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quant.int(10, False)
-    cit2 = ti.types.quant.int(14, False)
-    cft1 = ti.types.quant._custom_float(significand_type=cit1,
-                                        exponent_type=exp,
-                                        scale=1)
-    cft2 = ti.types.quant._custom_float(significand_type=cit2,
-                                        exponent_type=exp,
-                                        scale=1)
+    cft1 = ti.types.quant.float(exp=exponent_bits, frac=10, signed=False)
+    cft2 = ti.types.quant.float(exp=exponent_bits, frac=14, signed=False)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
@@ -118,15 +104,8 @@ def foo(x: ti.f32, y: ti.f32):
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_shared_exponent_borrow(exponent_bits):
-    exp = ti.types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quant.int(10, False)
-    cit2 = ti.types.quant.int(14, False)
-    cft1 = ti.types.quant._custom_float(significand_type=cit1,
-                                        exponent_type=exp,
-                                        scale=1)
-    cft2 = ti.types.quant._custom_float(significand_type=cit2,
-                                        exponent_type=exp,
-                                        scale=1)
+    cft1 = ti.types.quant.float(exp=exponent_bits, frac=10, signed=False)
+    cft2 = ti.types.quant.float(exp=exponent_bits, frac=14, signed=False)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
@@ -151,15 +130,8 @@ def inc():
 @pytest.mark.parametrize('exponent_bits', [5, 6, 7, 8])
 @test_utils.test(require=ti.extension.quant)
 def test_negative(exponent_bits):
-    exp = ti.types.quant.int(exponent_bits, False)
-    cit1 = ti.types.quant.int(10, False)
-    cit2 = ti.types.quant.int(14, True)
-    cft1 = ti.types.quant._custom_float(significand_type=cit1,
-                                        exponent_type=exp,
-                                        scale=1)
-    cft2 = ti.types.quant._custom_float(significand_type=cit2,
-                                        exponent_type=exp,
-                                        scale=1)
+    cft1 = ti.types.quant.float(exp=exponent_bits, frac=10, signed=False)
+    cft2 = ti.types.quant.float(exp=exponent_bits, frac=14, signed=True)
     a = ti.field(dtype=cft1)
     b = ti.field(dtype=cft2)
     ti.root.bit_struct(num_bits=32).place(a, b, shared_exponent=True)
diff --git a/tests/python/test_custom_float_time_integration.py b/tests/python/test_custom_float_time_integration.py
index 2cad8267dcb5b..459276781e001 100644
--- a/tests/python/test_custom_float_time_integration.py
+++ b/tests/python/test_custom_float_time_integration.py
@@ -14,11 +14,7 @@
 def test_custom_float_time_integration(use_cft, use_exponent, use_shared_exp):
     if use_cft:
         if use_exponent:
-            exp = ti.types.quant.int(6, False)
-            cit = ti.types.quant.int(13, True)
-            cft = ti.types.quant._custom_float(significand_type=cit,
-                                               exponent_type=exp,
-                                               scale=1)
+            cft = ti.types.quant.float(exp=6, frac=13)
             x = ti.Vector.field(2, dtype=cft)
             if use_shared_exp:
                 ti.root.bit_struct(num_bits=32).place(x, shared_exponent=True)
@@ -26,9 +22,7 @@ def test_custom_float_time_integration(use_cft, use_exponent, use_shared_exp):
                 ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(0))
                 ti.root.bit_struct(num_bits=32).place(x.get_scalar_field(1))
         else:
-            cit = ti.types.quant.int(16, True)
-            cft = ti.types.quant._custom_float(significand_type=cit,
-                                               scale=1 / 2**14)
+            cft = ti.types.quant.fixed(frac=16, range=2)
             x = ti.Vector.field(2, dtype=cft)
             ti.root.bit_struct(num_bits=32).place(x)
     else:
diff --git a/tests/python/test_custom_type_atomics.py b/tests/python/test_custom_type_atomics.py
index d1c030e25de66..5569d4bf5f132 100644
--- a/tests/python/test_custom_type_atomics.py
+++ b/tests/python/test_custom_type_atomics.py
@@ -68,10 +68,8 @@ def foo():
 
 @test_utils.test(require=ti.extension.quant_basic, debug=True)
 def test_custom_float_atomics():
-    ci13 = ti.types.quant.int(13, True)
-    ci19 = ti.types.quant.int(19, False)
-    cft13 = ti.types.quant._custom_float(significand_type=ci13, scale=0.1)
-    cft19 = ti.types.quant._custom_float(significand_type=ci19, scale=0.1)
+    cft13 = ti.types.quant.fixed(frac=13, signed=True, scale=0.1)
+    cft19 = ti.types.quant.fixed(frac=19, signed=False, scale=0.1)
 
     x = ti.field(dtype=cft13)
     y = ti.field(dtype=cft19)
diff --git a/tests/python/test_matrix_different_type.py b/tests/python/test_matrix_different_type.py
index c1e899b8ea77d..05288080f3fae 100644
--- a/tests/python/test_matrix_different_type.py
+++ b/tests/python/test_matrix_different_type.py
@@ -71,9 +71,9 @@ def verify():
 @test_utils.test(require=ti.extension.quant_basic)
 def test_custom_type():
     cit1 = ti.types.quant.int(bits=10, signed=True)
-    cft1 = ti.types.quant._custom_float(cit1, scale=0.1)
+    cft1 = ti.types.quant.fixed(frac=10, signed=True, scale=0.1)
     cit2 = ti.types.quant.int(bits=22, signed=False)
-    cft2 = ti.types.quant._custom_float(cit2, scale=0.1)
+    cft2 = ti.types.quant.fixed(frac=22, signed=False, scale=0.1)
     type_list = [[cit1, cft2], [cft1, cit2]]
     a = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
     b = ti.Matrix.field(len(type_list), len(type_list[0]), dtype=type_list)
diff --git a/tests/python/test_snode_layout_inspection.py b/tests/python/test_snode_layout_inspection.py
index 5282665712c2d..7b4a3bcec4c9e 100644
--- a/tests/python/test_snode_layout_inspection.py
+++ b/tests/python/test_snode_layout_inspection.py
@@ -41,9 +41,8 @@ def test_primitives():
 
 @test_utils.test(arch=ti.cpu)
 def test_bit_struct():
-    cit = ti.types.quant.int(16, False)
-    x = ti.field(dtype=cit)
-    y = ti.field(dtype=ti.types.quant._custom_float(significand_type=cit))
+    x = ti.field(dtype=ti.types.quant.int(16, False))
+    y = ti.field(dtype=ti.types.quant.fixed(16, False))
     z = ti.field(dtype=ti.f32)
 
     n1 = ti.root.dense(ti.i, 32)

From d034586bcd4435536f04d291318adbedb3bd30ce Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Wed, 25 May 2022 11:34:34 +0800
Subject: [PATCH 113/176] [Metal] Support Ndarray (#4720)

* [Metal] Support Ndarray

* simple work

* fix copying

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* wip

* fixes

* fix devalloc id bug, enable tests

* fix extra_arg offset

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* remove size

* rm size

* ref

* fix for ret matrix type

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix test

* fix zero

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 .../backends/metal/aot_module_loader_impl.cpp |   6 +-
 taichi/backends/metal/codegen_metal.cpp       | 116 ++++++++++--
 taichi/backends/metal/device.cpp              |   5 +-
 taichi/backends/metal/kernel_manager.cpp      | 176 +++++++++++++++---
 taichi/backends/metal/kernel_manager.h        |   3 +-
 taichi/backends/metal/kernel_utils.cpp        |  33 ++--
 taichi/backends/metal/kernel_utils.h          |  28 ++-
 taichi/backends/opengl/codegen_opengl.cpp     |   4 +-
 taichi/backends/opengl/opengl_program.cpp     |   1 +
 taichi/program/callable.cpp                   |   4 +-
 taichi/program/callable.h                     |   4 +-
 taichi/program/kernel.cpp                     |   3 -
 taichi/program/program.cpp                    |  15 +-
 taichi/transforms/ir_printer.cpp              |   2 +
 tests/python/test_ndarray.py                  |   4 +-
 15 files changed, 322 insertions(+), 82 deletions(-)

diff --git a/taichi/backends/metal/aot_module_loader_impl.cpp b/taichi/backends/metal/aot_module_loader_impl.cpp
index f0002a008992c..c573a9bdf556c 100644
--- a/taichi/backends/metal/aot_module_loader_impl.cpp
+++ b/taichi/backends/metal/aot_module_loader_impl.cpp
@@ -71,9 +71,9 @@ class AotModuleImpl : public aot::Module {
       return nullptr;
     }
     auto *kernel_data = itr->second;
-    runtime_->register_taichi_kernel(name, kernel_data->source_code,
-                                     kernel_data->kernel_attribs,
-                                     kernel_data->ctx_attribs);
+    runtime_->register_taichi_kernel(
+        name, kernel_data->source_code, kernel_data->kernel_attribs,
+        kernel_data->ctx_attribs, /*kernel=*/nullptr);
     return std::make_unique<KernelImpl>(runtime_, name);
   }
 
diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index b6b5c0c3c0937..12157a74c94e4 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -55,6 +55,10 @@ using BufferType = BufferDescriptor::Type;
 using BufferDescSet =
     std::unordered_set<BufferDescriptor, BufferDescriptor::Hasher>;
 
+std::string ndarray_buffer_name(int arg_id) {
+  return fmt::format("ndarray_addr_{}", arg_id);
+}
+
 std::string buffer_to_name(const BufferDescriptor &b) {
   switch (b.type()) {
     case BufferType::Root:
@@ -67,6 +71,8 @@ std::string buffer_to_name(const BufferDescriptor &b) {
       return kRuntimeBufferName;
     case BufferType::Print:
       return kPrintAssertBufferName;
+    case BufferType::Ndarray:
+      return ndarray_buffer_name(b.ndarray_arg_id());
     default:
       TI_NOT_IMPLEMENTED;
       break;
@@ -128,6 +134,7 @@ class TaskPreprocessor final : public BasicStmtVisitor {
  public:
   struct Result {
     bool should_init_randseeds{false};
+    std::unordered_map<int, int> arr_args_order;
   };
 
   static Result run(Stmt *s) {
@@ -140,6 +147,19 @@ class TaskPreprocessor final : public BasicStmtVisitor {
   void visit(RandStmt *) override {
     res_.should_init_randseeds = true;
   }
+
+  void visit(ArgLoadStmt *stmt) override {
+    if (!stmt->is_ptr) {
+      return;
+    }
+    const auto arg_id = stmt->arg_id;
+    if (res_.arr_args_order.count(arg_id) > 0) {
+      return;
+    }
+    const int order = res_.arr_args_order.size();
+    res_.arr_args_order[arg_id] = order;
+  }
+
   using BasicStmtVisitor::visit;
 
   TaskPreprocessor() = default;
@@ -419,8 +439,9 @@ class KernelCodegenImpl : public IRVisitor {
   void visit(ArgLoadStmt *stmt) override {
     const auto dt = metal_data_type_name(stmt->element_type());
     if (stmt->is_ptr) {
-      emit("device {} *{} = {}.arg{}();", dt, stmt->raw_name(), kContextVarName,
-           stmt->arg_id);
+      const auto type_str = fmt::format("device {} *", dt);
+      emit("{}{} = reinterpret_cast<{}>({});", type_str, stmt->raw_name(),
+           type_str, ndarray_buffer_name(stmt->arg_id));
     } else {
       emit("const {} {} = *{}.arg{}();", dt, stmt->raw_name(), kContextVarName,
            stmt->arg_id);
@@ -449,15 +470,50 @@ class KernelCodegenImpl : public IRVisitor {
       const auto *argload = stmt->base_ptrs[0]->as<ArgLoadStmt>();
       const int arg_id = argload->arg_id;
       const int num_indices = stmt->indices.size();
-      std::vector<std::string> size_var_names;
-      for (int i = 0; i < num_indices; i++) {
-        std::string var_name = fmt::format("{}_size{}_", stmt->raw_name(), i);
+      const auto &element_shape = stmt->element_shape;
+      std::vector<std::string> size_exprs;
+      enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
+      const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
+
+      // Args buffer arrange dimensions from outer to inner
+      // AoS args buffer:   array_shape|element_shape
+      // SoA args buffer: element_shape|array_shape
+      //
+      // ti.Matrix.ndarray(3, 2, ti.f32, (5, 4), layout=ti.Layout.AOS)
+      // args buffer: 5, 4, 3, 2
+      // ti.Matrix.ndarray(3, 2, ti.f32, (5, 4), layout=ti.Layout.SOA)
+      // args buffer: 3, 2, 5, 4
+      const int arr_shape_len = num_indices - element_shape.size();
+      int index_i = 0;
+      const auto add_elem_shape_exprs = [&]() {
+        for (int es : element_shape) {
+          size_exprs.push_back(std::to_string(es));
+          ++index_i;
+        }
+      };
+      int arr_shape_offset = 0;
+      if (layout == layout_SOA) {
+        add_elem_shape_exprs();
+        // When the layout is SOA, element shape comes before array shape, so
+        // we have to skip the element shapes first.
+        // TODO: Element shape is a compile-time known information, so extra
+        // args will always only need the array shape.
+        arr_shape_offset = element_shape.size();
+      }
+      for (int i = 0; i < arr_shape_len; i++) {
+        std::string var_name =
+            fmt::format("{}_arr_dim{}_", stmt->raw_name(), i);
         emit("const int {} = {}.extra_arg({}, {});", var_name, kContextVarName,
-             arg_id, i);
-        size_var_names.push_back(std::move(var_name));
+             arg_id, i + arr_shape_offset);
+        size_exprs.push_back(std::move(var_name));
+        ++index_i;
+      }
+      if (layout == layout_AOS) {
+        add_elem_shape_exprs();
       }
+      TI_ASSERT(index_i == num_indices);
       for (int i = 0; i < num_indices; i++) {
-        emit("{} *= {};", linear_index_name, size_var_names[i]);
+        emit("{} *= {};", linear_index_name, size_exprs[i]);
         emit("{} += {};", linear_index_name, stmt->indices[i]->raw_name());
       }
     }
@@ -1057,17 +1113,17 @@ class KernelCodegenImpl : public IRVisitor {
       ScopedIndent s(current_appender());
       emit("explicit {}(device byte* addr) : addr_(addr) {{}}", class_name);
       for (const auto &arg : ctx_attribs_.args()) {
-        const auto dt_name = metal_data_type_name(arg.dt);
-        emit("device {}* arg{}() {{", dt_name, arg.index);
         if (arg.is_array) {
-          emit("  // array, size={} B", arg.stride);
-        } else {
-          emit("  // scalar, size={} B", arg.stride);
+          continue;
         }
+        const auto dt_name = metal_data_type_name(arg.dt);
+        emit("device {}* arg{}() {{", dt_name, arg.index);
+        emit("  // scalar, size={} B", arg.stride);
         emit("  return (device {}*)(addr_ + {});", dt_name, arg.offset_in_mem);
         emit("}}");
       }
       for (const auto &ret : ctx_attribs_.rets()) {
+        // TODO: Why return still needs this?
         const auto dt_name = metal_data_type_name(ret.dt);
         emit("device {}* ret{}() {{", dt_name, ret.index);
         if (ret.is_array) {
@@ -1124,6 +1180,24 @@ class KernelCodegenImpl : public IRVisitor {
     return result;
   }
 
+  static std::unordered_map<int, int> make_arr_args_to_binding_indices(
+      const std::unordered_map<int, int> &arr_args_order,
+      int binding_idx_offset) {
+    auto res = arr_args_order;
+    for (auto itr = res.begin(); itr != res.end(); ++itr) {
+      itr->second += binding_idx_offset;
+    }
+    return res;
+  }
+
+  static void append_arr_buffer_descriptors(
+      const std::unordered_map<int, int> &arr_bindings,
+      std::vector<BufferDescriptor> *descs) {
+    for (const auto &[arr_id, _] : arr_bindings) {
+      descs->push_back(BufferDescriptor::ndarray(arr_id));
+    }
+  }
+
   void generate_serial_kernel(OffloadedStmt *stmt,
                               const BufferDescSet &root_buffer_descs,
                               const TaskPreprocessor::Result &preproc_res) {
@@ -1133,6 +1207,10 @@ class KernelCodegenImpl : public IRVisitor {
     ka.name = mtl_kernel_name;
     ka.task_type = stmt->task_type;
     ka.buffers = get_used_buffer_descriptors(root_buffer_descs);
+    ka.arr_args_to_binding_indices = make_arr_args_to_binding_indices(
+        preproc_res.arr_args_order, ka.buffers.size());
+    append_arr_buffer_descriptors(ka.arr_args_to_binding_indices,
+                                  &(ka.buffers));
     ka.advisory_total_num_threads = 1;
     ka.advisory_num_threads_per_group = 1;
 
@@ -1165,7 +1243,10 @@ class KernelCodegenImpl : public IRVisitor {
     ka.name = mtl_kernel_name;
     ka.task_type = stmt->task_type;
     ka.buffers = get_used_buffer_descriptors(root_buffer_descs);
-
+    ka.arr_args_to_binding_indices = make_arr_args_to_binding_indices(
+        preproc_res.arr_args_order, ka.buffers.size());
+    append_arr_buffer_descriptors(ka.arr_args_to_binding_indices,
+                                  &(ka.buffers));
     const bool used_tls = (stmt->tls_prologue != nullptr);
     KernelSigExtensions kernel_exts;
     kernel_exts.use_simdgroup = (used_tls && cgen_config_.allow_simdgroup);
@@ -1263,7 +1344,10 @@ class KernelCodegenImpl : public IRVisitor {
     ka.name = mtl_kernel_name;
     ka.task_type = stmt->task_type;
     ka.buffers = get_used_buffer_descriptors(root_buffer_descs);
-
+    ka.arr_args_to_binding_indices = make_arr_args_to_binding_indices(
+        preproc_res.arr_args_order, ka.buffers.size());
+    append_arr_buffer_descriptors(ka.arr_args_to_binding_indices,
+                                  &(ka.buffers));
     const bool used_tls = (stmt->tls_prologue != nullptr);
     KernelSigExtensions kernel_exts;
     kernel_exts.use_simdgroup = (used_tls && cgen_config_.allow_simdgroup);
@@ -1681,7 +1765,7 @@ FunctionType compile_to_metal_executable(
                   kernel_mgr->print_strtable(), offloaded);
   kernel_mgr->register_taichi_kernel(
       compiled_res.kernel_name, compiled_res.source_code,
-      compiled_res.kernel_attribs, compiled_res.ctx_attribs);
+      compiled_res.kernel_attribs, compiled_res.ctx_attribs, kernel);
   return [kernel_mgr,
           kernel_name = compiled_res.kernel_name](RuntimeContext &ctx) {
     kernel_mgr->launch_taichi_kernel(kernel_name, &ctx);
diff --git a/taichi/backends/metal/device.cpp b/taichi/backends/metal/device.cpp
index 26b363966f7af..83be76473f821 100644
--- a/taichi/backends/metal/device.cpp
+++ b/taichi/backends/metal/device.cpp
@@ -290,7 +290,9 @@ class DeviceImpl : public Device, public AllocToMTLBufferMapper {
   DeviceAllocation allocate_memory(const AllocParams &params) override {
     DeviceAllocation res;
     res.device = this;
-    res.alloc_id = allocations_.size();
+    // Do not use `allocations_.size()` as `alloc_id`, as items could be erased
+    // from `allocations_`.
+    res.alloc_id = next_alloc_id_++;
 
     AllocationInternal &ialloc =
         allocations_[res.alloc_id];  // "i" for internal
@@ -394,6 +396,7 @@ class DeviceImpl : public Device, public AllocToMTLBufferMapper {
   nsobj_unique_ptr<MTLCommandQueue> command_queue_{nullptr};
   std::unique_ptr<StreamImpl> stream_{nullptr};
   std::unordered_map<DeviceAllocationId, AllocationInternal> allocations_;
+  DeviceAllocationId next_alloc_id_{0};
 };
 
 }  // namespace
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index 9f3524cc35eed..5e5d9eb9857de 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -14,6 +14,7 @@
 #include "taichi/backends/metal/runtime_utils.h"
 #include "taichi/inc/constants.h"
 #include "taichi/math/arithmetic.h"
+#include "taichi/program/kernel.h"
 #include "taichi/program/py_print_buffer.h"
 #include "taichi/util/action_recorder.h"
 #include "taichi/util/file_sequence_writer.h"
@@ -292,11 +293,15 @@ class CompiledTaichiKernel {
     MemoryPool *mem_pool;
     KernelProfilerBase *profiler;
     const CompileConfig *compile_config;
+    const Kernel *kernel;
+    Device *rhi_device;
   };
 
   CompiledTaichiKernel(Params params)
       : ti_kernel_attribs(*params.ti_kernel_attribs),
-        ctx_attribs(*params.ctx_attribs) {
+        ctx_attribs(*params.ctx_attribs),
+        kernel_(params.kernel),
+        rhi_device_(params.rhi_device) {
     auto *const device = params.device;
     auto kernel_lib = new_library_with_source(
         device, params.mtl_source_code, params.compile_config->fast_math,
@@ -367,6 +372,49 @@ class CompiledTaichiKernel {
     }
   }
 
+  ~CompiledTaichiKernel() {
+    for (auto [_, alloc_n_sz] : ext_arr_arg_to_dev_alloc) {
+      rhi_device_->dealloc_memory(alloc_n_sz.alloc);
+    }
+  }
+
+  void maybe_make_dev_alloc_for_ext_arrs(const RuntimeContext &host_ctx) {
+    for (const auto &arg : ctx_attribs.args()) {
+      if (!arg.is_array) {
+        continue;
+      }
+      const int arg_id = arg.index;
+      if (host_ctx.is_device_allocations[arg_id]) {
+        continue;
+      }
+      // Even in the face that external array has 0-length, we still allocate
+      // something, to prevent runtime edge cases.
+      const auto arr_sz =
+          std::max(host_ctx.array_runtime_sizes[arg.index], (uint64)4);
+      auto itr = ext_arr_arg_to_dev_alloc.find(arg_id);
+      const bool already_allocated = (itr != ext_arr_arg_to_dev_alloc.end());
+      if (already_allocated && (itr->second.size >= arr_sz)) {
+        continue;
+      }
+      if (already_allocated) {
+        TI_TRACE("Dealloc dev_alloc for extarr size old={} new={}",
+                 itr->second.size, arr_sz);
+        rhi_device_->dealloc_memory(itr->second.alloc);
+      }
+      // This are the device buffers for "ext_arr". Unlike Ndarray, an ext_arr
+      // is actually a template param, so its size is always fixed per
+      // instantiated kernel.
+      Device::AllocParams aparams;
+      aparams.size = arr_sz;
+      aparams.host_read = true;
+      aparams.host_write = true;
+      AllocAndSize alloc_n_sz;
+      alloc_n_sz.alloc = rhi_device_->allocate_memory(aparams);
+      alloc_n_sz.size = aparams.size;
+      ext_arr_arg_to_dev_alloc[arg_id] = alloc_n_sz;
+    }
+  }
+
   // Have to be exposed as public for Impl to use. We cannot friend the Impl
   // class because it is private.
   std::vector<std::unique_ptr<CompiledMtlKernelBase>> compiled_mtl_kernels;
@@ -374,17 +422,32 @@ class CompiledTaichiKernel {
   KernelContextAttributes ctx_attribs;
   std::unique_ptr<BufferMemoryView> ctx_mem;
   nsobj_unique_ptr<MTLBuffer> ctx_buffer;
+
+  struct AllocAndSize {
+    DeviceAllocation alloc;
+    size_t size{0};
+  };
+  std::unordered_map<int, AllocAndSize> ext_arr_arg_to_dev_alloc;
+
+ private:
+  const Kernel *const kernel_;
+  Device *const rhi_device_;
 };
 
 class HostMetalCtxBlitter {
  public:
   HostMetalCtxBlitter(const CompiledTaichiKernel &kernel,
                       RuntimeContext *host_ctx,
+                      Device *rhi_device,
+                      AllocToMTLBufferMapper *alloc_mapper,
                       uint64_t *host_result_buffer,
                       const std::string &kernel_name)
-      : ti_kernel_attribs_(&kernel.ti_kernel_attribs),
+      : cti_kernel_(&kernel),
+        ti_kernel_attribs_(&kernel.ti_kernel_attribs),
         ctx_attribs_(&kernel.ctx_attribs),
         host_ctx_(host_ctx),
+        rhi_device_(rhi_device),
+        alloc_mapper_(alloc_mapper),
         host_result_buffer_(host_result_buffer),
         kernel_ctx_mem_(kernel.ctx_mem.get()),
         kernel_ctx_buffer_(kernel.ctx_buffer.get()),
@@ -404,6 +467,11 @@ class HostMetalCtxBlitter {
       return;
     }
     char *const base = (char *)kernel_ctx_mem_->ptr();
+    struct BufferAndSize {
+      MTLBuffer *buffer{nullptr};
+      size_t size{0};
+    };
+    std::vector<BufferAndSize> buf_sz;
     for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
       const auto &arg = ctx_attribs_->args()[i];
       const auto dt = arg.dt;
@@ -415,8 +483,22 @@ class HostMetalCtxBlitter {
              ActionArg("offset_in_bytes", (int64)arg.offset_in_mem)});
       }
       if (arg.is_array) {
-        const void *host_ptr = host_ctx_->get_arg<void *>(i);
-        std::memcpy(device_ptr, host_ptr, arg.stride);
+        if (host_ctx_->is_device_allocations[i]) {
+          // There is no way to write from host into Ndarray directly (yet), so
+          // we don't have to do anything here.
+        } else {
+          const void *host_ptr = host_ctx_->get_arg<void *>(i);
+          const auto alloc_n_sz = cti_kernel_->ext_arr_arg_to_dev_alloc.at(i);
+          auto buf_mem = alloc_mapper_->find(alloc_n_sz.alloc);
+          TI_ASSERT(buf_mem.buffer != nullptr);
+          auto *mem = buf_mem.mem;
+          TI_ASSERT(mem != nullptr);
+          // NOTE: DO NOT use `alloc_n_sz.size`. That is the size for the
+          // allocation, NOT the array size.
+          const auto arr_size = host_ctx_->array_runtime_sizes[i];
+          std::memcpy(mem->ptr(), host_ptr, arr_size);
+          buf_sz.push_back(BufferAndSize{buf_mem.buffer, arr_size});
+        }
       } else if (dt == MetalDataType::i32) {
         TO_METAL(int32);
       } else if (dt == MetalDataType::u32) {
@@ -440,8 +522,11 @@ class HostMetalCtxBlitter {
     std::memcpy(device_ptr, host_ctx_->extra_args,
                 ctx_attribs_->extra_args_bytes());
 #undef TO_METAL
-    did_modify_range(kernel_ctx_buffer_, /*location=*/0,
-                     kernel_ctx_mem_->size());
+    buf_sz.push_back(
+        BufferAndSize{kernel_ctx_buffer_, kernel_ctx_mem_->size()});
+    for (auto bs : buf_sz) {
+      did_modify_range(bs.buffer, /*length=*/0, bs.size);
+    }
   }
 
   void metal_to_host() {
@@ -456,11 +541,16 @@ class HostMetalCtxBlitter {
     char *const base = (char *)kernel_ctx_mem_->ptr();
     for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
       const auto &arg = ctx_attribs_->args()[i];
-      char *device_ptr = base + arg.offset_in_mem;
       if (arg.is_array) {
+        if (host_ctx_->is_device_allocations[i]) {
+          continue;
+        }
+        const auto alloc_n_sz = cti_kernel_->ext_arr_arg_to_dev_alloc.at(i);
+        auto *mem = alloc_mapper_->find(alloc_n_sz.alloc).mem;
+        TI_ASSERT(mem != nullptr);
         void *host_ptr = host_ctx_->get_arg<void *>(i);
-        std::memcpy(host_ptr, device_ptr, arg.stride);
-
+        const auto arr_size = host_ctx_->array_runtime_sizes[i];
+        std::memcpy(host_ptr, mem->ptr(), arr_size);
         if (!ti_kernel_attribs_->is_jit_evaluator) {
           ActionRecorder::get_instance().record(
               "context_metal_to_host",
@@ -469,10 +559,6 @@ class HostMetalCtxBlitter {
                   ActionArg("arg_id", i),
                   ActionArg("arg_type", "ptr"),
                   ActionArg("size_in_bytes", (int64)arg.stride),
-                  ActionArg("host_address",
-                            fmt::format("0x{:x}", (uint64)host_ptr)),
-                  ActionArg("device_address",
-                            fmt::format("0x{:x}", (uint64)device_ptr)),
               });
         }
       }
@@ -512,19 +598,24 @@ class HostMetalCtxBlitter {
   static std::unique_ptr<HostMetalCtxBlitter> maybe_make(
       const CompiledTaichiKernel &kernel,
       RuntimeContext *ctx,
+      Device *rhi_device,
+      AllocToMTLBufferMapper *alloc_mapper,
       uint64_t *host_result_buffer,
       std::string name) {
     if (kernel.ctx_attribs.empty()) {
       return nullptr;
     }
-    return std::make_unique<HostMetalCtxBlitter>(kernel, ctx,
-                                                 host_result_buffer, name);
+    return std::make_unique<HostMetalCtxBlitter>(
+        kernel, ctx, rhi_device, alloc_mapper, host_result_buffer, name);
   }
 
  private:
+  const CompiledTaichiKernel *const cti_kernel_;
   const TaichiKernelAttributes *const ti_kernel_attribs_;
   const KernelContextAttributes *const ctx_attribs_;
   RuntimeContext *const host_ctx_;
+  Device *const rhi_device_;
+  AllocToMTLBufferMapper *const alloc_mapper_;
   uint64_t *const host_result_buffer_;
   BufferMemoryView *const kernel_ctx_mem_;
   MTLBuffer *const kernel_ctx_buffer_;
@@ -636,7 +727,8 @@ class KernelManager::Impl {
   void register_taichi_kernel(const std::string &taichi_kernel_name,
                               const std::string &mtl_kernel_source_code,
                               const TaichiKernelAttributes &ti_kernel_attribs,
-                              const KernelContextAttributes &ctx_attribs) {
+                              const KernelContextAttributes &ctx_attribs,
+                              const Kernel *kernel) {
     TI_ASSERT(compiled_taichi_kernels_.find(taichi_kernel_name) ==
               compiled_taichi_kernels_.end());
 
@@ -655,6 +747,8 @@ class KernelManager::Impl {
     params.mem_pool = mem_pool_;
     params.profiler = profiler_;
     params.compile_config = config_;
+    params.kernel = kernel;
+    params.rhi_device = rhi_device_.get();
     compiled_taichi_kernels_[taichi_kernel_name] =
         std::make_unique<CompiledTaichiKernel>(params);
     TI_DEBUG("Registered Taichi kernel <{}>", taichi_kernel_name);
@@ -663,9 +757,12 @@ class KernelManager::Impl {
   void launch_taichi_kernel(const std::string &taichi_kernel_name,
                             RuntimeContext *ctx) {
     mac::ScopedAutoreleasePool pool;
-    auto &ctk = *compiled_taichi_kernels_.find(taichi_kernel_name)->second;
+    auto &cti_kernel =
+        *compiled_taichi_kernels_.find(taichi_kernel_name)->second;
+    cti_kernel.maybe_make_dev_alloc_for_ext_arrs(*ctx);
     auto ctx_blitter = HostMetalCtxBlitter::maybe_make(
-        ctk, ctx, host_result_buffer_, taichi_kernel_name);
+        cti_kernel, ctx, rhi_device_.get(), devalloc_mapper_,
+        host_result_buffer_, taichi_kernel_name);
     if (config_->verbose_kernel_launches) {
       TI_INFO("Launching Taichi kernel <{}>", taichi_kernel_name);
     }
@@ -681,14 +778,16 @@ class KernelManager::Impl {
 
     if (ctx_blitter) {
       ctx_blitter->host_to_metal();
-      input_buffers[BufferDescriptor::context()] = ctk.ctx_buffer.get();
+      input_buffers[BufferDescriptor::context()] = cti_kernel.ctx_buffer.get();
     }
+    auto ndarray_buffers = get_dev_alloc_buffers(cti_kernel, *ctx);
+    input_buffers.insert(ndarray_buffers.begin(), ndarray_buffers.end());
 
-    for (const auto &mk : ctk.compiled_mtl_kernels) {
+    for (const auto &mk : cti_kernel.compiled_mtl_kernels) {
       mk->launch(input_buffers, cur_command_buffer_.get());
     }
 
-    const auto &used = ctk.ti_kernel_attribs.used_features;
+    const auto &used = cti_kernel.ti_kernel_attribs.used_features;
     const bool used_print_assert = (used.print || used.assertion);
     if (ctx_blitter || used_print_assert) {
       // TODO(k-ye): One optimization is to synchronize only when we absolutely
@@ -702,6 +801,9 @@ class KernelManager::Impl {
         clear_print_assert_buffer();
         buffers_to_blit.push_back(print_assert_idevalloc_.buffer);
       }
+      for (auto [_, buf] : ndarray_buffers) {
+        buffers_to_blit.push_back(buf);
+      }
       blit_buffers_and_sync(buffers_to_blit);
 
       if (ctx_blitter) {
@@ -1118,6 +1220,30 @@ class KernelManager::Impl {
     return res;
   }
 
+  InputBuffersMap get_dev_alloc_buffers(const CompiledTaichiKernel &ctk,
+                                        const RuntimeContext &host_ctx) const {
+    InputBuffersMap res;
+    for (const auto &arg : ctk.ctx_attribs.args()) {
+      if (!arg.is_array) {
+        continue;
+      }
+      DeviceAllocation dev_alloc;
+      if (host_ctx.is_device_allocations[arg.index]) {
+        dev_alloc = *reinterpret_cast<const DeviceAllocation *>(
+            host_ctx.args[arg.index]);
+        TI_TRACE("Ndarray arg_id={} alloc_id={}", arg.index,
+                 dev_alloc.alloc_id);
+      } else {
+        dev_alloc = ctk.ext_arr_arg_to_dev_alloc.at(arg.index).alloc;
+        TI_TRACE("ExtArr arg_id={} alloc_id={}", arg.index, dev_alloc.alloc_id);
+      }
+      MTLBuffer *buffer = devalloc_mapper_->find(dev_alloc).buffer;
+      TI_ASSERT(buffer != nullptr);
+      res[BufferDescriptor::ndarray(arg.index)] = buffer;
+    }
+    return res;
+  }
+
   struct SNodesRootBuffer {
     BufferDescriptor desc;
     DevAllocWithInternals idevalloc;
@@ -1175,7 +1301,8 @@ class KernelManager::Impl {
   void register_taichi_kernel(const std::string &taichi_kernel_name,
                               const std::string &mtl_kernel_source_code,
                               const TaichiKernelAttributes &ti_kernel_attribs,
-                              const KernelContextAttributes &ctx_attribs) {
+                              const KernelContextAttributes &ctx_attribs,
+                              const Kernel *kernel) {
     TI_ERROR("Metal not supported on the current OS");
   }
 
@@ -1225,9 +1352,10 @@ void KernelManager::register_taichi_kernel(
     const std::string &taichi_kernel_name,
     const std::string &mtl_kernel_source_code,
     const TaichiKernelAttributes &ti_kernel_attribs,
-    const KernelContextAttributes &ctx_attribs) {
+    const KernelContextAttributes &ctx_attribs,
+    const Kernel *kernel) {
   impl_->register_taichi_kernel(taichi_kernel_name, mtl_kernel_source_code,
-                                ti_kernel_attribs, ctx_attribs);
+                                ti_kernel_attribs, ctx_attribs, kernel);
 }
 
 void KernelManager::launch_taichi_kernel(const std::string &taichi_kernel_name,
diff --git a/taichi/backends/metal/kernel_manager.h b/taichi/backends/metal/kernel_manager.h
index 10bf38965e14b..e1cccfbb3971c 100644
--- a/taichi/backends/metal/kernel_manager.h
+++ b/taichi/backends/metal/kernel_manager.h
@@ -52,7 +52,8 @@ class KernelManager {
   void register_taichi_kernel(const std::string &taichi_kernel_name,
                               const std::string &mtl_kernel_source_code,
                               const TaichiKernelAttributes &ti_kernel_attribs,
-                              const KernelContextAttributes &ctx_attribs);
+                              const KernelContextAttributes &ctx_attribs,
+                              const Kernel *kernel);
 
   // Launch the given |taichi_kernel_name|.
   // Kernel launching is asynchronous, therefore the Metal memory is not valid
diff --git a/taichi/backends/metal/kernel_utils.cpp b/taichi/backends/metal/kernel_utils.cpp
index ac59811f999c5..fe5dd8d7f100c 100644
--- a/taichi/backends/metal/kernel_utils.cpp
+++ b/taichi/backends/metal/kernel_utils.cpp
@@ -31,11 +31,8 @@ bool BufferDescriptor::operator==(const BufferDescriptor &other) const {
   if (type_ != other.type_) {
     return false;
   }
-  if (type_ == Type::Root) {
-    return root_id_ == other.root_id_;
-  }
-  TI_ASSERT(root_id_ == -1);
-  return true;
+
+  return id_ == other.id_;
 }
 
 std::string BufferDescriptor::debug_string() const {
@@ -51,6 +48,9 @@ std::string BufferDescriptor::debug_string() const {
   if (type_ == Type::Root) {
     return fmt::format("Root_{}", root_id());
   }
+  if (type_ == Type::Ndarray) {
+    return fmt::format("Ndarray_{}", ndarray_arg_id());
+  }
   return m.find(type_)->second;
 }
 
@@ -88,7 +88,7 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
                metal_data_type_name(ma.dt));
     }
     ma.is_array = ka.is_array;
-    ma.stride = ma.is_array ? ka.size : dt_bytes;
+    ma.stride = ma.is_array ? 0 : dt_bytes;
     ma.index = arg_attribs_vec_.size();
     arg_attribs_vec_.push_back(ma);
   }
@@ -122,7 +122,8 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
     ret_attribs_vec_.push_back(mr);
   }
 
-  auto arrange_scalar_before_array = [&bytes = this->ctx_bytes_](auto *vec) {
+  auto arrange_scalar_before_array = [&bytes = this->ctx_bytes_](
+                                         auto *vec, bool allow_arr_mem_offset) {
     std::vector<int> scalar_indices;
     std::vector<int> array_indices;
     for (int i = 0; i < vec->size(); ++i) {
@@ -144,15 +145,21 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
     // Then the array args
     for (int i : array_indices) {
       auto &attribs = (*vec)[i];
-      const size_t dt_bytes = metal_data_type_bytes(attribs.dt);
-      bytes = (bytes + dt_bytes - 1) / dt_bytes * dt_bytes;
-      attribs.offset_in_mem = bytes;
-      bytes += attribs.stride;
+      if (allow_arr_mem_offset) {
+        const size_t dt_bytes = metal_data_type_bytes(attribs.dt);
+        bytes = (bytes + dt_bytes - 1) / dt_bytes * dt_bytes;
+        attribs.offset_in_mem = bytes;
+        bytes += attribs.stride;
+      } else {
+        // Array args are no longer embedded, they have dedicated MTLBuffers.
+        attribs.offset_in_mem = -1;
+      }
     }
   };
 
-  arrange_scalar_before_array(&arg_attribs_vec_);
-  arrange_scalar_before_array(&ret_attribs_vec_);
+  arrange_scalar_before_array(&arg_attribs_vec_,
+                              /*allow_arr_mem_offset=*/false);
+  arrange_scalar_before_array(&ret_attribs_vec_, /*allow_arr_mem_offset=*/true);
 }
 
 }  // namespace metal
diff --git a/taichi/backends/metal/kernel_utils.h b/taichi/backends/metal/kernel_utils.h
index 5138a374e36e6..f8e2404fb0a47 100644
--- a/taichi/backends/metal/kernel_utils.h
+++ b/taichi/backends/metal/kernel_utils.h
@@ -41,6 +41,7 @@ struct BufferDescriptor {
     Context,
     Runtime,
     Print,
+    Ndarray,
   };
 
   BufferDescriptor() = default;
@@ -65,13 +66,22 @@ struct BufferDescriptor {
     return BufferDescriptor{Type::Print};
   }
 
+  static BufferDescriptor ndarray(int arr_arg_id) {
+    return BufferDescriptor{Type::Ndarray, arr_arg_id};
+  }
+
   Type type() const {
     return type_;
   }
 
   int root_id() const {
     TI_ASSERT(type_ == Type::Root);
-    return root_id_;
+    return id_;
+  }
+
+  int ndarray_arg_id() const {
+    TI_ASSERT(type_ == Type::Ndarray);
+    return id_;
   }
 
   std::string debug_string() const;
@@ -84,7 +94,7 @@ struct BufferDescriptor {
 
   struct Hasher {
     std::size_t operator()(const BufferDescriptor &desc) const {
-      return std::hash<BufferDescriptor::Type>{}(desc.type()) ^ desc.root_id_;
+      return std::hash<BufferDescriptor::Type>{}(desc.type()) ^ desc.id_;
     }
   };
 
@@ -92,13 +102,13 @@ struct BufferDescriptor {
   explicit BufferDescriptor(Type t) : type_(t) {
   }
 
-  explicit BufferDescriptor(Type t, int root_id) : type_(t), root_id_(root_id) {
+  explicit BufferDescriptor(Type t, int root_id) : type_(t), id_(root_id) {
   }
   Type type_{Type::Root};
-  int root_id_{-1};  // only used if type==Root
+  int id_{-1};  // only used if type in {Root, Ndarray}
 
  public:
-  TI_IO_DEF(type_, root_id_);
+  TI_IO_DEF(type_, id_);
 };
 
 // This struct holds the necessary information to launch a Metal kernel.
@@ -138,6 +148,7 @@ struct KernelAttributes {
     const SNode *snode = nullptr;
   };
   std::vector<BufferDescriptor> buffers;
+  std::unordered_map<int, int> arr_args_to_binding_indices;
   // Only valid when |task_type| is `range_for`.
   std::optional<RangeForAttributes> range_for_attribs;
   // Only valid when |task_type| is `listgen`.
@@ -187,7 +198,12 @@ class KernelContextAttributes {
  private:
   // Attributes that are shared by the input arg and the return value.
   struct AttribsBase {
-    // For array arg, this is #elements * stride(dt). Unit: byte
+    // This is tricky:
+    // * For Args
+    //    * scalar: stride(dt)
+    //    * array: 0
+    // * For Return, this can actually be a matrix, where `is_array` is true...
+    // Unit: byte.
     size_t stride = 0;
     // Offset in the argument buffer
     size_t offset_in_mem = 0;
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index dbc762574c88a..c68ab809268d1 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -483,11 +483,11 @@ class KernelGen : public IRVisitor {
     const auto *argload = stmt->base_ptrs[0]->as<ArgLoadStmt>();
     const int arg_id = argload->arg_id;
     const int num_indices = stmt->indices.size();
-    auto element_shape = stmt->element_shape;
+    const auto &element_shape = stmt->element_shape;
     std::vector<std::string> size_var_names;
     std::vector<std::string> element_shape_size_var_names;
     enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
-    auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
+    const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
 
     if (element_shape.size() > 0) {
       int elem_beg = 0;
diff --git a/taichi/backends/opengl/opengl_program.cpp b/taichi/backends/opengl/opengl_program.cpp
index 0b6defe956507..1905ba7515220 100644
--- a/taichi/backends/opengl/opengl_program.cpp
+++ b/taichi/backends/opengl/opengl_program.cpp
@@ -35,6 +35,7 @@ void OpenglProgramImpl::materialize_runtime(MemoryPool *memory_pool,
 DeviceAllocation OpenglProgramImpl::allocate_memory_ndarray(
     std::size_t alloc_size,
     uint64 *result_buffer) {
+  // FIXME: Why is host R/W set to true?
   return opengl_runtime_->device->allocate_memory(
       {alloc_size, /*host_write=*/true, /*host_read=*/true,
        /*export_sharing=*/false});
diff --git a/taichi/program/callable.cpp b/taichi/program/callable.cpp
index 914a5973ac937..9f70896ee768a 100644
--- a/taichi/program/callable.cpp
+++ b/taichi/program/callable.cpp
@@ -20,8 +20,8 @@ int Callable::insert_ret(const DataType &dt) {
 int Callable::insert_arr_arg(const DataType &dt,
                              int total_dim,
                              std::vector<int> element_shape) {
-  args.emplace_back(dt->get_compute_type(), true, /*size=*/0, total_dim,
-                    element_shape);
+  args.emplace_back(dt->get_compute_type(), /*is_array=*/true, /*size=*/0,
+                    total_dim, element_shape);
   return (int)args.size() - 1;
 }
 
diff --git a/taichi/program/callable.h b/taichi/program/callable.h
index dc9910a55fff3..663e683970723 100644
--- a/taichi/program/callable.h
+++ b/taichi/program/callable.h
@@ -19,18 +19,16 @@ class TI_DLL_EXPORT Callable {
     DataType dt;
     bool is_array{
         false};  // This is true for both ndarray and external array args.
-    std::size_t size{0};  // TODO: size is runtime information, maybe remove?
     std::size_t total_dim{0};             // total dim of array
     std::vector<int> element_shape = {};  // shape of each element
 
     explicit Arg(const DataType &dt = PrimitiveType::unknown,
                  bool is_array = false,
-                 std::size_t size = 0,
+                 std::size_t size_unused = 0,
                  int total_dim = 0,
                  std::vector<int> element_shape = {})
         : dt(dt),
           is_array(is_array),
-          size(size),
           total_dim(total_dim),
           element_shape(std::move(element_shape)) {
     }
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 19f9ee8370ec6..3b12af94ab526 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -242,9 +242,6 @@ void Kernel::LaunchContextBuilder::set_arg_external_array(
        ActionArg("address", fmt::format("0x{:x}", ptr)),
        ActionArg("array_size_in_bytes", (int64)size)});
 
-  // FIXME(https://github.com/taichi-dev/taichi/issues/4949): Make the Metal
-  // backend support Ndarray, then remove this line below.
-  kernel_->args[arg_id].size = size;
   ctx_->set_arg(arg_id, ptr);
   ctx_->set_array_runtime_size(arg_id, size);
   ctx_->set_array_is_device_allocation(arg_id, is_device_allocation);
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 94c95357d04b1..784c472c126aa 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -415,16 +415,17 @@ Kernel &Program::get_ndarray_reader(Ndarray *ndarray) {
     }
     auto ret = Stmt::make<FrontendReturnStmt>(
         ExprGroup(Expr(Expr::make<ExternalTensorExpression>(
-            keys.dtype, keys.num_active_indices, keys.num_active_indices,
-            0))[indices]));
+            keys.dtype, keys.num_active_indices,
+            /*arg_id=*/keys.num_active_indices, 0))[indices]));
     this->current_ast_builder()->insert(std::move(ret));
   });
   ker.set_arch(get_accessor_arch());
   ker.name = kernel_name;
   ker.is_accessor = true;
-  for (int i = 0; i < keys.num_active_indices; i++)
-    ker.insert_arg(PrimitiveType::i32, false);
-  ker.insert_arg(keys.dtype, true);
+  for (int i = 0; i < keys.num_active_indices; i++) {
+    ker.insert_arg(PrimitiveType::i32, /*is_array=*/false);
+  }
+  ker.insert_arg(keys.dtype, /*is_array=*/true);
   ker.insert_ret(keys.dtype);
   return ker;
 }
@@ -439,8 +440,8 @@ Kernel &Program::get_ndarray_writer(Ndarray *ndarray) {
       indices.push_back(Expr::make<ArgLoadExpression>(i, PrimitiveType::i32));
     }
     auto expr = Expr(Expr::make<ExternalTensorExpression>(
-        keys.dtype, keys.num_active_indices, keys.num_active_indices + 1,
-        0))[indices];
+        keys.dtype, keys.num_active_indices,
+        /*arg_id=*/keys.num_active_indices + 1, 0))[indices];
     this->current_ast_builder()->insert_assignment(
         expr, Expr::make<ArgLoadExpression>(keys.num_active_indices,
                                             keys.dtype->get_compute_type()));
diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index 23c54d23a4c2c..7cb30527128d8 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -551,6 +551,8 @@ class IRPrinter : public IRVisitor {
       }
       s += ")";
     }
+    s += fmt::format(" element_dim={} layout={}", stmt->element_dim,
+                     (stmt->element_dim <= 0) ? "AOS" : "SOA");
 
     print(fmt::format("{}{} = external_ptr {}", stmt->type_hint(), stmt->name(),
                       s));
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 8579c04b4acf0..cadfca098497f 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -18,7 +18,9 @@
 ndarray_shapes = [(), 8, (6, 12)]
 vector_dims = [3]
 matrix_dims = [(1, 2), (2, 3)]
-supported_archs_taichi_ndarray = [ti.cpu, ti.cuda, ti.opengl, ti.vulkan]
+supported_archs_taichi_ndarray = [
+    ti.cpu, ti.cuda, ti.opengl, ti.vulkan, ti.metal
+]
 
 
 def _test_scalar_ndarray(dtype, shape):

From 296b7201e181db29527bb6fa4d5aa94845e700f5 Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Wed, 25 May 2022 19:53:21 +0800
Subject: [PATCH 114/176] [Lang] Fix potential precision bug when using math
 vector and matrix types (#5032)

* fix fast_gui rgba bug

* fix floating precision problem for vector types

* add more flexible initialization methods for matrix type

* add more flexible initialization methods for matrix type

* add glsl determinant and inverse function support

* add glsl determinant and inverse function support

* add glsl determinant and inverse function support

* add glsl determinant and inverse function support

* add glsl determinant and inverse function support

* fix matrix precision type bug and use matrix-member inverse

* fix matrix precision type bug and use matrix-member inverse
---
 python/taichi/math/mathimpl.py | 122 +++++++++++++++++++++++----------
 tests/python/test_api.py       |  14 ++--
 2 files changed, 94 insertions(+), 42 deletions(-)

diff --git a/python/taichi/math/mathimpl.py b/python/taichi/math/mathimpl.py
index 085e7d169ff2b..9561ffd74bd7b 100644
--- a/python/taichi/math/mathimpl.py
+++ b/python/taichi/math/mathimpl.py
@@ -10,44 +10,44 @@
 
 import taichi as ti
 
-vec2 = ti.types.vector(2, float)  # pylint: disable=E1101
-"""2D float vector type.
-"""
+_get_uint_ip = lambda: ti.u32 if impl.get_runtime(
+).default_ip == ti.i32 else ti.u64
 
-vec3 = ti.types.vector(3, float)  # pylint: disable=E1101
-"""3D float vector type.
-"""
 
-vec4 = ti.types.vector(4, float)  # pylint: disable=E1101
-"""4D float vector type.
-"""
+def vec2(*args):
+    """2D floating vector type.
+    """
+    return ti.types.vector(2, float)(*args)  # pylint: disable=E1101
 
-ivec2 = ti.types.vector(2, int)  # pylint: disable=E1101
-"""2D int vector type.
-"""
 
-ivec3 = ti.types.vector(3, int)  # pylint: disable=E1101
-"""3D int vector type.
-"""
+def vec3(*args):
+    """3D floating vector type.
+    """
+    return ti.types.vector(3, float)(*args)  # pylint: disable=E1101
 
-ivec4 = ti.types.vector(4, int)  # pylint: disable=E1101
-"""4D int vector type.
-"""
 
-mat2 = ti.types.matrix(2, 2, float)  # pylint: disable=E1101
-"""2x2 float matrix type.
-"""
+def vec4(*args):
+    """4D floating vector type.
+    """
+    return ti.types.vector(4, float)(*args)  # pylint: disable=E1101
 
-mat3 = ti.types.matrix(3, 3, float)  # pylint: disable=E1101
-"""3x3 float matrix type.
-"""
 
-mat4 = ti.types.matrix(4, 4, float)  # pylint: disable=E1101
-"""4x4 float matrix type.
-"""
+def ivec2(*args):
+    """2D signed int vector type.
+    """
+    return ti.types.vector(2, int)(*args)  # pylint: disable=E1101
 
-_get_uint_ip = lambda: ti.u32 if impl.get_runtime(
-).default_ip == ti.i32 else ti.u64
+
+def ivec3(*args):
+    """3D signed int vector type.
+    """
+    return ti.types.vector(3, int)(*args)  # pylint: disable=E1101
+
+
+def ivec4(*args):
+    """4D signed int vector type.
+    """
+    return ti.types.vector(4, int)(*args)  # pylint: disable=E1101
 
 
 def uvec2(*args):
@@ -68,6 +68,24 @@ def uvec4(*args):
     return ti.types.vector(4, _get_uint_ip())(*args)  # pylint: disable=E1101
 
 
+def mat2(*args):
+    """2x2 floating matrix type.
+    """
+    return ti.types.matrix(2, 2, float)(*args)  # pylint: disable=E1101
+
+
+def mat3(*args):
+    """3x3 floating matrix type.
+    """
+    return ti.types.matrix(3, 3, float)(*args)  # pylint: disable=E1101
+
+
+def mat4(*args):
+    """4x4 floating matrix type.
+    """
+    return ti.types.matrix(4, 4, float)(*args)  # pylint: disable=E1101
+
+
 @ti.func
 def mix(x, y, a):
     """Performs a linear interpolation between `x` and `y` using
@@ -611,12 +629,46 @@ def length(x):
     return x.norm()
 
 
+@ti.func
+def determinant(m):
+    """Alias for :func:`taichi.Matrix.determinant`.
+    """
+    return m.determinant()
+
+
+@ti.func
+def inverse(mat):  # pylint: disable=R1710
+    """Calculate the inverse of a matrix.
+
+    This function is equivalent to the `inverse` function in GLSL.
+
+    Args:
+        mat (:class:`taichi.Matrix`): The matrix of which to take the inverse.
+
+    Returns:
+        Inverse of the input matrix.
+
+    Example::
+
+        >>> @ti.kernel
+        >>> def test():
+        >>>     m = mat3([(1, 1, 0), (0, 1, 1), (0, 0, 1)])
+        >>>     print(inverse(m))
+        >>>
+        >>> test()
+        [[1.000000, -1.000000, 1.000000],
+         [0.000000, 1.000000, -1.000000],
+         [0.000000, 0.000000, 1.000000]]
+    """
+    return mat.inverse()
+
+
 __all__ = [
     "acos", "asin", "atan2", "ceil", "clamp", "cos", "cross", "degrees",
-    "distance", "dot", "e", "exp", "eye", "floor", "fract", "ivec2", "ivec3",
-    "ivec4", "length", "log", "log2", "mat2", "mat3", "mat4", "max", "min",
-    "mix", "mod", "normalize", "pi", "pow", "radians", "reflect", "refract",
-    "rot2", "rot3", "rotate2d", "rotate3d", "round", "sign", "sin",
-    "smoothstep", "sqrt", "step", "tan", "tanh", "uvec2", "uvec3", "uvec4",
-    "vec2", "vec3", "vec4"
+    "determinant", "distance", "dot", "e", "exp", "eye", "floor", "fract",
+    "inverse", "ivec2", "ivec3", "ivec4", "length", "log", "log2", "mat2",
+    "mat3", "mat4", "max", "min", "mix", "mod", "normalize", "pi", "pow",
+    "radians", "reflect", "refract", "rot2", "rot3", "rotate2d", "rotate3d",
+    "round", "sign", "sin", "smoothstep", "sqrt", "step", "tan", "tanh",
+    "uvec2", "uvec3", "uvec4", "vec2", "vec3", "vec4"
 ]
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 2c36c753e079b..9f51201fd9f9f 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -98,13 +98,13 @@ def _get_expected_matrix_apis():
 ]
 user_api[ti.math] = [
     'acos', 'asin', 'atan2', 'cconj', 'cdiv', 'ceil', 'cexp', 'cinv', 'clamp',
-    'clog', 'cmul', 'cos', 'cpow', 'cross', 'csqrt', 'degrees', 'distance',
-    'dot', 'e', 'exp', 'eye', 'floor', 'fract', 'ivec2', 'ivec3', 'ivec4',
-    'length', 'log', 'log2', 'mat2', 'mat3', 'mat4', 'max', 'min', 'mix',
-    'mod', 'normalize', 'pi', 'pow', 'radians', 'reflect', 'refract', 'rot2',
-    'rot3', 'rotate2d', 'rotate3d', 'round', 'sign', 'sin', 'smoothstep',
-    'sqrt', 'step', 'tan', 'tanh', 'uvec2', 'uvec3', 'uvec4', 'vec2', 'vec3',
-    'vec4'
+    'clog', 'cmul', 'cos', 'cpow', 'cross', 'csqrt', 'degrees', 'determinant',
+    'distance', 'dot', 'e', 'exp', 'eye', 'floor', 'fract', 'inverse', 'ivec2',
+    'ivec3', 'ivec4', 'length', 'log', 'log2', 'mat2', 'mat3', 'mat4', 'max',
+    'min', 'mix', 'mod', 'normalize', 'pi', 'pow', 'radians', 'reflect',
+    'refract', 'rot2', 'rot3', 'rotate2d', 'rotate3d', 'round', 'sign', 'sin',
+    'smoothstep', 'sqrt', 'step', 'tan', 'tanh', 'uvec2', 'uvec3', 'uvec4',
+    'vec2', 'vec3', 'vec4'
 ]
 user_api[ti.Matrix] = _get_expected_matrix_apis()
 user_api[ti.MatrixField] = [

From 63f2d2ce3dedefc3cd15548b7649de1ed3a919b5 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Wed, 25 May 2022 20:12:05 +0800
Subject: [PATCH 115/176] [Vulkan] Fixed vulkan backend crash on AOT examples
 (#5047)

---
 taichi/backends/vulkan/vulkan_device.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/taichi/backends/vulkan/vulkan_device.h b/taichi/backends/vulkan/vulkan_device.h
index 15d5cde4d88a8..852486aed7a58 100644
--- a/taichi/backends/vulkan/vulkan_device.h
+++ b/taichi/backends/vulkan/vulkan_device.h
@@ -176,7 +176,11 @@ class VulkanResourceBinder : public ResourceBinder {
       for (const auto &pair : set.bindings) {
         size_t binding_hash = 0;
         uint32_t *u32_ptr = (uint32_t *)&pair.second;
-        for (int i = 0; i < sizeof(Set) / sizeof(uint32_t); i++) {
+        static_assert(
+            sizeof(VulkanResourceBinder::Binding) % sizeof(uint32_t) == 0,
+            "sizeof(VulkanResourceBinder::Binding) is not a multiple of 4");
+        size_t n = sizeof(VulkanResourceBinder::Binding) / sizeof(uint32_t);
+        for (int i = 0; i < n; i++) {
           binding_hash = binding_hash ^ u32_ptr[i];
           binding_hash = (binding_hash << 7) | (binding_hash >> (64 - 7));
         }

From cdc07a5425ba795dd118fc37471764d421ec2773 Mon Sep 17 00:00:00 2001
From: PENGUINLIONG <admin@penguinliong.moe>
Date: Thu, 26 May 2022 10:30:20 +0800
Subject: [PATCH 116/176] Exit CI builds when download of prebuilt packages
 fails (#5043)

---
 .github/workflows/scripts/win_build.ps1          | 3 +++
 .github/workflows/scripts/win_build_test_cpu.ps1 | 3 +++
 ci/windows/win_build_test.ps1                    | 2 ++
 3 files changed, 8 insertions(+)

diff --git a/.github/workflows/scripts/win_build.ps1 b/.github/workflows/scripts/win_build.ps1
index 86ad4243742e5..52cf2a4259045 100644
--- a/.github/workflows/scripts/win_build.ps1
+++ b/.github/workflows/scripts/win_build.ps1
@@ -47,11 +47,13 @@ Push-Location $libsDir
 if (-not (Test-Path "taichi_llvm")) {
     WriteInfo("Download and extract LLVM")
     curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip -LO
+    if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
     7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm
 }
 if (-not (Test-Path "taichi_clang")) {
     WriteInfo("Download and extract Clang")
     curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO
+    if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
     7z x clang-10.0.0-win.zip -otaichi_clang
 }
 $env:LLVM_DIR = "$libsDir\taichi_llvm"
@@ -60,6 +62,7 @@ if ($installVulkan) {
     WriteInfo("Download and install Vulkan")
     if (-not (Test-Path "VulkanSDK")) {
         curl.exe --retry 10 --retry-delay 5 https://sdk.lunarg.com/sdk/download/1.2.189.0/windows/VulkanSDK-1.2.189.0-Installer.exe -Lo VulkanSDK.exe
+        if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
         $installer = Start-Process -FilePath VulkanSDK.exe -Wait -PassThru -ArgumentList @("/S");
         $installer.WaitForExit();
     }
diff --git a/.github/workflows/scripts/win_build_test_cpu.ps1 b/.github/workflows/scripts/win_build_test_cpu.ps1
index 0296d98aa4d57..9d482fd31bd4b 100644
--- a/.github/workflows/scripts/win_build_test_cpu.ps1
+++ b/.github/workflows/scripts/win_build_test_cpu.ps1
@@ -24,11 +24,13 @@ Set-Location $libsDir
 if (-not (Test-Path "taichi_llvm")) {
     WriteInfo("Download and extract LLVM")
     curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip -LO
+    if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
     7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm
 }
 if (-not (Test-Path "taichi_clang")) {
     WriteInfo("Download and extract Clang")
     curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO
+    if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
     7z x clang-10.0.0-win.zip -otaichi_clang
 }
 
@@ -58,6 +60,7 @@ WriteInfo("ccache dir: $Env:CCACHE_DIR")
 md "$Env:CCACHE_DIR" -ea 0
 if (-not (Test-Path "ccache-4.5.1-windows-64")) {
     curl.exe --retry 10 --retry-delay 5 https://github.com/ccache/ccache/releases/download/v4.5.1/ccache-4.5.1-windows-64.zip -LO
+    if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
     7z x ccache-4.5.1-windows-64.zip
     $env:PATH += ";${pwd}/ccache-4.5.1-windows-64"
 }
diff --git a/ci/windows/win_build_test.ps1 b/ci/windows/win_build_test.ps1
index 7d374b45510ea..f50a3bfe256c1 100644
--- a/ci/windows/win_build_test.ps1
+++ b/ci/windows/win_build_test.ps1
@@ -25,11 +25,13 @@ Push-Location $libsDir
 if (-not (Test-Path "taichi_llvm")) {
     WriteInfo("Download and extract LLVM")
     curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip -LO
+    if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
     7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm
 }
 if (-not (Test-Path "taichi_clang")) {
     WriteInfo("Download and extract Clang")
     curl.exe --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO
+    if ($LASTEXITCODE -ne 0) { exit $LASTEXITCODE; }
     7z x clang-10.0.0-win.zip -otaichi_clang
 }
 

From 64b8c2a26770c693233737c64ccc3302e175ff68 Mon Sep 17 00:00:00 2001
From: yekuang <k-ye@users.noreply.github.com>
Date: Thu, 26 May 2022 11:11:22 +0800
Subject: [PATCH 117/176] [ci] Run cpp tests via run_tests.py (#5035)

* [ci] Run cpp tests via run_tests.py

* default to False

* enable cpp on win
---
 .github/workflows/scripts/unix_test.sh | 3 +--
 ci/windows/win_build_test.ps1          | 1 +
 tests/run_tests.py                     | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/scripts/unix_test.sh b/.github/workflows/scripts/unix_test.sh
index 7954f053d9903..d1f2ea59591af 100755
--- a/.github/workflows/scripts/unix_test.sh
+++ b/.github/workflows/scripts/unix_test.sh
@@ -48,8 +48,7 @@ ti diagnose
 ti changelog
 echo "wanted archs: $TI_WANTED_ARCHS"
 
-TI_PATH=$(python3 -c "import taichi;print(taichi.__path__[0])" | tail -1)
-TI_LIB_DIR="$TI_PATH/_lib/runtime" ./build/taichi_cpp_tests
+python3 tests/run_tests.py --cpp
 
 if [ -z "$GPU_TEST" ]; then
     if [[ $PLATFORM == *"m1"* ]]; then
diff --git a/ci/windows/win_build_test.ps1 b/ci/windows/win_build_test.ps1
index f50a3bfe256c1..35b6c6db3f6ee 100644
--- a/ci/windows/win_build_test.ps1
+++ b/ci/windows/win_build_test.ps1
@@ -61,5 +61,6 @@ python setup.py develop
 WriteInfo("Build finished")
 
 WriteInfo("Testing Taichi")
+python tests/run_tests.py --cpp
 python tests/run_tests.py -vr2 -t2 -k "not torch and not paddle" -a cpu
 WriteInfo("Test finished")
diff --git a/tests/run_tests.py b/tests/run_tests.py
index ca7f557fe6a08..7621f985394ae 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -110,9 +110,9 @@ def test():
     parser.add_argument('-c',
                         '--cpp',
                         dest='cpp',
-                        default=True,
+                        default=False,
                         action='store_true',
-                        help='Run the C++ tests')
+                        help='Only run the C++ tests')
     parser.add_argument('-s',
                         '--show',
                         dest='show_output',
@@ -209,6 +209,7 @@ def test():
 
     if args.cpp:
         _test_cpp()
+        return
 
     if _test_python(args) != 0:
         exit(1)

From e8a9732fcbfe1de7a41e94f31d9b33f61c4915b5 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Thu, 26 May 2022 11:30:10 +0800
Subject: [PATCH 118/176] Set host_write to false for opengl ndarray (#5038)

As discussed ndarrays can be written through calling write kernels, but
it shouldn't support directly map on host and write to it.
---
 taichi/backends/opengl/opengl_program.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/taichi/backends/opengl/opengl_program.cpp b/taichi/backends/opengl/opengl_program.cpp
index 1905ba7515220..a862522bca984 100644
--- a/taichi/backends/opengl/opengl_program.cpp
+++ b/taichi/backends/opengl/opengl_program.cpp
@@ -37,7 +37,7 @@ DeviceAllocation OpenglProgramImpl::allocate_memory_ndarray(
     uint64 *result_buffer) {
   // FIXME: Why is host R/W set to true?
   return opengl_runtime_->device->allocate_memory(
-      {alloc_size, /*host_write=*/true, /*host_read=*/true,
+      {alloc_size, /*host_write=*/false, /*host_read=*/true,
        /*export_sharing=*/false});
 }
 

From da2b332813c23379007ad746e96cc8a780908f05 Mon Sep 17 00:00:00 2001
From: pengyu <6712304+FantasyVR@users.noreply.github.com>
Date: Thu, 26 May 2022 15:42:59 +0800
Subject: [PATCH 119/176] [Lang] Build sparse matrix from ndarray (#4841)

* build sparse matrix from ndarray

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* add shape property

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix errors

* fix pylint

* fix failed test

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* improve

* improve

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix ndarray data ptr not found

* pylint

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* check ndarray dimension when build sparse matrix

* improve

* add example docstring

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 misc/sm_ndarray.py                    | 205 ++++++++++++++++++++++++++
 python/taichi/linalg/sparse_matrix.py |  45 +++++-
 python/taichi/linalg/sparse_solver.py |  15 +-
 taichi/program/program.cpp            |   2 +-
 taichi/program/program.h              |   2 +-
 taichi/program/sparse_matrix.cpp      |  29 ++++
 taichi/program/sparse_matrix.h        |  10 ++
 taichi/python/export_lang.cpp         |   6 +
 tests/python/test_sparse_matrix.py    |  28 +++-
 9 files changed, 334 insertions(+), 8 deletions(-)
 create mode 100644 misc/sm_ndarray.py

diff --git a/misc/sm_ndarray.py b/misc/sm_ndarray.py
new file mode 100644
index 0000000000000..75bf0695dc5c4
--- /dev/null
+++ b/misc/sm_ndarray.py
@@ -0,0 +1,205 @@
+import taichi as ti
+
+ti.init(arch=ti.cpu)
+
+N = 10
+
+# memory_layout: [x,y,z],[x,y,z],...,[x,y,z]
+st0 = ti.ndarray(shape=(N, 3), dtype=ti.f32)  #(10, 3)
+# (not works) memory_layout: [x,x,..,x],[y,y,..,y],[z,z,..,z]
+st1 = ti.ndarray(shape=(3, N), dtype=ti.f32)  # not works (3, 10)
+
+A = ti.linalg.SparseMatrix(n=10, m=10, dtype=ti.f32)
+
+
+@ti.kernel
+def fill_st0(ts: ti.types.ndarray()):
+    for i in range(N):
+        ts[i, 0] = i
+        ts[i, 1] = i
+        ts[i, 2] = i
+
+
+@ti.kernel
+def fill_st1(ts: ti.types.ndarray()):
+    for i in range(N):
+        ts[0, N] = i
+        ts[1, N] = i
+        ts[2, N] = i
+
+
+fill_st0(st0)
+A.build_from_ndarray(st0)
+print(f"A built from st0:\n{A}")
+
+# Not works as we expected
+fill_st1(st1)
+A.build_from_ndarray(st1)
+print(f"A built from st1:\n{A}")
+
+# memory_layout (AOS): [x,y,z],[x,y,z]....[x,y,z]
+vt0_aos = ti.Vector.ndarray(n=3, dtype=ti.f32, shape=N,
+                            layout=ti.Layout.AOS)  #(10,3)
+# (not works) memory_layout (SOA): [x,x,..,x],[y,y,..,y],[z,z,..,z]
+vt0_soa = ti.Vector.ndarray(n=3, dtype=ti.f32, shape=N,
+                            layout=ti.Layout.SOA)  #(3,10)
+# memory_layout (AOS/SOA): [x,y,z],[x,y,z]....[x,y,z]
+vt1_aos = ti.Vector.ndarray(n=1,
+                            dtype=ti.f32,
+                            shape=3 * N,
+                            layout=ti.Layout.AOS)  #(30,1)
+vt1_soa = ti.Vector.ndarray(n=1,
+                            dtype=ti.f32,
+                            shape=3 * N,
+                            layout=ti.Layout.SOA)  #(1,30)
+vt2_aos = ti.Vector.ndarray(n=3 * N,
+                            dtype=ti.f32,
+                            shape=(),
+                            layout=ti.Layout.AOS)  #(30,)
+vt2_soa = ti.Vector.ndarray(n=3 * N,
+                            dtype=ti.f32,
+                            shape=(),
+                            layout=ti.Layout.SOA)  #(30,)
+
+B = ti.linalg.SparseMatrix(n=10, m=10, dtype=ti.f32)
+
+
+@ti.kernel
+def fill_vt0_aos(triplets: ti.types.ndarray()):
+    for i in range(N):
+        triplet = ti.Vector([i, (i + 1) % N, i], dt=ti.f32)
+        triplets[i] = triplet
+
+
+@ti.kernel
+def fill_vt1(triplets: ti.types.ndarray()):
+    for i in range(N):
+        triplets[3 * i + 0][0] = i
+        triplets[3 * i + 1][0] = (i + 1) % N
+        triplets[3 * i + 2][0] = i
+
+
+@ti.kernel
+def fill_vt2(triplets: ti.types.ndarray()):
+    for i in range(N):
+        triplets[None][3 * i] = i
+        triplets[None][3 * i + 1] = (i + 1) % N
+        triplets[None][3 * i + 2] = i
+
+
+fill_vt0_aos(vt0_aos)
+B.build_from_ndarray(vt0_aos)
+print(f"B built from vt0_aos:\n{B}")
+fill_vt1(vt1_aos)
+B.build_from_ndarray(vt1_aos)
+print(f"B built from vt1 aos:\n{B}")
+fill_vt1(vt1_soa)
+B.build_from_ndarray(vt1_soa)
+print(f"B built from vt1 soa:\n{B}")
+fill_vt2(vt2_aos)
+B.build_from_ndarray(vt2_aos)
+print(f"B built from vt2 aos:\n{B}")
+fill_vt2(vt2_soa)
+B.build_from_ndarray(vt2_soa)
+print(f"B built from vt2 soa:\n{B}")
+
+# memory_layout (AOS/SOA): [x,y,z],[x,y,z]....[x,y,z]
+mt0_aos = ti.Matrix.ndarray(n=10,
+                            m=3,
+                            dtype=ti.f32,
+                            shape=(),
+                            layout=ti.Layout.AOS)  #(10,3)
+mt0_soa = ti.Matrix.ndarray(n=10,
+                            m=3,
+                            dtype=ti.f32,
+                            shape=(),
+                            layout=ti.Layout.SOA)  #(10,3)
+mt1_aos = ti.Matrix.ndarray(n=1,
+                            m=3,
+                            dtype=ti.f32,
+                            shape=(10, ),
+                            layout=ti.Layout.AOS)  #(10,1,3)
+mt3_aos = ti.Matrix.ndarray(n=3,
+                            m=1,
+                            dtype=ti.f32,
+                            shape=(10, ),
+                            layout=ti.Layout.AOS)  #(10,3,1)
+
+# (not works) memory_layout (AOS/SOA): [x,x,..,x],[y,y,..,y],[z,z,..,z]
+mt1_soa = ti.Matrix.ndarray(n=1,
+                            m=3,
+                            dtype=ti.f32,
+                            shape=(10, ),
+                            layout=ti.Layout.SOA)  #(1,3,10)
+mt3_soa = ti.Matrix.ndarray(n=3,
+                            m=1,
+                            dtype=ti.f32,
+                            shape=(10, ),
+                            layout=ti.Layout.SOA)  #(3,1,10)
+mt2_aos = ti.Matrix.ndarray(n=3,
+                            m=10,
+                            dtype=ti.f32,
+                            shape=(),
+                            layout=ti.Layout.AOS)  #(3,10)
+mt2_soa = ti.Matrix.ndarray(n=3,
+                            m=10,
+                            dtype=ti.f32,
+                            shape=(),
+                            layout=ti.Layout.SOA)  #(3,10)
+
+C = ti.linalg.SparseMatrix(n=10, m=10, dtype=ti.f32)
+
+
+@ti.kernel
+def fill_mt0(mts: ti.types.ndarray()):
+    for i in range(N):
+        mts[None][i, 0] = i
+        mts[None][i, 1] = (i + 2) % N
+        mts[None][i, 2] = i
+
+
+@ti.kernel
+def fill_mt1(mts: ti.types.ndarray()):
+    for i in range(N):
+        mts[i][0, 0] = i
+        mts[i][0, 1] = (i + 2) % N
+        mts[i][0, 2] = i
+
+
+@ti.kernel
+def fill_mt2(mts: ti.types.ndarray()):
+    for i in range(N):
+        mts[None][0, i] = i
+        mts[None][1, i] = (i + 2) % N
+        mts[None][2, i] = i
+
+
+@ti.kernel
+def fill_mt3(mts: ti.types.ndarray()):
+    for i in range(N):
+        mts[i][0, 0] = i
+        mts[i][1, 0] = (i + 2) % N
+        mts[i][2, 0] = i
+
+
+fill_mt0(mt0_aos)
+C.build_from_ndarray(mt0_aos)
+print(f"C built from mt0 aos:\n{C}")
+fill_mt0(mt0_soa)
+C.build_from_ndarray(mt0_soa)
+print(f"C built from mt0 soa:\n{C}")
+
+fill_mt1(mt1_aos)
+C.build_from_ndarray(mt1_aos)
+print(f"C built from mt1 aos:\n{C}")
+fill_mt3(mt3_aos)
+C.build_from_ndarray(mt3_aos)
+print(f"C built from mt3 soa:\n{C}")
+
+# Not works as we expected
+fill_mt1(mt1_soa)
+C.build_from_ndarray(mt1_soa)
+print(f"C built from mt1 soa:\n{C}")
+fill_mt3(mt3_soa)
+C.build_from_ndarray(mt3_soa)
+print(f"C built from mt3 soa:\n{C}")
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
index 40d87cd796f3a..83b4eb15d3b86 100644
--- a/python/taichi/linalg/sparse_matrix.py
+++ b/python/taichi/linalg/sparse_matrix.py
@@ -1,6 +1,8 @@
 import numpy as np
+from taichi.lang.exception import TaichiRuntimeError
 from taichi.lang.field import Field
 from taichi.lang.impl import get_runtime
+from taichi.lang.matrix import Ndarray
 from taichi.lang.util import warning
 from taichi.types import annotations, f32
 
@@ -132,7 +134,9 @@ def __matmul__(self, other):
             assert self.m == other.shape[
                 0], f"Dimension mismatch between sparse matrix ({self.n}, {self.m}) and vector ({other.shape})"
             return self.matrix.mat_vec_mul(other)
-        assert False, f"Sparse matrix-matrix/vector multiplication does not support {type(other)} for now. Supported types are SparseMatrix, ti.field, and numpy.ndarray."
+        raise TaichiRuntimeError(
+            f"Sparse matrix-matrix/vector multiplication does not support {type(other)} for now. Supported types are SparseMatrix, ti.field, and numpy ndarray."
+        )
 
     def __getitem__(self, indices):
         return self.matrix.get_element(indices[0], indices[1])
@@ -147,10 +151,49 @@ def __str__(self):
     def __repr__(self):
         return self.matrix.to_string()
 
+    @property
     def shape(self):
         """The shape of the sparse matrix."""
         return (self.n, self.m)
 
+    def build_from_ndarray(self, ndarray):
+        """Build the sparse matrix from a ndarray.
+
+        Args:
+            ndarray (Union[ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]): the ndarray to build the sparse matrix from.
+
+        Raises:
+            TaichiRuntimeError: If the input is not a ndarray or the length is not divisible by 3.
+
+        Example::
+            >>> N = 5
+            >>> triplets = ti.Vector.ndarray(n=3, dtype=ti.f32, shape=10, layout=ti.Layout.AOS)
+            >>> @ti.kernel
+            >>> def fill(triplets: ti.types.ndarray()):
+            >>>     for i in range(N):
+            >>>        triplets[i] = ti.Vector([i, (i + 1) % N, i+1], dt=ti.f32)
+            >>> fill(triplets)
+            >>> A = ti.linalg.SparseMatrix(n=N, m=N, dtype=ti.f32)
+            >>> A.build_from_ndarray(triplets)
+            >>> print(A)
+            [0, 1, 0, 0, 0]
+            [0, 0, 2, 0, 0]
+            [0, 0, 0, 3, 0]
+            [0, 0, 0, 0, 4]
+            [5, 0, 0, 0, 0]
+        """
+        if isinstance(ndarray, Ndarray):
+            if ndarray.arr.nelement() % 3 != 0:
+                raise TaichiRuntimeError(
+                    "The number of ndarray elements must have a length that is divisible by 3."
+                )
+            get_runtime().prog.make_sparse_matrix_from_ndarray(
+                self.matrix, ndarray.arr)
+        else:
+            raise TaichiRuntimeError(
+                'Sparse matrix only supports building from [ti.ndarray, ti.Vector.ndarray, ti.Matrix.ndarray]'
+            )
+
 
 class SparseMatrixBuilder:
     """A python wrap around sparse matrix builder.
diff --git a/python/taichi/linalg/sparse_solver.py b/python/taichi/linalg/sparse_solver.py
index 67fca2e5f2ab3..021df4b9e8fb7 100644
--- a/python/taichi/linalg/sparse_solver.py
+++ b/python/taichi/linalg/sparse_solver.py
@@ -1,6 +1,7 @@
 import numpy as np
 import taichi.lang
 from taichi._lib import core as _ti_core
+from taichi.lang.exception import TaichiRuntimeError
 from taichi.lang.field import Field
 from taichi.linalg import SparseMatrix
 from taichi.types.primitive_types import f32
@@ -24,11 +25,15 @@ def __init__(self, dtype=f32, solver_type="LLT", ordering="AMD"):
             self.solver = _ti_core.make_sparse_solver(dtype, solver_type,
                                                       ordering)
         else:
-            assert False, f"The solver type {solver_type} with {ordering} is not supported for now. Only {solver_type_list} with {solver_ordering} are supported."
+            raise TaichiRuntimeError(
+                f"The solver type {solver_type} with {ordering} is not supported for now. Only {solver_type_list} with {solver_ordering} are supported."
+            )
 
     @staticmethod
     def _type_assert(sparse_matrix):
-        assert False, f"The parameter type: {type(sparse_matrix)} is not supported in linear solvers for now."
+        raise TaichiRuntimeError(
+            f"The parameter type: {type(sparse_matrix)} is not supported in linear solvers for now."
+        )
 
     def compute(self, sparse_matrix):
         """This method is equivalent to calling both `analyze_pattern` and then `factorize`.
@@ -63,7 +68,7 @@ def factorize(self, sparse_matrix):
         else:
             self._type_assert(sparse_matrix)
 
-    def solve(self, b):
+    def solve(self, b):  # pylint: disable=R1710
         """Computes the solution of the linear systems.
         Args:
             b (numpy.array or Field): The right-hand side of the linear systems.
@@ -75,7 +80,9 @@ def solve(self, b):
             return self.solver.solve(b.to_numpy())
         if isinstance(b, np.ndarray):
             return self.solver.solve(b)
-        assert False, f"The parameter type: {type(b)} is not supported in linear solvers for now."
+        raise TaichiRuntimeError(
+            f"The parameter type: {type(b)} is not supported in linear solvers for now."
+        )
 
     def info(self):
         """Check if the linear systems are solved successfully.
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 784c472c126aa..05ae5f128715d 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -560,7 +560,7 @@ Ndarray *Program::create_ndarray(const DataType type,
   return ndarrays_.back().get();
 }
 
-intptr_t Program::get_ndarray_data_ptr_as_int(Ndarray *ndarray) {
+intptr_t Program::get_ndarray_data_ptr_as_int(const Ndarray *ndarray) {
   uint64_t *data_ptr{nullptr};
 #ifdef TI_WITH_LLVM
   if (arch_is_cpu(config.arch) || config.arch == Arch::cuda) {
diff --git a/taichi/program/program.h b/taichi/program/program.h
index e411cad9c620c..a43d271c5ce4e 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -323,7 +323,7 @@ class TI_DLL_EXPORT Program {
 
   Ndarray *create_ndarray(const DataType type, const std::vector<int> &shape);
 
-  intptr_t get_ndarray_data_ptr_as_int(Ndarray *ndarray);
+  intptr_t get_ndarray_data_ptr_as_int(const Ndarray *ndarray);
 
   void fill_ndarray_fast(Ndarray *ndarray, uint32_t val);
 
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index 1952b1b078bf3..c728b033e468e 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -162,5 +162,34 @@ std::unique_ptr<SparseMatrix> make_sparse_matrix(
              storage_format);
 }
 
+template <typename T>
+void build_ndarray_template(SparseMatrix &sm,
+                            intptr_t data_ptr,
+                            size_t num_triplets) {
+  using V = Eigen::Triplet<T>;
+  std::vector<V> triplets;
+  T *data = reinterpret_cast<T *>(data_ptr);
+  for (int i = 0; i < num_triplets; i++) {
+    triplets.push_back(
+        V(data[i * 3], data[i * 3 + 1], taichi_union_cast<T>(data[i * 3 + 2])));
+  }
+  sm.build_triplets(static_cast<void *>(&triplets));
+}
+
+void make_sparse_matrix_from_ndarray(Program *prog,
+                                     SparseMatrix &sm,
+                                     const Ndarray &ndarray) {
+  std::string sdtype = taichi::lang::data_type_name(sm.get_data_type());
+  auto data_ptr = prog->get_ndarray_data_ptr_as_int(&ndarray);
+  auto num_triplets = ndarray.get_nelement() / 3;
+  if (sdtype == "f32") {
+    build_ndarray_template<float32>(sm, data_ptr, num_triplets);
+  } else if (sdtype == "f64") {
+    build_ndarray_template<float64>(sm, data_ptr, num_triplets);
+  } else {
+    TI_ERROR("Unsupported sparse matrix data type {}!", sdtype);
+  }
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/program/sparse_matrix.h b/taichi/program/sparse_matrix.h
index 5aedfd1b1d7b2..9501fc2781469 100644
--- a/taichi/program/sparse_matrix.h
+++ b/taichi/program/sparse_matrix.h
@@ -3,6 +3,8 @@
 #include "taichi/common/core.h"
 #include "taichi/inc/constants.h"
 #include "taichi/ir/type_utils.h"
+#include "taichi/program/ndarray.h"
+#include "taichi/program/program.h"
 
 #include "Eigen/Sparse"
 
@@ -74,6 +76,10 @@ class SparseMatrix {
     return nullptr;
   }
 
+  inline DataType get_data_type() {
+    return dtype_;
+  }
+
   template <class T>
   T get_element(int row, int col) {
     std::cout << "get_element not implemented" << std::endl;
@@ -188,5 +194,9 @@ std::unique_ptr<SparseMatrix> make_sparse_matrix(
     int cols,
     DataType dt,
     const std::string &storage_format);
+
+void make_sparse_matrix_from_ndarray(Program *prog,
+                                     SparseMatrix &sm,
+                                     const Ndarray &ndarray);
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index d2314d238ad4b..f4f63e458bbca 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -381,6 +381,12 @@ void export_lang(py::module &m) {
                          "SparseMatrix only supports CPU for now.");
              return make_sparse_matrix(n, m, dtype, storage_format);
            })
+      .def("make_sparse_matrix_from_ndarray",
+           [](Program *program, SparseMatrix &sm, const Ndarray &ndarray) {
+             TI_ERROR_IF(!arch_is_cpu(program->config.arch),
+                         "SparseMatrix only supports CPU for now.");
+             return make_sparse_matrix_from_ndarray(program, sm, ndarray);
+           })
       .def(
           "dump_dot",
           [](Program *program, std::optional<std::string> rankdir,
diff --git a/tests/python/test_sparse_matrix.py b/tests/python/test_sparse_matrix.py
index 5b49316cd81b0..d4f6b320c8484 100644
--- a/tests/python/test_sparse_matrix.py
+++ b/tests/python/test_sparse_matrix.py
@@ -54,6 +54,32 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
             assert A[i, j] == i + j
 
 
+@pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
+                                                   (ti.f32, 'row_major'),
+                                                   (ti.f64, 'col_major'),
+                                                   (ti.f64, 'row_major')])
+@test_utils.test(arch=ti.cpu)
+def test_build_sparse_matrix_frome_ndarray(dtype, storage_format):
+    n = 8
+    triplets = ti.Vector.ndarray(n=3, dtype=ti.f32, shape=n)
+    A = ti.linalg.SparseMatrix(n=10,
+                               m=10,
+                               dtype=ti.f32,
+                               storage_format=storage_format)
+
+    @ti.kernel
+    def fill(triplets: ti.types.ndarray()):
+        for i in range(n):
+            triplet = ti.Vector([i, i, i], dt=ti.f32)
+            triplets[i] = triplet
+
+    fill(triplets)
+    A.build_from_ndarray(triplets)
+
+    for i in range(n):
+        assert A[i, i] == i
+
+
 @pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),
                                                    (ti.f32, 'row_major'),
                                                    (ti.f64, 'col_major'),
@@ -74,7 +100,7 @@ def fill(Abuilder: ti.types.sparse_matrix_builder()):
 
     fill(Abuilder)
     A = Abuilder.build()
-    assert A.shape() == (n, m)
+    assert A.shape == (n, m)
 
 
 @pytest.mark.parametrize('dtype, storage_format', [(ti.f32, 'col_major'),

From 5cea4496f3e458168420680cd25d22482b4741f5 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 26 May 2022 16:31:11 +0800
Subject: [PATCH 120/176] [bug] Added type promotion support for atan2 (#5037)

* [bug] Added type promotion rule for atan2

* Fixed minor issue

* Modified type promotion rule
---
 docs/lang/articles/basic/type.md    |  8 +++++--
 taichi/ir/frontend_ir.cpp           | 11 +++++++++
 taichi/transforms/type_check.cpp    | 18 +++++++++++++++
 tests/python/test_type_promotion.py | 36 +++++++++++++++++++++++++++++
 4 files changed, 71 insertions(+), 2 deletions(-)

diff --git a/docs/lang/articles/basic/type.md b/docs/lang/articles/basic/type.md
index 1e2fd7511ced5..8af09258af9e3 100644
--- a/docs/lang/articles/basic/type.md
+++ b/docs/lang/articles/basic/type.md
@@ -125,8 +125,12 @@ A few exceptions:
 - `u8 << i32 -> u8`
 - `i16 << i8 -> i16`
 
-2. logical operations: always return i32
-3. comparison operations: always return i32
+2. atan2 operation: return fp64 if either lhs or rhs is fp64, otherwise return fp32.
+- `i32 atan f32 -> f32`
+- `i32 atan f64 -> f64`
+
+3. logical operations: always return i32
+4. comparison operations: always return i32
 
 #### Implicit type casting in assignments
 
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index e5fcb6d18cf83..45627374e1ddf 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -202,6 +202,17 @@ void BinaryOpExpression::type_check(CompileConfig *config) {
     return;
   }
 
+  // Some backends such as vulkan doesn't support fp64
+  // Try not promoting to fp64 unless neccessary
+  if (type == BinaryOpType::atan2) {
+    if (lhs_type == PrimitiveType::f64 || rhs_type == PrimitiveType::f64) {
+      ret_type = PrimitiveType::f64;
+    } else {
+      ret_type = PrimitiveType::f32;
+    }
+    return;
+  }
+
   if (type == BinaryOpType::truediv) {
     auto default_fp = config->default_fp;
     if (!is_real(lhs_type)) {
diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index fc3092b01b105..a92b85ce96fed 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -268,6 +268,9 @@ class TypeCheck : public IRVisitor {
   }
 
   void cast(Stmt *&val, DataType dt) {
+    if (val->ret_type == dt)
+      return;
+
     auto cast_stmt = insert_type_cast_after(val, val, dt);
     val = cast_stmt;
   }
@@ -301,6 +304,21 @@ class TypeCheck : public IRVisitor {
       stmt->op_type = BinaryOpType::div;
     }
 
+    // Some backends such as vulkan doesn't support fp64
+    // Always promote to fp32 unless neccessary
+    if (stmt->op_type == BinaryOpType::atan2) {
+      if (stmt->rhs->ret_type == PrimitiveType::f64 ||
+          stmt->lhs->ret_type == PrimitiveType::f64) {
+        stmt->ret_type = PrimitiveType::f64;
+        cast(stmt->rhs, PrimitiveType::f64);
+        cast(stmt->lhs, PrimitiveType::f64);
+      } else {
+        stmt->ret_type = PrimitiveType::f32;
+        cast(stmt->rhs, PrimitiveType::f32);
+        cast(stmt->lhs, PrimitiveType::f32);
+      }
+    }
+
     if (stmt->lhs->ret_type != stmt->rhs->ret_type) {
       auto promote_custom_int_type = [&](Stmt *stmt, Stmt *hs) {
         if (auto cit = hs->ret_type->cast<CustomIntType>()) {
diff --git a/tests/python/test_type_promotion.py b/tests/python/test_type_promotion.py
index abaade3775d7c..3374f7c807323 100644
--- a/tests/python/test_type_promotion.py
+++ b/tests/python/test_type_promotion.py
@@ -60,3 +60,39 @@ def func():
     func()
 
     assert np.allclose(y.to_numpy(), np.sqrt(x.to_numpy()))
+
+
+@test_utils.test()
+def test_atan2():
+    N = 1
+    x = ti.field(ti.i32, shape=(N, ))
+    y = ti.field(ti.i32, shape=(N, ))
+
+    @ti.kernel
+    def test_case_0() -> ti.f32:
+        i = ti.i32(2)
+        return ti.atan2(i, 1)
+
+    @ti.kernel
+    def test_case_1() -> ti.f32:
+        x[0] = ti.i32(2)
+        return ti.atan2(x[0], 1)
+
+    @ti.kernel
+    def test_case_2() -> ti.f32:
+        x[0] = ti.i32(3)
+        y[0] = ti.i32(1)
+        return ti.atan2(x[0], y[0])
+
+    ti_res0 = test_case_0()
+    np_res0 = np.arctan2(2, 1)
+
+    ti_res1 = test_case_1()
+    np_res1 = np.arctan2(2, 1)
+
+    ti_res2 = test_case_2()
+    np_res2 = np.arctan2(3, 1)
+
+    assert np.allclose(ti_res0, np_res0)
+    assert np.allclose(ti_res1, np_res1)
+    assert np.allclose(ti_res2, np_res2)

From 235c9bb551be03aef231a404e7813644276fb57b Mon Sep 17 00:00:00 2001
From: Vissidarte-Herman <93570324+Vissidarte-Herman@users.noreply.github.com>
Date: Fri, 27 May 2022 11:23:40 +0800
Subject: [PATCH 121/176] [Doc] Updated type system (#5054)

* Editorial updates

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 docs/lang/articles/basic/type.md | 46 +++++++++++++++-----------------
 1 file changed, 22 insertions(+), 24 deletions(-)

diff --git a/docs/lang/articles/basic/type.md b/docs/lang/articles/basic/type.md
index 8af09258af9e3..3fc01da91f6d3 100644
--- a/docs/lang/articles/basic/type.md
+++ b/docs/lang/articles/basic/type.md
@@ -100,37 +100,35 @@ As a rule of thumb, implicit type casting is a major source of bugs. And Taichi
 
 #### Implicit type casting in binary operations
 
-Taichi implements its own implicit type casting rules for binary operations, which are slightly different from [those for the C programming language](https://en.cppreference.com/w/c/language/conversion).
+Taichi implements its own implicit type casting rules for binary operations, which are slightly different from [those for the C programming language](https://en.cppreference.com/w/c/language/conversion). In general we have three rules in descending order of priority:
 
-In general we have three rules with descending priority:
+1. Integer + floating point -> floating point
+   - `i32 + f32 -> f32`
+   - `i16 + f16 -> f16`
 
-1. integral OP floating_point -> floating_point
-- `i32 + f32 -> f32`
-- `i16 + f16 -> f16`
+2. Low-precision bits + high-precision bits -> high-precision bits
+   - `i16 + i32 -> i32`
+   - `f16 + f32 -> f32`
+   - `u8 + u16 -> u16`
 
-2. low_precision_bits OP high_precision_bits -> high_precision_bits
-- `i16 + i32 -> i32`
-- `u8 + u16 -> u16`
+3. Signed integer + unsigned integer -> unsigned integer
+   - `u32 + i32 -> u32`
+   - `u8 + i8 -> u8`
 
-3. signed OP unsigned -> unsigned
-- `u32 + i32 -> u32`
-- `u8 + i8 -> u8`
-
-For conflicting rules, only the highest priority one will be applied.
-- `u8 + i16 -> i16` (rule #2 conflicts with rule #3: apply rule #2)
-- `f16 + i32 -> f16` (rule #1 conflicts with rule #2: apply rule #1)
+When it comes to rule conflicts, the rule of the highest priority applies:
+  - `u8 + i16 -> i16` (when rule #2 conflicts with rule #3, rule #2 applies.)
+  - `f16 + i32 -> f16` (when rule #1 conflicts with rule #2, rule #1 applies.)
 
 A few exceptions:
-1. bit-shift operations: always follow lhs's dtype
-- `u8 << i32 -> u8`
-- `i16 << i8 -> i16`
-
-2. atan2 operation: return fp64 if either lhs or rhs is fp64, otherwise return fp32.
-- `i32 atan f32 -> f32`
-- `i32 atan f64 -> f64`
 
-3. logical operations: always return i32
-4. comparison operations: always return i32
+- bit-shift operations return lhs' (left hand side's) data type:
+  - `u8 << i32 -> u8`
+  - `i16 << i8 -> i16`
+- atan2 operations return `f64` if either side is `f64`, or `f32` otherwise.
+  - `i32 atan f32 -> f32`
+  - `i32 atan f64 -> f64`
+- Logical operations return `i32`.
+- Comparison operations return `i32`.
 
 #### Implicit type casting in assignments
 

From 8966069d5721f2affe2a1317eaeaf080f04b1a40 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Fri, 27 May 2022 11:35:38 +0800
Subject: [PATCH 122/176] [bug] Ndarray type should include primitive dtype as
 well (#5052)

This PR does three things:
- Switch cgraph `Arg` to take in `ti.f32/i32` instead of string
`f32/i32` as inputs
- Fix a bug that when we produce injected ndarray args for compilation
we only produced f32 ndarrays, which won't work for ndarray of other
primitive dtypes.
- No need to specify `element_shape` if it's scalar arg or scalar
ndarray arg.
---
 python/taichi/aot/utils.py                  | 14 ++++-----
 python/taichi/examples/graph/mpm88_graph.py | 12 ++++----
 python/taichi/types/ndarray_type.py         |  5 ++++
 taichi/aot/graph_data.h                     | 33 +++++++++++++++++++--
 taichi/python/export_lang.cpp               |  9 +++---
 tests/cpp/aot/aot_save_load_test.cpp        |  8 ++---
 tests/cpp/program/graph_test.cpp            |  5 ++--
 tests/python/test_aot.py                    | 12 ++++----
 tests/python/test_graph.py                  | 23 ++++++++++++++
 9 files changed, 88 insertions(+), 33 deletions(-)
 create mode 100644 tests/python/test_graph.py

diff --git a/python/taichi/aot/utils.py b/python/taichi/aot/utils.py
index fa6a0783c575a..e0cbf3646cac4 100644
--- a/python/taichi/aot/utils.py
+++ b/python/taichi/aot/utils.py
@@ -4,7 +4,6 @@
 from taichi.lang.matrix import MatrixNdarray, VectorNdarray
 from taichi.types.annotations import template
 from taichi.types.ndarray_type import NdarrayType
-from taichi.types.primitive_types import f32
 
 template_types = (NdarrayType, template)
 
@@ -29,15 +28,16 @@ def produce_injected_args_from_template(kernel, template_args):
 
 def produce_injected_args(kernel, symbolic_args=None):
     injected_args = []
-    for j, arg in enumerate(kernel.arguments):
+    for i, arg in enumerate(kernel.arguments):
         anno = arg.annotation
         if isinstance(anno, template_types):
             if not isinstance(anno, NdarrayType):
                 raise TaichiCompilationError(
                     f'Expected Ndaray type, got {anno}')
             if symbolic_args is not None:
-                anno.element_shape = tuple(symbolic_args[j].element_shape)
+                anno.element_shape = tuple(symbolic_args[i].element_shape)
                 anno.element_dim = len(anno.element_shape)
+                anno.dtype = symbolic_args[i].dtype()
 
             if anno.element_shape is None or anno.field_dim is None:
                 raise TaichiCompilationError(
@@ -45,19 +45,19 @@ def produce_injected_args(kernel, symbolic_args=None):
                     'in the param annotation, or provide an example '
                     f'ndarray for param={arg.name}')
             if anno.element_dim == 0:
-                injected_args.append(ScalarNdarray(f32,
-                                                   (2, ) * anno.field_dim))
+                injected_args.append(
+                    ScalarNdarray(anno.dtype, (2, ) * anno.field_dim))
             elif anno.element_dim == 1:
                 injected_args.append(
                     VectorNdarray(anno.element_shape[0],
-                                  dtype=f32,
+                                  dtype=anno.dtype,
                                   shape=(2, ) * anno.field_dim,
                                   layout=Layout.AOS))
             elif anno.element_dim == 2:
                 injected_args.append(
                     MatrixNdarray(anno.element_shape[0],
                                   anno.element_shape[1],
-                                  dtype=f32,
+                                  dtype=anno.dtype,
                                   shape=(2, ) * anno.field_dim,
                                   layout=Layout.AOS))
             else:
diff --git a/python/taichi/examples/graph/mpm88_graph.py b/python/taichi/examples/graph/mpm88_graph.py
index d7a15119718cd..430142b6022d2 100644
--- a/python/taichi/examples/graph/mpm88_graph.py
+++ b/python/taichi/examples/graph/mpm88_graph.py
@@ -113,12 +113,12 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
     if not args.baseline:
         print('running in graph mode')
         # Build graph
-        sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'x', 'f32', element_shape=(2, ))
-        sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'v', 'f32', element_shape=(2, ))
-        sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'C', 'f32', element_shape=(2, 2))
-        sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'J', 'f32', element_shape=())
-        sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_v', 'f32', element_shape=(2, ))
-        sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_m', 'f32', element_shape=())
+        sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'x', ti.f32, element_shape=(2, ))
+        sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'v', ti.f32, element_shape=(2, ))
+        sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'C', ti.f32, element_shape=(2, 2))
+        sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'J', ti.f32)
+        sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_v', ti.f32, element_shape=(2, ))
+        sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_m', ti.f32)
         g_init = ti.graph.Graph()
         g_init.dispatch(init_particles, sym_x, sym_v, sym_J)
 
diff --git a/python/taichi/types/ndarray_type.py b/python/taichi/types/ndarray_type.py
index c6ff82f453550..0bc2c610c3414 100644
--- a/python/taichi/types/ndarray_type.py
+++ b/python/taichi/types/ndarray_type.py
@@ -1,3 +1,6 @@
+from taichi.types.primitive_types import f32
+
+
 class NdarrayType:
     """Type annotation for arbitrary arrays, including external arrays (numpy ndarrays and torch tensors) and Taichi ndarrays.
 
@@ -11,6 +14,7 @@ class NdarrayType:
         layout (Union[Layout, NoneType], optional): None if not specified (will be treated as Layout.AOS for external arrays), Layout.AOS or Layout.SOA.
     """
     def __init__(self,
+                 dtype=f32,
                  element_dim=None,
                  element_shape=None,
                  field_dim=None,
@@ -24,6 +28,7 @@ def __init__(self,
             raise ValueError(
                 f"Both element_shape and element_dim are specified, but shape doesn't match specified dim: {len(element_shape)}!={element_dim}"
             )
+        self.dtype = dtype
         self.element_shape = element_shape
         self.element_dim = len(
             element_shape) if element_shape is not None else element_dim
diff --git a/taichi/aot/graph_data.h b/taichi/aot/graph_data.h
index aa2c2479d1d67..db87ca8702018 100644
--- a/taichi/aot/graph_data.h
+++ b/taichi/aot/graph_data.h
@@ -2,6 +2,7 @@
 #include <vector>
 #include <string>
 #include <unordered_map>
+#include "taichi/ir/type.h"
 #include "taichi/aot/module_data.h"
 
 template <typename T, typename G>
@@ -23,10 +24,38 @@ struct Arg {
   ArgKind tag;
   std::string name;
   // TODO: real element dtype = dtype + element_shape
-  std::string dtype_name;
+  PrimitiveTypeID dtype_id;
   std::vector<int> element_shape;
 
-  TI_IO_DEF(name, dtype_name, tag, element_shape);
+  // For serialization & deserialization
+  explicit Arg()
+      : tag(ArgKind::kUnknown),
+        name(""),
+        dtype_id(PrimitiveTypeID::unknown),
+        element_shape({}) {
+  }
+
+  explicit Arg(ArgKind tag,
+               const std::string &name,
+               PrimitiveTypeID dtype_id,
+               const std::vector<int> &element_shape)
+      : tag(tag), name(name), dtype_id(dtype_id), element_shape(element_shape) {
+  }
+
+  // Python/C++ interface that's user facing.
+  explicit Arg(ArgKind tag,
+               const std::string &name,
+               const DataType &dtype,
+               const std::vector<int> &element_shape = {})
+      : tag(tag), name(name), element_shape(element_shape) {
+    dtype_id = dtype->as<PrimitiveType>()->type;
+  }
+
+  DataType dtype() const {
+    return PrimitiveType::get(dtype_id);
+  }
+
+  TI_IO_DEF(name, dtype_id, tag, element_shape);
 };
 
 /**
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index f4f63e458bbca..cc8423c1fde50 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -546,11 +546,12 @@ void export_lang(py::module &m) {
       .export_values();
 
   py::class_<aot::Arg>(m, "Arg")
-      .def(py::init<aot::ArgKind, std::string, std::string, std::vector<int>>(),
-           py::arg("tag"), py::arg("name"), py::arg("dtype_name"),
-           py::arg("element_shape"))
+      .def(py::init<aot::ArgKind, std::string, DataType &, std::vector<int>>(),
+           py::arg("tag"), py::arg("name"), py::arg("dtype"),
+           py::arg("element_shape") = py::tuple())
       .def_readonly("name", &aot::Arg::name)
-      .def_readonly("element_shape", &aot::Arg::element_shape);
+      .def_readonly("element_shape", &aot::Arg::element_shape)
+      .def("dtype", &aot::Arg::dtype);
 
   py::class_<Node>(m, "Node");
 
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index e0c0211fb195a..8142029e8d4f9 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -311,12 +311,10 @@ TEST(AotSaveLoad, VulkanNdarray) {
 
   auto g_builder = std::make_unique<GraphBuilder>();
   auto seq = g_builder->seq();
-  auto arr_arg = aot::Arg{
-      aot::ArgKind::kNdarray, "arr", PrimitiveType::i32.to_string(), {}};
+  auto arr_arg = aot::Arg{aot::ArgKind::kNdarray, "arr", PrimitiveType::i32};
   seq->dispatch(ker1.get(), {arr_arg});
-  seq->dispatch(ker2.get(),
-                {arr_arg, aot::Arg{aot::ArgKind::kScalar, "x",
-                                   PrimitiveType::i32.to_string()}});
+  seq->dispatch(ker2.get(), {arr_arg, aot::Arg{aot::ArgKind::kScalar, "x",
+                                               PrimitiveType::i32}});
   auto graph = g_builder->compile();
 
   aot_builder->add_graph("test", *graph);
diff --git a/tests/cpp/program/graph_test.cpp b/tests/cpp/program/graph_test.cpp
index 2e062a31da7cb..958adb2f9d461 100644
--- a/tests/cpp/program/graph_test.cpp
+++ b/tests/cpp/program/graph_test.cpp
@@ -30,13 +30,12 @@ TEST(GraphTest, SimpleGraphRun) {
 
   auto g_builder = std::make_unique<GraphBuilder>();
   auto seq = g_builder->seq();
-  auto arr_arg = aot::Arg{
-      aot::ArgKind::kNdarray, "arr", PrimitiveType::i32.to_string(), {}};
+  auto arr_arg = aot::Arg{aot::ArgKind::kNdarray, "arr", PrimitiveType::i32};
   seq->dispatch(ker1.get(), {arr_arg});
   seq->dispatch(ker2.get(), {arr_arg, aot::Arg{
                                           aot::ArgKind::kScalar,
                                           "x",
-                                          PrimitiveType::i32.to_string(),
+                                          PrimitiveType::i32,
                                       }});
 
   auto g = g_builder->compile();
diff --git a/tests/python/test_aot.py b/tests/python/test_aot.py
index 9cb531d6efe65..55fb7a59cf8e2 100644
--- a/tests/python/test_aot.py
+++ b/tests/python/test_aot.py
@@ -714,27 +714,27 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
 
     sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
                          'x',
-                         'f32',
+                         ti.f32,
                          element_shape=(2, ))
     sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
                          'v',
-                         'f32',
+                         ti.f32,
                          element_shape=(2, ))
     sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
                          'C',
-                         'f32',
+                         ti.f32,
                          element_shape=(2, 2))
     sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
                          'J',
-                         'f32',
+                         ti.f32,
                          element_shape=())
     sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
                               'grid_v',
-                              'f32',
+                              ti.f32,
                               element_shape=(2, ))
     sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
                               'grid_m',
-                              'f32',
+                              ti.f32,
                               element_shape=())
     g_init = ti.graph.Graph()
     g_init.dispatch(init_particles, sym_x, sym_v, sym_J)
diff --git a/tests/python/test_graph.py b/tests/python/test_graph.py
new file mode 100644
index 0000000000000..5f4023725c581
--- /dev/null
+++ b/tests/python/test_graph.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+import taichi as ti
+from tests import test_utils
+
+
+@test_utils.test(arch=ti.vulkan)
+def test_ndarray_int():
+    n = 4
+
+    @ti.kernel
+    def test(pos: ti.types.ndarray(field_dim=1, element_shape=())):
+        for i in range(n):
+            pos[i] = 1
+
+    sym_pos = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'pos', ti.i32)
+    g_init = ti.graph.Graph()
+    g_init.dispatch(test, sym_pos)
+    g_init.compile()
+
+    a = ti.ndarray(ti.i32, shape=(n, ))
+    g_init.run({'pos': a})
+    assert (a.to_numpy() == np.ones(4)).all()

From 78a013996a7a08c240cf431463e7aaeea120187e Mon Sep 17 00:00:00 2001
From: daylily <xy.r@outlook.com>
Date: Sun, 29 May 2022 13:48:45 +0800
Subject: [PATCH 123/176] [Lang] [ir] Add short-circuit if-then-else operator
 (#5022)

* [Lang] [ir] Add a short-circuit if-then-else operator and use it to implement IfExp

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/ast/ast_transformer.py     | 12 +----
 python/taichi/lang/ops.py                     | 23 +++++++++
 taichi/ir/expression_ops.h                    |  1 +
 taichi/ir/frontend_ir.cpp                     | 51 ++++++++++++++++---
 taichi/ir/stmt_op_types.cpp                   |  1 +
 taichi/ir/stmt_op_types.h                     |  2 +-
 taichi/python/export_lang.cpp                 |  1 +
 tests/cpp/ir/frontend_type_inference_test.cpp | 10 ++--
 tests/python/test_type_check.py               |  2 +-
 9 files changed, 78 insertions(+), 25 deletions(-)

diff --git a/python/taichi/lang/ast/ast_transformer.py b/python/taichi/lang/ast/ast_transformer.py
index 9ffc80db38c0e..8a074f27dc500 100644
--- a/python/taichi/lang/ast/ast_transformer.py
+++ b/python/taichi/lang/ast/ast_transformer.py
@@ -1108,17 +1108,7 @@ def build_IfExp(ctx, node):
                 node.ptr = build_stmt(ctx, node.orelse)
             return node.ptr
 
-        val = impl.expr_init(None)
-
-        impl.begin_frontend_if(ctx.ast_builder, node.test.ptr)
-        ctx.ast_builder.begin_frontend_if_true()
-        val._assign(node.body.ptr)
-        ctx.ast_builder.pop_scope()
-        ctx.ast_builder.begin_frontend_if_false()
-        val._assign(node.orelse.ptr)
-        ctx.ast_builder.pop_scope()
-
-        node.ptr = val
+        node.ptr = ti_ops.ifte(node.test.ptr, node.body.ptr, node.orelse.ptr)
         return node.ptr
 
     @staticmethod
diff --git a/python/taichi/lang/ops.py b/python/taichi/lang/ops.py
index e94675afad1c8..84fd67c069fc8 100644
--- a/python/taichi/lang/ops.py
+++ b/python/taichi/lang/ops.py
@@ -1143,6 +1143,29 @@ def py_select(cond, x1, x2):
     return _ternary_operation(_ti_core.expr_select, py_select, cond, x1, x2)
 
 
+@ternary
+def ifte(cond, x1, x2):
+    """Evaluate and return `x1` if `cond` is true; otherwise evaluate and return `x2`. This operator guarantees
+    short-circuit semantics: exactly one of `x1` or `x2` will be evaluated.
+
+    Args:
+        cond (:mod:`~taichi.types.primitive_types`): \
+            The condition.
+        x1, x2 (:mod:`~taichi.types.primitive_types`): \
+            The outputs.
+
+    Returns:
+        `x1` if `cond` is true and `x2` otherwise.
+    """
+    # TODO: systematically resolve `-1 = True` problem by introducing u1:
+    cond = logical_not(logical_not(cond))
+
+    def py_ifte(cond, x1, x2):
+        return x1 if cond else x2
+
+    return _ternary_operation(_ti_core.expr_ifte, py_ifte, cond, x1, x2)
+
+
 @writeback_binary
 def atomic_add(x, y):
     """Atomically compute `x + y`, store the result in `x`,
diff --git a/taichi/ir/expression_ops.h b/taichi/ir/expression_ops.h
index dd3b7ee255c7b..dd18413c8867f 100644
--- a/taichi/ir/expression_ops.h
+++ b/taichi/ir/expression_ops.h
@@ -124,6 +124,7 @@ DEFINE_EXPRESSION_FUNC_BINARY(floordiv)
 DEFINE_EXPRESSION_FUNC_BINARY(bit_shr)
 
 DEFINE_EXPRESSION_FUNC_TERNARY(select)
+DEFINE_EXPRESSION_FUNC_TERNARY(ifte)
 
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 45627374e1ddf..cfc5a8f4ae3c8 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -263,6 +263,37 @@ void BinaryOpExpression::flatten(FlattenContext *ctx) {
   stmt = ctx->back_stmt();
 }
 
+void make_ifte(Expression::FlattenContext *ctx,
+               DataType ret_type,
+               Expr cond,
+               Expr true_val,
+               Expr false_val) {
+  auto result = ctx->push_back<AllocaStmt>(ret_type);
+  flatten_rvalue(cond, ctx);
+  auto if_stmt = ctx->push_back<IfStmt>(cond->stmt);
+
+  Expression::FlattenContext lctx;
+  lctx.current_block = ctx->current_block;
+  flatten_rvalue(true_val, &lctx);
+  lctx.push_back<LocalStoreStmt>(result, true_val->stmt);
+
+  Expression::FlattenContext rctx;
+  rctx.current_block = ctx->current_block;
+  flatten_rvalue(false_val, &rctx);
+  rctx.push_back<LocalStoreStmt>(result, false_val->stmt);
+
+  auto true_block = std::make_unique<Block>();
+  true_block->set_statements(std::move(lctx.stmts));
+  if_stmt->set_true_statements(std::move(true_block));
+
+  auto false_block = std::make_unique<Block>();
+  false_block->set_statements(std::move(rctx.stmts));
+  if_stmt->set_false_statements(std::move(false_block));
+
+  ctx->push_back<LocalLoadStmt>(LocalAddress(result, 0));
+  return;
+}
+
 void TernaryOpExpression::type_check(CompileConfig *) {
   TI_ASSERT_TYPE_CHECKED(op1);
   TI_ASSERT_TYPE_CHECKED(op2);
@@ -276,8 +307,9 @@ void TernaryOpExpression::type_check(CompileConfig *) {
                     ternary_type_name(type), op1->ret_type->to_string(),
                     op2->ret_type->to_string(), op3->ret_type->to_string()));
   };
-  if (!is_integral(op1_type) || !op2_type->is<PrimitiveType>() ||
-      !op3_type->is<PrimitiveType>())
+  if (op1_type != PrimitiveType::i32)
+    error();
+  if (!op2_type->is<PrimitiveType>() || !op3_type->is<PrimitiveType>())
     error();
   ret_type = promoted_type(op2_type, op3_type);
 }
@@ -285,12 +317,17 @@ void TernaryOpExpression::type_check(CompileConfig *) {
 void TernaryOpExpression::flatten(FlattenContext *ctx) {
   // if (stmt)
   //  return;
-  flatten_rvalue(op1, ctx);
-  flatten_rvalue(op2, ctx);
-  flatten_rvalue(op3, ctx);
-  ctx->push_back(
-      std::make_unique<TernaryOpStmt>(type, op1->stmt, op2->stmt, op3->stmt));
+  if (type == TernaryOpType::select) {
+    flatten_rvalue(op1, ctx);
+    flatten_rvalue(op2, ctx);
+    flatten_rvalue(op3, ctx);
+    ctx->push_back(
+        std::make_unique<TernaryOpStmt>(type, op1->stmt, op2->stmt, op3->stmt));
+  } else if (type == TernaryOpType::ifte) {
+    make_ifte(ctx, ret_type, op1, op2, op3);
+  }
   stmt = ctx->back_stmt();
+  stmt->tb = tb;
 }
 
 void InternalFuncCallExpression::type_check(CompileConfig *) {
diff --git a/taichi/ir/stmt_op_types.cpp b/taichi/ir/stmt_op_types.cpp
index 74d7edfaaa4e7..a5f492f869c0a 100644
--- a/taichi/ir/stmt_op_types.cpp
+++ b/taichi/ir/stmt_op_types.cpp
@@ -77,6 +77,7 @@ std::string ternary_type_name(TernaryOpType type) {
     return #i;
 
     REGISTER_TYPE(select);
+    REGISTER_TYPE(ifte);
 
 #undef REGISTER_TYPE
     default:
diff --git a/taichi/ir/stmt_op_types.h b/taichi/ir/stmt_op_types.h
index 5e04525930a3f..a71d5512cd6ed 100644
--- a/taichi/ir/stmt_op_types.h
+++ b/taichi/ir/stmt_op_types.h
@@ -62,7 +62,7 @@ inline bool is_bit_op(BinaryOpType type) {
 
 std::string binary_op_type_symbol(BinaryOpType type);
 
-enum class TernaryOpType : int { select, undefined };
+enum class TernaryOpType : int { select, ifte, undefined };
 
 std::string ternary_type_name(TernaryOpType type);
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index cc8423c1fde50..96c55be9890b7 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -763,6 +763,7 @@ void export_lang(py::module &m) {
   DEFINE_EXPRESSION_OP(log)
 
   DEFINE_EXPRESSION_OP(select)
+  DEFINE_EXPRESSION_OP(ifte)
 
   DEFINE_EXPRESSION_OP(cmp_le)
   DEFINE_EXPRESSION_OP(cmp_lt)
diff --git a/tests/cpp/ir/frontend_type_inference_test.cpp b/tests/cpp/ir/frontend_type_inference_test.cpp
index 75131240abcce..570a6114cb7da 100644
--- a/tests/cpp/ir/frontend_type_inference_test.cpp
+++ b/tests/cpp/ir/frontend_type_inference_test.cpp
@@ -66,16 +66,16 @@ TEST(FrontendTypeInference, UnaryOp) {
 }
 
 TEST(FrontendTypeInference, TernaryOp) {
-  auto const_i16 = value<int16>(-(1 << 10));
-  const_i16->type_check(nullptr);
-  EXPECT_EQ(const_i16->ret_type, PrimitiveType::i16);
-  auto cast_i8 = cast(const_i16, PrimitiveType::i8);
+  auto const_i32 = value<int32>(-(1 << 10));
+  const_i32->type_check(nullptr);
+  EXPECT_EQ(const_i32->ret_type, PrimitiveType::i32);
+  auto cast_i8 = cast(const_i32, PrimitiveType::i8);
   cast_i8->type_check(nullptr);
   EXPECT_EQ(cast_i8->ret_type, PrimitiveType::i8);
   auto const_f32 = value<float32>(5.0);
   const_f32->type_check(nullptr);
   EXPECT_EQ(const_f32->ret_type, PrimitiveType::f32);
-  auto ternary_f32 = expr_select(const_i16, cast_i8, const_f32);
+  auto ternary_f32 = expr_select(const_i32, cast_i8, const_f32);
   ternary_f32->type_check(nullptr);
   EXPECT_EQ(ternary_f32->ret_type, PrimitiveType::f32);
 }
diff --git a/tests/python/test_type_check.py b/tests/python/test_type_check.py
index fe5ae0d8f5154..964e24131b106 100644
--- a/tests/python/test_type_check.py
+++ b/tests/python/test_type_check.py
@@ -41,7 +41,7 @@ def select():
         d = b if a else c
 
     with pytest.raises(TypeError,
-                       match="`if` conditions must be of type int32"):
+                       match="unsupported operand type\\(s\\) for 'ifte'"):
         select()
 
 
From 359afd22aff6caf7c2e81def79c597184f385753 Mon Sep 17 00:00:00 2001
From: bsavery <brian.savery@gmail.com>
Date: Sat, 28 May 2022 23:48:46 -0700
Subject: [PATCH 124/176] [Lang] Struct Classes implementation (#4989)

* Initial Struct Classes implementation

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update method translation to mark as taichi funcs

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Revert unwanted changes to impl.py

* Update test_api.py

* Update struct.py

Update with review comments.

* Update class decorator docstring

* Update func marking and add tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update tests/python/test_custom_struct.py

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docstrings

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update python/taichi/lang/struct.py

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update python/taichi/lang/struct.py

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update python/taichi/lang/struct.py

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>
---
 python/taichi/lang/impl.py         |  8 +--
 python/taichi/lang/struct.py       | 92 +++++++++++++++++++++++++++---
 tests/python/test_api.py           |  8 +--
 tests/python/test_custom_struct.py | 41 +++++++++++++
 4 files changed, 133 insertions(+), 16 deletions(-)

diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 90cfc7446d3af..6147f27e2e4ae 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -41,7 +41,7 @@ def expr_init(rhs):
     if isinstance(rhs, Matrix):
         return Matrix(rhs.to_list())
     if isinstance(rhs, Struct):
-        return Struct(rhs.to_dict())
+        return Struct(rhs.to_dict(include_methods=True))
     if isinstance(rhs, list):
         return [expr_init(e) for e in rhs]
     if isinstance(rhs, tuple):
@@ -168,9 +168,9 @@ def subscript(value, *_indices, skip_reordered=False):
         if isinstance(value, MatrixField):
             return _MatrixFieldElement(value, indices_expr_group)
         if isinstance(value, StructField):
-            return _IntermediateStruct(
-                {k: subscript(v, *_indices)
-                 for k, v in value._items})
+            entries = {k: subscript(v, *_indices) for k, v in value._items}
+            entries['__struct_methods'] = value.struct_methods
+            return _IntermediateStruct(entries)
         return Expr(_ti_core.subscript(_var, indices_expr_group))
     if isinstance(value, AnyArray):
         # TODO: deprecate using get_attribute to get dim
diff --git a/python/taichi/lang/struct.py b/python/taichi/lang/struct.py
index cb750b8eee578..0d197ef78247e 100644
--- a/python/taichi/lang/struct.py
+++ b/python/taichi/lang/struct.py
@@ -1,4 +1,5 @@
 import numbers
+from types import MethodType
 
 from taichi.lang import expr, impl, ops
 from taichi.lang.common_ops import TaichiOperations
@@ -21,7 +22,9 @@ class Struct(TaichiOperations):
 
     Args:
         entries (Dict[str, Union[Dict, Expr, Matrix, Struct]]): \
-            keys and values for struct members.
+            keys and values for struct members. Entries can optionally
+            include a dictionary of functions with the key '__struct_methods'
+            which will be attached to the struct for executing on the struct data.
 
     Returns:
         An instance of this struct.
@@ -49,6 +52,9 @@ def __init__(self, *args, **kwargs):
             raise TaichiSyntaxError(
                 "Custom structs need to be initialized using either dictionary or keyword arguments"
             )
+        self.methods = self.entries.pop("__struct_methods", {})
+        self._register_methods()
+
         for k, v in self.entries.items():
             if isinstance(v, (list, tuple)):
                 v = Matrix(v)
@@ -95,6 +101,11 @@ def _register_members(self):
                         Struct._make_setter(k),
                     ))
 
+    def _register_methods(self):
+        for name, method in self.methods.items():
+            # use MethodType to pass self (this object) to the method
+            setattr(self, name, MethodType(method, self))
+
     def __getitem__(self, key):
         ret = self.entries[key]
         if isinstance(ret, SNodeHostAccess):
@@ -233,24 +244,30 @@ def __str__(self):
     def __repr__(self):
         return str(self.to_dict())
 
-    def to_dict(self):
+    def to_dict(self, include_methods=False):
         """Converts the Struct to a dictionary.
 
         Args:
+            include_methods (bool): Whether any struct methods should be included
+                in the result dictionary under the key '__struct_methods'.
 
         Returns:
             Dict: The result dictionary.
         """
-        return {
+        res_dict = {
             k: v.to_dict() if isinstance(v, Struct) else
             v.to_list() if isinstance(v, Matrix) else v
             for k, v in self.entries.items()
         }
+        if include_methods:
+            res_dict['__struct_methods'] = self.methods
+        return res_dict
 
     @classmethod
     @python_scope
     def field(cls,
               members,
+              methods={},
               shape=None,
               name="<Struct>",
               offset=None,
@@ -261,6 +278,9 @@ def field(cls,
 
         Args:
             members (dict): a dict, each item is like `name: type`.
+            methods (dict): a dict of methods that should be included with
+                the field.  Each struct item of the field will have the
+                methods as instance functions.
             shape (Tuple[int]): width and height of the field.
             offset (Tuple[int]): offset of the indices of the created field.
                 For example if `offset=(-10, -10)` the indices of the field
@@ -336,7 +356,7 @@ def field(cls,
                     grads = tuple(e.grad for e in field_dict.values())
                     impl.root.dense(impl.index_nd(dim),
                                     shape).place(*grads, offset=offset)
-        return StructField(field_dict, name=name)
+        return StructField(field_dict, methods, name=name)
 
 
 class _IntermediateStruct(Struct):
@@ -344,9 +364,13 @@ class _IntermediateStruct(Struct):
 
     Args:
         entries (Dict[str, Union[Expr, Matrix, Struct]]): keys and values for struct members.
+            Any methods included under the key '__struct_methods' will be applied to each
+            struct instance.
     """
     def __init__(self, entries):
         assert isinstance(entries, dict)
+        self.methods = entries.pop('__struct_methods', {})
+        self._register_methods()
         self.entries = entries
         self._register_members()
 
@@ -359,11 +383,14 @@ class StructField(Field):
 
     Args:
         field_dict (Dict[str, Field]): Struct field members.
+        struct_methods (Dict[str, callable]): Dictionary of functions to apply
+            to each struct instance in the field.
         name (string, optional): The custom name of the field.
     """
-    def __init__(self, field_dict, name=None):
+    def __init__(self, field_dict, struct_methods, name=None):
         # will not call Field initializer
         self.field_dict = field_dict
+        self.struct_methods = struct_methods
         self.name = name
         self._register_fields()
 
@@ -572,14 +599,18 @@ def __getitem__(self, indices):
                 v, ScalarField) else v[indices]
             for k, v in self._items
         }
+        entries['__struct_methods'] = self.struct_methods
         return Struct(entries)
 
 
 class StructType(CompoundType):
     def __init__(self, **kwargs):
         self.members = {}
+        self.methods = {}
         for k, dtype in kwargs.items():
-            if isinstance(dtype, CompoundType):
+            if k == '__struct_methods':
+                self.methods = dtype
+            elif isinstance(dtype, CompoundType):
                 self.members[k] = dtype
             else:
                 self.members[k] = cook_dtype(dtype)
@@ -620,6 +651,7 @@ def cast(self, struct):
                     ) if dtype in primitive_types.integer_types else float(v)
                 else:
                     entries[k] = ops.cast(struct.entries[k], dtype)
+        entries['__struct_methods'] = self.methods
         return Struct(entries)
 
     def filled_with_scalar(self, value):
@@ -629,10 +661,54 @@ def filled_with_scalar(self, value):
                 entries[k] = dtype.filled_with_scalar(value)
             else:
                 entries[k] = value
+        entries['__struct_methods'] = self.methods
         return Struct(entries)
 
     def field(self, **kwargs):
-        return Struct.field(self.members, **kwargs)
+        return Struct.field(self.members, self.methods, **kwargs)
+
 
+def struct_class(cls):
+    """Converts a class with field annotations and methods into a taichi struct type.
+
+    This will return a normal custom struct type, with the functions added to it.
+    Struct fields can be generated in the normal way from the struct type.
+    Functions in the class can be run on the struct instance.
+
+    This class decorator inspects the class for annotations and methods and
+        1.  Sets the annotations as fields for the struct
+        2.  Attaches the methods to the struct type
+
+    Example::
 
-__all__ = ["Struct", "StructField"]
+        >>> @ti.stuct_class
+        >>> class Sphere:
+        >>>     center: vec3
+        >>>     radius: ti.f32
+        >>>
+        >>>     @ti.func
+        >>>     def area(self):
+        >>>         return 4 * 3.14 * self.radius * self.radius
+        >>>
+        >>> my_spheres = Sphere.field(shape=(n, ))
+        >>> my_sphere[2].area()
+
+    Args:
+        cls (Class): the class with annotations and methods to convert to a struct
+
+    Returns:
+        A taichi struct with the annotations as fields
+            and methods from the class attached.
+    """
+    # save the annotaion fields for the struct
+    fields = cls.__annotations__
+    # get the class methods to be attached to the struct types
+    fields['__struct_methods'] = {
+        attribute: getattr(cls, attribute)
+        for attribute in dir(cls)
+        if callable(getattr(cls, attribute)) and not attribute.startswith('__')
+    }
+    return StructType(**fields)
+
+
+__all__ = ["Struct", "StructField", "struct_class"]
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index 9f51201fd9f9f..a390ac63cebc3 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -83,10 +83,10 @@ def _get_expected_matrix_apis():
     'randn', 'random', 'raw_div', 'raw_mod', 'ref', 'rescale_index', 'reset',
     'rgb_to_hex', 'root', 'round', 'rsqrt', 'select', 'set_logging_level',
     'simt', 'sin', 'solve', 'sparse_matrix_builder', 'sqrt', 'static',
-    'static_assert', 'static_print', 'stop_grad', 'svd', 'swizzle_generator',
-    'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools', 'types', 'u16',
-    'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64', 'uint8', 'vulkan',
-    'wasm', 'x64', 'x86_64', 'zero'
+    'static_assert', 'static_print', 'stop_grad', 'struct_class', 'svd',
+    'swizzle_generator', 'sym_eig', 'sync', 'tan', 'tanh', 'template', 'tools',
+    'types', 'u16', 'u32', 'u64', 'u8', 'ui', 'uint16', 'uint32', 'uint64',
+    'uint8', 'vulkan', 'wasm', 'x64', 'x86_64', 'zero'
 ]
 user_api[ti.Field] = [
     'copy_from', 'dtype', 'fill', 'from_numpy', 'from_paddle', 'from_torch',
diff --git a/tests/python/test_custom_struct.py b/tests/python/test_custom_struct.py
index 657e8f6820fd7..b6ccedd9eb168 100644
--- a/tests/python/test_custom_struct.py
+++ b/tests/python/test_custom_struct.py
@@ -204,6 +204,47 @@ def run_python_scope():
         assert x[i].line.length == 5.0
 
 
+@test_utils.test(exclude=ti.cc)
+def test_struct_class():
+    # example struct class type
+    vec3f = ti.types.vector(3, float)
+
+    @ti.struct_class
+    class Sphere:
+        center: vec3f
+        radius: ti.f32
+
+        @ti.func
+        def area(self):
+            return 4 * 3.14 * self.radius * self.radius
+
+        def py_scope_area(self):
+            return 4 * 3.14 * self.radius * self.radius
+
+    # test function usage from python scope
+    assert np.isclose(
+        Sphere(center=vec3f(0.0), radius=2.0).py_scope_area(),
+        4.0 * 3.14 * 4.0)
+
+    # test function usage from taichi scope
+    @ti.kernel
+    def get_area() -> ti.f32:
+        sphere = Sphere(center=vec3f(0.0), radius=2.0)
+        return sphere.area()
+
+    assert np.isclose(get_area(), 4.0 * 3.14 * 4.0)
+
+    # test function usage from taichi scope with field
+    struct_field = Sphere.field(shape=(4, ))
+    struct_field[3] = Sphere(center=vec3f(0.0), radius=2.0)
+
+    @ti.kernel
+    def get_area_field() -> ti.f32:
+        return struct_field[3].area()
+
+    assert np.isclose(get_area_field(), 4.0 * 3.14 * 4.0)
+
+
 @test_utils.test()
 def test_struct_assign():
     n = 32

From 0f4c950ee73e61863e0841bdd71cc13ed1292287 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Mon, 30 May 2022 16:53:13 +0800
Subject: [PATCH 125/176] [Lang] [type] Disallow reading a whole bit_struct
 (#5061)

---
 misc/visualize_quant_types.py    | 176 -------------------------------
 python/taichi/lang/impl.py       |   8 --
 taichi/ir/expression_printer.h   |   6 +-
 taichi/ir/frontend_ir.cpp        |  10 +-
 taichi/ir/frontend_ir.h          |   5 -
 taichi/python/export_lang.cpp    |   4 -
 taichi/transforms/type_check.cpp |   6 +-
 7 files changed, 4 insertions(+), 211 deletions(-)
 delete mode 100644 misc/visualize_quant_types.py

diff --git a/misc/visualize_quant_types.py b/misc/visualize_quant_types.py
deleted file mode 100644
index 62fb507a456a4..0000000000000
--- a/misc/visualize_quant_types.py
+++ /dev/null
@@ -1,176 +0,0 @@
-import argparse
-import math
-import os
-from struct import pack, unpack
-
-import taichi as ti
-
-ti.init()
-
-f19 = ti.types.quant.float(exp=6, frac=13, signed=True)
-f16 = ti.types.quant.float(exp=5, frac=11, signed=True)
-fixed16 = ti.types.quant.fixed(frac=16, range=2)
-
-vf19 = ti.Vector.field(2, dtype=f19)
-bs_vf19 = ti.root.bit_struct(num_bits=32)
-bs_vf19.place(vf19, shared_exponent=True)
-
-vf16 = ti.Vector.field(2, dtype=f16)
-bs_vf16 = ti.root.bit_struct(num_bits=32)
-bs_vf16.place(vf16)
-
-vfixed16 = ti.Vector.field(2, dtype=fixed16)
-bs_vfixed16 = ti.root.bit_struct(num_bits=32)
-bs_vfixed16.place(vfixed16)
-
-
-@ti.kernel
-def set_vals(x: ti.f32, y: ti.f32):
-    val = ti.Vector([x, y])
-    vf16[None] = val
-    vf19[None] = val
-    vfixed16[None] = val
-
-
-def serialize_i32(x):
-    s = ''
-    for i in reversed(range(32)):
-        s += f'{(x>>i) & 1}'
-    return s
-
-
-def serialize_f32(x):
-    b = pack('f', x)
-    n = unpack('i', b)[0]
-    return serialize_i32(n)
-
-
-@ti.kernel
-def fetch_bs(bs: ti.template()) -> ti.i32:
-    return bs[None]
-
-
-coord = ti.GUI(res=(800, 800), background_color=0xFFFFFF)
-numbers = ti.GUI(res=(800, 800), background_color=0xFFFFFF)
-
-
-def draw_coord(t, f):
-    cx, cy = 0.5, 0.5
-    lx, ly = 0.4, 0.4
-    l1 = lx * 0.8
-    al = 0.02
-    coord.line(begin=(cx - lx, cy),
-               end=(cx + lx, cy),
-               radius=3,
-               color=0x666666)
-    coord.line(begin=(cx, cy - ly),
-               end=(cx, cy + ly),
-               radius=3,
-               color=0x666666)
-    coord.line(begin=(cx + lx - al, cy - al),
-               end=(cx + lx, cy),
-               radius=3,
-               color=0x666666)
-    coord.line(begin=(cx + lx - al, cy + al),
-               end=(cx + lx, cy),
-               radius=3,
-               color=0x666666)
-    coord.line(begin=(cx - al, cy + ly - al),
-               end=(cx, cy + ly),
-               radius=3,
-               color=0x666666)
-    coord.line(begin=(cx + al, cy + ly - al),
-               end=(cx, cy + ly),
-               radius=3,
-               color=0x666666)
-
-    def transform(p):
-        return cx + l1 * p[0], cy + l1 * p[1]
-
-    segments = 300
-    for i in range(segments):
-        t1 = i / segments
-        t2 = (i + 1) / segments
-        coord.line(begin=transform(f(t1)),
-                   end=transform(f(t2)),
-                   radius=3,
-                   color=0x0)
-
-    coord.circle(pos=transform(f(t)), color=0xDD1122, radius=10)
-
-
-frames = 300
-
-parser = argparse.ArgumentParser()
-parser.add_argument('-c', '--curve', type=int, help='Curve type', default=0)
-
-args = parser.parse_args()
-
-if args.curve == 0:
-
-    def f(t):
-        return math.cos(t * 2 * math.pi), math.sin(t * 2 * math.pi)
-elif args.curve == 1:
-
-    def f(t):
-        t = math.cos(t * 2 * math.pi) * 0.5 + 0.5
-        return 1 - t, t
-elif args.curve == 2:
-
-    def f(t):
-        t = math.cos(t * 2 * math.pi)
-        t = t * 2.3
-        s = 0.1
-        return math.exp(t) * s, math.exp(-t) * s
-
-
-folder = f'curve{args.curve}'
-os.makedirs(folder, exist_ok=True)
-
-for i in range(frames * 2 + 1):
-    t = i / frames
-
-    draw_coord(t, f)
-    coord.show(f'{folder}/coord_{i:04d}.png')
-
-    x, y = f(t)
-    set_vals(x, y)
-
-    fs = 100
-    color = 0x111111
-
-    def reorder(b, seg):
-        r = ''
-        seg = [0] + seg + [32]
-        for i in range(len(seg) - 1):
-            r = r + b[32 - seg[i + 1]:32 - seg[i]]
-        return r
-
-    def real_to_str(x):
-        s = ''
-        if x < 0:
-            s = ''
-        else:
-            s = ' '
-        return s + f'{x:.4f}'
-
-    numbers.text(real_to_str(x), (0.05, 0.9), font_size=fs, color=color)
-    numbers.text(real_to_str(y), (0.55, 0.9), font_size=fs, color=color)
-
-    fs = 49
-
-    bits = [bs_vf19, bs_vf16, bs_vfixed16]
-    seg = [[], [], [6, 19], [5, 16, 21], [16]]
-    bits = list(map(lambda x: serialize_i32(fetch_bs(x)), bits))
-
-    bits = [serialize_f32(x), serialize_f32(y)] + bits
-
-    for j in range(len(bits)):
-        b = reorder(bits[j], seg[j])
-        numbers.text(b, (0.05, 0.7 - j * 0.15), font_size=fs, color=color)
-
-    numbers.show(f'{folder}/numbers_{i:04d}.png')
-
-os.system(
-    f'ti video {folder}/numbers*.png -f 60 -c 2 -o numbers{args.curve}.mp4')
-os.system(f'ti video {folder}/coord*.png -f 60 -c 2 -o coord{args.curve}.mp4')
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 6147f27e2e4ae..d15731be885ac 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -191,14 +191,6 @@ def subscript(value, *_indices, skip_reordered=False):
         ])
         ret.any_array_access = any_array_access
         return ret
-    if isinstance(value, SNode):
-        # When reading bit structure we only support the 0-D case for now.
-        field_dim = 0
-        if field_dim != index_dim:
-            raise IndexError(
-                f'Field with dim {field_dim} accessed with indices of dim {index_dim}'
-            )
-        return Expr(_ti_core.subscript(value.ptr, indices_expr_group))
     # Directly evaluate in Python for non-Taichi types
     return value.__getitem__(*_indices)
 
diff --git a/taichi/ir/expression_printer.h b/taichi/ir/expression_printer.h
index f6bb7e607f2ef..29f391d17334a 100644
--- a/taichi/ir/expression_printer.h
+++ b/taichi/ir/expression_printer.h
@@ -101,11 +101,7 @@ class ExpressionHumanFriendlyPrinter : public ExpressionPrinter {
   }
 
   void visit(GlobalPtrExpression *expr) override {
-    if (expr->snode) {
-      emit(expr->snode->get_node_type_name_hinted());
-    } else {
-      expr->var->accept(this);
-    }
+    expr->var->accept(this);
     emit('[');
     emit_vector(expr->indices.exprs);
     emit(']');
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index cfc5a8f4ae3c8..9ded4ffae1146 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -365,10 +365,7 @@ void GlobalVariableExpression::flatten(FlattenContext *ctx) {
 
 void GlobalPtrExpression::type_check(CompileConfig *) {
   // Currently, dimension compatibility check happens in Python
-  if (snode != nullptr) {
-    TI_ASSERT(snode->dt->is<BitStructType>());
-    ret_type = snode->dt->cast<BitStructType>()->get_physical_type();
-  } else if (var.is<GlobalVariableExpression>()) {
+  if (var.is<GlobalVariableExpression>()) {
     ret_type =
         var.cast<GlobalVariableExpression>()->snode->dt->get_compute_type();
   } else if (var.is<ExternalTensorExpression>()) {
@@ -391,10 +388,7 @@ void GlobalPtrExpression::flatten(FlattenContext *ctx) {
   std::vector<Stmt *> index_stmts;
   std::vector<int> offsets;
   SNode *snode = nullptr;
-  if (this->snode != nullptr) {
-    snode = this->snode;
-  }
-  if (bool(var) && var.is<GlobalVariableExpression>()) {
+  if (var.is<GlobalVariableExpression>()) {
     snode = var.cast<GlobalVariableExpression>()->snode;
     offsets = snode->index_offsets;
   }
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index d34f7b8274c7f..93752a4fca9da 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -463,7 +463,6 @@ class GlobalVariableExpression : public Expression {
 
 class GlobalPtrExpression : public Expression {
  public:
-  SNode *snode{nullptr};
   Expr var;
   ExprGroup indices;
 
@@ -471,10 +470,6 @@ class GlobalPtrExpression : public Expression {
       : var(var), indices(indices) {
   }
 
-  GlobalPtrExpression(SNode *snode, const ExprGroup &indices)
-      : snode(snode), indices(indices) {
-  }
-
   void type_check(CompileConfig *config) override;
 
   void flatten(FlattenContext *ctx) override;
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 96c55be9890b7..8b2a5546d8b6b 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -865,10 +865,6 @@ void export_lang(py::module &m) {
         Expr::make<TensorElementExpression, const Expr &, const ExprGroup &,
                    const std::vector<int> &, int>);
 
-  m.def("subscript", [](SNode *snode, const ExprGroup &indices) {
-    return Expr::make<GlobalPtrExpression>(snode, indices);
-  });
-
   m.def("get_external_tensor_dim", [](const Expr &expr) {
     TI_ASSERT(expr.is<ExternalTensorExpression>());
     return expr.cast<ExternalTensorExpression>()->dim;
diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index a92b85ce96fed..87c7cc39723c4 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -123,11 +123,7 @@ class TypeCheck : public IRVisitor {
 
   void visit(GlobalLoadStmt *stmt) override {
     auto pointee_type = stmt->src->ret_type.ptr_removed();
-    if (auto bit_struct = pointee_type->cast<BitStructType>()) {
-      stmt->ret_type = bit_struct->get_physical_type();
-    } else {
-      stmt->ret_type = pointee_type->get_compute_type();
-    }
+    stmt->ret_type = pointee_type->get_compute_type();
   }
 
   void visit(SNodeOpStmt *stmt) override {

From 2197fad07a03f2142781a7e308c7eb9ed001cd5c Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Mon, 30 May 2022 16:56:29 +0800
Subject: [PATCH 126/176] [bug] Remove operator ! for Expr (#5062)

---
 taichi/ir/expr.cpp | 4 ----
 taichi/ir/expr.h   | 2 --
 2 files changed, 6 deletions(-)

diff --git a/taichi/ir/expr.cpp b/taichi/ir/expr.cpp
index ff7ae1385d5dc..e56c679258c11 100644
--- a/taichi/ir/expr.cpp
+++ b/taichi/ir/expr.cpp
@@ -50,10 +50,6 @@ SNode *Expr::snode() const {
   return cast<GlobalVariableExpression>()->snode;
 }
 
-Expr Expr::operator!() {
-  return Expr::make<UnaryOpExpression>(UnaryOpType::logic_not, expr);
-}
-
 void Expr::set_grad(const Expr &o) {
   this->cast<GlobalVariableExpression>()->adjoint.set(o);
 }
diff --git a/taichi/ir/expr.h b/taichi/ir/expr.h
index f5d2ba4db8acf..1997431c1fdbc 100644
--- a/taichi/ir/expr.h
+++ b/taichi/ir/expr.h
@@ -83,8 +83,6 @@ class Expr {
 
   Expr operator[](const ExprGroup &indices) const;
 
-  Expr operator!();
-
   template <typename T, typename... Args>
   static Expr make(Args &&...args) {
     return Expr(std::make_shared<T>(std::forward<Args>(args)...));

From 69a53feac34f8c1674d59b2c0acb580e50cd79e0 Mon Sep 17 00:00:00 2001
From: Frost Ming <mianghong@gmail.com>
Date: Tue, 31 May 2022 10:05:40 +0800
Subject: [PATCH 127/176] [build] [bug] Ensure the assets folder is copied to
 the project directory (#5063)

* Bugfix: ensure the assets folder are copied

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* call copy_assets before setup()

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 setup.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index d829a59315604..3f9b5a43cac67 100644
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,6 @@
 import multiprocessing
 import os
 import shutil
-import subprocess
 import sys
 from distutils.command.clean import clean
 from distutils.dir_util import remove_tree
@@ -66,17 +65,16 @@ def remove_tmp(taichi_dir):
 class EggInfo(egg_info):
     def finalize_options(self, *args, **kwargs):
         if '' not in self.distribution.package_dir:
-            #4975: skbuild loses the root package dir
+            # Issue#4975: skbuild loses the root package dir
             self.distribution.package_dir[''] = package_dir
         return super().finalize_options(*args, **kwargs)
 
-    def run(self):
-        taichi_dir = os.path.join(package_dir, 'taichi')
-        remove_tmp(taichi_dir)
 
-        shutil.copytree('external/assets', os.path.join(taichi_dir, 'assets'))
+def copy_assets():
+    taichi_dir = os.path.join(package_dir, 'taichi')
+    remove_tmp(taichi_dir)
 
-        egg_info.run(self)
+    shutil.copytree('external/assets', os.path.join(taichi_dir, 'assets'))
 
 
 class Clean(clean):
@@ -147,6 +145,7 @@ def exclude_paths(manifest_files):
     ]
 
 
+copy_assets()
 setup(name=project_name,
       packages=packages,
       package_dir={"": package_dir},

From e6cb2885a61a1072ddd952527a036a645a9a2130 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Mon, 30 May 2022 20:39:57 -0700
Subject: [PATCH 128/176] [refactor] Split GraphBuilder out of Graph class
 (#5064)

---
 python/taichi/examples/graph/mpm88_graph.py | 14 +++++++-------
 python/taichi/graph/_graph.py               | 12 ++++++++----
 tests/python/test_aot.py                    | 14 +++++++-------
 tests/python/test_graph.py                  |  6 +++---
 4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/python/taichi/examples/graph/mpm88_graph.py b/python/taichi/examples/graph/mpm88_graph.py
index 430142b6022d2..50ba7554fa7b8 100644
--- a/python/taichi/examples/graph/mpm88_graph.py
+++ b/python/taichi/examples/graph/mpm88_graph.py
@@ -119,11 +119,11 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
         sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'J', ti.f32)
         sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_v', ti.f32, element_shape=(2, ))
         sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_m', ti.f32)
-        g_init = ti.graph.Graph()
-        g_init.dispatch(init_particles, sym_x, sym_v, sym_J)
+        g_init_builder = ti.graph.GraphBuilder()
+        g_init_builder.dispatch(init_particles, sym_x, sym_v, sym_J)
 
-        g_update = ti.graph.Graph()
-        substep = g_update.create_sequential()
+        g_update_builder = ti.graph.GraphBuilder()
+        substep = g_update_builder.create_sequential()
 
         substep.dispatch(substep_reset_grid, sym_grid_v, sym_grid_m)
         substep.dispatch(substep_p2g, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
@@ -132,11 +132,11 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
         substep.dispatch(substep_g2p, sym_x, sym_v, sym_C, sym_J, sym_grid_v)
 
         for i in range(N_ITER):
-            g_update.append(substep)
+            g_update_builder.append(substep)
 
         # Compile
-        g_init.compile()
-        g_update.compile()
+        g_init = g_init_builder.compile()
+        g_update = g_update_builder.compile()
 
         # Run
         g_init.run({'x': x, 'v': v, 'J': J})
diff --git a/python/taichi/graph/_graph.py b/python/taichi/graph/_graph.py
index 1758ea0b7146d..f9f06b32c8aa0 100644
--- a/python/taichi/graph/_graph.py
+++ b/python/taichi/graph/_graph.py
@@ -25,10 +25,9 @@ def dispatch(self, kernel_fn, *args):
         self.seq_.dispatch(kernel_cpp, args)
 
 
-class Graph:
+class GraphBuilder:
     def __init__(self):
         self._graph_builder = _ti_core.GraphBuilder()
-        self._compiled_graph = None
 
     def dispatch(self, kernel_fn, *args):
         kernel_cpp = gen_cpp_kernel(kernel_fn, args)
@@ -43,7 +42,12 @@ def append(self, node):
         self._graph_builder.seq().append(node.seq_)
 
     def compile(self):
-        self._compiled_graph = self._graph_builder.compile()
+        return Graph(self._graph_builder.compile())
+
+
+class Graph:
+    def __init__(self, compiled_graph) -> None:
+        self._compiled_graph = compiled_graph
 
     def run(self, args):
         arg_ptrs = {}
@@ -65,4 +69,4 @@ def run(self, args):
         self._compiled_graph.run(arg_ptrs, arg_ints, arg_floats)
 
 
-__all__ = ['Graph', 'Arg', 'ArgKind']
+__all__ = ['GraphBuilder', 'Graph', 'Arg', 'ArgKind']
diff --git a/tests/python/test_aot.py b/tests/python/test_aot.py
index 55fb7a59cf8e2..b5593890ebefc 100644
--- a/tests/python/test_aot.py
+++ b/tests/python/test_aot.py
@@ -736,11 +736,11 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
                               'grid_m',
                               ti.f32,
                               element_shape=())
-    g_init = ti.graph.Graph()
-    g_init.dispatch(init_particles, sym_x, sym_v, sym_J)
+    g_init_builder = ti.graph.GraphBuilder()
+    g_init_builder.dispatch(init_particles, sym_x, sym_v, sym_J)
 
-    g_update = ti.graph.Graph()
-    substep = g_update.create_sequential()
+    g_update_builder = ti.graph.GraphBuilder()
+    substep = g_update_builder.create_sequential()
 
     substep.dispatch(substep_reset_grid, sym_grid_v, sym_grid_m)
     substep.dispatch(substep_p2g, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
@@ -749,10 +749,10 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
     substep.dispatch(substep_g2p, sym_x, sym_v, sym_C, sym_J, sym_grid_v)
 
     for i in range(N_ITER):
-        g_update.append(substep)
+        g_update_builder.append(substep)
 
-    g_init.compile()
-    g_update.compile()
+    g_init = g_init_builder.compile()
+    g_update = g_update_builder.compile()
 
     x = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
     v = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
diff --git a/tests/python/test_graph.py b/tests/python/test_graph.py
index 5f4023725c581..e08940c851b4f 100644
--- a/tests/python/test_graph.py
+++ b/tests/python/test_graph.py
@@ -14,10 +14,10 @@ def test(pos: ti.types.ndarray(field_dim=1, element_shape=())):
             pos[i] = 1
 
     sym_pos = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'pos', ti.i32)
-    g_init = ti.graph.Graph()
+    g_init = ti.graph.GraphBuilder()
     g_init.dispatch(test, sym_pos)
-    g_init.compile()
+    g = g_init.compile()
 
     a = ti.ndarray(ti.i32, shape=(n, ))
-    g_init.run({'pos': a})
+    g.run({'pos': a})
     assert (a.to_numpy() == np.ones(4)).all()

From e5b66392d023caa61c40eb3bd678bc2ad9627350 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Tue, 31 May 2022 18:35:53 +0800
Subject: [PATCH 129/176] [aot] [CUDA-AOT PR #0] Refactored
 compile_module_to_executable() to CUDAModuleToFunctionConverter (#5070)

---
 taichi/backends/cuda/codegen_cuda.cpp | 253 ++++++++++++++------------
 taichi/backends/cuda/codegen_cuda.h   |  18 ++
 taichi/codegen/codegen_llvm.h         |   9 +-
 3 files changed, 161 insertions(+), 119 deletions(-)

diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index ad72bf95f1413..83e748fc99673 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -15,6 +15,7 @@
 #include "taichi/backends/cuda/cuda_context.h"
 #include "taichi/codegen/codegen_llvm.h"
 #include "taichi/llvm/llvm_program.h"
+#include "taichi/util/action_recorder.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -37,123 +38,12 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
 
   FunctionType gen() override {
     auto compiled_res = run_compilation();
-    return compile_module_to_executable(this->kernel, std::move(compiled_res));
-  }
 
-  static FunctionType compile_module_to_executable(
-      Kernel *kernel,
-      CompiledData &&compiled_data) {
-#ifdef TI_WITH_CUDA
-    auto *tlctx =
-        kernel->program->get_llvm_program_impl()->get_llvm_context(Arch::cuda);
-    for (auto &task : compiled_data.offloaded_tasks) {
-      llvm::Function *func = compiled_data.llvm_module->getFunction(task.name);
-      TI_ASSERT(func);
-      tlctx->mark_function_as_cuda_kernel(func, task.block_dim);
-    }
-
-    auto jit = tlctx->jit.get();
-    auto cuda_module = jit->add_module(std::move(compiled_data.llvm_module),
-                                       kernel->program->config.gpu_max_reg);
-
-    return [cuda_module, kernel,
-            offloaded_tasks =
-                compiled_data.offloaded_tasks](RuntimeContext &context) {
-      CUDAContext::get_instance().make_current();
-      auto args = kernel->args;
-      std::vector<void *> arg_buffers(args.size(), nullptr);
-      std::vector<void *> device_buffers(args.size(), nullptr);
-
-      // We could also use kernel->make_launch_context() to create
-      // |ctx_builder|, but that implies the usage of Program's context. For the
-      // sake of decoupling, let's not do that and explicitly set the context we
-      // want to modify.
-      Kernel::LaunchContextBuilder ctx_builder(kernel, &context);
-      bool transferred = false;
-      for (int i = 0; i < (int)args.size(); i++) {
-        if (args[i].is_array) {
-          const auto arr_sz = context.array_runtime_sizes[i];
-          if (arr_sz == 0) {
-            continue;
-          }
-          arg_buffers[i] = context.get_arg<void *>(i);
-          if (!context.is_device_allocations[i]) {
-            // Note: both numpy and PyTorch support arrays/tensors with zeros
-            // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
-            // `arr_sz` zero.
-            unsigned int attr_val = 0;
-            uint32_t ret_code =
-                CUDADriver::get_instance().mem_get_attribute.call(
-                    &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
-                    (void *)arg_buffers[i]);
-            if (ret_code != CUDA_SUCCESS || attr_val != CU_MEMORYTYPE_DEVICE) {
-              // Copy to device buffer if arg is on host
-              // - ret_code != CUDA_SUCCESS:
-              //   arg_buffers[i] is not on device
-              // - attr_val != CU_MEMORYTYPE_DEVICE:
-              //   Cuda driver is aware of arg_buffers[i] but it might be on
-              //   host.
-              // See CUDA driver API `cuPointerGetAttribute` for more details.
-              transferred = true;
-              CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz);
-              CUDADriver::get_instance().memcpy_host_to_device(
-                  (void *)device_buffers[i], arg_buffers[i], arr_sz);
-            } else {
-              device_buffers[i] = arg_buffers[i];
-            }
-            // device_buffers[i] saves a raw ptr on CUDA device.
-            ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
-                                               arr_sz,
-                                               /*is_device_allocation=*/false);
-
-          } else if (arr_sz > 0) {
-            // arg_buffers[i] is a DeviceAllocation*
-            // TODO: Unwraps DeviceAllocation* can be done at CodeGenLLVM since
-            // it's shared by cpu and cuda.
-            DeviceAllocation *ptr =
-                static_cast<DeviceAllocation *>(arg_buffers[i]);
-            device_buffers[i] = kernel->program->get_llvm_program_impl()
-                                    ->get_ndarray_alloc_info_ptr(*ptr);
-            // We compare arg_buffers[i] and device_buffers[i] later to check
-            // if transfer happened.
-            // TODO: this logic can be improved but I'll leave it to a followup
-            // PR.
-            arg_buffers[i] = device_buffers[i];
-
-            // device_buffers[i] saves the unwrapped raw ptr from arg_buffers[i]
-            ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
-                                               arr_sz,
-                                               /*is_device_allocation=*/false);
-          }
-        }
-      }
-      if (transferred) {
-        CUDADriver::get_instance().stream_synchronize(nullptr);
-      }
+    CUDAModuleToFunctionConverter converter{
+        tlctx, this->kernel->program->get_llvm_program_impl()};
 
-      for (auto task : offloaded_tasks) {
-        TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
-                 task.block_dim);
-        cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0,
-                            {&context});
-      }
-      // copy data back to host
-      if (transferred) {
-        CUDADriver::get_instance().stream_synchronize(nullptr);
-        for (int i = 0; i < (int)args.size(); i++) {
-          if (device_buffers[i] != arg_buffers[i]) {
-            CUDADriver::get_instance().memcpy_device_to_host(
-                arg_buffers[i], (void *)device_buffers[i],
-                context.array_runtime_sizes[i]);
-            CUDADriver::get_instance().mem_free((void *)device_buffers[i]);
-          }
-        }
-      }
-    };
-#else
-    TI_ERROR("No CUDA");
-    return nullptr;
-#endif  // TI_WITH_CUDA
+    return converter.convert(this->kernel, std::move(compiled_res.llvm_module),
+                             std::move(compiled_res.offloaded_tasks));
   }
 
   llvm::Value *create_print(std::string tag,
@@ -934,9 +824,142 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
   }
 };
 
+static void set_arg_external_array(RuntimeContext *ctx,
+                                   const std::string &kernel_name,
+                                   int arg_id,
+                                   uintptr_t ptr,
+                                   uint64 size,
+                                   bool is_device_allocation) {
+  ActionRecorder::get_instance().record(
+      "set_kernel_arg_ext_ptr",
+      {ActionArg("kernel_name", kernel_name), ActionArg("arg_id", arg_id),
+       ActionArg("address", fmt::format("0x{:x}", ptr)),
+       ActionArg("array_size_in_bytes", (int64)size)});
+
+  ctx->set_arg(arg_id, ptr);
+  ctx->set_array_runtime_size(arg_id, size);
+  ctx->set_array_is_device_allocation(arg_id, is_device_allocation);
+}
+
 FunctionType CodeGenCUDA::codegen() {
   TI_AUTO_PROF
   return CodeGenLLVMCUDA(kernel, ir).gen();
 }
 
+FunctionType CUDAModuleToFunctionConverter::convert(
+    const std::string &kernel_name,
+    const std::vector<LlvmLaunchArgInfo> &args,
+    std::unique_ptr<llvm::Module> mod,
+    std::vector<OffloadedTask> &&tasks) const {
+#ifdef TI_WITH_CUDA
+  for (const auto &task : tasks) {
+    llvm::Function *func = mod->getFunction(task.name);
+    TI_ASSERT(func);
+    tlctx_->mark_function_as_cuda_kernel(func, task.block_dim);
+  }
+
+  auto jit = tlctx_->jit.get();
+  auto cuda_module =
+      jit->add_module(std::move(mod), program_->config->gpu_max_reg);
+
+  return [cuda_module, kernel_name, args, offloaded_tasks = tasks,
+          program = this->program_](RuntimeContext &context) {
+    CUDAContext::get_instance().make_current();
+    std::vector<void *> arg_buffers(args.size(), nullptr);
+    std::vector<void *> device_buffers(args.size(), nullptr);
+
+    bool transferred = false;
+    for (int i = 0; i < (int)args.size(); i++) {
+      if (args[i].is_array) {
+        const auto arr_sz = context.array_runtime_sizes[i];
+        if (arr_sz == 0) {
+          continue;
+        }
+        arg_buffers[i] = context.get_arg<void *>(i);
+        if (!context.is_device_allocations[i]) {
+          // Note: both numpy and PyTorch support arrays/tensors with zeros
+          // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
+          // `arr_sz` zero.
+          unsigned int attr_val = 0;
+          uint32_t ret_code = CUDADriver::get_instance().mem_get_attribute.call(
+              &attr_val, CU_POINTER_ATTRIBUTE_MEMORY_TYPE,
+              (void *)arg_buffers[i]);
+
+          if (ret_code != CUDA_SUCCESS || attr_val != CU_MEMORYTYPE_DEVICE) {
+            // Copy to device buffer if arg is on host
+            // - ret_code != CUDA_SUCCESS:
+            //   arg_buffers[i] is not on device
+            // - attr_val != CU_MEMORYTYPE_DEVICE:
+            //   Cuda driver is aware of arg_buffers[i] but it might be on
+            //   host.
+            // See CUDA driver API `cuPointerGetAttribute` for more details.
+            transferred = true;
+            CUDADriver::get_instance().malloc(&device_buffers[i], arr_sz);
+            CUDADriver::get_instance().memcpy_host_to_device(
+                (void *)device_buffers[i], arg_buffers[i], arr_sz);
+          } else {
+            device_buffers[i] = arg_buffers[i];
+          }
+          // device_buffers[i] saves a raw ptr on CUDA device.
+          set_arg_external_array(&context, kernel_name, i,
+                                 (uint64)device_buffers[i], arr_sz,
+                                 /*is_device_allocation=*/false);
+
+        } else if (arr_sz > 0) {
+          // arg_buffers[i] is a DeviceAllocation*
+          // TODO: Unwraps DeviceAllocation* can be done at CodeGenLLVM since
+          // it's shared by cpu and cuda.
+          DeviceAllocation *ptr =
+              static_cast<DeviceAllocation *>(arg_buffers[i]);
+          device_buffers[i] = program->get_ndarray_alloc_info_ptr(*ptr);
+          // We compare arg_buffers[i] and device_buffers[i] later to check
+          // if transfer happened.
+          // TODO: this logic can be improved but I'll leave it to a followup
+          // PR.
+          arg_buffers[i] = device_buffers[i];
+
+          // device_buffers[i] saves the unwrapped raw ptr from arg_buffers[i]
+          set_arg_external_array(&context, kernel_name, i,
+                                 (uint64)device_buffers[i], arr_sz,
+                                 /*is_device_allocation=*/false);
+        }
+      }
+    }
+    if (transferred) {
+      CUDADriver::get_instance().stream_synchronize(nullptr);
+    }
+
+    for (auto task : offloaded_tasks) {
+      TI_TRACE("Launching kernel {}<<<{}, {}>>>", task.name, task.grid_dim,
+               task.block_dim);
+      cuda_module->launch(task.name, task.grid_dim, task.block_dim, 0,
+                          {&context});
+    }
+    // copy data back to host
+    if (transferred) {
+      CUDADriver::get_instance().stream_synchronize(nullptr);
+      for (int i = 0; i < (int)args.size(); i++) {
+        if (device_buffers[i] != arg_buffers[i]) {
+          CUDADriver::get_instance().memcpy_device_to_host(
+              arg_buffers[i], (void *)device_buffers[i],
+              context.array_runtime_sizes[i]);
+          CUDADriver::get_instance().mem_free((void *)device_buffers[i]);
+        }
+      }
+    }
+  };
+#else
+  TI_ERROR("No CUDA");
+  return nullptr;
+#endif  // TI_WITH_CUDA
+}
+
+FunctionType CUDAModuleToFunctionConverter::convert(
+    const Kernel *kernel,
+    std::unique_ptr<llvm::Module> mod,
+    std::vector<OffloadedTask> &&tasks) const {
+  return convert(kernel->name, infer_launch_args(kernel), std::move(mod),
+                 std::move(tasks));
+}
+
 TLANG_NAMESPACE_END
diff --git a/taichi/backends/cuda/codegen_cuda.h b/taichi/backends/cuda/codegen_cuda.h
index 0d4eec87e4b26..3285e0f7564b0 100644
--- a/taichi/backends/cuda/codegen_cuda.h
+++ b/taichi/backends/cuda/codegen_cuda.h
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "taichi/codegen/codegen.h"
+#include "taichi/codegen/codegen_llvm.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -15,4 +16,21 @@ class CodeGenCUDA : public KernelCodeGen {
   FunctionType codegen() override;
 };
 
+class CUDAModuleToFunctionConverter : public ModuleToFunctionConverter {
+ public:
+  explicit CUDAModuleToFunctionConverter(TaichiLLVMContext *tlctx,
+                                         LlvmProgramImpl *program)
+      : ModuleToFunctionConverter(tlctx, program) {
+  }
+
+  FunctionType convert(const std::string &kernel_name,
+                       const std::vector<LlvmLaunchArgInfo> &args,
+                       std::unique_ptr<llvm::Module> mod,
+                       std::vector<OffloadedTask> &&tasks) const override;
+
+  FunctionType convert(const Kernel *kernel,
+                       std::unique_ptr<llvm::Module> mod,
+                       std::vector<OffloadedTask> &&tasks) const override;
+};
+
 TLANG_NAMESPACE_END
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index bb83a3151b636..583acd5e80245 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -413,7 +413,8 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
 class LlvmProgramImpl;
 
-// This is for CPU, we need one for CUDA (AMDGPU) as well.
+// TODO: Make ModuleToFunctionConverter abstract,
+//       Move CPU implementation to "taichi/backend/cpu/"
 class ModuleToFunctionConverter {
  public:
   explicit ModuleToFunctionConverter(TaichiLLVMContext *tlctx,
@@ -426,9 +427,9 @@ class ModuleToFunctionConverter {
                                std::unique_ptr<llvm::Module> mod,
                                std::vector<OffloadedTask> &&tasks) const;
 
-  FunctionType convert(const Kernel *kernel,
-                       std::unique_ptr<llvm::Module> mod,
-                       std::vector<OffloadedTask> &&tasks) const;
+  virtual FunctionType convert(const Kernel *kernel,
+                               std::unique_ptr<llvm::Module> mod,
+                               std::vector<OffloadedTask> &&tasks) const;
 
  protected:
   TaichiLLVMContext *tlctx_{nullptr};

From 5704111c80beb5858cce8d26ac3fa70952cdc71d Mon Sep 17 00:00:00 2001
From: ailzhang <ailing@taichi.graphics>
Date: Tue, 31 May 2022 15:51:32 +0800
Subject: [PATCH 130/176] [refactor] Specialized Ndarray Type is (element_type,
 shape, layout)

ghstack-source-id: 977cd453359b8ccc09deccacc62a915abcd42734
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5065
---
 python/taichi/aot/utils.py            | 30 +++++++++++++++-----------
 python/taichi/lang/_ndarray.py        |  9 ++++++++
 python/taichi/lang/kernel_impl.py     | 14 +++---------
 python/taichi/lang/matrix.py          | 14 +++++++-----
 python/taichi/types/compound_types.py |  6 ++++++
 python/taichi/types/ndarray_type.py   | 31 ++++++++++++++++-----------
 tests/python/test_aot.py              |  5 ++---
 tests/python/test_api.py              |  8 +++----
 tests/python/test_graph.py            |  2 +-
 9 files changed, 70 insertions(+), 49 deletions(-)

diff --git a/python/taichi/aot/utils.py b/python/taichi/aot/utils.py
index e0cbf3646cac4..6bf0b85992517 100644
--- a/python/taichi/aot/utils.py
+++ b/python/taichi/aot/utils.py
@@ -35,29 +35,33 @@ def produce_injected_args(kernel, symbolic_args=None):
                 raise TaichiCompilationError(
                     f'Expected Ndaray type, got {anno}')
             if symbolic_args is not None:
-                anno.element_shape = tuple(symbolic_args[i].element_shape)
-                anno.element_dim = len(anno.element_shape)
-                anno.dtype = symbolic_args[i].dtype()
+                element_shape = tuple(symbolic_args[i].element_shape)
+                element_dim = len(element_shape)
+                dtype = symbolic_args[i].dtype()
+            else:
+                element_shape = anno.element_shape
+                element_dim = anno.element_dim
+                dtype = anno.dtype
 
-            if anno.element_shape is None or anno.field_dim is None:
+            if element_shape is None or anno.field_dim is None:
                 raise TaichiCompilationError(
                     'Please either specify both `element_shape` and `field_dim` '
                     'in the param annotation, or provide an example '
                     f'ndarray for param={arg.name}')
-            if anno.element_dim == 0:
+            if element_dim is None or element_dim == 0:
                 injected_args.append(
-                    ScalarNdarray(anno.dtype, (2, ) * anno.field_dim))
-            elif anno.element_dim == 1:
+                    ScalarNdarray(dtype, (2, ) * anno.field_dim))
+            elif element_dim == 1:
                 injected_args.append(
-                    VectorNdarray(anno.element_shape[0],
-                                  dtype=anno.dtype,
+                    VectorNdarray(element_shape[0],
+                                  dtype=dtype,
                                   shape=(2, ) * anno.field_dim,
                                   layout=Layout.AOS))
-            elif anno.element_dim == 2:
+            elif element_dim == 2:
                 injected_args.append(
-                    MatrixNdarray(anno.element_shape[0],
-                                  anno.element_shape[1],
-                                  dtype=anno.dtype,
+                    MatrixNdarray(element_shape[0],
+                                  element_shape[1],
+                                  dtype=dtype,
                                   shape=(2, ) * anno.field_dim,
                                   layout=Layout.AOS))
             else:
diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index bc418f1798da7..902a003feeed1 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -4,6 +4,7 @@
 from taichi.lang.enums import Layout
 from taichi.lang.util import cook_dtype, python_scope, to_numpy_type
 from taichi.types import primitive_types
+from taichi.types.ndarray_type import SpecializeNdarrayType
 
 
 class Ndarray:
@@ -15,10 +16,17 @@ class Ndarray:
     """
     def __init__(self, dtype, arr_shape):
         self.host_accessor = None
+        self.layout = None
+        self.shape = None
+        self.element_type = None
         self.dtype = cook_dtype(dtype)
         self.arr = impl.get_runtime().prog.create_ndarray(
             cook_dtype(dtype), arr_shape)
 
+    def get_type(self):
+        return SpecializeNdarrayType(self.element_type, self.shape,
+                                     self.layout)
+
     @property
     def element_shape(self):
         """Gets ndarray element shape.
@@ -209,6 +217,7 @@ class ScalarNdarray(Ndarray):
     def __init__(self, dtype, arr_shape):
         super().__init__(dtype, arr_shape)
         self.shape = tuple(self.arr.shape)
+        self.element_type = dtype
 
     @property
     def element_shape(self):
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 1f689717f42a2..e5365981c7932 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -337,21 +337,13 @@ def extract_arg(arg, anno):
             return arg
         if isinstance(anno, ndarray_type.NdarrayType):
             if isinstance(arg, taichi.lang._ndarray.ScalarNdarray):
-                anno._check_element_dim(arg, 0)
-                anno._check_element_shape(())
-                anno._check_field_dim(len(arg.shape))
+                anno.match(arg.get_type())
                 return arg.dtype, len(arg.shape), (), Layout.AOS
             if isinstance(arg, taichi.lang.matrix.VectorNdarray):
-                anno._check_element_dim(arg, 1)
-                anno._check_element_shape((arg.n, ))
-                anno._check_field_dim(len(arg.shape))
-                anno._check_layout(arg)
+                anno.match(arg.get_type())
                 return arg.dtype, len(arg.shape) + 1, (arg.n, ), arg.layout
             if isinstance(arg, taichi.lang.matrix.MatrixNdarray):
-                anno._check_element_dim(arg, 2)
-                anno._check_element_shape((arg.n, arg.m))
-                anno._check_field_dim(len(arg.shape))
-                anno._check_layout(arg)
+                anno.match(arg.get_type())
                 return arg.dtype, len(arg.shape) + 2, (arg.n,
                                                        arg.m), arg.layout
             # external arrays
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index c5160c22d9dd1..7a4ff79009844 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -17,7 +17,7 @@
                               taichi_scope, to_numpy_type, to_paddle_type,
                               to_pytorch_type, warning)
 from taichi.types import primitive_types
-from taichi.types.compound_types import CompoundType
+from taichi.types.compound_types import CompoundType, TensorType
 
 
 def _gen_swizzles(cls):
@@ -1688,12 +1688,14 @@ class MatrixNdarray(Ndarray):
         >>> arr = ti.MatrixNdarray(2, 2, ti.f32, shape=(3, 3), layout=Layout.SOA)
     """
     def __init__(self, n, m, dtype, shape, layout):
-        self.layout = layout
-        self.shape = shape
         self.n = n
         self.m = m
+        # TODO: we should pass in element_type, shape, layout instead.
         arr_shape = (n, m) + shape if layout == Layout.SOA else shape + (n, m)
         super().__init__(dtype, arr_shape)
+        self.layout = layout
+        self.shape = shape
+        self.element_type = TensorType((self.n, self.m), dtype)
 
     @property
     def element_shape(self):
@@ -1783,11 +1785,13 @@ class VectorNdarray(Ndarray):
         >>> a = ti.VectorNdarray(3, ti.f32, (3, 3), layout=Layout.SOA)
     """
     def __init__(self, n, dtype, shape, layout):
-        self.layout = layout
-        self.shape = shape
         self.n = n
+        # TODO: pass in element_type, shape, layout directly
         arr_shape = (n, ) + shape if layout == Layout.SOA else shape + (n, )
         super().__init__(dtype, arr_shape)
+        self.layout = layout
+        self.shape = shape
+        self.element_type = TensorType((n, ), dtype)
 
     @property
     def element_shape(self):
diff --git a/python/taichi/types/compound_types.py b/python/taichi/types/compound_types.py
index 75f02e5afa846..4de57b9edb2d8 100644
--- a/python/taichi/types/compound_types.py
+++ b/python/taichi/types/compound_types.py
@@ -5,6 +5,12 @@ class CompoundType:
     pass
 
 
+class TensorType(CompoundType):
+    def __init__(self, shape, dtype):
+        self.dtype = taichi.lang.util.cook_dtype(dtype)
+        self.shape = shape
+
+
 # TODO: maybe move MatrixType, StructType here to avoid the circular import?
 def matrix(n, m, dtype):
     """Creates a matrix type with given shape and data type.
diff --git a/python/taichi/types/ndarray_type.py b/python/taichi/types/ndarray_type.py
index 0bc2c610c3414..61be06f25213c 100644
--- a/python/taichi/types/ndarray_type.py
+++ b/python/taichi/types/ndarray_type.py
@@ -1,6 +1,13 @@
 from taichi.types.primitive_types import f32
 
 
+class SpecializeNdarrayType:
+    def __init__(self, element_type, shape=None, layout=None):
+        self.element_type = element_type
+        self.shape = shape
+        self.layout = layout
+
+
 class NdarrayType:
     """Type annotation for arbitrary arrays, including external arrays (numpy ndarrays and torch tensors) and Taichi ndarrays.
 
@@ -32,31 +39,31 @@ def __init__(self,
         self.element_shape = element_shape
         self.element_dim = len(
             element_shape) if element_shape is not None else element_dim
+
         self.field_dim = field_dim
         self.layout = layout
 
-    def _check_element_dim(self, arg, arg_dim):
-        if self.element_dim is not None and self.element_dim != arg_dim:
+    def match(self, ndarray_type: SpecializeNdarrayType):
+        if self.element_dim is not None and self.element_dim != len(
+                ndarray_type.element_type.shape):
             raise ValueError(
-                f"Invalid argument into ti.types.ndarray() - required element_dim={self.element_dim}, but {arg} is provided"
+                f"Invalid argument into ti.types.ndarray() - required element_dim={self.element_dim}, but {len(ndarray_type.element_type.shape)} is provided"
             )
 
-    def _check_layout(self, arg):
-        if self.layout is not None and self.layout != arg.layout:
+        if self.element_shape is not None and self.element_shape != ndarray_type.element_type.shape:
             raise ValueError(
-                f"Invalid argument into ti.types.ndarray() - required layout={self.layout}, but {arg} is provided"
+                f"Invalid argument into ti.types.ndarray() - required element_shape={self.element_shape}, but {ndarray_type.element_type.shape} is provided"
             )
 
-    def _check_element_shape(self, shapes):
-        if self.element_shape is not None and shapes != self.element_shape:
+        if self.layout is not None and self.layout != ndarray_type.layout:
             raise ValueError(
-                f"Invalid argument into ti.types.ndarray() - required element_shape={self.element_shape}, but {shapes} is provided"
+                f"Invalid argument into ti.types.ndarray() - required layout={self.layout}, but {ndarray_type.layout} is provided"
             )
 
-    def _check_field_dim(self, field_dim):
-        if self.field_dim is not None and field_dim != self.field_dim:
+        if self.field_dim is not None and self.field_dim != len(
+                ndarray_type.shape):
             raise ValueError(
-                f"Invalid argument into ti.types.ndarray() - required field_dim={self.field_dim}, but {field_dim} is provided"
+                f"Invalid argument into ti.types.ndarray() - required field_dim={self.field_dim}, but {ndarray_type.element_type} is provided"
             )
 
 
diff --git a/tests/python/test_aot.py b/tests/python/test_aot.py
index b5593890ebefc..6cb0252fe3046 100644
--- a/tests/python/test_aot.py
+++ b/tests/python/test_aot.py
@@ -84,15 +84,14 @@ def test_aot_bind_id():
     density1 = ti.ndarray(dtype=ti.f32, shape=(8, 8))
 
     @ti.kernel
-    def init(x: ti.f32, density1: ti.types.ndarray(field_dim=2,
-                                                   element_shape=())):
+    def init(x: ti.f32, density1: ti.types.ndarray(field_dim=2)):
         for i, j in density1:
             density[i, j] = x
             density1[i, j] = x + 1
 
     with tempfile.TemporaryDirectory() as tmpdir:
         m = ti.aot.Module(ti.lang.impl.current_cfg().arch)
-        m.add_kernel(init)
+        m.add_kernel(init, {'density1': density1})
         m.save(tmpdir, '')
         with open(os.path.join(tmpdir, 'metadata.json')) as json_file:
             res = json.load(json_file)
diff --git a/tests/python/test_api.py b/tests/python/test_api.py
index a390ac63cebc3..67def75173424 100644
--- a/tests/python/test_api.py
+++ b/tests/python/test_api.py
@@ -113,9 +113,9 @@ def _get_expected_matrix_apis():
     'to_torch'
 ]
 user_api[ti.MatrixNdarray] = [
-    'copy_from', 'element_shape', 'fill', 'from_numpy', 'to_numpy'
+    'copy_from', 'element_shape', 'fill', 'from_numpy', 'get_type', 'to_numpy'
 ]
-user_api[ti.Ndarray] = ['copy_from', 'element_shape', 'fill']
+user_api[ti.Ndarray] = ['copy_from', 'element_shape', 'fill', 'get_type']
 user_api[ti.SNode] = [
     'bit_array', 'bit_struct', 'bitmasked', 'deactivate_all', 'dense',
     'dynamic', 'lazy_grad', 'parent', 'place', 'pointer', 'shape'
@@ -125,7 +125,7 @@ def _get_expected_matrix_apis():
     'parent', 'shape', 'snode', 'to_numpy', 'to_paddle', 'to_torch'
 ]
 user_api[ti.ScalarNdarray] = [
-    'copy_from', 'element_shape', 'fill', 'from_numpy', 'to_numpy'
+    'copy_from', 'element_shape', 'fill', 'from_numpy', 'get_type', 'to_numpy'
 ]
 user_api[ti.Struct] = ['field', 'fill', 'items', 'keys', 'to_dict']
 user_api[ti.StructField] = [
@@ -134,7 +134,7 @@ def _get_expected_matrix_apis():
     'to_paddle', 'to_torch'
 ]
 user_api[ti.VectorNdarray] = [
-    'copy_from', 'element_shape', 'fill', 'from_numpy', 'to_numpy'
+    'copy_from', 'element_shape', 'fill', 'from_numpy', 'get_type', 'to_numpy'
 ]
 
 
diff --git a/tests/python/test_graph.py b/tests/python/test_graph.py
index e08940c851b4f..b6e22bf8e352b 100644
--- a/tests/python/test_graph.py
+++ b/tests/python/test_graph.py
@@ -9,7 +9,7 @@ def test_ndarray_int():
     n = 4
 
     @ti.kernel
-    def test(pos: ti.types.ndarray(field_dim=1, element_shape=())):
+    def test(pos: ti.types.ndarray(field_dim=1)):
         for i in range(n):
             pos[i] = 1
 

From f42a84922e411bf5e2f8e7a5bb3d66ce46597c6e Mon Sep 17 00:00:00 2001
From: ailzhang <ailing@taichi.graphics>
Date: Tue, 31 May 2022 15:51:33 +0800
Subject: [PATCH 131/176] [refactor] Pass element_shape and layout to C++
 Ndarray

Note we still flatten element_shape in the C++ Ndarray, which is blocked by the accessors and will be fixed
in the following PRs.

ghstack-source-id: 0cb5c05f0ad4c188546d7174a1d82f398bc717c2
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5066
---
 python/taichi/lang/_ndarray.py            | 12 ++++++----
 python/taichi/lang/enums.py               | 13 ++---------
 python/taichi/lang/matrix.py              | 20 +++++++++-------
 python/taichi/types/compound_types.py     |  2 +-
 taichi/backends/opengl/codegen_opengl.cpp | 11 +++++----
 taichi/inc/constants.h                    |  2 ++
 taichi/program/ndarray.cpp                | 28 +++++++++++++++++------
 taichi/program/ndarray.h                  |  6 ++++-
 taichi/program/program.cpp                |  7 ++++--
 taichi/program/program.h                  |  6 ++++-
 taichi/python/export_lang.cpp             | 14 ++++++++++--
 11 files changed, 78 insertions(+), 43 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index 902a003feeed1..30514bac43393 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -14,14 +14,13 @@ class Ndarray:
         dtype (DataType): Data type of each value.
         shape (Tuple[int]): Shape of the Ndarray.
     """
-    def __init__(self, dtype, arr_shape):
+    def __init__(self):
         self.host_accessor = None
         self.layout = None
         self.shape = None
         self.element_type = None
-        self.dtype = cook_dtype(dtype)
-        self.arr = impl.get_runtime().prog.create_ndarray(
-            cook_dtype(dtype), arr_shape)
+        self.dtype = None
+        self.arr = None
 
     def get_type(self):
         return SpecializeNdarrayType(self.element_type, self.shape,
@@ -215,7 +214,10 @@ class ScalarNdarray(Ndarray):
         shape (Tuple[int]): Shape of the ndarray.
     """
     def __init__(self, dtype, arr_shape):
-        super().__init__(dtype, arr_shape)
+        super().__init__()
+        self.dtype = cook_dtype(dtype)
+        self.arr = impl.get_runtime().prog.create_ndarray(
+            self.dtype, arr_shape)
         self.shape = tuple(self.arr.shape)
         self.element_type = dtype
 
diff --git a/python/taichi/lang/enums.py b/python/taichi/lang/enums.py
index 43f14d50dcccb..1eb81f136a834 100644
--- a/python/taichi/lang/enums.py
+++ b/python/taichi/lang/enums.py
@@ -1,14 +1,5 @@
-from enum import Enum, unique
-
-
-@unique
-class Layout(Enum):
-    """Layout of a Taichi field or ndarray.
-
-    Currently, AOS (array of structures) and SOA (structure of arrays) are supported.
-    """
-    AOS = 1
-    SOA = 2
+from taichi._lib import core as _ti_core
 
+Layout = _ti_core.Layout
 
 __all__ = ['Layout']
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 7a4ff79009844..d94191b88f3c1 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1690,12 +1690,14 @@ class MatrixNdarray(Ndarray):
     def __init__(self, n, m, dtype, shape, layout):
         self.n = n
         self.m = m
-        # TODO: we should pass in element_type, shape, layout instead.
-        arr_shape = (n, m) + shape if layout == Layout.SOA else shape + (n, m)
-        super().__init__(dtype, arr_shape)
+        super().__init__()
+        self.dtype = cook_dtype(dtype)
         self.layout = layout
         self.shape = shape
-        self.element_type = TensorType((self.n, self.m), dtype)
+        self.element_type = TensorType((self.n, self.m), self.dtype)
+        # TODO: we should pass in element_type, shape, layout instead.
+        self.arr = impl.get_runtime().prog.create_ndarray(
+            self.element_type.dtype, shape, self.element_type.shape, layout)
 
     @property
     def element_shape(self):
@@ -1786,12 +1788,14 @@ class VectorNdarray(Ndarray):
     """
     def __init__(self, n, dtype, shape, layout):
         self.n = n
-        # TODO: pass in element_type, shape, layout directly
-        arr_shape = (n, ) + shape if layout == Layout.SOA else shape + (n, )
-        super().__init__(dtype, arr_shape)
+        super().__init__()
+        self.dtype = cook_dtype(dtype)
         self.layout = layout
         self.shape = shape
-        self.element_type = TensorType((n, ), dtype)
+        self.element_type = TensorType((n, ), self.dtype)
+        # TODO: pass in element_type, shape, layout directly
+        self.arr = impl.get_runtime().prog.create_ndarray(
+            self.element_type.dtype, shape, self.element_type.shape, layout)
 
     @property
     def element_shape(self):
diff --git a/python/taichi/types/compound_types.py b/python/taichi/types/compound_types.py
index 4de57b9edb2d8..2c4da9bec6d2d 100644
--- a/python/taichi/types/compound_types.py
+++ b/python/taichi/types/compound_types.py
@@ -7,7 +7,7 @@ class CompoundType:
 
 class TensorType(CompoundType):
     def __init__(self, shape, dtype):
-        self.dtype = taichi.lang.util.cook_dtype(dtype)
+        self.dtype = dtype
         self.shape = shape
 
 
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index c68ab809268d1..b25ae7852a6ea 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -486,13 +486,14 @@ class KernelGen : public IRVisitor {
     const auto &element_shape = stmt->element_shape;
     std::vector<std::string> size_var_names;
     std::vector<std::string> element_shape_size_var_names;
-    enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
-    const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
+
+    const auto layout = stmt->element_dim <= 0 ? ExternalArrayLayout::kAOS
+                                               : ExternalArrayLayout::kSOA;
 
     if (element_shape.size() > 0) {
       int elem_beg = 0;
       int elem_end = 0;
-      if (layout == layout_SOA) {
+      if (layout == ExternalArrayLayout::kSOA) {
         elem_beg = 0;
         elem_end = element_shape.size();
       } else {
@@ -519,7 +520,7 @@ class KernelGen : public IRVisitor {
     // args buffer: 3, 2, 5, 4
     int ind_beg = 0;
     int ind_end = 0;
-    if (layout == layout_SOA) {
+    if (layout == ExternalArrayLayout::kSOA) {
       ind_beg = element_shape.size();
       ind_end = num_indices;
     } else {
@@ -540,7 +541,7 @@ class KernelGen : public IRVisitor {
       size_var_names.push_back(std::move(var_name));
     }
     // Arrange index stride and offsets in correct order
-    if (layout == layout_SOA) {
+    if (layout == ExternalArrayLayout::kSOA) {
       size_var_names.insert(size_var_names.begin(),
                             element_shape_size_var_names.begin(),
                             element_shape_size_var_names.end());
diff --git a/taichi/inc/constants.h b/taichi/inc/constants.h
index c282d65291a05..970091c2d3e28 100644
--- a/taichi/inc/constants.h
+++ b/taichi/inc/constants.h
@@ -46,3 +46,5 @@ T taichi_union_cast(G g) {
   static_assert(sizeof(T) == sizeof(G));
   return taichi_union_cast_with_different_sizes<T>(g);
 }
+
+enum class ExternalArrayLayout { kAOS, kSOA, kNull };
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index ae19dde3ea623..f022a66d08bb5 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -13,17 +13,31 @@ namespace lang {
 
 Ndarray::Ndarray(Program *prog,
                  const DataType type,
-                 const std::vector<int> &shape)
+                 const std::vector<int> &shape_,
+                 const std::vector<int> &element_shape_,
+                 ExternalArrayLayout layout_)
     : dtype(type),
-      shape(shape),
-      num_active_indices(shape.size()),
-      nelement_(std::accumulate(std::begin(shape),
-                                std::end(shape),
-                                1,
-                                std::multiplies<>())),
+      element_shape(element_shape_),
+      shape(shape_),
+      layout(layout_),
       element_size_(data_type_size(dtype)),
       prog_(prog),
       rw_accessors_bank_(&prog->get_ndarray_rw_accessors_bank()) {
+  // TODO: Instead of flattening the element, shape/nelement_/num_active_indices
+  // should refer to field shape only.
+  // The only blocker left is the accessors should handle vector/matrix as well
+  // instead of scalar only.
+  if (layout == ExternalArrayLayout::kAOS) {
+    shape.insert(shape.end(), element_shape.begin(), element_shape.end());
+  } else if (layout == ExternalArrayLayout::kSOA) {
+    shape.insert(shape.begin(), element_shape.begin(), element_shape.end());
+  }
+  nelement_ = std::accumulate(std::begin(shape_), std::end(shape_), 1,
+                              std::multiplies<>()) *
+              std::accumulate(std::begin(element_shape),
+                              std::end(element_shape), 1, std::multiplies<>());
+  num_active_indices = shape.size();
+
   ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
                                                  prog->result_buffer);
 }
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index a77686c1e194f..2e4320be77c8d 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -22,7 +22,9 @@ class TI_DLL_EXPORT Ndarray {
    */
   explicit Ndarray(Program *prog,
                    const DataType type,
-                   const std::vector<int> &shape);
+                   const std::vector<int> &shape,
+                   const std::vector<int> &element_shape = {},
+                   ExternalArrayLayout layout = ExternalArrayLayout::kNull);
 
   /* Constructs a Ndarray from an existing DeviceAllocation
    * It doesn't handle the allocation and deallocation.
@@ -32,10 +34,12 @@ class TI_DLL_EXPORT Ndarray {
                    const std::vector<int> &shape);
   DeviceAllocation ndarray_alloc_{kDeviceNullAllocation};
   DataType dtype;
+  std::vector<int> element_shape;
   // Invariant: Since ndarray indices are flattened for vector/matrix, this is
   // always true:
   //   num_active_indices = shape.size()
   std::vector<int> shape;
+  ExternalArrayLayout layout{ExternalArrayLayout::kNull};
   int num_active_indices{0};
 
   intptr_t get_data_ptr_as_int() const;
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 05ae5f128715d..b12975ebfacc4 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -555,8 +555,11 @@ std::size_t Program::get_snode_num_dynamically_allocated(SNode *snode) {
 }
 
 Ndarray *Program::create_ndarray(const DataType type,
-                                 const std::vector<int> &shape) {
-  ndarrays_.emplace_back(std::make_unique<Ndarray>(this, type, shape));
+                                 const std::vector<int> &shape,
+                                 const std::vector<int> &element_shape,
+                                 ExternalArrayLayout layout) {
+  ndarrays_.emplace_back(
+      std::make_unique<Ndarray>(this, type, shape, element_shape, layout));
   return ndarrays_.back().get();
 }
 
diff --git a/taichi/program/program.h b/taichi/program/program.h
index a43d271c5ce4e..70f3f21950a6f 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -321,7 +321,11 @@ class TI_DLL_EXPORT Program {
     return program_impl_->allocate_memory_ndarray(alloc_size, result_buffer);
   }
 
-  Ndarray *create_ndarray(const DataType type, const std::vector<int> &shape);
+  Ndarray *create_ndarray(
+      const DataType type,
+      const std::vector<int> &shape,
+      const std::vector<int> &element_shape = {},
+      ExternalArrayLayout layout = ExternalArrayLayout::kNull);
 
   intptr_t get_ndarray_data_ptr_as_int(const Ndarray *ndarray);
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 8b2a5546d8b6b..8bc8492f984dc 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -89,6 +89,11 @@ void export_lang(py::module &m) {
 #undef PER_EXTENSION
       .export_values();
 
+  py::enum_<ExternalArrayLayout>(m, "Layout", py::arithmetic())
+      .value("AOS", ExternalArrayLayout::kAOS)
+      .value("SOA", ExternalArrayLayout::kSOA)
+      .export_values();
+
   // TODO(type): This should be removed
   py::class_<DataType>(m, "DataType")
       .def(py::init<Type *>())
@@ -427,9 +432,14 @@ void export_lang(py::module &m) {
       .def(
           "create_ndarray",
           [&](Program *program, const DataType &dt,
-              const std::vector<int> &shape) -> Ndarray * {
-            return program->create_ndarray(dt, shape);
+              const std::vector<int> &shape,
+              const std::vector<int> &element_shape,
+              ExternalArrayLayout layout) -> Ndarray * {
+            return program->create_ndarray(dt, shape, element_shape, layout);
           },
+          py::arg("dt"), py::arg("shape"),
+          py::arg("element_shape") = py::tuple(),
+          py::arg("layout") = ExternalArrayLayout::kNull,
           py::return_value_policy::reference)
       .def("get_ndarray_data_ptr_as_int",
            [](Program *program, Ndarray *ndarray) {

From 2faf4899f4b8cd160c102ebbe30c2d7be20ed836 Mon Sep 17 00:00:00 2001
From: ailzhang <ailing@taichi.graphics>
Date: Tue, 31 May 2022 20:38:51 +0800
Subject: [PATCH 132/176] [Lang] Support constructing vector and matrix ndarray
 from ti.ndarray()

ghstack-source-id: 3055ba79c35aecea61c449038b4f8c07e87571b9
Pull Request resolved: https://github.com/taichi-dev/taichi/pull/5073
---
 python/taichi/lang/impl.py    | 28 +++++++++++++++++++++-------
 python/taichi/lang/mesh.py    |  2 +-
 taichi/python/export_lang.cpp |  1 +
 tests/python/test_ndarray.py  | 21 +++++++++++++++++++++
 4 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index d15731be885ac..83e230a157686 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -8,12 +8,13 @@
 from taichi.lang._ndarray import ScalarNdarray
 from taichi.lang._ndrange import GroupedNDRange, _Ndrange
 from taichi.lang.any_array import AnyArray, AnyArrayAccess
+from taichi.lang.enums import Layout
 from taichi.lang.exception import TaichiRuntimeError, TaichiTypeError
 from taichi.lang.expr import Expr, make_expr_group
 from taichi.lang.field import Field, ScalarField
 from taichi.lang.kernel_arguments import SparseMatrixProxy
-from taichi.lang.matrix import (Matrix, MatrixField, _IntermediateMatrix,
-                                _MatrixFieldElement)
+from taichi.lang.matrix import (Matrix, MatrixField, MatrixNdarray, MatrixType,
+                                _IntermediateMatrix, _MatrixFieldElement)
 from taichi.lang.mesh import (ConvType, MeshElementFieldProxy, MeshInstance,
                               MeshRelationAccessProxy,
                               MeshReorderedMatrixFieldProxy,
@@ -23,7 +24,7 @@
 from taichi.lang.tape import TapeImpl
 from taichi.lang.util import (cook_dtype, get_traceback, is_taichi_class,
                               python_scope, taichi_scope, warning)
-from taichi.types.primitive_types import f16, f32, f64, i32, i64
+from taichi.types.primitive_types import f16, f32, f64, i32, i64, types
 
 
 @taichi_scope
@@ -564,21 +565,34 @@ def field(dtype, shape=None, name="", offset=None, needs_grad=False):
 
 
 @python_scope
-def ndarray(dtype, shape):
+def ndarray(dtype, shape, layout=Layout.NULL):
     """Defines a Taichi ndarray with scalar elements.
 
     Args:
-        dtype (DataType): Data type of each value.
+        dtype (Union[DataType, MatrixType]): Data type of each element. This can be either a scalar type like ti.f32 or a compound type like ti.types.vector(3, ti.i32).
         shape (Union[int, tuple[int]]): Shape of the ndarray.
+        layout (Layout, optional): Layout of ndarray, only applicable when element is non-scalar type. Default is Layout.AOS.
 
     Example:
         The code below shows how a Taichi ndarray with scalar elements can be declared and defined::
 
-            >>> x = ti.ndarray(ti.f32, shape=(16, 8))
+            >>> x = ti.ndarray(ti.f32, shape=(16, 8))  # ndarray of shape (16, 8), each element is ti.f32 scalar.
+            >>> vec3 = ti.types.vector(3, ti.i32)
+            >>> y = ti.ndarray(vec3, shape=(10, 2))  # ndarray of shape (10, 2), each element is a vector of 3 ti.i32 scalars.
+            >>> matrix_ty = ti.types.matrix(3, 4, float)
+            >>> z = ti.ndarray(matrix_ty, shape=(4, 5), layout=ti.Layout.SOA)  # ndarray of shape (4, 5), each element is a matrix of (3, 4) ti.float scalars.
     """
     if isinstance(shape, numbers.Number):
         shape = (shape, )
-    return ScalarNdarray(dtype, shape)
+    if dtype in types:
+        assert layout == Layout.NULL
+        return ScalarNdarray(dtype, shape)
+    if isinstance(dtype, MatrixType):
+        layout = Layout.AOS if layout == Layout.NULL else layout
+        return MatrixNdarray(dtype.n, dtype.m, dtype.dtype, shape, layout)
+
+    raise TaichiRuntimeError(
+        f'{dtype} is not supported as ndarray element type')
 
 
 @taichi_scope
diff --git a/python/taichi/lang/mesh.py b/python/taichi/lang/mesh.py
index 42c15e935c0b0..e8592f7b69f8b 100644
--- a/python/taichi/lang/mesh.py
+++ b/python/taichi/lang/mesh.py
@@ -221,7 +221,7 @@ def place(
         """Declares mesh attributes for the mesh element in current mesh builder.
 
         Args:
-        members (Dict[str, Union[PrimitiveType, VectorType, MatrixType]]): \
+        members (Dict[str, Union[PrimitiveType, MatrixType]]): \
             names and types for element attributes.
         reorder: True if reorders the internal memory for coalesced data access within mesh-for loop.
         needs_grad: True if needs to record grad.
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 8bc8492f984dc..fcb2c6fe4284e 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -92,6 +92,7 @@ void export_lang(py::module &m) {
   py::enum_<ExternalArrayLayout>(m, "Layout", py::arithmetic())
       .value("AOS", ExternalArrayLayout::kAOS)
       .value("SOA", ExternalArrayLayout::kSOA)
+      .value("NULL", ExternalArrayLayout::kNull)
       .export_values();
 
   // TODO(type): This should be removed
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index cadfca098497f..6454c37f501ba 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -165,6 +165,27 @@ def test_ndarray_2d():
     _test_ndarray_2d()
 
 
+@test_utils.test(arch=supported_archs_taichi_ndarray)
+def test_ndarray_compound_element():
+    n = 10
+    a = ti.ndarray(ti.i32, shape=(n, ))
+
+    vec3 = ti.types.vector(3, ti.i32)
+    b = ti.ndarray(vec3, shape=(n, n))
+    assert isinstance(b, ti.MatrixNdarray)
+    assert b.shape == (n, n)
+    assert b.element_type.dtype == ti.i32
+    assert b.element_type.shape == (3, 1)
+
+    matrix34 = ti.types.matrix(3, 4, float)
+    c = ti.ndarray(matrix34, shape=(n, n + 1), layout=ti.Layout.SOA)
+    assert isinstance(c, ti.MatrixNdarray)
+    assert c.shape == (n, n + 1)
+    assert c.element_type.dtype == ti.f32
+    assert c.element_type.shape == (3, 4)
+    assert c.layout == ti.Layout.SOA
+
+
 def _test_ndarray_copy_from_ndarray():
     n = 16
     a = ti.ndarray(ti.i32, shape=n)

From e360191437e9692f0edcb03cd7eec8dee8c99c93 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Wed, 1 Jun 2022 08:50:58 +0800
Subject: [PATCH 133/176] [refactor] Resolve comments from #5065 (#5074)

---
 python/taichi/lang/_ndarray.py      | 5 ++---
 python/taichi/lang/kernel_impl.py   | 6 +++---
 python/taichi/types/ndarray_type.py | 4 ++--
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index 30514bac43393..7c0298ac37935 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -4,7 +4,7 @@
 from taichi.lang.enums import Layout
 from taichi.lang.util import cook_dtype, python_scope, to_numpy_type
 from taichi.types import primitive_types
-from taichi.types.ndarray_type import SpecializeNdarrayType
+from taichi.types.ndarray_type import NdarrayTypeMetadata
 
 
 class Ndarray:
@@ -23,8 +23,7 @@ def __init__(self):
         self.arr = None
 
     def get_type(self):
-        return SpecializeNdarrayType(self.element_type, self.shape,
-                                     self.layout)
+        return NdarrayTypeMetadata(self.element_type, self.shape, self.layout)
 
     @property
     def element_shape(self):
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index e5365981c7932..fab88a564e267 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -337,13 +337,13 @@ def extract_arg(arg, anno):
             return arg
         if isinstance(anno, ndarray_type.NdarrayType):
             if isinstance(arg, taichi.lang._ndarray.ScalarNdarray):
-                anno.match(arg.get_type())
+                anno.check_matched(arg.get_type())
                 return arg.dtype, len(arg.shape), (), Layout.AOS
             if isinstance(arg, taichi.lang.matrix.VectorNdarray):
-                anno.match(arg.get_type())
+                anno.check_matched(arg.get_type())
                 return arg.dtype, len(arg.shape) + 1, (arg.n, ), arg.layout
             if isinstance(arg, taichi.lang.matrix.MatrixNdarray):
-                anno.match(arg.get_type())
+                anno.check_matched(arg.get_type())
                 return arg.dtype, len(arg.shape) + 2, (arg.n,
                                                        arg.m), arg.layout
             # external arrays
diff --git a/python/taichi/types/ndarray_type.py b/python/taichi/types/ndarray_type.py
index 61be06f25213c..58ce958f09cfe 100644
--- a/python/taichi/types/ndarray_type.py
+++ b/python/taichi/types/ndarray_type.py
@@ -1,7 +1,7 @@
 from taichi.types.primitive_types import f32
 
 
-class SpecializeNdarrayType:
+class NdarrayTypeMetadata:
     def __init__(self, element_type, shape=None, layout=None):
         self.element_type = element_type
         self.shape = shape
@@ -43,7 +43,7 @@ def __init__(self,
         self.field_dim = field_dim
         self.layout = layout
 
-    def match(self, ndarray_type: SpecializeNdarrayType):
+    def check_matched(self, ndarray_type: NdarrayTypeMetadata):
         if self.element_dim is not None and self.element_dim != len(
                 ndarray_type.element_type.shape):
             raise ValueError(

From 564dceab94849b29015c6050676dd921bde0f6fa Mon Sep 17 00:00:00 2001
From: Alex Brown <96645475+AlexBrown42@users.noreply.github.com>
Date: Wed, 1 Jun 2022 09:47:10 +0800
Subject: [PATCH 134/176] [Example] Update mass_spring_3d_ggui.py to v2 (#3879)

* cleaner mass_spring_3d_ggui.py

* fix collision

* Fix penetration

* No capitalized globals

* fix compute_force

* parameter tweaks

* Looks good now

* Update TaichiCore.cmake

* Update mass_spring_3d_ggui.py

* Update mass_spring_3d_ggui.py

Change variable name `allow_bending` to `bending_springs`.
---
 .../ggui_examples/mass_spring_3d_ggui.py      | 173 +++++++++++-------
 1 file changed, 106 insertions(+), 67 deletions(-)

diff --git a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
index 2cfae527a3333..80302a772c5f0 100644
--- a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
+++ b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
@@ -1,104 +1,143 @@
 import taichi as ti
+ti.init(arch=ti.gpu)  # Alternatively, ti.init(arch=ti.cpu)
 
-arch = ti.vulkan if ti._lib.core.with_vulkan() else ti.cuda
-ti.init(arch=arch)
+n = 128
+quad_size = 1.0 / n
+dt = 4e-2 / n
+substeps = int(1 / 60 // dt)
 
-N = 128
-cell_size = 1.0 / N
-gravity = 0.5
-stiffness = 1600
-damping = 2
-dt = 5e-4
+gravity = ti.Vector([0, -9.8, 0])
+spring_Y = 3e4
+dashpot_damping = 1e4
+drag_damping = 1
 
-ball_radius = 0.2
-ball_center = ti.Vector.field(3, float, (1, ))
+ball_radius = 0.3
+ball_center = ti.Vector.field(3, dtype=float, shape=(1, ))
+ball_center[0] = [0, 0, 0]
 
-x = ti.Vector.field(3, float, (N, N))
-v = ti.Vector.field(3, float, (N, N))
+x = ti.Vector.field(3, dtype=float, shape=(n, n))
+v = ti.Vector.field(3, dtype=float, shape=(n, n))
 
-num_triangles = (N - 1) * (N - 1) * 2
-indices = ti.field(int, num_triangles * 3)
-vertices = ti.Vector.field(3, float, N * N)
-
-
-def init_scene():
-    for i, j in ti.ndrange(N, N):
-        x[i, j] = ti.Vector([
-            i * cell_size, j * cell_size / ti.sqrt(2),
-            (N - j) * cell_size / ti.sqrt(2)
-        ])
-    ball_center[0] = ti.Vector([0.5, -0.5, -0.0])
+num_triangles = (n - 1) * (n - 1) * 2
+indices = ti.field(int, shape=num_triangles * 3)
+vertices = ti.Vector.field(3, dtype=float, shape=n * n)
 
+bending_springs = False
 
 @ti.kernel
-def set_indices():
-    for i, j in ti.ndrange(N, N):
-        if i < N - 1 and j < N - 1:
-            square_id = (i * (N - 1)) + j
-            # 1st triangle of the square
-            indices[square_id * 6 + 0] = i * N + j
-            indices[square_id * 6 + 1] = (i + 1) * N + j
-            indices[square_id * 6 + 2] = i * N + (j + 1)
-            # 2nd triangle of the square
-            indices[square_id * 6 + 3] = (i + 1) * N + j + 1
-            indices[square_id * 6 + 4] = i * N + (j + 1)
-            indices[square_id * 6 + 5] = (i + 1) * N + j
+def initialize_mass_points():
+    random_offset = ti.Vector([ti.random() - 0.5, ti.random() - 0.5]) * 0.1
 
+    for i, j in x:
+        x[i, j] = [
+            i * quad_size - 0.5 + random_offset[0], 0.6,
+            j * quad_size - 0.5 + random_offset[1]
+        ]
+        v[i, j] = [0, 0, 0]
 
-links = [[-1, 0], [1, 0], [0, -1], [0, 1], [-1, -1], [1, -1], [-1, 1], [1, 1]]
-links = [ti.Vector(v) for v in links]
 
+@ti.kernel
+def initialize_mesh_indices():
+    for i, j in ti.ndrange(n - 1, n - 1):
+        quad_id = (i * (n - 1)) + j
+        # 1st triangle of the square
+        indices[quad_id * 6 + 0] = i * n + j
+        indices[quad_id * 6 + 1] = (i + 1) * n + j
+        indices[quad_id * 6 + 2] = i * n + (j + 1)
+        # 2nd triangle of the square
+        indices[quad_id * 6 + 3] = (i + 1) * n + j + 1
+        indices[quad_id * 6 + 4] = i * n + (j + 1)
+        indices[quad_id * 6 + 5] = (i + 1) * n + j
+
+
+initialize_mesh_indices()
+
+spring_offsets = []
+if bending_springs:
+    for i in range(-1, 2):
+        for j in range(-1, 2):
+            if (i, j) != (0, 0):
+                spring_offsets.append(ti.Vector([i, j]))
+
+else:
+    for i in range(-2, 3):
+        for j in range(-2, 3):
+            if (i, j) != (0, 0) and abs(i) + abs(j) <= 2:
+                spring_offsets.append(ti.Vector([i, j]))
 
 @ti.kernel
-def step():
+def substep():
     for i in ti.grouped(x):
-        v[i].y -= gravity * dt
+        v[i] += gravity * dt
+
     for i in ti.grouped(x):
         force = ti.Vector([0.0, 0.0, 0.0])
-        for d in ti.static(links):
-            j = min(max(i + d, 0), [N - 1, N - 1])
-            relative_pos = x[j] - x[i]
-            current_length = relative_pos.norm()
-            original_length = cell_size * float(i - j).norm()
-            if original_length != 0:
-                force += stiffness * relative_pos.normalized() * (
-                    current_length - original_length) / original_length
+        for spring_offset in ti.static(spring_offsets):
+            j = i + spring_offset
+            if 0 <= j[0] < n and 0 <= j[1] < n:
+                x_ij = x[i] - x[j]
+                v_ij = v[i] - v[j]
+                d = x_ij.normalized()
+                current_dist = x_ij.norm()
+                original_dist = quad_size * float(i - j).norm()
+                # Spring force
+                force += -spring_Y * d * (current_dist / original_dist - 1)
+                # Dashpot damping
+                force += -v_ij.dot(d) * d * dashpot_damping * quad_size
+
         v[i] += force * dt
+
     for i in ti.grouped(x):
-        v[i] *= ti.exp(-damping * dt)
-        if (x[i] - ball_center[0]).norm() <= ball_radius:
-            v[i] = ti.Vector([0.0, 0.0, 0.0])
+        v[i] *= ti.exp(-drag_damping * dt)
+        offset_to_center = x[i] - ball_center[0]
+        if offset_to_center.norm() <= ball_radius:
+            # Velocity projection
+            normal = offset_to_center.normalized()
+            v[i] -= min(v[i].dot(normal), 0) * normal
         x[i] += dt * v[i]
 
 
-@ti.kernel
-def set_vertices():
-    for i, j in ti.ndrange(N, N):
-        vertices[i * N + j] = x[i, j]
 
+@ti.kernel
+def update_vertices():
+    for i, j in ti.ndrange(n, n):
+        vertices[i * n + j] = x[i, j]
 
-init_scene()
-set_indices()
 
-window = ti.ui.Window("Cloth", (800, 800), vsync=True)
+window = ti.ui.Window("Taichi Cloth Simulation on GGUI", (1024, 1024),
+                      vsync=True)
 canvas = window.get_canvas()
+canvas.set_background_color((1, 1, 1))
 scene = ti.ui.Scene()
 camera = ti.ui.make_camera()
 
-while window.running:
-    for i in range(30):
-        step()
-    set_vertices()
+current_t = 0.0
+initialize_mass_points()
 
-    camera.position(0.5, -0.5, 2)
-    camera.lookat(0.5, -0.5, 0)
+while window.running:
+    if current_t > 1.5:
+        # Reset
+        initialize_mass_points()
+        current_t = 0
+
+    for i in range(substeps):
+        substep()
+        current_t += dt
+    update_vertices()
+
+    camera.position(0.0, 0.0, 3)
+    camera.lookat(0.0, 0.0, 0)
     scene.set_camera(camera)
 
-    scene.point_light(pos=(0.5, 1, 2), color=(1, 1, 1))
+    scene.point_light(pos=(0, 1, 2), color=(1, 1, 1))
     scene.mesh(vertices,
                indices=indices,
-               color=(0.5, 0.5, 0.5),
+               color=(0.8, 0, 0),
                two_sided=True)
-    scene.particles(ball_center, radius=ball_radius, color=(0.5, 0, 0))
+
+    # Draw a smaller ball to avoid visual penetration
+    scene.particles(ball_center, radius=ball_radius * 0.95, color=(0.2, 0.6, 1))
     canvas.scene(scene)
     window.show()
+
+#TODO: include self-collision handling

From a2a9b44bd91bd05d88217842b4fcfca0961d6f5e Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Wed, 1 Jun 2022 11:09:38 +0800
Subject: [PATCH 135/176] [doc] Fix broken link for github action status badge
 (#5076)

* [doc] Fix link for github action status badge

* Update README.md

Co-authored-by: Bo Qiao <boqiao@taichi.graphics>

* Update README.md

Co-authored-by: Bo Qiao <boqiao@taichi.graphics>

Co-authored-by: Bo Qiao <boqiao@taichi.graphics>
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 949fe62c312dd..0ec34b767dcbc 100644
--- a/README.md
+++ b/README.md
@@ -6,8 +6,8 @@
 
 [![Latest Release](https://img.shields.io/github/v/release/taichi-dev/taichi?color=blue&label=Latest%20Release)](https://github.com/taichi-dev/taichi/releases/latest)
 [![downloads](https://pepy.tech/badge/taichi)](https://pepy.tech/project/taichi)
-[![CI](https://github.com/taichi-dev/taichi/actions/workflows/testing.yml/badge.svg)](https://github.com/taichi-dev/taichi/actions/workflows/postsubmit.yml)
-[![Docker Cloud Build Status](https://img.shields.io/docker/cloud/build/taichidev/taichi?label=Docker%20Image&logo=docker)](https://hub.docker.com/r/taichidev/taichi)
+[![CI](https://github.com/taichi-dev/taichi/actions/workflows/testing.yml/badge.svg)](https://github.com/taichi-dev/taichi/actions/workflows/testing.yml)
+[![Nightly Release](https://github.com/taichi-dev/taichi/actions/workflows/release.yml/badge.svg)](https://github.com/taichi-dev/taichi/actions/workflows/release.yml)
 [![Python Codecov Status](https://img.shields.io/codecov/c/github/taichi-dev/taichi?label=Python%20Coverage&logo=codecov)](https://codecov.io/gh/taichi-dev/taichi/src/master)
 
 ```py

From c985372498a7370f6d273864b001b928c24f48cc Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Wed, 1 Jun 2022 11:13:58 +0800
Subject: [PATCH 136/176] [llvm] Specialize element shape for LLVM backend
 (#5071)

* Specialize element shape for LLVM backend

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/codegen_llvm.cpp | 26 +++++++++++++++++++++++++-
 1 file changed, 25 insertions(+), 1 deletion(-)

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 0444be59f9b4a..6449b8a378b30 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1609,6 +1609,23 @@ void CodeGenLLVM::visit(ExternalPtrStmt *stmt) {
   auto arg_id = argload->arg_id;
   int num_indices = stmt->indices.size();
   std::vector<llvm::Value *> sizes(num_indices);
+  const auto &element_shape = stmt->element_shape;
+  enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
+  const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
+  // Determine the element shape position inside the indices vector
+  // TODO: change the outer layout in order to remove the element layout
+  // guess work
+  int element_shape_begin = -1;
+  int element_shape_end = -1;
+  if (element_shape.size() > 0) {
+    if (layout == layout_SOA) {
+      element_shape_begin = 0;
+      element_shape_end = element_shape.size();
+    } else {
+      element_shape_begin = num_indices - element_shape.size();
+      element_shape_end = num_indices;
+    }
+  }
 
   for (int i = 0; i < num_indices; i++) {
     auto raw_arg = create_call(
@@ -1623,8 +1640,15 @@ void CodeGenLLVM::visit(ExternalPtrStmt *stmt) {
       llvm::PointerType::get(tlctx->get_data_type(dt), 0));
 
   auto linear_index = tlctx->get_constant(0);
+  int element_shape_idx = 0;
   for (int i = 0; i < num_indices; i++) {
-    linear_index = builder->CreateMul(linear_index, sizes[i]);
+    if (i >= element_shape_begin && i < element_shape_end) {
+      llvm::Value *size_var =
+          tlctx->get_constant(element_shape[element_shape_idx++]);
+      linear_index = builder->CreateMul(linear_index, size_var);
+    } else {
+      linear_index = builder->CreateMul(linear_index, sizes[i]);
+    }
     linear_index = builder->CreateAdd(linear_index, llvm_val[stmt->indices[i]]);
   }
 

From 1514d4b3ba41ea6e6823dd7271c9194f42de8e16 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Wed, 1 Jun 2022 11:14:38 +0800
Subject: [PATCH 137/176] [spirv] Specialize element shape for spirv codegen.
 (#5068)

* Specialize element shape for spirv codegen.

* Fix index for size_var_names

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Slight changes for better code style.

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/spirv/spirv_codegen.cpp | 33 ++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index b8901c20c4dde..25127db5be3df 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -564,8 +564,30 @@ class TaskCodegen : public IRVisitor {
     {
       const int num_indices = stmt->indices.size();
       std::vector<std::string> size_var_names;
+      const auto &element_shape = stmt->element_shape;
+      enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
+      const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
       const auto extra_args_member_index = ctx_attribs_->args().size();
+
+      // Determine the element shape position inside the indices vector
+      // TODO: change the outer layout in order to remove the element layout
+      // guess work
+      int element_shape_begin = -1;
+      int element_shape_end = -1;
+      if (element_shape.size() > 0) {
+        if (layout == layout_SOA) {
+          element_shape_begin = 0;
+          element_shape_end = element_shape.size();
+        } else {
+          element_shape_begin = num_indices - element_shape.size();
+          element_shape_end = num_indices;
+        }
+      }
       for (int i = 0; i < num_indices; i++) {
+        // Skip expressions for element shapes.
+        if (i >= element_shape_begin && i < element_shape_end) {
+          continue;
+        }
         std::string var_name = fmt::format("{}_size{}_", stmt->raw_name(), i);
         const auto extra_arg_index = (arg_id * taichi_max_num_indices) + i;
         spirv::Value var_ptr = ir_->make_value(
@@ -578,8 +600,16 @@ class TaskCodegen : public IRVisitor {
         ir_->register_value(var_name, var);
         size_var_names.push_back(std::move(var_name));
       }
+      int size_var_names_idx = 0;
       for (int i = 0; i < num_indices; i++) {
-        spirv::Value size_var = ir_->query_value(size_var_names[i]);
+        spirv::Value size_var;
+        // Use immediate numbers to flatten index for element shapes.
+        if (i >= element_shape_begin && i < element_shape_end) {
+          size_var = ir_->uint_immediate_number(
+              ir_->i32_type(), element_shape[i - element_shape_begin]);
+        } else {
+          size_var = ir_->query_value(size_var_names[size_var_names_idx++]);
+        }
         spirv::Value indices = ir_->query_value(stmt->indices[i]->raw_name());
         linear_offset = ir_->mul(linear_offset, size_var);
         linear_offset = ir_->add(linear_offset, indices);
@@ -592,7 +622,6 @@ class TaskCodegen : public IRVisitor {
       ir_->decorate(spv::OpDecorate, linear_offset,
                     spv::DecorationNoSignedWrap);
     }
-
     if (device_->get_cap(DeviceCapability::spirv_has_physical_storage_buffer)) {
       spirv::Value addr_ptr = ir_->make_value(
           spv::OpAccessChain,

From f53e35171e7c0f0a6ba9f35ec40695755c158b0f Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Wed, 1 Jun 2022 11:39:41 +0800
Subject: [PATCH 138/176] [Lang] Add more initialization routines for glsl
 matrix types (#5069)

* add more initialization routines for glsl matrix types

* add more initialization routines for glsl matrix types
---
 python/taichi/math/mathimpl.py | 34 +++++++++++++++++++++++++++++++---
 1 file changed, 31 insertions(+), 3 deletions(-)

diff --git a/python/taichi/math/mathimpl.py b/python/taichi/math/mathimpl.py
index 9561ffd74bd7b..d3c3c0234331a 100644
--- a/python/taichi/math/mathimpl.py
+++ b/python/taichi/math/mathimpl.py
@@ -68,22 +68,50 @@ def uvec4(*args):
     return ti.types.vector(4, _get_uint_ip())(*args)  # pylint: disable=E1101
 
 
+def _gen_matrix(n, *args):
+    """Supports more matrix construction routines.
+
+    1. Usual contruction (from a 2d list or a single scalar).
+    2. From a 1-D array of n*n elements (glsl style).
+    3. From a list of n-D vectors (glsl style).
+    """
+    if len(args) == n * n:  # initialize with n*n scalars
+        data = [[args[k * n + i] for i in range(n)] for k in range(n)]
+        return ti.Matrix(data, float)
+
+    if len(args) == n:  # initialize with n vectors
+        # Matrix.rows() will do implict type inference
+        data = [list(x) for x in args]
+        return ti.Matrix(data, float)
+
+    if len(args) == 1:  # initialize with a scalar, a matrix or a 1d list
+        x = args[0]
+        if isinstance(x, ti.Matrix):
+            return x
+
+        if hasattr(x, "__len__") and len(x) == n * n:
+            data = [[x[k * n + i] for i in range(n)] for k in range(n)]
+            return ti.Matrix(data, float)
+
+    return ti.types.matrix(n, n, float)(*args)  # pylint: disable=E1101
+
+
 def mat2(*args):
     """2x2 floating matrix type.
     """
-    return ti.types.matrix(2, 2, float)(*args)  # pylint: disable=E1101
+    return _gen_matrix(2, *args)
 
 
 def mat3(*args):
     """3x3 floating matrix type.
     """
-    return ti.types.matrix(3, 3, float)(*args)  # pylint: disable=E1101
+    return _gen_matrix(3, *args)
 
 
 def mat4(*args):
     """4x4 floating matrix type.
     """
-    return ti.types.matrix(4, 4, float)(*args)  # pylint: disable=E1101
+    return _gen_matrix(4, *args)
 
 
 @ti.func

From 31f9bfbc0bd2a4c77d683362749aacfb26085589 Mon Sep 17 00:00:00 2001
From: Bo Qiao <qiao.bo@outlook.com>
Date: Wed, 1 Jun 2022 14:52:00 +0800
Subject: [PATCH 139/176] [cuda] [simt] Add assertions for warp intrinsics on
 old GPUs (#5077)

* Add guard for cc smaller than 70

* Fix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Fix pylint

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/impl.py      |  4 ++++
 python/taichi/lang/simt/warp.py | 10 ++++++++++
 2 files changed, 14 insertions(+)

diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index 83e230a157686..f10ba72937b91 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -888,6 +888,10 @@ def call_internal(name, *args, with_runtime_context=True):
                                            with_runtime_context))
 
 
+def get_cuda_compute_capability():
+    return _ti_core.query_int64("cuda_compute_capability")
+
+
 @taichi_scope
 def mesh_relation_access(mesh, from_index, to_element_type):
     # to support ti.mesh_local and access mesh attribute as field
diff --git a/python/taichi/lang/simt/warp.py b/python/taichi/lang/simt/warp.py
index 720b504577ade..4ae8cf8e9bc7e 100644
--- a/python/taichi/lang/simt/warp.py
+++ b/python/taichi/lang/simt/warp.py
@@ -98,6 +98,11 @@ def shfl_xor_i32(mask, val, offset):
 
 
 def match_any(mask, value):
+    # These intrinsics are only available on compute_70 or higher
+    # https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
+    if impl.get_cuda_compute_capability() < 70:
+        raise AssertionError(
+            "match_any intrinsic only available on compute_70 or higher")
     return impl.call_internal("cuda_match_any_sync_i32",
                               mask,
                               value,
@@ -105,6 +110,11 @@ def match_any(mask, value):
 
 
 def match_all(mask, val):
+    # These intrinsics are only available on compute_70 or higher
+    # https://docs.nvidia.com/cuda/pdf/NVVM_IR_Specification.pdf
+    if impl.get_cuda_compute_capability() < 70:
+        raise AssertionError(
+            "match_all intrinsic only available on compute_70 or higher")
     return impl.call_internal("cuda_match_all_sync_i32",
                               mask,
                               val,

From 4b0ea28a16c74893f5abf5713e850272b5acb512 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Wed, 1 Jun 2022 16:00:30 +0800
Subject: [PATCH 140/176] [refactor] Correctly set ndarray element_size and
 nelement (#5080)

---
 python/taichi/lang/matrix.py          |  6 ++----
 python/taichi/linalg/sparse_matrix.py |  6 +++++-
 taichi/program/ndarray.cpp            | 15 ++++++++++-----
 taichi/program/program.cpp            |  5 +++--
 taichi/program/sparse_matrix.cpp      |  2 +-
 taichi/python/export_lang.cpp         |  1 +
 tests/python/test_ndarray.py          |  4 ++--
 7 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index d94191b88f3c1..39d9820eb6eef 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1709,8 +1709,7 @@ def element_shape(self):
             >>> arr.element_shape
             (2, 2)
         """
-        arr_shape = tuple(self.arr.shape)
-        return arr_shape[:2] if self.layout == Layout.SOA else arr_shape[-2:]
+        return tuple(self.arr.element_shape)
 
     @python_scope
     def __setitem__(self, key, value):
@@ -1807,8 +1806,7 @@ def element_shape(self):
             >>> a.element_shape
             (3,)
         """
-        arr_shape = tuple(self.arr.shape)
-        return arr_shape[:1] if self.layout == Layout.SOA else arr_shape[-1:]
+        return tuple(self.arr.element_shape)
 
     @python_scope
     def __setitem__(self, key, value):
diff --git a/python/taichi/linalg/sparse_matrix.py b/python/taichi/linalg/sparse_matrix.py
index 83b4eb15d3b86..f04ddeb786340 100644
--- a/python/taichi/linalg/sparse_matrix.py
+++ b/python/taichi/linalg/sparse_matrix.py
@@ -1,3 +1,5 @@
+from functools import reduce
+
 import numpy as np
 from taichi.lang.exception import TaichiRuntimeError
 from taichi.lang.field import Field
@@ -183,7 +185,9 @@ def build_from_ndarray(self, ndarray):
             [5, 0, 0, 0, 0]
         """
         if isinstance(ndarray, Ndarray):
-            if ndarray.arr.nelement() % 3 != 0:
+            num_scalars = reduce(lambda x, y: x * y,
+                                 ndarray.shape + ndarray.element_shape)
+            if num_scalars % 3 != 0:
                 raise TaichiRuntimeError(
                     "The number of ndarray elements must have a length that is divisible by 3."
                 )
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index f022a66d08bb5..2e6876dfb9676 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -20,7 +20,15 @@ Ndarray::Ndarray(Program *prog,
       element_shape(element_shape_),
       shape(shape_),
       layout(layout_),
-      element_size_(data_type_size(dtype)),
+      nelement_(std::accumulate(std::begin(shape_),
+                                std::end(shape_),
+                                1,
+                                std::multiplies<>())),
+      element_size_(data_type_size(dtype) *
+                    std::accumulate(std::begin(element_shape),
+                                    std::end(element_shape),
+                                    1,
+                                    std::multiplies<>())),
       prog_(prog),
       rw_accessors_bank_(&prog->get_ndarray_rw_accessors_bank()) {
   // TODO: Instead of flattening the element, shape/nelement_/num_active_indices
@@ -32,10 +40,7 @@ Ndarray::Ndarray(Program *prog,
   } else if (layout == ExternalArrayLayout::kSOA) {
     shape.insert(shape.begin(), element_shape.begin(), element_shape.end());
   }
-  nelement_ = std::accumulate(std::begin(shape_), std::end(shape_), 1,
-                              std::multiplies<>()) *
-              std::accumulate(std::begin(element_shape),
-                              std::end(element_shape), 1, std::multiplies<>());
+
   num_active_indices = shape.size();
 
   ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index b12975ebfacc4..fdce6b501fef3 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -582,8 +582,9 @@ void Program::fill_ndarray_fast(Ndarray *ndarray, uint32_t val) {
 // This is a temporary solution to bypass device api.
 // Should be moved to CommandList once available in CUDA.
 #ifdef TI_WITH_LLVM
-  get_llvm_program_impl()->fill_ndarray(ndarray->ndarray_alloc_,
-                                        ndarray->get_nelement(), val);
+  get_llvm_program_impl()->fill_ndarray(
+      ndarray->ndarray_alloc_,
+      ndarray->get_nelement() * ndarray->get_element_size(), val);
 #else
   TI_ERROR("Not supported");
 #endif
diff --git a/taichi/program/sparse_matrix.cpp b/taichi/program/sparse_matrix.cpp
index c728b033e468e..975d6b344f538 100644
--- a/taichi/program/sparse_matrix.cpp
+++ b/taichi/program/sparse_matrix.cpp
@@ -181,7 +181,7 @@ void make_sparse_matrix_from_ndarray(Program *prog,
                                      const Ndarray &ndarray) {
   std::string sdtype = taichi::lang::data_type_name(sm.get_data_type());
   auto data_ptr = prog->get_ndarray_data_ptr_as_int(&ndarray);
-  auto num_triplets = ndarray.get_nelement() / 3;
+  auto num_triplets = ndarray.get_nelement() * ndarray.get_element_size() / 3;
   if (sdtype == "f32") {
     build_ndarray_template<float32>(sm, data_ptr, num_triplets);
   } else if (sdtype == "f64") {
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index fcb2c6fe4284e..e2c53c5077071 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -549,6 +549,7 @@ void export_lang(py::module &m) {
       .def("write_int", &Ndarray::write_int)
       .def("write_float", &Ndarray::write_float)
       .def_readonly("dtype", &Ndarray::dtype)
+      .def_readonly("element_shape", &Ndarray::element_shape)
       .def_readonly("shape", &Ndarray::shape);
 
   py::enum_<aot::ArgKind>(m, "ArgKind")
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index 6454c37f501ba..a4861d4b88cd6 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -611,8 +611,8 @@ def _test_size_in_bytes():
     assert a._get_nelement() == 8
 
     b = ti.Vector.ndarray(10, ti.f64, 5)
-    assert b._get_element_size() == 8
-    assert b._get_nelement() == 50
+    assert b._get_element_size() == 80
+    assert b._get_nelement() == 5
 
 
 @test_utils.test(arch=[ti.cpu, ti.cuda])

From 530fa2a4c8b95e94923fad57f36d5fd51a522848 Mon Sep 17 00:00:00 2001
From: Bob Cao <bobcaocheng@gmail.com>
Date: Wed, 1 Jun 2022 01:26:18 -0700
Subject: [PATCH 141/176] [infra] Refactor Vulkan runtime into true Common
 Runtime (#5058)

* Remove all references to Vulkan in common runtime & fix device API for OpenGL (bindings) and DirectX 11 (memory leaks)

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix tests

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* fix cpp test

* update

* update

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 cmake/TaichiCXXFlags.cmake                    |  3 +-
 cmake/TaichiCore.cmake                        | 20 +++----
 taichi/aot/module_loader.cpp                  | 15 +++--
 taichi/backends/dx/dx_device.cpp              | 20 ++++---
 taichi/backends/dx/dx_program.cpp             | 54 ++++++++++++-----
 taichi/backends/dx/dx_program.h               | 55 +++++++++++++----
 taichi/backends/opengl/opengl_device.cpp      | 17 ++++--
 taichi/backends/opengl/opengl_device.h        | 12 +++-
 .../backends/vulkan/aot_module_loader_impl.h  | 27 ---------
 taichi/backends/vulkan/vulkan_program.cpp     | 41 ++++++-------
 taichi/backends/vulkan/vulkan_program.h       | 12 ++--
 taichi/program/program.cpp                    |  3 +-
 taichi/python/export_misc.cpp                 |  2 +-
 taichi/runtime/gfx/CMakeLists.txt             | 16 +++++
 .../gfx/aot_graph_data.h}                     | 16 ++---
 .../gfx}/aot_module_builder_impl.cpp          | 19 +++---
 .../gfx}/aot_module_builder_impl.h            | 13 ++--
 .../gfx}/aot_module_loader_impl.cpp           | 30 +++++-----
 taichi/runtime/gfx/aot_module_loader_impl.h   | 30 ++++++++++
 .../vulkan => runtime/gfx}/aot_utils.h        |  6 +-
 taichi/runtime/{vulkan => gfx}/runtime.cpp    | 59 +++++++------------
 taichi/runtime/{vulkan => gfx}/runtime.h      | 16 ++---
 .../gfx}/snode_tree_manager.cpp               | 12 ++--
 .../gfx}/snode_tree_manager.h                 | 12 ++--
 taichi/runtime/opengl/opengl_api.cpp          | 12 ++--
 taichi/runtime/vulkan/CMakeLists.txt          | 13 ----
 tests/cpp/aot/aot_save_load_test.cpp          | 24 ++++----
 tests/cpp/backends/dx11_device_test.cpp       |  4 +-
 tests/python/test_ad_basics.py                |  4 +-
 tests/python/test_clear_all_gradients.py      |  2 +-
 tests/python/test_element_wise.py             |  2 +-
 tests/python/test_f16.py                      |  4 +-
 tests/python/test_fields_builder.py           |  6 +-
 tests/python/test_loop_grad.py                |  4 +-
 tests/python/test_print.py                    | 18 ++++--
 tests/python/test_reduction.py                |  5 +-
 tests/python/test_struct.py                   |  2 +-
 tests/python/test_torch_io.py                 | 24 ++++----
 tests/python/test_types.py                    | 20 ++++---
 tests/test_utils.py                           |  2 +-
 40 files changed, 367 insertions(+), 289 deletions(-)
 delete mode 100644 taichi/backends/vulkan/aot_module_loader_impl.h
 create mode 100644 taichi/runtime/gfx/CMakeLists.txt
 rename taichi/{backends/vulkan/vulkan_graph_data.h => runtime/gfx/aot_graph_data.h} (55%)
 rename taichi/{backends/vulkan => runtime/gfx}/aot_module_builder_impl.cpp (93%)
 rename taichi/{backends/vulkan => runtime/gfx}/aot_module_builder_impl.h (85%)
 rename taichi/{backends/vulkan => runtime/gfx}/aot_module_loader_impl.cpp (85%)
 create mode 100644 taichi/runtime/gfx/aot_module_loader_impl.h
 rename taichi/{backends/vulkan => runtime/gfx}/aot_utils.h (84%)
 rename taichi/runtime/{vulkan => gfx}/runtime.cpp (94%)
 rename taichi/runtime/{vulkan => gfx}/runtime.h (93%)
 rename taichi/{backends/vulkan => runtime/gfx}/snode_tree_manager.cpp (73%)
 rename taichi/{backends/vulkan => runtime/gfx}/snode_tree_manager.h (79%)
 delete mode 100644 taichi/runtime/vulkan/CMakeLists.txt

diff --git a/cmake/TaichiCXXFlags.cmake b/cmake/TaichiCXXFlags.cmake
index a2e555d22b44d..aceaae0428c57 100644
--- a/cmake/TaichiCXXFlags.cmake
+++ b/cmake/TaichiCXXFlags.cmake
@@ -20,7 +20,8 @@ endif ()
 # Do not enable lto for APPLE since it made linking extremely slow.
 if (WIN32)
     if (CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
-        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -flto=thin")
+        set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "${CMAKE_CXX_FLAGS} -flto=thin")
+        set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS} -flto=thin")
     endif()
 endif()
 
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index 1bcafcf015530..c9584aa66f7df 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -372,24 +372,21 @@ add_subdirectory(external/SPIRV-Tools)
 # https://github.com/KhronosGroup/SPIRV-Tools/issues/1569#issuecomment-390250792
 target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE SPIRV-Tools-opt ${SPIRV_TOOLS})
 
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Headers/include)
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Reflect)
+
+add_subdirectory(taichi/runtime/gfx)
+target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE gfx_runtime)
+
+# Vulkan Device API
 if (TI_WITH_VULKAN)
     include_directories(SYSTEM external/Vulkan-Headers/include)
 
     include_directories(SYSTEM external/volk)
 
-    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Headers/include)
-    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Reflect)
-
     # By specifying SYSTEM, we suppressed the warnings from third-party headers.
     target_include_directories(${CORE_LIBRARY_NAME} SYSTEM PRIVATE external/VulkanMemoryAllocator/include)
 
-    if (LINUX)
-        # shaderc requires pthread
-        set(THREADS_PREFER_PTHREAD_FLAG ON)
-        find_package(Threads REQUIRED)
-        target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE Threads::Threads)
-    endif()
-
     if (APPLE)
         find_library(MOLTEN_VK libMoltenVK.dylib PATHS $HOMEBREW_CELLAR/molten-vk $VULKAN_SDK REQUIRED)
         configure_file(${MOLTEN_VK} ${CMAKE_BINARY_DIR}/libMoltenVK.dylib COPYONLY)
@@ -398,9 +395,6 @@ if (TI_WITH_VULKAN)
             install(FILES ${CMAKE_BINARY_DIR}/libMoltenVK.dylib DESTINATION ${INSTALL_LIB_DIR}/runtime)
         endif()
     endif()
-
-    add_subdirectory(taichi/runtime/vulkan)
-    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE vulkan_runtime)
 endif ()
 
 
diff --git a/taichi/aot/module_loader.cpp b/taichi/aot/module_loader.cpp
index 908cc60db02c1..6526dfbc57969 100644
--- a/taichi/aot/module_loader.cpp
+++ b/taichi/aot/module_loader.cpp
@@ -1,6 +1,6 @@
 #include "taichi/aot/module_loader.h"
 
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 #include "taichi/backends/metal/aot_module_loader_impl.h"
 
 namespace taichi {
@@ -32,19 +32,18 @@ Kernel *KernelTemplate::get_kernel(
 std::unique_ptr<Module> Module::load(Arch arch, std::any mod_params) {
   if (arch == Arch::vulkan) {
 #ifdef TI_WITH_VULKAN
-    return vulkan::make_aot_module(mod_params);
-#else
-    TI_NOT_IMPLEMENTED
+    return gfx::make_aot_module(mod_params, arch);
+#endif
+  } else if (arch == Arch::dx11) {
+#ifdef TI_WITH_DX11
+    return gfx::make_aot_module(mod_params, arch);
 #endif
   } else if (arch == Arch::metal) {
 #ifdef TI_WITH_METAL
     return metal::make_aot_module(mod_params);
-#else
-    TI_NOT_IMPLEMENTED
 #endif
-  } else {
-    TI_NOT_IMPLEMENTED;
   }
+  TI_NOT_IMPLEMENTED;
 }
 
 Kernel *Module::get_kernel(const std::string &name) {
diff --git a/taichi/backends/dx/dx_device.cpp b/taichi/backends/dx/dx_device.cpp
index 436a40c221aee..27746b68e0f1f 100644
--- a/taichi/backends/dx/dx_device.cpp
+++ b/taichi/backends/dx/dx_device.cpp
@@ -22,7 +22,7 @@ void dump_buffer(ID3D11Device *device,
 
 void check_dx_error(HRESULT hr, const char *msg) {
   if (!SUCCEEDED(hr)) {
-    TI_ERROR("Error in {}: {}", msg, hr);
+    TI_ERROR("Error in {}: {:x}", msg, uint32_t(hr));
   }
 }
 
@@ -593,16 +593,21 @@ DeviceAllocation Dx11Device::allocate_memory(const AllocParams &params) {
 
 void Dx11Device::dealloc_memory(DeviceAllocation handle) {
   uint32_t alloc_id = handle.alloc_id;
-  if (alloc_id_to_buffer_.count(alloc_id) == 0)
-    return;
+  if (alloc_id_to_buffer_.find(alloc_id) == alloc_id_to_buffer_.end())
+    TI_ERROR("Invalid handle, possible double free?");
   ID3D11Buffer *buf = alloc_id_to_buffer_[alloc_id];
   buf->Release();
   alloc_id_to_buffer_.erase(alloc_id);
   ID3D11UnorderedAccessView *uav = alloc_id_to_uav_[alloc_id];
   uav->Release();
-  ID3D11Buffer *cpucopy = alloc_id_to_cpucopy_[alloc_id];
-  if (cpucopy)
-    cpucopy->Release();
+  if (alloc_id_to_cpucopy_.find(alloc_id) != alloc_id_to_cpucopy_.end()) {
+    alloc_id_to_cpucopy_[alloc_id]->Release();
+    alloc_id_to_cpucopy_.erase(alloc_id);
+  }
+  if (alloc_id_to_cb_copy_.find(alloc_id) != alloc_id_to_cb_copy_.end()) {
+    alloc_id_to_cb_copy_[alloc_id]->Release();
+    alloc_id_to_cb_copy_.erase(alloc_id);
+  }
   alloc_id_to_uav_.erase(alloc_id);
 }
 
@@ -724,10 +729,9 @@ ID3D11UnorderedAccessView *Dx11Device::alloc_id_to_uav(uint32_t alloc_id) {
 }
 
 ID3D11Buffer *Dx11Device::create_or_get_cb_buffer(uint32_t alloc_id) {
-  if (alloc_id_to_cb_copy_.count(alloc_id) > 0) {
+  if (alloc_id_to_cb_copy_.find(alloc_id) != alloc_id_to_cb_copy_.end()) {
     return alloc_id_to_cb_copy_[alloc_id];
   }
-  assert(alloc_id_to_buffer_.count(alloc_id) > 0);
   ID3D11Buffer *buf = alloc_id_to_buffer_[alloc_id];
   ID3D11Buffer *cb_buf;
   HRESULT hr = create_constant_buffer_copy(device_, buf, &cb_buf);
diff --git a/taichi/backends/dx/dx_program.cpp b/taichi/backends/dx/dx_program.cpp
index 1b76bdeffde9d..f41617536d3eb 100644
--- a/taichi/backends/dx/dx_program.cpp
+++ b/taichi/backends/dx/dx_program.cpp
@@ -3,18 +3,20 @@
 #include "taichi/backends/dx/dx_program.h"
 
 #include "taichi/backends/dx/dx_device.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 
 namespace taichi {
 namespace lang {
 namespace directx11 {
 
 FunctionType compile_to_executable(Kernel *kernel,
-                                   vulkan::VkRuntime *runtime,
-                                   vulkan::SNodeTreeManager *snode_tree_mgr) {
+                                   gfx::GfxRuntime *runtime,
+                                   gfx::SNodeTreeManager *snode_tree_mgr) {
   auto handle = runtime->register_taichi_kernel(
-      std::move(vulkan::run_codegen(kernel, runtime->get_ti_device(),
-                                    snode_tree_mgr->get_compiled_structs())));
+      std::move(gfx::run_codegen(kernel, runtime->get_ti_device(),
+                                 snode_tree_mgr->get_compiled_structs())));
   return [runtime, handle](RuntimeContext &ctx) {
     runtime->launch_kernel(handle, &ctx);
   };
@@ -40,28 +42,52 @@ void Dx11ProgramImpl::materialize_runtime(MemoryPool *memory_pool,
 
   device_ = std::make_shared<directx11::Dx11Device>();
 
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = *result_buffer_ptr;
   params.device = device_.get();
-  runtime_ = std::make_unique<vulkan::VkRuntime>(std::move(params));
-  snode_tree_mgr_ = std::make_unique<vulkan::SNodeTreeManager>(runtime_.get());
+  runtime_ = std::make_unique<gfx::GfxRuntime>(std::move(params));
+  snode_tree_mgr_ = std::make_unique<gfx::SNodeTreeManager>(runtime_.get());
 }
 
-void Dx11ProgramImpl::synchronize() {
-  TI_NOT_IMPLEMENTED;
+void Dx11ProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
+  if (runtime_) {
+    snode_tree_mgr_->materialize_snode_tree(tree);
+  } else {
+    gfx::CompiledSNodeStructs compiled_structs =
+        gfx::compile_snode_structs(*tree->root());
+    aot_compiled_snode_structs_.push_back(compiled_structs);
+  }
 }
 
 void Dx11ProgramImpl::materialize_snode_tree(SNodeTree *tree,
-                                             uint64 *result_buffer_ptr) {
+                                             uint64 *result_buffer) {
   snode_tree_mgr_->materialize_snode_tree(tree);
 }
 
 std::unique_ptr<AotModuleBuilder> Dx11ProgramImpl::make_aot_module_builder() {
-  return nullptr;
+  if (runtime_) {
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        snode_tree_mgr_->get_compiled_structs(), Arch::dx11);
+  } else {
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        aot_compiled_snode_structs_, Arch::dx11);
+  }
 }
 
-void Dx11ProgramImpl::destroy_snode_tree(SNodeTree *snode_tree) {
-  TI_NOT_IMPLEMENTED;
+DeviceAllocation Dx11ProgramImpl::allocate_memory_ndarray(
+    std::size_t alloc_size,
+    uint64 *result_buffer) {
+  return get_compute_device()->allocate_memory(
+      {alloc_size, /*host_write=*/false, /*host_read=*/false,
+       /*export_sharing=*/false});
+}
+
+std::unique_ptr<aot::Kernel> Dx11ProgramImpl::make_aot_kernel(Kernel &kernel) {
+  spirv::lower(&kernel);
+  std::vector<gfx::CompiledSNodeStructs> compiled_structs;
+  gfx::GfxRuntime::RegisterParams kparams =
+      gfx::run_codegen(&kernel, get_compute_device(), compiled_structs);
+  return std::make_unique<gfx::KernelImpl>(runtime_.get(), std::move(kparams));
 }
 
 }  // namespace lang
diff --git a/taichi/backends/dx/dx_program.h b/taichi/backends/dx/dx_program.h
index 603ee63abc3c4..1ae5522e563ba 100644
--- a/taichi/backends/dx/dx_program.h
+++ b/taichi/backends/dx/dx_program.h
@@ -3,8 +3,8 @@
 #ifdef TI_WITH_DX11
 
 #include "taichi/backends/dx/dx_device.h"
-#include "taichi/runtime/vulkan/runtime.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/runtime.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
 #include "taichi/program/program_impl.h"
 
 namespace taichi {
@@ -13,26 +13,59 @@ namespace lang {
 class Dx11ProgramImpl : public ProgramImpl {
  public:
   Dx11ProgramImpl(CompileConfig &config);
-
   FunctionType compile(Kernel *kernel, OffloadedStmt *offloaded) override;
+
   std::size_t get_snode_num_dynamically_allocated(
       SNode *snode,
       uint64 *result_buffer) override {
-    return 0;
+    return 0;  // TODO: support sparse
   }
-  std::unique_ptr<AotModuleBuilder> make_aot_module_builder();
+
+  void compile_snode_tree_types(SNodeTree *tree) override;
+
   void materialize_runtime(MemoryPool *memory_pool,
                            KernelProfilerBase *profiler,
                            uint64 **result_buffer_ptr) override;
-  virtual void materialize_snode_tree(SNodeTree *tree,
-                                      uint64 *result_buffer_ptr) override;
-  virtual void destroy_snode_tree(SNodeTree *snode_tree) override;
-  void synchronize() override;
+
+  void materialize_snode_tree(SNodeTree *tree, uint64 *result_buffer) override;
+
+  void synchronize() override {
+    runtime_->synchronize();
+  }
+
+  StreamSemaphore flush() override {
+    return runtime_->flush();
+  }
+
+  std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override;
+
+  void destroy_snode_tree(SNodeTree *snode_tree) override {
+    TI_ASSERT(snode_tree_mgr_ != nullptr);
+    snode_tree_mgr_->destroy_snode_tree(snode_tree);
+  }
+
+  DeviceAllocation allocate_memory_ndarray(std::size_t alloc_size,
+                                           uint64 *result_buffer) override;
+
+  Device *get_compute_device() override {
+    return device_.get();
+  }
+
+  Device *get_graphics_device() override {
+    return device_.get();
+  }
+
+  DevicePtr get_snode_tree_device_ptr(int tree_id) override {
+    return snode_tree_mgr_->get_snode_tree_device_ptr(tree_id);
+  }
+
+  std::unique_ptr<aot::Kernel> make_aot_kernel(Kernel &kernel) override;
 
  private:
   std::shared_ptr<directx11::Dx11Device> device_{nullptr};
-  std::unique_ptr<vulkan::VkRuntime> runtime_{nullptr};
-  std::unique_ptr<vulkan::SNodeTreeManager> snode_tree_mgr_{nullptr};
+  std::unique_ptr<gfx::GfxRuntime> runtime_{nullptr};
+  std::unique_ptr<gfx::SNodeTreeManager> snode_tree_mgr_{nullptr};
+  std::vector<spirv::CompiledSNodeStructs> aot_compiled_snode_structs_;
 };
 
 }  // namespace lang
diff --git a/taichi/backends/opengl/opengl_device.cpp b/taichi/backends/opengl/opengl_device.cpp
index b399cbb12e9ab..0da96f76d3685 100644
--- a/taichi/backends/opengl/opengl_device.cpp
+++ b/taichi/backends/opengl/opengl_device.cpp
@@ -197,7 +197,7 @@ void GLResourceBinder::rw_buffer(uint32_t set,
                                  DeviceAllocation alloc) {
   TI_ASSERT_INFO(set == 0, "OpenGL only supports set = 0, requested set = {}",
                  set);
-  binding_map_[binding] = alloc.alloc_id;
+  ssbo_binding_map_[binding] = alloc.alloc_id;
 }
 
 void GLResourceBinder::buffer(uint32_t set,
@@ -211,7 +211,9 @@ void GLResourceBinder::buffer(uint32_t set,
 void GLResourceBinder::buffer(uint32_t set,
                               uint32_t binding,
                               DeviceAllocation alloc) {
-  rw_buffer(set, binding, alloc);
+  TI_ASSERT_INFO(set == 0, "OpenGL only supports set = 0, requested set = {}",
+                 set);
+  ubo_binding_map_[binding] = alloc.alloc_id;
 }
 
 void GLResourceBinder::image(uint32_t set,
@@ -295,10 +297,17 @@ void GLCommandList::bind_pipeline(Pipeline *p) {
 
 void GLCommandList::bind_resources(ResourceBinder *_binder) {
   GLResourceBinder *binder = static_cast<GLResourceBinder *>(_binder);
-  for (auto &[binding, buffer] : binder->binding_map()) {
+  for (auto &[binding, buffer] : binder->ssbo_binding_map()) {
+    auto cmd = std::make_unique<CmdBindBufferToIndex>();
+    cmd->buffer = buffer;
+    cmd->index = binding;
+    recorded_commands_.push_back(std::move(cmd));
+  }
+  for (auto &[binding, buffer] : binder->ubo_binding_map()) {
     auto cmd = std::make_unique<CmdBindBufferToIndex>();
     cmd->buffer = buffer;
     cmd->index = binding;
+    cmd->target = GL_UNIFORM_BUFFER;
     recorded_commands_.push_back(std::move(cmd));
   }
 }
@@ -682,7 +691,7 @@ void GLCommandList::CmdBindPipeline::execute() {
 }
 
 void GLCommandList::CmdBindBufferToIndex::execute() {
-  glBindBufferBase(GL_SHADER_STORAGE_BUFFER, index, buffer);
+  glBindBufferBase(target, index, buffer);
   check_opengl_error("glBindBufferBase");
 }
 
diff --git a/taichi/backends/opengl/opengl_device.h b/taichi/backends/opengl/opengl_device.h
index 99eb1cba77eb5..7ab1e8c6dc185 100644
--- a/taichi/backends/opengl/opengl_device.h
+++ b/taichi/backends/opengl/opengl_device.h
@@ -53,12 +53,17 @@ class GLResourceBinder : public ResourceBinder {
   // index_width = 2 -> uint16 index
   void index_buffer(DevicePtr ptr, size_t index_width) override;
 
-  const std::unordered_map<uint32_t, GLuint> &binding_map() {
-    return binding_map_;
+  const std::unordered_map<uint32_t, GLuint> &ssbo_binding_map() {
+    return ssbo_binding_map_;
+  }
+
+  const std::unordered_map<uint32_t, GLuint> &ubo_binding_map() {
+    return ubo_binding_map_;
   }
 
  private:
-  std::unordered_map<uint32_t, GLuint> binding_map_;
+  std::unordered_map<uint32_t, GLuint> ssbo_binding_map_;
+  std::unordered_map<uint32_t, GLuint> ubo_binding_map_;
 };
 
 class GLPipeline : public Pipeline {
@@ -141,6 +146,7 @@ class GLCommandList : public CommandList {
   struct CmdBindBufferToIndex : public Cmd {
     GLuint buffer{0};
     GLuint index{0};
+    GLenum target{GL_SHADER_STORAGE_BUFFER};
     void execute() override;
   };
 
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.h b/taichi/backends/vulkan/aot_module_loader_impl.h
deleted file mode 100644
index 16230e1411306..0000000000000
--- a/taichi/backends/vulkan/aot_module_loader_impl.h
+++ /dev/null
@@ -1,27 +0,0 @@
-#pragma once
-
-#include <any>
-#include <string>
-#include <vector>
-
-#include "taichi/backends/vulkan/aot_utils.h"
-#include "taichi/runtime/vulkan/runtime.h"
-#include "taichi/codegen/spirv/kernel_utils.h"
-#include "taichi/aot/module_builder.h"
-#include "taichi/aot/module_loader.h"
-#include "taichi/backends/vulkan/aot_module_builder_impl.h"
-#include "taichi/backends/vulkan/vulkan_graph_data.h"
-
-namespace taichi {
-namespace lang {
-namespace vulkan {
-struct TI_DLL_EXPORT AotModuleParams {
-  std::string module_path;
-  VkRuntime *runtime{nullptr};
-};
-
-TI_DLL_EXPORT std::unique_ptr<aot::Module> make_aot_module(std::any mod_params);
-
-}  // namespace vulkan
-}  // namespace lang
-}  // namespace taichi
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/backends/vulkan/vulkan_program.cpp
index cddb446ae469f..d7cac58be11b8 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/backends/vulkan/vulkan_program.cpp
@@ -1,8 +1,8 @@
 #include "taichi/backends/vulkan/vulkan_program.h"
 
-#include "taichi/backends/vulkan/aot_module_builder_impl.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 
 #if !defined(ANDROID) && !defined(TI_EMSCRIPTENED)
 #include "GLFW/glfw3.h"
@@ -69,11 +69,11 @@ VulkanProgramImpl::VulkanProgramImpl(CompileConfig &config)
 }
 
 FunctionType compile_to_executable(Kernel *kernel,
-                                   VkRuntime *runtime,
-                                   SNodeTreeManager *snode_tree_mgr) {
+                                   gfx::GfxRuntime *runtime,
+                                   gfx::SNodeTreeManager *snode_tree_mgr) {
   auto handle = runtime->register_taichi_kernel(
-      run_codegen(kernel, runtime->get_ti_device(),
-                  snode_tree_mgr->get_compiled_structs()));
+      gfx::run_codegen(kernel, runtime->get_ti_device(),
+                       snode_tree_mgr->get_compiled_structs()));
   return [runtime, handle](RuntimeContext &ctx) {
     runtime->launch_kernel(handle, &ctx);
   };
@@ -144,20 +144,20 @@ void VulkanProgramImpl::materialize_runtime(MemoryPool *memory_pool,
 
   embedded_device_ = std::make_unique<VulkanDeviceCreator>(evd_params);
 
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = *result_buffer_ptr;
   params.device = embedded_device_->device();
-  vulkan_runtime_ = std::make_unique<vulkan::VkRuntime>(std::move(params));
+  vulkan_runtime_ = std::make_unique<gfx::GfxRuntime>(std::move(params));
   snode_tree_mgr_ =
-      std::make_unique<vulkan::SNodeTreeManager>(vulkan_runtime_.get());
+      std::make_unique<gfx::SNodeTreeManager>(vulkan_runtime_.get());
 }
 
 void VulkanProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   if (vulkan_runtime_) {
     snode_tree_mgr_->materialize_snode_tree(tree);
   } else {
-    CompiledSNodeStructs compiled_structs =
-        vulkan::compile_snode_structs(*tree->root());
+    gfx::CompiledSNodeStructs compiled_structs =
+        gfx::compile_snode_structs(*tree->root());
     aot_compiled_snode_structs_.push_back(compiled_structs);
   }
 }
@@ -169,10 +169,11 @@ void VulkanProgramImpl::materialize_snode_tree(SNodeTree *tree,
 
 std::unique_ptr<AotModuleBuilder> VulkanProgramImpl::make_aot_module_builder() {
   if (vulkan_runtime_) {
-    return std::make_unique<AotModuleBuilderImpl>(
-        snode_tree_mgr_->get_compiled_structs());
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        snode_tree_mgr_->get_compiled_structs(), Arch::vulkan);
   } else {
-    return std::make_unique<AotModuleBuilderImpl>(aot_compiled_snode_structs_);
+    return std::make_unique<gfx::AotModuleBuilderImpl>(
+        aot_compiled_snode_structs_, Arch::vulkan);
   }
 }
 
@@ -187,11 +188,11 @@ DeviceAllocation VulkanProgramImpl::allocate_memory_ndarray(
 std::unique_ptr<aot::Kernel> VulkanProgramImpl::make_aot_kernel(
     Kernel &kernel) {
   spirv::lower(&kernel);
-  std::vector<CompiledSNodeStructs> compiled_structs;
-  VkRuntime::RegisterParams kparams =
-      run_codegen(&kernel, get_compute_device(), compiled_structs);
-  return std::make_unique<KernelImpl>(vulkan_runtime_.get(),
-                                      std::move(kparams));
+  std::vector<gfx::CompiledSNodeStructs> compiled_structs;
+  gfx::GfxRuntime::RegisterParams kparams =
+      gfx::run_codegen(&kernel, get_compute_device(), compiled_structs);
+  return std::make_unique<gfx::KernelImpl>(vulkan_runtime_.get(),
+                                           std::move(kparams));
 }
 
 VulkanProgramImpl::~VulkanProgramImpl() {
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/backends/vulkan/vulkan_program.h
index b8d7a820fb7be..830910ebcc858 100644
--- a/taichi/backends/vulkan/vulkan_program.h
+++ b/taichi/backends/vulkan/vulkan_program.h
@@ -6,8 +6,8 @@
 #include "taichi/backends/vulkan/vulkan_device_creator.h"
 #include "taichi/backends/vulkan/vulkan_utils.h"
 #include "taichi/backends/vulkan/vulkan_loader.h"
-#include "taichi/runtime/vulkan/runtime.h"
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/runtime.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
 #include "taichi/backends/vulkan/vulkan_device.h"
 #include "vk_mem_alloc.h"
 
@@ -35,7 +35,7 @@ class VulkanProgramImpl : public ProgramImpl {
   std::size_t get_snode_num_dynamically_allocated(
       SNode *snode,
       uint64 *result_buffer) override {
-    return 0;  // TODO: support sparse in vulkan
+    return 0;  // TODO: support sparse
   }
 
   void compile_snode_tree_types(SNodeTree *tree) override;
@@ -56,7 +56,7 @@ class VulkanProgramImpl : public ProgramImpl {
 
   std::unique_ptr<AotModuleBuilder> make_aot_module_builder() override;
 
-  virtual void destroy_snode_tree(SNodeTree *snode_tree) override {
+  void destroy_snode_tree(SNodeTree *snode_tree) override {
     TI_ASSERT(snode_tree_mgr_ != nullptr);
     snode_tree_mgr_->destroy_snode_tree(snode_tree);
   }
@@ -88,8 +88,8 @@ class VulkanProgramImpl : public ProgramImpl {
 
  private:
   std::unique_ptr<vulkan::VulkanDeviceCreator> embedded_device_{nullptr};
-  std::unique_ptr<vulkan::VkRuntime> vulkan_runtime_{nullptr};
-  std::unique_ptr<vulkan::SNodeTreeManager> snode_tree_mgr_{nullptr};
+  std::unique_ptr<gfx::GfxRuntime> vulkan_runtime_{nullptr};
+  std::unique_ptr<gfx::SNodeTreeManager> snode_tree_mgr_{nullptr};
   std::vector<spirv::CompiledSNodeStructs> aot_compiled_snode_structs_;
 };
 }  // namespace lang
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index fdce6b501fef3..10603a6f7bbef 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -193,7 +193,8 @@ void Program::materialize_runtime() {
 }
 
 void Program::destroy_snode_tree(SNodeTree *snode_tree) {
-  TI_ASSERT(arch_uses_llvm(config.arch) || config.arch == Arch::vulkan);
+  TI_ASSERT(arch_uses_llvm(config.arch) || config.arch == Arch::vulkan ||
+            config.arch == Arch::dx11);
   program_impl_->destroy_snode_tree(snode_tree);
   free_snode_tree_ids_.push(snode_tree->id());
 }
diff --git a/taichi/python/export_misc.cpp b/taichi/python/export_misc.cpp
index 2ede85517e551..44e1f88c0e72c 100644
--- a/taichi/python/export_misc.cpp
+++ b/taichi/python/export_misc.cpp
@@ -5,7 +5,7 @@
 
 #include "taichi/backends/metal/api.h"
 #include "taichi/runtime/opengl/opengl_api.h"
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/backends/dx/dx_api.h"
 #include "taichi/common/core.h"
 #include "taichi/common/interface.h"
diff --git a/taichi/runtime/gfx/CMakeLists.txt b/taichi/runtime/gfx/CMakeLists.txt
new file mode 100644
index 0000000000000..bc5bcdf03b5fb
--- /dev/null
+++ b/taichi/runtime/gfx/CMakeLists.txt
@@ -0,0 +1,16 @@
+# ./taichi/runtime/gfx/CMakeLists.txt
+
+add_library(gfx_runtime)
+target_sources(gfx_runtime
+  PRIVATE
+    runtime.cpp
+    snode_tree_manager.cpp
+    aot_module_builder_impl.cpp
+    aot_module_loader_impl.cpp
+  )
+target_include_directories(gfx_runtime
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+    ${PROJECT_SOURCE_DIR}/external/eigen
+    ${PROJECT_SOURCE_DIR}/external/FP16/include
+  )
diff --git a/taichi/backends/vulkan/vulkan_graph_data.h b/taichi/runtime/gfx/aot_graph_data.h
similarity index 55%
rename from taichi/backends/vulkan/vulkan_graph_data.h
rename to taichi/runtime/gfx/aot_graph_data.h
index 6fa3cafc1e3e0..b9d8b4315ab65 100644
--- a/taichi/backends/vulkan/vulkan_graph_data.h
+++ b/taichi/runtime/gfx/aot_graph_data.h
@@ -1,12 +1,12 @@
 #pragma once
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 class KernelImpl : public aot::Kernel {
  public:
-  explicit KernelImpl(VkRuntime *runtime, VkRuntime::RegisterParams &&params)
+  explicit KernelImpl(GfxRuntime *runtime, GfxRuntime::RegisterParams &&params)
       : runtime_(runtime), params_(std::move(params)) {
     handle_ = runtime_->register_taichi_kernel(params_);
   }
@@ -15,15 +15,15 @@ class KernelImpl : public aot::Kernel {
     runtime_->launch_kernel(handle_, ctx);
   }
 
-  const VkRuntime::RegisterParams &params() {
+  const GfxRuntime::RegisterParams &params() {
     return params_;
   }
 
  private:
-  VkRuntime *const runtime_;
-  VkRuntime::KernelHandle handle_;
-  const VkRuntime::RegisterParams params_;
+  GfxRuntime *const runtime_;
+  GfxRuntime::KernelHandle handle_;
+  const GfxRuntime::RegisterParams params_;
 };
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_builder_impl.cpp b/taichi/runtime/gfx/aot_module_builder_impl.cpp
similarity index 93%
rename from taichi/backends/vulkan/aot_module_builder_impl.cpp
rename to taichi/runtime/gfx/aot_module_builder_impl.cpp
index ed03800098bd1..48e3b7964de03 100644
--- a/taichi/backends/vulkan/aot_module_builder_impl.cpp
+++ b/taichi/runtime/gfx/aot_module_builder_impl.cpp
@@ -1,15 +1,15 @@
-#include "taichi/backends/vulkan/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
 
 #include <fstream>
 #include <type_traits>
 
 #include "taichi/aot/module_data.h"
 #include "taichi/codegen/spirv/spirv_codegen.h"
-#include "taichi/backends/vulkan/vulkan_graph_data.h"
+#include "taichi/runtime/gfx/aot_graph_data.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 namespace {
 class AotDataConverter {
@@ -66,7 +66,6 @@ class AotDataConverter {
     aot::CompiledOffloadedTask res{};
     res.type = offloaded_task_type_name(in.task_type);
     res.name = in.name;
-    // TODO: update range_hint after ndarray is supported on vulkan.
     if (in.range_for_attribs && in.range_for_attribs->const_begin &&
         in.range_for_attribs->const_end) {
       res.range_hint = std::to_string(in.range_for_attribs->end -
@@ -98,9 +97,11 @@ class AotDataConverter {
 
 }  // namespace
 AotModuleBuilderImpl::AotModuleBuilderImpl(
-    const std::vector<CompiledSNodeStructs> &compiled_structs)
-    : compiled_structs_(compiled_structs) {
-  aot_target_device_ = std::make_unique<aot::TargetDevice>(Arch::vulkan);
+    const std::vector<CompiledSNodeStructs> &compiled_structs,
+    Arch device_api_backend)
+    : compiled_structs_(compiled_structs),
+      device_api_backend_(device_api_backend) {
+  aot_target_device_ = std::make_unique<aot::TargetDevice>(device_api_backend_);
   if (!compiled_structs.empty()) {
     ti_aot_data_.root_buffer_size = compiled_structs[0].root_size;
   }
@@ -120,7 +121,7 @@ std::string AotModuleBuilderImpl::write_spv_file(
 void AotModuleBuilderImpl::dump(const std::string &output_dir,
                                 const std::string &filename) const {
   TI_WARN_IF(!filename.empty(),
-             "Filename prefix is ignored on vulkan backend.");
+             "Filename prefix is ignored on Unified Device API backends.");
   const std::string bin_path = fmt::format("{}/metadata.tcb", output_dir);
   write_to_binary_file(ti_aot_data_, bin_path);
 
@@ -199,6 +200,6 @@ void AotModuleBuilderImpl::add_per_backend_tmpl(const std::string &identifier,
   ti_aot_data_.spirv_codes.push_back(compiled.task_spirv_source_codes);
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_builder_impl.h b/taichi/runtime/gfx/aot_module_builder_impl.h
similarity index 85%
rename from taichi/backends/vulkan/aot_module_builder_impl.h
rename to taichi/runtime/gfx/aot_module_builder_impl.h
index 40dc4157c06f4..a6eeaaa3af143 100644
--- a/taichi/backends/vulkan/aot_module_builder_impl.h
+++ b/taichi/runtime/gfx/aot_module_builder_impl.h
@@ -4,19 +4,20 @@
 #include <vector>
 
 #include "taichi/aot/module_builder.h"
-#include "taichi/backends/vulkan/aot_utils.h"
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/aot_utils.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/codegen/spirv/snode_struct_compiler.h"
 #include "taichi/codegen/spirv/kernel_utils.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 class AotModuleBuilderImpl : public AotModuleBuilder {
  public:
   explicit AotModuleBuilderImpl(
-      const std::vector<CompiledSNodeStructs> &compiled_structs);
+      const std::vector<CompiledSNodeStructs> &compiled_structs,
+      Arch device_api_backend);
 
   void dump(const std::string &output_dir,
             const std::string &filename) const override;
@@ -45,8 +46,10 @@ class AotModuleBuilderImpl : public AotModuleBuilder {
   const std::vector<CompiledSNodeStructs> &compiled_structs_;
   TaichiAotData ti_aot_data_;
   std::unique_ptr<Device> aot_target_device_;
+
+  Arch device_api_backend_;
 };
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_module_loader_impl.cpp b/taichi/runtime/gfx/aot_module_loader_impl.cpp
similarity index 85%
rename from taichi/backends/vulkan/aot_module_loader_impl.cpp
rename to taichi/runtime/gfx/aot_module_loader_impl.cpp
index 4ea34de89fc7c..497636b5b7c2c 100644
--- a/taichi/backends/vulkan/aot_module_loader_impl.cpp
+++ b/taichi/runtime/gfx/aot_module_loader_impl.cpp
@@ -1,30 +1,30 @@
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 
 #include <fstream>
 #include <type_traits>
 
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/aot/graph_data.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 namespace {
 class FieldImpl : public aot::Field {
  public:
-  explicit FieldImpl(VkRuntime *runtime, const aot::CompiledFieldData &field)
+  explicit FieldImpl(GfxRuntime *runtime, const aot::CompiledFieldData &field)
       : runtime_(runtime), field_(field) {
   }
 
  private:
-  VkRuntime *const runtime_;
+  GfxRuntime *const runtime_;
   aot::CompiledFieldData field_;
 };
 
 class AotModuleImpl : public aot::Module {
  public:
-  explicit AotModuleImpl(const AotModuleParams &params)
-      : runtime_(params.runtime) {
+  explicit AotModuleImpl(const AotModuleParams &params, Arch device_api_backend)
+      : runtime_(params.runtime), device_api_backend_(device_api_backend) {
     const std::string bin_path =
         fmt::format("{}/metadata.tcb", params.module_path);
     read_from_binary_file(ti_aot_data_, bin_path);
@@ -63,7 +63,7 @@ class AotModuleImpl : public aot::Module {
 
   // Module metadata
   Arch arch() const override {
-    return Arch::vulkan;
+    return device_api_backend_;
   }
   uint64_t version() const override {
     TI_NOT_IMPLEMENTED;
@@ -82,7 +82,7 @@ class AotModuleImpl : public aot::Module {
   }
 
   bool get_kernel_params_by_name(const std::string &name,
-                                 VkRuntime::RegisterParams &kernel) {
+                                 GfxRuntime::RegisterParams &kernel) {
     for (int i = 0; i < ti_aot_data_.kernels.size(); ++i) {
       // Offloaded task names encode more than the name of the function, but for
       // AOT, only use the name of the function which should be the first part
@@ -102,7 +102,7 @@ class AotModuleImpl : public aot::Module {
 
   std::unique_ptr<aot::Kernel> make_new_kernel(
       const std::string &name) override {
-    VkRuntime::RegisterParams kparams;
+    GfxRuntime::RegisterParams kparams;
     if (!get_kernel_params_by_name(name, kparams)) {
       TI_DEBUG("Failed to load kernel {}", name);
       return nullptr;
@@ -139,16 +139,18 @@ class AotModuleImpl : public aot::Module {
   }
 
   TaichiAotData ti_aot_data_;
-  VkRuntime *runtime_{nullptr};
+  GfxRuntime *runtime_{nullptr};
+  Arch device_api_backend_;
 };
 
 }  // namespace
 
-std::unique_ptr<aot::Module> make_aot_module(std::any mod_params) {
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params,
+                                             Arch device_api_backend) {
   AotModuleParams params = std::any_cast<AotModuleParams &>(mod_params);
-  return std::make_unique<AotModuleImpl>(params);
+  return std::make_unique<AotModuleImpl>(params, device_api_backend);
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/gfx/aot_module_loader_impl.h b/taichi/runtime/gfx/aot_module_loader_impl.h
new file mode 100644
index 0000000000000..6fec0a063bf24
--- /dev/null
+++ b/taichi/runtime/gfx/aot_module_loader_impl.h
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <any>
+#include <string>
+#include <vector>
+
+#include "taichi/runtime/gfx/aot_utils.h"
+#include "taichi/runtime/gfx/runtime.h"
+#include "taichi/runtime/gfx/aot_module_builder_impl.h"
+#include "taichi/runtime/gfx/aot_graph_data.h"
+#include "taichi/codegen/spirv/kernel_utils.h"
+#include "taichi/aot/module_builder.h"
+#include "taichi/aot/module_loader.h"
+
+namespace taichi {
+namespace lang {
+namespace gfx {
+
+struct TI_DLL_EXPORT AotModuleParams {
+  std::string module_path;
+  GfxRuntime *runtime{nullptr};
+};
+
+TI_DLL_EXPORT std::unique_ptr<aot::Module> make_aot_module(
+    std::any mod_params,
+    Arch device_api_backend);
+
+}  // namespace gfx
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/vulkan/aot_utils.h b/taichi/runtime/gfx/aot_utils.h
similarity index 84%
rename from taichi/backends/vulkan/aot_utils.h
rename to taichi/runtime/gfx/aot_utils.h
index 5c00d4023efa8..e8c1f5b0ea150 100644
--- a/taichi/backends/vulkan/aot_utils.h
+++ b/taichi/runtime/gfx/aot_utils.h
@@ -7,10 +7,10 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 /**
- * AOT module data for the vulkan backend.
+ * AOT module data for the Unified Device API backend.
  */
 struct TaichiAotData {
   //   BufferMetaData metadata;
@@ -22,6 +22,6 @@ struct TaichiAotData {
   TI_IO_DEF(kernels, fields, root_buffer_size);
 };
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/vulkan/runtime.cpp b/taichi/runtime/gfx/runtime.cpp
similarity index 94%
rename from taichi/runtime/vulkan/runtime.cpp
rename to taichi/runtime/gfx/runtime.cpp
index 3a4d199824a13..d47e19c71ce46 100644
--- a/taichi/runtime/vulkan/runtime.cpp
+++ b/taichi/runtime/gfx/runtime.cpp
@@ -1,4 +1,4 @@
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 #include "taichi/program/program.h"
 
 #include <chrono>
@@ -18,28 +18,9 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 namespace {
-class StopWatch {
- public:
-  StopWatch() : begin_(std::chrono::system_clock::now()) {
-  }
-
-  int get_micros() {
-    typedef std::chrono::duration<float> fsec;
-
-    auto now = std::chrono::system_clock::now();
-
-    fsec fs = now - begin_;
-    begin_ = now;
-    auto d = std::chrono::duration_cast<std::chrono::microseconds>(fs);
-    return d.count();
-  }
-
- private:
-  std::chrono::time_point<std::chrono::system_clock> begin_;
-};
 
 class HostDeviceContextBlitter {
  public:
@@ -123,7 +104,7 @@ class HostDeviceContextBlitter {
             break;
           }
         }
-        TI_ERROR("Vulkan does not support arg type={}",
+        TI_ERROR("Device does not support arg type={}",
                  PrimitiveType::get(arg.dtype).to_string());
       } while (0);
     }
@@ -227,7 +208,7 @@ class HostDeviceContextBlitter {
             continue;
           }
         }
-        TI_ERROR("Vulkan does not support return value type={}",
+        TI_ERROR("Device does not support return value type={}",
                  data_type_name(PrimitiveType::get(ret.dtype)));
       }
     }
@@ -268,7 +249,7 @@ constexpr size_t kGtmpBufferSize = 1024 * 1024;
 constexpr size_t kListGenBufferSize = 32 << 20;
 
 // Info for launching a compiled Taichi kernel, which consists of a series of
-// Vulkan pipelines.
+// Unified Device API pipelines.
 
 CompiledTaichiKernel::CompiledTaichiKernel(const Params &ti_params)
     : ti_kernel_attribs_(*ti_params.ti_kernel_attribs),
@@ -372,14 +353,14 @@ void CompiledTaichiKernel::generate_command_list(
   }
 }
 
-VkRuntime::VkRuntime(const Params &params)
+GfxRuntime::GfxRuntime(const Params &params)
     : device_(params.device), host_result_buffer_(params.host_result_buffer) {
   TI_ASSERT(host_result_buffer_ != nullptr);
   current_cmdlist_pending_since_ = high_res_clock::now();
   init_nonroot_buffers();
 }
 
-VkRuntime::~VkRuntime() {
+GfxRuntime::~GfxRuntime() {
   synchronize();
   {
     decltype(ti_kernels_) tmp;
@@ -388,8 +369,8 @@ VkRuntime::~VkRuntime() {
   global_tmps_buffer_.reset();
 }
 
-VkRuntime::KernelHandle VkRuntime::register_taichi_kernel(
-    VkRuntime::RegisterParams reg_params) {
+GfxRuntime::KernelHandle GfxRuntime::register_taichi_kernel(
+    GfxRuntime::RegisterParams reg_params) {
   CompiledTaichiKernel::Params params;
   params.ti_kernel_attribs = &(reg_params.kernel_attribs);
   params.num_snode_trees = reg_params.num_snode_trees;
@@ -414,7 +395,7 @@ VkRuntime::KernelHandle VkRuntime::register_taichi_kernel(
   return res;
 }
 
-void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
+void GfxRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   auto *ti_kernel = ti_kernels_[handle.id_].get();
 
   std::unique_ptr<DeviceAllocationGuard> args_buffer{nullptr},
@@ -553,13 +534,13 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   }
 }
 
-void VkRuntime::synchronize() {
+void GfxRuntime::synchronize() {
   flush();
   device_->wait_idle();
   ctx_buffers_.clear();
 }
 
-StreamSemaphore VkRuntime::flush() {
+StreamSemaphore GfxRuntime::flush() {
   StreamSemaphore sema;
   if (current_cmdlist_) {
     sema = device_->get_compute_stream()->submit(current_cmdlist_.get());
@@ -572,11 +553,11 @@ StreamSemaphore VkRuntime::flush() {
   return sema;
 }
 
-Device *VkRuntime::get_ti_device() const {
+Device *GfxRuntime::get_ti_device() const {
   return device_;
 }
 
-void VkRuntime::init_nonroot_buffers() {
+void GfxRuntime::init_nonroot_buffers() {
   global_tmps_buffer_ = device_->allocate_memory_unique(
       {kGtmpBufferSize,
        /*host_write=*/false, /*host_read=*/false,
@@ -598,7 +579,7 @@ void VkRuntime::init_nonroot_buffers() {
   stream->submit_synced(cmdlist.get());
 }
 
-void VkRuntime::add_root_buffer(size_t root_buffer_size) {
+void GfxRuntime::add_root_buffer(size_t root_buffer_size) {
   if (root_buffer_size == 0) {
     root_buffer_size = 4;  // there might be empty roots
   }
@@ -617,14 +598,14 @@ void VkRuntime::add_root_buffer(size_t root_buffer_size) {
   root_buffers_size_map_[root_buffers_.back().get()] = root_buffer_size;
 }
 
-DeviceAllocation *VkRuntime::get_root_buffer(int id) const {
+DeviceAllocation *GfxRuntime::get_root_buffer(int id) const {
   if (id >= root_buffers_.size()) {
     TI_ERROR("root buffer id {} not found", id);
   }
   return root_buffers_[id].get();
 }
 
-size_t VkRuntime::get_root_buffer_size(int id) const {
+size_t GfxRuntime::get_root_buffer_size(int id) const {
   auto it = root_buffers_size_map_.find(root_buffers_[id].get());
   if (id >= root_buffers_.size() || it == root_buffers_size_map_.end()) {
     TI_ERROR("root buffer id {} not found", id);
@@ -632,7 +613,7 @@ size_t VkRuntime::get_root_buffer_size(int id) const {
   return it->second;
 }
 
-VkRuntime::RegisterParams run_codegen(
+GfxRuntime::RegisterParams run_codegen(
     Kernel *kernel,
     Device *device,
     const std::vector<CompiledSNodeStructs> &compiled_structs) {
@@ -647,12 +628,12 @@ VkRuntime::RegisterParams run_codegen(
   params.enable_spv_opt =
       kernel->program->config.external_optimization_level > 0;
   spirv::KernelCodegen codegen(params);
-  VkRuntime::RegisterParams res;
+  GfxRuntime::RegisterParams res;
   codegen.run(res.kernel_attribs, res.task_spirv_source_codes);
   res.num_snode_trees = compiled_structs.size();
   return res;
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/vulkan/runtime.h b/taichi/runtime/gfx/runtime.h
similarity index 93%
rename from taichi/runtime/vulkan/runtime.h
rename to taichi/runtime/gfx/runtime.h
index c4f356783bcea..cbe1a48d758d1 100644
--- a/taichi/runtime/vulkan/runtime.h
+++ b/taichi/runtime/gfx/runtime.h
@@ -14,7 +14,7 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
 using namespace taichi::lang::spirv;
 
@@ -73,20 +73,20 @@ class CompiledTaichiKernel {
   std::vector<std::unique_ptr<Pipeline>> pipelines_;
 };
 
-class TI_DLL_EXPORT VkRuntime {
+class TI_DLL_EXPORT GfxRuntime {
  public:
   struct Params {
     uint64_t *host_result_buffer{nullptr};
     Device *device{nullptr};
   };
 
-  explicit VkRuntime(const Params &params);
+  explicit GfxRuntime(const Params &params);
   // To make Pimpl + std::unique_ptr work
-  ~VkRuntime();
+  ~GfxRuntime();
 
   class KernelHandle {
    private:
-    friend class VkRuntime;
+    friend class GfxRuntime;
     int id_ = -1;
   };
 
@@ -113,7 +113,7 @@ class TI_DLL_EXPORT VkRuntime {
   size_t get_root_buffer_size(int id) const;
 
  private:
-  friend class taichi::lang::vulkan::SNodeTreeManager;
+  friend class taichi::lang::gfx::SNodeTreeManager;
 
   void init_nonroot_buffers();
 
@@ -135,11 +135,11 @@ class TI_DLL_EXPORT VkRuntime {
   std::unordered_map<DeviceAllocation *, size_t> root_buffers_size_map_;
 };
 
-VkRuntime::RegisterParams run_codegen(
+GfxRuntime::RegisterParams run_codegen(
     Kernel *kernel,
     Device *device,
     const std::vector<CompiledSNodeStructs> &compiled_structs);
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/snode_tree_manager.cpp b/taichi/runtime/gfx/snode_tree_manager.cpp
similarity index 73%
rename from taichi/backends/vulkan/snode_tree_manager.cpp
rename to taichi/runtime/gfx/snode_tree_manager.cpp
index b7d4816ae42d9..0ace8e561304d 100644
--- a/taichi/backends/vulkan/snode_tree_manager.cpp
+++ b/taichi/runtime/gfx/snode_tree_manager.cpp
@@ -1,17 +1,17 @@
-#include "taichi/backends/vulkan/snode_tree_manager.h"
+#include "taichi/runtime/gfx/snode_tree_manager.h"
 
-#include "taichi/runtime/vulkan/runtime.h"
+#include "taichi/runtime/gfx/runtime.h"
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
-SNodeTreeManager::SNodeTreeManager(VkRuntime *rtm) : runtime_(rtm) {
+SNodeTreeManager::SNodeTreeManager(GfxRuntime *rtm) : runtime_(rtm) {
 }
 
 void SNodeTreeManager::materialize_snode_tree(SNodeTree *tree) {
   auto *const root = tree->root();
-  CompiledSNodeStructs compiled_structs = vulkan::compile_snode_structs(*root);
+  CompiledSNodeStructs compiled_structs = compile_snode_structs(*root);
   runtime_->add_root_buffer(compiled_structs.root_size);
   compiled_snode_structs_.push_back(compiled_structs);
 }
@@ -33,6 +33,6 @@ DevicePtr SNodeTreeManager::get_snode_tree_device_ptr(int tree_id) {
   return runtime_->root_buffers_[tree_id]->get_ptr();
 }
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/backends/vulkan/snode_tree_manager.h b/taichi/runtime/gfx/snode_tree_manager.h
similarity index 79%
rename from taichi/backends/vulkan/snode_tree_manager.h
rename to taichi/runtime/gfx/snode_tree_manager.h
index d946c308632b9..3add69e6c4a41 100644
--- a/taichi/backends/vulkan/snode_tree_manager.h
+++ b/taichi/runtime/gfx/snode_tree_manager.h
@@ -8,12 +8,12 @@
 
 namespace taichi {
 namespace lang {
-namespace vulkan {
+namespace gfx {
 
-class VkRuntime;
+class GfxRuntime;
 
 /**
- * @brief Manages the SNodeTrees for the Vulkan backend.
+ * @brief Manages the SNodeTrees for the underlying backend.
  *
  */
 class SNodeTreeManager {
@@ -21,7 +21,7 @@ class SNodeTreeManager {
   using CompiledSNodeStructs = taichi::lang::spirv::CompiledSNodeStructs;
 
  public:
-  explicit SNodeTreeManager(VkRuntime *rtm);
+  explicit SNodeTreeManager(GfxRuntime *rtm);
 
   const std::vector<CompiledSNodeStructs> &get_compiled_structs() const {
     return compiled_snode_structs_;
@@ -34,10 +34,10 @@ class SNodeTreeManager {
   DevicePtr get_snode_tree_device_ptr(int tree_id);
 
  private:
-  VkRuntime *const runtime_;
+  GfxRuntime *const runtime_;
   std::vector<CompiledSNodeStructs> compiled_snode_structs_;
 };
 
-}  // namespace vulkan
+}  // namespace gfx
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/runtime/opengl/opengl_api.cpp b/taichi/runtime/opengl/opengl_api.cpp
index d4c0c93667592..daa6144e398bd 100644
--- a/taichi/runtime/opengl/opengl_api.cpp
+++ b/taichi/runtime/opengl/opengl_api.cpp
@@ -461,12 +461,12 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
   for (const auto &task : program_.tasks) {
     auto binder = compiled_pipeline_[i]->resource_binder();
     auto &core_bufs = runtime->impl->core_bufs;
-    binder->buffer(0, static_cast<int>(GLBufId::Runtime), core_bufs.runtime);
+    binder->rw_buffer(0, static_cast<int>(GLBufId::Runtime), core_bufs.runtime);
     if (program_.used.buf_data)
-      binder->buffer(0, static_cast<int>(GLBufId::Root), core_bufs.root);
-    binder->buffer(0, static_cast<int>(GLBufId::Gtmp), core_bufs.gtmp);
+      binder->rw_buffer(0, static_cast<int>(GLBufId::Root), core_bufs.root);
+    binder->rw_buffer(0, static_cast<int>(GLBufId::Gtmp), core_bufs.gtmp);
     if (program_.args_buf_size || program_.ret_buf_size)
-      binder->buffer(0, static_cast<int>(GLBufId::Args), *args_buf_);
+      binder->rw_buffer(0, static_cast<int>(GLBufId::Args), *args_buf_);
     // TODO: properly assert and throw if we bind more than allowed SSBOs.
     //       On most devices this number is 8. But I need to look up how
     //       to query this information so currently this is thrown from OpenGl.
@@ -475,9 +475,9 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
         DeviceAllocation *ptr =
             static_cast<DeviceAllocation *>((void *)ctx.args[arg_id]);
 
-        binder->buffer(0, bind_id, *ptr);
+        binder->rw_buffer(0, bind_id, *ptr);
       } else {
-        binder->buffer(0, bind_id, ext_arr_bufs_[arg_id]);
+        binder->rw_buffer(0, bind_id, ext_arr_bufs_[arg_id]);
       }
     }
 
diff --git a/taichi/runtime/vulkan/CMakeLists.txt b/taichi/runtime/vulkan/CMakeLists.txt
deleted file mode 100644
index 00ecee7a09caf..0000000000000
--- a/taichi/runtime/vulkan/CMakeLists.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-# ./taichi/runtime/vulkan/CMakeLists.txt
-
-add_library(vulkan_runtime)
-target_sources(vulkan_runtime
-  PRIVATE
-    runtime.cpp
-  )
-target_include_directories(vulkan_runtime
-  PRIVATE
-    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
-    ${PROJECT_SOURCE_DIR}/external/eigen
-    ${PROJECT_SOURCE_DIR}/external/FP16/include
-  )
diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index 8142029e8d4f9..f83f5e55e9c18 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -7,8 +7,8 @@
 #include "tests/cpp/program/test_program.h"
 #include "taichi/aot/graph_data.h"
 #include "taichi/program/graph_builder.h"
+#include "taichi/runtime/gfx/aot_module_loader_impl.h"
 #ifdef TI_WITH_VULKAN
-#include "taichi/backends/vulkan/aot_module_loader_impl.h"
 #include "taichi/backends/device.h"
 #include "taichi/backends/vulkan/vulkan_device.h"
 #include "taichi/backends/vulkan/vulkan_device_creator.h"
@@ -121,7 +121,7 @@ using namespace lang;
 
 #ifdef TI_WITH_VULKAN
 [[maybe_unused]] static void write_devalloc(
-    taichi::lang::vulkan::VkRuntime *vulkan_runtime,
+    taichi::lang::gfx::GfxRuntime *vulkan_runtime,
     taichi::lang::DeviceAllocation &alloc,
     const void *data,
     size_t size) {
@@ -132,7 +132,7 @@ using namespace lang;
 }
 
 [[maybe_unused]] static void load_devalloc(
-    taichi::lang::vulkan::VkRuntime *vulkan_runtime,
+    taichi::lang::gfx::GfxRuntime *vulkan_runtime,
     taichi::lang::DeviceAllocation &alloc,
     void *data,
     size_t size) {
@@ -169,14 +169,14 @@ TEST(AotSaveLoad, Vulkan) {
       std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
 
   // Create Vulkan runtime
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = result_buffer;
   params.device = embedded_device->device();
   auto vulkan_runtime =
-      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
 
   // Run AOT module loader
-  vulkan::AotModuleParams mod_params;
+  gfx::AotModuleParams mod_params;
   mod_params.module_path = ".";
   mod_params.runtime = vulkan_runtime.get();
 
@@ -242,14 +242,14 @@ TEST(AotSaveLoad, VulkanNdarray) {
       std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
 
   // Create Vulkan runtime
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = result_buffer;
   params.device = embedded_device->device();
   auto vulkan_runtime =
-      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
 
   // Run AOT module loader
-  vulkan::AotModuleParams mod_params;
+  gfx::AotModuleParams mod_params;
   mod_params.module_path = ".";
   mod_params.runtime = vulkan_runtime.get();
 
@@ -350,14 +350,14 @@ TEST(AotLoadGraph, Vulkan) {
       static_cast<taichi::lang::vulkan::VulkanDevice *>(
           embedded_device->device());
   // Create Vulkan runtime
-  vulkan::VkRuntime::Params params;
+  gfx::GfxRuntime::Params params;
   params.host_result_buffer = result_buffer;
   params.device = device_;
   auto vulkan_runtime =
-      std::make_unique<taichi::lang::vulkan::VkRuntime>(std::move(params));
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
 
   // Run AOT module loader
-  vulkan::AotModuleParams mod_params;
+  gfx::AotModuleParams mod_params;
   mod_params.module_path = ".";
   mod_params.runtime = vulkan_runtime.get();
 
diff --git a/tests/cpp/backends/dx11_device_test.cpp b/tests/cpp/backends/dx11_device_test.cpp
index a8584170ee75b..d35b62a015585 100644
--- a/tests/cpp/backends/dx11_device_test.cpp
+++ b/tests/cpp/backends/dx11_device_test.cpp
@@ -127,8 +127,8 @@ TEST(Dx11ProgramTest, MaterializeRuntimeTest) {
   /*
   This test needs allocate_memory because of the call stack here:
   Dx11ProgramImpl::materialize_runtime
-  - VkRuntime::VkRuntime
-     - VkRuntime::init_buffers
+  - GfxRuntime::GfxRuntime
+     - GfxRuntime::init_buffers
         - Dx11Device::allocate_memory_unique
         - Dx11Device::get_compute_stream
         - Dx11Stream::new_command_list
diff --git a/tests/python/test_ad_basics.py b/tests/python/test_ad_basics.py
index a870e643a6d9f..7d4191760baaa 100644
--- a/tests/python/test_ad_basics.py
+++ b/tests/python/test_ad_basics.py
@@ -90,7 +90,7 @@ def test_poly(tifunc):
     (lambda x: ti.asin(x), lambda x: np.arcsin(x)),
 ])
 @if_has_autograd
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_trigonometric(tifunc, npfunc):
     grad_test(tifunc, npfunc)
 
@@ -319,7 +319,7 @@ def work():
     assert 'RandStmt not supported' in e.value.args[0]
 
 
-@test_utils.test(exclude=[ti.cc, ti.vulkan, ti.opengl])
+@test_utils.test(exclude=[ti.cc, ti.vulkan, ti.opengl, ti.dx11])
 def test_ad_frac():
     @ti.func
     def frac(x):
diff --git a/tests/python/test_clear_all_gradients.py b/tests/python/test_clear_all_gradients.py
index f93e780494b4d..fbde26b3a740e 100644
--- a/tests/python/test_clear_all_gradients.py
+++ b/tests/python/test_clear_all_gradients.py
@@ -4,7 +4,7 @@
 from tests import test_utils
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_clear_all_gradients():
     x = ti.field(ti.f32)
     y = ti.field(ti.f32)
diff --git a/tests/python/test_element_wise.py b/tests/python/test_element_wise.py
index 642d7cb606e7f..cecbccc9fb6b6 100644
--- a/tests/python/test_element_wise.py
+++ b/tests/python/test_element_wise.py
@@ -11,7 +11,7 @@ def _c_mod(a, b):
 
 @pytest.mark.parametrize('lhs_is_mat,rhs_is_mat', [(True, True), (True, False),
                                                    (False, True)])
-@test_utils.test(fast_math=False, exclude=[ti.vulkan])
+@test_utils.test(fast_math=False, exclude=[ti.vulkan, ti.dx11])
 def test_binary_f(lhs_is_mat, rhs_is_mat):
     x = ti.Matrix.field(3, 2, ti.f32, 16)
     if lhs_is_mat:
diff --git a/tests/python/test_f16.py b/tests/python/test_f16.py
index 2cf9728b65b7d..b27526546ad6e 100644
--- a/tests/python/test_f16.py
+++ b/tests/python/test_f16.py
@@ -102,7 +102,7 @@ def init():
 
 
 @pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
-@test_utils.test(arch=archs_support_f16, exclude=ti.vulkan)
+@test_utils.test(arch=archs_support_f16, exclude=[ti.vulkan, ti.dx11])
 def test_to_paddle():
     import paddle
     n = 16
@@ -123,7 +123,7 @@ def init():
 
 
 @pytest.mark.skipif(not has_paddle(), reason='Paddle not installed.')
-@test_utils.test(arch=archs_support_f16, exclude=ti.vulkan)
+@test_utils.test(arch=archs_support_f16, exclude=[ti.vulkan, ti.dx11])
 def test_from_paddle():
     import paddle
     n = 16
diff --git a/tests/python/test_fields_builder.py b/tests/python/test_fields_builder.py
index 590fc2986e27a..888f8773be12b 100644
--- a/tests/python/test_fields_builder.py
+++ b/tests/python/test_fields_builder.py
@@ -38,7 +38,7 @@ def assign_field_multiple():
         assert x[i] == i
 
 
-@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.dx11, ti.metal])
 def test_fields_builder_dense():
     shape = 5
     fb1 = ti.FieldsBuilder()
@@ -142,7 +142,7 @@ def assign_field_multiple_struct_for():
 # See https://docs.taichi-lang.org/docs/type#primitive-types for more details.
 @pytest.mark.parametrize('test_1d_size', [1, 10, 100])
 @pytest.mark.parametrize('field_type', [ti.f32, ti.i32])
-@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.metal])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.dx11, ti.metal])
 def test_fields_builder_destroy(test_1d_size, field_type):
     def test_for_single_destroy_multi_fields():
         fb = ti.FieldsBuilder()
@@ -180,7 +180,7 @@ def test_for_raise_destroy_twice():
             c.destroy()
 
 
-@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan])
+@test_utils.test(arch=[ti.cpu, ti.cuda, ti.vulkan, ti.dx11])
 def test_field_initialize_zero():
     fb0 = ti.FieldsBuilder()
     a = ti.field(ti.i32)
diff --git a/tests/python/test_loop_grad.py b/tests/python/test_loop_grad.py
index ab209613430a2..11482fb39384c 100644
--- a/tests/python/test_loop_grad.py
+++ b/tests/python/test_loop_grad.py
@@ -2,7 +2,7 @@
 from tests import test_utils
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_loop_grad():
     x = ti.field(ti.f32)
 
@@ -32,7 +32,7 @@ def func():
             assert x.grad[k, i] == 2**(m - 1 - i)
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_loop_grad_complex():
     return  # This case is not supported yet
     x = ti.field(ti.f32)
diff --git a/tests/python/test_print.py b/tests/python/test_print.py
index 3e62d19199378..e9f0943123755 100644
--- a/tests/python/test_print.py
+++ b/tests/python/test_print.py
@@ -23,7 +23,8 @@ def func():
 
 # TODO: As described by @k-ye above, what we want to ensure
 #       is that, the content shows on console is *correct*.
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_multi_print():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -33,7 +34,8 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_string():
     @ti.kernel
     def func(x: ti.i32, y: ti.f32):
@@ -45,7 +47,8 @@ def func(x: ti.i32, y: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_matrix():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=())
     y = ti.Vector.field(3, dtype=ti.f32, shape=3)
@@ -61,7 +64,8 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_sep_end():
     @ti.kernel
     def func():
@@ -81,7 +85,8 @@ def func():
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_multiple_threads():
     x = ti.field(dtype=ti.f32, shape=(128, ))
 
@@ -97,7 +102,8 @@ def func(k: ti.f32):
     ti.sync()
 
 
-@test_utils.test(exclude=[ti.vulkan])  # TODO(changyu): enable ti.vulkan
+@test_utils.test(exclude=[ti.vulkan,
+                          ti.dx11])  # TODO(changyu): enable ti.vulkan
 def test_print_list():
     x = ti.Matrix.field(2, 3, dtype=ti.f32, shape=(2, 3))
     y = ti.Vector.field(3, dtype=ti.f32, shape=())
diff --git a/tests/python/test_reduction.py b/tests/python/test_reduction.py
index 88e7f93d097f5..a1e62a741d769 100644
--- a/tests/python/test_reduction.py
+++ b/tests/python/test_reduction.py
@@ -33,8 +33,9 @@
 
 def _test_reduction_single(dtype, criterion, op):
     N = 1024 * 1024
-    if (ti.lang.impl.current_cfg().arch == ti.opengl or
-            ti.lang.impl.current_cfg().arch == ti.vulkan) and dtype == ti.f32:
+    if (ti.lang.impl.current_cfg().arch == ti.opengl
+            or ti.lang.impl.current_cfg().arch == ti.vulkan
+            or ti.lang.impl.current_cfg().arch == ti.dx11) and dtype == ti.f32:
         # OpenGL/Vulkan are not capable of such large number in its float32...
         N = 1024 * 16
 
diff --git a/tests/python/test_struct.py b/tests/python/test_struct.py
index e6f7b240f91f6..e499f41a190b7 100644
--- a/tests/python/test_struct.py
+++ b/tests/python/test_struct.py
@@ -61,7 +61,7 @@ def test_linear_nested_aos():
         assert y[i] == i + 123
 
 
-@test_utils.test(exclude=[ti.vulkan])
+@test_utils.test(exclude=[ti.vulkan, ti.dx11])
 def test_2d_nested():
     x = ti.field(ti.i32)
 
diff --git a/tests/python/test_torch_io.py b/tests/python/test_torch_io.py
index 1f05bfdcb1f3e..0c96605da431a 100644
--- a/tests/python/test_torch_io.py
+++ b/tests/python/test_torch_io.py
@@ -11,7 +11,7 @@
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_devices():
     n = 32
     x = ti.field(dtype=ti.i32, shape=n)
@@ -48,7 +48,7 @@ def store(y: ti.types.ndarray()):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io():
     n = 32
 
@@ -88,7 +88,7 @@ def backward(ctx, outp_grad):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_2d():
     n = 32
 
@@ -112,7 +112,7 @@ def forward(ctx, inp):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_3d():
     n = 16
 
@@ -138,7 +138,7 @@ def forward(ctx, inp):
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_simple():
     n = 32
 
@@ -165,7 +165,7 @@ def test_io_simple():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_zeros():
     mat = ti.Matrix.field(2, 6, dtype=ti.f32, shape=(), needs_grad=True)
     zeros = torch.zeros((2, 6))
@@ -179,7 +179,7 @@ def test_io_zeros():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_io_struct():
     n = 16
     x1 = ti.Struct.field({"a": ti.i32, "b": ti.f32}, shape=(n, ))
@@ -199,7 +199,7 @@ def test_io_struct():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_fused_kernels():
     n = 12
     X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
@@ -211,7 +211,7 @@ def test_fused_kernels():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_device():
     n = 12
     X = ti.Matrix.field(3, 2, ti.f32, shape=(n, n, n))
@@ -222,7 +222,7 @@ def test_device():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_shape_matrix():
     n = 12
     x = ti.Matrix.field(3, 2, ti.f32, shape=(n, n))
@@ -242,7 +242,7 @@ def test_shape_matrix():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_shape_vector():
     n = 12
     x = ti.Vector.field(3, ti.f32, shape=(n, n))
@@ -261,7 +261,7 @@ def test_shape_vector():
 
 
 @pytest.mark.skipif(not has_pytorch(), reason='Pytorch not installed.')
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_torch_zero():
     @ti.kernel
     def test_torch(arr: ti.types.ndarray()):
diff --git a/tests/python/test_types.py b/tests/python/test_types.py
index 2d1cb785136be..893112ce546c8 100644
--- a/tests/python/test_types.py
+++ b/tests/python/test_types.py
@@ -20,13 +20,14 @@ def func(value: dt):
 
 
 @pytest.mark.parametrize('dt', _TI_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_type_assign_argument(dt):
     _test_type_assign_argument(dt)
 
 
 @pytest.mark.parametrize('dt', _TI_64_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_type_assign_argument64(dt):
     _test_type_assign_argument(dt)
 
@@ -52,13 +53,14 @@ def func():
 
 
 @pytest.mark.parametrize('dt', _TI_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_type_operator(dt):
     _test_type_operator(dt)
 
 
 @pytest.mark.parametrize('dt', _TI_64_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_type_operator64(dt):
     _test_type_operator(dt)
 
@@ -77,13 +79,14 @@ def func(i: ti.i32, j: ti.i32):
 
 
 @pytest.mark.parametrize('dt', _TI_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_type_field(dt):
     _test_type_field(dt)
 
 
 @pytest.mark.parametrize('dt', _TI_64_TYPES)
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_type_field64(dt):
     _test_type_field(dt)
 
@@ -119,7 +122,7 @@ def func():
     (ti.i32, 32),
     (ti.u32, 32),
 ])
-@test_utils.test(exclude=[ti.opengl, ti.vulkan])
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11])
 def test_overflow(dt, n):
     _test_overflow(dt, n)
 
@@ -128,7 +131,8 @@ def test_overflow(dt, n):
     (ti.i64, 64),
     (ti.u64, 64),
 ])
-@test_utils.test(exclude=[ti.opengl, ti.vulkan], require=ti.extension.data64)
+@test_utils.test(exclude=[ti.opengl, ti.vulkan, ti.dx11],
+                 require=ti.extension.data64)
 def test_overflow64(dt, n):
     _test_overflow(dt, n)
 
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 9f7d9babb0aa7..c42d230cb886a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -7,7 +7,7 @@
 
 import pytest
 from taichi._lib import core as _ti_core
-from taichi.lang import cc, cpu, cuda, gpu, metal, opengl, vulkan
+from taichi.lang import cc, cpu, cuda, dx11, gpu, metal, opengl, vulkan
 from taichi.lang.misc import is_arch_supported
 
 import taichi as ti

From 3983d8517ba680b83ab8c1fcb1ebfe97ef60c0f3 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Wed, 1 Jun 2022 18:15:36 +0800
Subject: [PATCH 142/176] [llvm] [aot] CUDA-AOT PR #1: Extracted common logics
 from CPUAotModuleImpl into LLVMAotModule (#5072)

* [llvm] [aot] CUDA-AOT PR #1: Extracted common logics from CPUAotModuleImpl into LLVMAotModule

* Renamed LLVMAotModule

* Fixed minor issue
---
 .../backends/cpu/aot_module_loader_impl.cpp   | 53 +++++--------------
 taichi/llvm/llvm_aot_module_loader.cpp        | 41 ++++++++++++++
 taichi/llvm/llvm_aot_module_loader.h          | 42 +++++++++++++++
 3 files changed, 95 insertions(+), 41 deletions(-)
 create mode 100644 taichi/llvm/llvm_aot_module_loader.cpp
 create mode 100644 taichi/llvm/llvm_aot_module_loader.h

diff --git a/taichi/backends/cpu/aot_module_loader_impl.cpp b/taichi/backends/cpu/aot_module_loader_impl.cpp
index 0026bc41e299d..fcd1761e20e99 100644
--- a/taichi/backends/cpu/aot_module_loader_impl.cpp
+++ b/taichi/backends/cpu/aot_module_loader_impl.cpp
@@ -1,4 +1,5 @@
 #include "taichi/backends/cpu/aot_module_loader_impl.h"
+#include "taichi/llvm/llvm_aot_module_loader.h"
 
 #include "taichi/llvm/llvm_offline_cache.h"
 #include "taichi/llvm/llvm_program.h"
@@ -6,51 +7,23 @@
 
 namespace taichi {
 namespace lang {
-namespace cpu {
 namespace {
 
-class KernelImpl : public aot::Kernel {
- public:
-  explicit KernelImpl(FunctionType fn) : fn_(fn) {
-  }
-
-  void launch(RuntimeContext *ctx) override {
-    fn_(*ctx);
-  }
-
- private:
-  FunctionType fn_;
-};
-
-class AotModuleImpl : public aot::Module {
+class AotModuleImpl : public LlvmAotModule {
  public:
-  explicit AotModuleImpl(const AotModuleParams &params)
-      : program_(params.program),
-        cache_reader_(LlvmOfflineCacheFileReader::make(params.module_path)) {
-    TI_ASSERT(program_ != nullptr);
+  explicit AotModuleImpl(const cpu::AotModuleParams &params)
+      : LlvmAotModule(params.module_path, params.program) {
   }
 
   Arch arch() const override {
     return Arch::x64;
   }
 
-  uint64_t version() const override {
-    return 0;
-  }
-
-  size_t get_root_size() const override {
-    return 0;
-  }
-
  private:
-  std::unique_ptr<aot::Kernel> make_new_kernel(
-      const std::string &name) override {
-    TI_ASSERT(cache_reader_ != nullptr);
+  FunctionType convert_module_to_function(
+      const std::string &name,
+      LlvmOfflineCache::KernelCacheData &&loaded) override {
     auto *tlctx = program_->get_llvm_context(program_->config->arch);
-    LlvmOfflineCache::KernelCacheData loaded;
-    auto ok = cache_reader_->get_kernel_cache(
-        loaded, name, *tlctx->get_this_thread_context());
-    TI_ERROR_IF(!ok, "Failed to load kernel={}", name);
 
     const auto &tasks = loaded.offloaded_task_list;
     std::vector<OffloadedTask> offloaded_tasks;
@@ -62,11 +35,10 @@ class AotModuleImpl : public aot::Module {
       ot.grid_dim = t.grid_dim;
       offloaded_tasks.push_back(std::move(ot));
     }
+
     ModuleToFunctionConverter converter{tlctx, program_};
-    auto fn =
-        converter.convert(name, loaded.args, std::move(loaded.owned_module),
-                          std::move(offloaded_tasks));
-    return std::make_unique<KernelImpl>(fn);
+    return converter.convert(name, loaded.args, std::move(loaded.owned_module),
+                             std::move(offloaded_tasks));
   }
 
   std::unique_ptr<aot::KernelTemplate> make_new_kernel_template(
@@ -79,13 +51,12 @@ class AotModuleImpl : public aot::Module {
     TI_NOT_IMPLEMENTED;
     return nullptr;
   }
-
-  LlvmProgramImpl *const program_{nullptr};
-  std::unique_ptr<LlvmOfflineCacheFileReader> cache_reader_{nullptr};
 };
 
 }  // namespace
 
+namespace cpu {
+
 std::unique_ptr<aot::Module> make_aot_module(std::any mod_params) {
   auto mod = std::make_unique<AotModuleImpl>(
       std::any_cast<const AotModuleParams &>(mod_params));
diff --git a/taichi/llvm/llvm_aot_module_loader.cpp b/taichi/llvm/llvm_aot_module_loader.cpp
new file mode 100644
index 0000000000000..5d725927388d7
--- /dev/null
+++ b/taichi/llvm/llvm_aot_module_loader.cpp
@@ -0,0 +1,41 @@
+#include "taichi/llvm/llvm_aot_module_loader.h"
+
+namespace taichi {
+namespace lang {
+namespace {
+
+class KernelImpl : public aot::Kernel {
+ public:
+  explicit KernelImpl(FunctionType fn) : fn_(fn) {
+  }
+
+  void launch(RuntimeContext *ctx) override {
+    fn_(*ctx);
+  }
+
+ private:
+  FunctionType fn_;
+};
+
+}  // namespace
+
+LlvmOfflineCache::KernelCacheData LlvmAotModule::load_kernel_from_cache(
+    const std::string &name) {
+  TI_ASSERT(cache_reader_ != nullptr);
+  auto *tlctx = program_->get_llvm_context(program_->config->arch);
+  LlvmOfflineCache::KernelCacheData loaded;
+  auto ok = cache_reader_->get_kernel_cache(loaded, name,
+                                            *tlctx->get_this_thread_context());
+  TI_ERROR_IF(!ok, "Failed to load kernel={}", name);
+  return loaded;
+}
+
+std::unique_ptr<aot::Kernel> LlvmAotModule::make_new_kernel(
+    const std::string &name) {
+  auto loaded = load_kernel_from_cache(name);
+  auto fn = convert_module_to_function(name, std::move(loaded));
+  return std::make_unique<KernelImpl>(fn);
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/llvm_aot_module_loader.h b/taichi/llvm/llvm_aot_module_loader.h
new file mode 100644
index 0000000000000..7056eba37d212
--- /dev/null
+++ b/taichi/llvm/llvm_aot_module_loader.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include "taichi/aot/module_loader.h"
+#include "taichi/llvm/llvm_program.h"
+
+namespace taichi {
+namespace lang {
+
+class LlvmAotModule : public aot::Module {
+ public:
+  explicit LlvmAotModule(const std::string &module_path,
+                         LlvmProgramImpl *program)
+      : program_(program),
+        cache_reader_(LlvmOfflineCacheFileReader::make(module_path)) {
+    TI_ASSERT(program_ != nullptr);
+  }
+
+  uint64_t version() const override {
+    return 0;
+  }
+
+  size_t get_root_size() const override {
+    return 0;
+  }
+
+ protected:
+  virtual FunctionType convert_module_to_function(
+      const std::string &name,
+      LlvmOfflineCache::KernelCacheData &&loaded) = 0;
+
+  LlvmOfflineCache::KernelCacheData load_kernel_from_cache(
+      const std::string &name);
+
+  std::unique_ptr<aot::Kernel> make_new_kernel(
+      const std::string &name) override;
+
+  LlvmProgramImpl *const program_{nullptr};
+  std::unique_ptr<LlvmOfflineCacheFileReader> cache_reader_{nullptr};
+};
+
+}  // namespace lang
+}  // namespace taichi

From 41c9736c8e42afd4a6d8441a7c34fdf329d1996f Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Thu, 2 Jun 2022 09:14:53 +0800
Subject: [PATCH 143/176] [llvm] [refactor] Merge AtomicOpStmt codegen in CPU
 and CUDA backends (#5086)

* [llvm] [refactor] Merge AtomicOpStmt codegen in CodeGenLLVMCUDA and CodeGenLLVM

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/backends/cuda/codegen_cuda.cpp | 117 +--------------
 taichi/codegen/codegen_llvm.cpp       | 199 ++++++++++++++------------
 taichi/codegen/codegen_llvm.h         |  18 ++-
 3 files changed, 120 insertions(+), 214 deletions(-)

diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index 83e748fc99673..8715a6786172b 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -191,7 +191,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
 
   // Not all reduction statements can be optimized.
   // If the operation cannot be optimized, this function returns nullptr.
-  llvm::Value *optimized_reduction(AtomicOpStmt *stmt) {
+  llvm::Value *optimized_reduction(AtomicOpStmt *stmt) override {
     if (!stmt->is_reduction) {
       return nullptr;
     }
@@ -227,39 +227,6 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
                        {llvm_val[stmt->dest], llvm_val[stmt->val]});
   }
 
-  llvm::Value *custom_type_atomic(AtomicOpStmt *stmt) {
-    if (stmt->op_type != AtomicOpType::add) {
-      return nullptr;
-    }
-
-    auto dst_type = stmt->dest->ret_type->as<PointerType>()->get_pointee_type();
-    if (auto cit = dst_type->cast<CustomIntType>()) {
-      return atomic_add_custom_int(stmt, cit);
-    } else if (auto cft = dst_type->cast<CustomFloatType>()) {
-      return atomic_add_custom_float(stmt, cft);
-    } else {
-      return nullptr;
-    }
-  }
-
-  llvm::Value *integral_type_atomic(AtomicOpStmt *stmt) {
-    if (!is_integral(stmt->val->ret_type)) {
-      return nullptr;
-    }
-    std::unordered_map<AtomicOpType, llvm::AtomicRMWInst::BinOp> bin_op;
-    bin_op[AtomicOpType::add] = llvm::AtomicRMWInst::BinOp::Add;
-    bin_op[AtomicOpType::min] = llvm::AtomicRMWInst::BinOp::Min;
-    bin_op[AtomicOpType::max] = llvm::AtomicRMWInst::BinOp::Max;
-
-    bin_op[AtomicOpType::bit_and] = llvm::AtomicRMWInst::BinOp::And;
-    bin_op[AtomicOpType::bit_or] = llvm::AtomicRMWInst::BinOp::Or;
-    bin_op[AtomicOpType::bit_xor] = llvm::AtomicRMWInst::BinOp::Xor;
-    TI_ASSERT(bin_op.find(stmt->op_type) != bin_op.end());
-    return builder->CreateAtomicRMW(
-        bin_op.at(stmt->op_type), llvm_val[stmt->dest], llvm_val[stmt->val],
-        llvm::AtomicOrdering::SequentiallyConsistent);
-  }
-
   // A huge hack for supporting f16 atomic add/max/min! Borrowed from
   // https://github.com/tensorflow/tensorflow/blob/470d58a83470f8ede3beaa584e6992bc71b7baa6/tensorflow/compiler/xla/service/gpu/ir_emitter.cc#L378-L490
   // The reason is that LLVM10 does not support generating atomicCAS for f16 on
@@ -311,7 +278,7 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
   llvm::Value *atomic_op_using_cas(
       llvm::Value *output_address,
       llvm::Value *val,
-      std::function<llvm::Value *(llvm::Value *, llvm::Value *)> op) {
+      std::function<llvm::Value *(llvm::Value *, llvm::Value *)> op) override {
     llvm::PointerType *output_address_type =
         llvm::dyn_cast<llvm::PointerType>(output_address->getType());
     TI_ASSERT(output_address_type != nullptr);
@@ -406,86 +373,6 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
     return output_address;
   }
 
-  llvm::Value *real_or_unsigned_type_atomic(AtomicOpStmt *stmt) {
-    if (!stmt->val->ret_type->is<PrimitiveType>()) {
-      return nullptr;
-    }
-    AtomicOpType op = stmt->op_type;
-    if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f16)) {
-      switch (op) {
-        case AtomicOpType::add:
-          return atomic_op_using_cas(
-              llvm_val[stmt->dest], llvm_val[stmt->val],
-              [&](auto v1, auto v2) { return builder->CreateFAdd(v1, v2); });
-        case AtomicOpType::max:
-          return atomic_op_using_cas(
-              llvm_val[stmt->dest], llvm_val[stmt->val],
-              [&](auto v1, auto v2) { return builder->CreateMaxNum(v1, v2); });
-        case AtomicOpType::min:
-          return atomic_op_using_cas(
-              llvm_val[stmt->dest], llvm_val[stmt->val],
-              [&](auto v1, auto v2) { return builder->CreateMinNum(v1, v2); });
-        default:
-          break;
-      }
-    }
-
-    PrimitiveTypeID prim_type =
-        stmt->val->ret_type->cast<PrimitiveType>()->type;
-
-    std::unordered_map<PrimitiveTypeID,
-                       std::unordered_map<AtomicOpType, std::string>>
-        atomics;
-
-    atomics[PrimitiveTypeID::f32][AtomicOpType::add] = "atomic_add_f32";
-    atomics[PrimitiveTypeID::f64][AtomicOpType::add] = "atomic_add_f64";
-    atomics[PrimitiveTypeID::f32][AtomicOpType::min] = "atomic_min_f32";
-    atomics[PrimitiveTypeID::f64][AtomicOpType::min] = "atomic_min_f64";
-    atomics[PrimitiveTypeID::f32][AtomicOpType::max] = "atomic_max_f32";
-    atomics[PrimitiveTypeID::f64][AtomicOpType::max] = "atomic_max_f64";
-    atomics[PrimitiveTypeID::u32][AtomicOpType::min] = "atomic_min_u32";
-    atomics[PrimitiveTypeID::u64][AtomicOpType::min] = "atomic_min_u64";
-    atomics[PrimitiveTypeID::u32][AtomicOpType::max] = "atomic_max_u32";
-    atomics[PrimitiveTypeID::u64][AtomicOpType::max] = "atomic_max_u64";
-
-    if (atomics.find(prim_type) == atomics.end()) {
-      return nullptr;
-    }
-    if (is_integral(stmt->val->ret_type) &&
-        atomics.at(prim_type).find(op) == atomics.at(prim_type).end()) {
-      return nullptr;
-    }
-    TI_ASSERT(atomics.at(prim_type).find(op) != atomics.at(prim_type).end());
-
-    return create_call(atomics.at(prim_type).at(op),
-                       {llvm_val[stmt->dest], llvm_val[stmt->val]});
-  }
-
-  void visit(AtomicOpStmt *stmt) override {
-    // https://llvm.org/docs/NVPTXUsage.html#address-spaces
-    bool is_local = stmt->dest->is<AllocaStmt>();
-    if (is_local) {
-      TI_ERROR("Local atomics should have been demoted.");
-    }
-    TI_ASSERT(stmt->width() == 1);
-    for (int l = 0; l < stmt->width(); l++) {
-      llvm::Value *old_value;
-
-      if (llvm::Value *result = optimized_reduction(stmt)) {
-        old_value = result;
-      } else if (llvm::Value *result = custom_type_atomic(stmt)) {
-        old_value = result;
-      } else if (llvm::Value *result = real_or_unsigned_type_atomic(stmt)) {
-        old_value = result;
-      } else if (llvm::Value *result = integral_type_atomic(stmt)) {
-        old_value = result;
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
-      llvm_val[stmt] = old_value;
-    }
-  }
-
   void visit(RangeForStmt *for_stmt) override {
     create_naive_range_for(for_stmt);
   }
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 6449b8a378b30..306b0c863b29a 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1207,6 +1207,44 @@ void CodeGenLLVM::visit(SNodeOpStmt *stmt) {
   }
 }
 
+llvm::Value *CodeGenLLVM::optimized_reduction(AtomicOpStmt *stmt) {
+  return nullptr;
+}
+
+llvm::Value *CodeGenLLVM::custom_type_atomic(AtomicOpStmt *stmt) {
+  // TODO(type): support all AtomicOpTypes on custom types
+  if (stmt->op_type != AtomicOpType::add) {
+    return nullptr;
+  }
+
+  auto dst_type = stmt->dest->ret_type->as<PointerType>()->get_pointee_type();
+  if (auto cit = dst_type->cast<CustomIntType>()) {
+    return atomic_add_custom_int(stmt, cit);
+  } else if (auto cft = dst_type->cast<CustomFloatType>()) {
+    return atomic_add_custom_float(stmt, cft);
+  } else {
+    return nullptr;
+  }
+}
+
+llvm::Value *CodeGenLLVM::integral_type_atomic(AtomicOpStmt *stmt) {
+  if (!is_integral(stmt->val->ret_type)) {
+    return nullptr;
+  }
+  std::unordered_map<AtomicOpType, llvm::AtomicRMWInst::BinOp> bin_op;
+  bin_op[AtomicOpType::add] = llvm::AtomicRMWInst::BinOp::Add;
+  bin_op[AtomicOpType::min] = llvm::AtomicRMWInst::BinOp::Min;
+  bin_op[AtomicOpType::max] = llvm::AtomicRMWInst::BinOp::Max;
+
+  bin_op[AtomicOpType::bit_and] = llvm::AtomicRMWInst::BinOp::And;
+  bin_op[AtomicOpType::bit_or] = llvm::AtomicRMWInst::BinOp::Or;
+  bin_op[AtomicOpType::bit_xor] = llvm::AtomicRMWInst::BinOp::Xor;
+  TI_ASSERT(bin_op.find(stmt->op_type) != bin_op.end());
+  return builder->CreateAtomicRMW(bin_op.at(stmt->op_type),
+                                  llvm_val[stmt->dest], llvm_val[stmt->val],
+                                  llvm::AtomicOrdering::SequentiallyConsistent);
+}
+
 llvm::Value *CodeGenLLVM::atomic_op_using_cas(
     llvm::Value *dest,
     llvm::Value *val,
@@ -1242,104 +1280,77 @@ llvm::Value *CodeGenLLVM::atomic_op_using_cas(
   return old_val;
 }
 
+llvm::Value *CodeGenLLVM::real_or_unsigned_type_atomic(AtomicOpStmt *stmt) {
+  if (!stmt->val->ret_type->is<PrimitiveType>()) {
+    return nullptr;
+  }
+  AtomicOpType op = stmt->op_type;
+  if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f16)) {
+    switch (op) {
+      case AtomicOpType::add:
+        return atomic_op_using_cas(
+            llvm_val[stmt->dest], llvm_val[stmt->val],
+            [&](auto v1, auto v2) { return builder->CreateFAdd(v1, v2); });
+      case AtomicOpType::max:
+        return atomic_op_using_cas(
+            llvm_val[stmt->dest], llvm_val[stmt->val],
+            [&](auto v1, auto v2) { return builder->CreateMaxNum(v1, v2); });
+      case AtomicOpType::min:
+        return atomic_op_using_cas(
+            llvm_val[stmt->dest], llvm_val[stmt->val],
+            [&](auto v1, auto v2) { return builder->CreateMinNum(v1, v2); });
+      default:
+        break;
+    }
+  }
+
+  PrimitiveTypeID prim_type = stmt->val->ret_type->cast<PrimitiveType>()->type;
+
+  std::unordered_map<PrimitiveTypeID,
+                     std::unordered_map<AtomicOpType, std::string>>
+      atomics;
+
+  atomics[PrimitiveTypeID::f32][AtomicOpType::add] = "atomic_add_f32";
+  atomics[PrimitiveTypeID::f64][AtomicOpType::add] = "atomic_add_f64";
+  atomics[PrimitiveTypeID::f32][AtomicOpType::min] = "atomic_min_f32";
+  atomics[PrimitiveTypeID::f64][AtomicOpType::min] = "atomic_min_f64";
+  atomics[PrimitiveTypeID::f32][AtomicOpType::max] = "atomic_max_f32";
+  atomics[PrimitiveTypeID::f64][AtomicOpType::max] = "atomic_max_f64";
+  atomics[PrimitiveTypeID::u32][AtomicOpType::min] = "atomic_min_u32";
+  atomics[PrimitiveTypeID::u64][AtomicOpType::min] = "atomic_min_u64";
+  atomics[PrimitiveTypeID::u32][AtomicOpType::max] = "atomic_max_u32";
+  atomics[PrimitiveTypeID::u64][AtomicOpType::max] = "atomic_max_u64";
+
+  if (atomics.find(prim_type) == atomics.end()) {
+    return nullptr;
+  }
+  if (is_integral(stmt->val->ret_type) &&
+      atomics.at(prim_type).find(op) == atomics.at(prim_type).end()) {
+    return nullptr;
+  }
+  TI_ASSERT(atomics.at(prim_type).find(op) != atomics.at(prim_type).end());
+
+  return create_call(atomics.at(prim_type).at(op),
+                     {llvm_val[stmt->dest], llvm_val[stmt->val]});
+}
+
 void CodeGenLLVM::visit(AtomicOpStmt *stmt) {
-  // auto mask = stmt->parent->mask();
-  // TODO: deal with mask when vectorized
-  // TODO(type): support all AtomicOpTypes on custom types
+  bool is_local = stmt->dest->is<AllocaStmt>();
+  if (is_local) {
+    TI_ERROR("Local atomics should have been demoted.");
+  }
   TI_ASSERT(stmt->width() == 1);
   for (int l = 0; l < stmt->width(); l++) {
     llvm::Value *old_value;
-    if (stmt->op_type == AtomicOpType::add) {
-      auto dst_type =
-          stmt->dest->ret_type->as<PointerType>()->get_pointee_type();
-      if (dst_type->is<PrimitiveType>() && is_integral(stmt->val->ret_type)) {
-        old_value = builder->CreateAtomicRMW(
-            llvm::AtomicRMWInst::BinOp::Add, llvm_val[stmt->dest],
-            llvm_val[stmt->val], llvm::AtomicOrdering::SequentiallyConsistent);
-      } else if (!dst_type->is<CustomFloatType>() &&
-                 is_real(stmt->val->ret_type)) {
-        old_value = builder->CreateAtomicRMW(
-            llvm::AtomicRMWInst::BinOp::FAdd, llvm_val[stmt->dest],
-            llvm_val[stmt->val], llvm::AtomicOrdering::SequentiallyConsistent);
-      } else if (auto cit = dst_type->cast<CustomIntType>()) {
-        old_value = atomic_add_custom_int(stmt, cit);
-      } else if (auto cft = dst_type->cast<CustomFloatType>()) {
-        old_value = atomic_add_custom_float(stmt, cft);
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
-    } else if (stmt->op_type == AtomicOpType::min) {
-      if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::u32)) {
-        old_value = create_call("atomic_min_u32",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::u64)) {
-        old_value = create_call("atomic_min_u64",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else if (is_integral(stmt->val->ret_type)) {
-        old_value = builder->CreateAtomicRMW(
-            llvm::AtomicRMWInst::BinOp::Min, llvm_val[stmt->dest],
-            llvm_val[stmt->val], llvm::AtomicOrdering::SequentiallyConsistent);
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f16)) {
-        old_value = atomic_op_using_cas(
-            llvm_val[stmt->dest], llvm_val[stmt->val],
-            [&](auto v1, auto v2) { return builder->CreateMinNum(v1, v2); });
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f32)) {
-        old_value = create_call("atomic_min_f32",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f64)) {
-        old_value = create_call("atomic_min_f64",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
-    } else if (stmt->op_type == AtomicOpType::max) {
-      if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::u32)) {
-        old_value = create_call("atomic_max_u32",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::u64)) {
-        old_value = create_call("atomic_max_u64",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else if (is_integral(stmt->val->ret_type)) {
-        old_value = builder->CreateAtomicRMW(
-            llvm::AtomicRMWInst::BinOp::Max, llvm_val[stmt->dest],
-            llvm_val[stmt->val], llvm::AtomicOrdering::SequentiallyConsistent);
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f16)) {
-        old_value = atomic_op_using_cas(
-            llvm_val[stmt->dest], llvm_val[stmt->val],
-            [&](auto v1, auto v2) { return builder->CreateMaxNum(v1, v2); });
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f32)) {
-        old_value = create_call("atomic_max_f32",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f64)) {
-        old_value = create_call("atomic_max_f64",
-                                {llvm_val[stmt->dest], llvm_val[stmt->val]});
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
-    } else if (stmt->op_type == AtomicOpType::bit_and) {
-      if (is_integral(stmt->val->ret_type)) {
-        old_value = builder->CreateAtomicRMW(
-            llvm::AtomicRMWInst::BinOp::And, llvm_val[stmt->dest],
-            llvm_val[stmt->val], llvm::AtomicOrdering::SequentiallyConsistent);
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
-    } else if (stmt->op_type == AtomicOpType::bit_or) {
-      if (is_integral(stmt->val->ret_type)) {
-        old_value = builder->CreateAtomicRMW(
-            llvm::AtomicRMWInst::BinOp::Or, llvm_val[stmt->dest],
-            llvm_val[stmt->val], llvm::AtomicOrdering::SequentiallyConsistent);
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
-    } else if (stmt->op_type == AtomicOpType::bit_xor) {
-      if (is_integral(stmt->val->ret_type)) {
-        old_value = builder->CreateAtomicRMW(
-            llvm::AtomicRMWInst::BinOp::Xor, llvm_val[stmt->dest],
-            llvm_val[stmt->val], llvm::AtomicOrdering::SequentiallyConsistent);
-      } else {
-        TI_NOT_IMPLEMENTED
-      }
+
+    if (llvm::Value *result = optimized_reduction(stmt)) {
+      old_value = result;
+    } else if (llvm::Value *result = custom_type_atomic(stmt)) {
+      old_value = result;
+    } else if (llvm::Value *result = real_or_unsigned_type_atomic(stmt)) {
+      old_value = result;
+    } else if (llvm::Value *result = integral_type_atomic(stmt)) {
+      old_value = result;
     } else {
       TI_NOT_IMPLEMENTED
     }
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index 583acd5e80245..6bc5c15c26229 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -230,6 +230,19 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
                                    CustomIntType *cit,
                                    llvm::Value *real);
 
+  virtual llvm::Value *optimized_reduction(AtomicOpStmt *stmt);
+
+  virtual llvm::Value *custom_type_atomic(AtomicOpStmt *stmt);
+
+  virtual llvm::Value *integral_type_atomic(AtomicOpStmt *stmt);
+
+  virtual llvm::Value *atomic_op_using_cas(
+      llvm::Value *output_address,
+      llvm::Value *val,
+      std::function<llvm::Value *(llvm::Value *, llvm::Value *)> op);
+
+  virtual llvm::Value *real_or_unsigned_type_atomic(AtomicOpStmt *stmt);
+
   void visit(AtomicOpStmt *stmt) override;
 
   void visit(GlobalPtrStmt *stmt) override;
@@ -392,11 +405,6 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   llvm::Value *get_exponent_offset(llvm::Value *exponent, CustomFloatType *cft);
 
-  llvm::Value *atomic_op_using_cas(
-      llvm::Value *dest,
-      llvm::Value *val,
-      std::function<llvm::Value *(llvm::Value *, llvm::Value *)> op);
-
   void visit(FuncCallStmt *stmt) override;
 
   llvm::Value *bitcast_from_u64(llvm::Value *val, DataType type);

From 4c42fc991bc4fdad773539ec97a6ea20092a09b9 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Thu, 2 Jun 2022 10:43:46 +0800
Subject: [PATCH 144/176] [refactor] Make sure Ndarray shape is field shape
 (#5085)

---
 python/taichi/lang/_ndarray.py               | 12 ++++---
 python/taichi/lang/matrix.py                 |  4 +--
 taichi/program/kernel.cpp                    | 13 +++++--
 taichi/program/ndarray.cpp                   | 16 ++++-----
 taichi/program/ndarray.h                     |  6 +++-
 taichi/program/ndarray_rw_accessors_bank.cpp | 36 +++++++++++---------
 taichi/program/ndarray_rw_accessors_bank.h   |  2 +-
 taichi/program/program.cpp                   |  4 +--
 taichi/python/export_lang.cpp                |  1 +
 tests/python/test_ndarray.py                 |  4 +--
 10 files changed, 57 insertions(+), 41 deletions(-)

diff --git a/python/taichi/lang/_ndarray.py b/python/taichi/lang/_ndarray.py
index 7c0298ac37935..85e676d6caaaa 100644
--- a/python/taichi/lang/_ndarray.py
+++ b/python/taichi/lang/_ndarray.py
@@ -81,7 +81,8 @@ def _ndarray_to_numpy(self):
         Returns:
             numpy.ndarray: The result numpy array.
         """
-        arr = np.zeros(shape=self.arr.shape, dtype=to_numpy_type(self.dtype))
+        arr = np.zeros(shape=self.arr.total_shape(),
+                       dtype=to_numpy_type(self.dtype))
         from taichi._kernels import ndarray_to_ext_arr  # pylint: disable=C0415
         ndarray_to_ext_arr(self, arr)
         impl.get_runtime().sync()
@@ -93,7 +94,8 @@ def _ndarray_matrix_to_numpy(self, layout, as_vector):
         Returns:
             numpy.ndarray: The result numpy array.
         """
-        arr = np.zeros(shape=self.arr.shape, dtype=to_numpy_type(self.dtype))
+        arr = np.zeros(shape=self.arr.total_shape(),
+                       dtype=to_numpy_type(self.dtype))
         from taichi._kernels import \
             ndarray_matrix_to_ext_arr  # pylint: disable=C0415
         layout_is_aos = 1 if layout == Layout.AOS else 0
@@ -109,7 +111,7 @@ def _ndarray_from_numpy(self, arr):
         """
         if not isinstance(arr, np.ndarray):
             raise TypeError(f"{np.ndarray} expected, but {type(arr)} provided")
-        if tuple(self.arr.shape) != tuple(arr.shape):
+        if tuple(self.arr.total_shape()) != tuple(arr.shape):
             raise ValueError(
                 f"Mismatch shape: {tuple(self.arr.shape)} expected, but {tuple(arr.shape)} provided"
             )
@@ -128,7 +130,7 @@ def _ndarray_matrix_from_numpy(self, arr, layout, as_vector):
         """
         if not isinstance(arr, np.ndarray):
             raise TypeError(f"{np.ndarray} expected, but {type(arr)} provided")
-        if tuple(self.arr.shape) != tuple(arr.shape):
+        if tuple(self.arr.total_shape()) != tuple(arr.shape):
             raise ValueError(
                 f"Mismatch shape: {tuple(self.arr.shape)} expected, but {tuple(arr.shape)} provided"
             )
@@ -195,7 +197,7 @@ def _pad_key(self, key):
             key = ()
         if not isinstance(key, (tuple, list)):
             key = (key, )
-        assert len(key) == len(self.arr.shape)
+        assert len(key) == len(self.arr.total_shape())
         return key
 
     def _initialize_host_accessor(self):
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 39d9820eb6eef..adc69777c6092 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1693,7 +1693,7 @@ def __init__(self, n, m, dtype, shape, layout):
         super().__init__()
         self.dtype = cook_dtype(dtype)
         self.layout = layout
-        self.shape = shape
+        self.shape = tuple(shape)
         self.element_type = TensorType((self.n, self.m), self.dtype)
         # TODO: we should pass in element_type, shape, layout instead.
         self.arr = impl.get_runtime().prog.create_ndarray(
@@ -1790,7 +1790,7 @@ def __init__(self, n, dtype, shape, layout):
         super().__init__()
         self.dtype = cook_dtype(dtype)
         self.layout = layout
-        self.shape = shape
+        self.shape = tuple(shape)
         self.element_type = TensorType((n, ), self.dtype)
         # TODO: pass in element_type, shape, layout directly
         self.arr = impl.get_runtime().prog.create_ndarray(
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 3b12af94ab526..7148650722e78 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -269,8 +269,17 @@ void Kernel::LaunchContextBuilder::set_arg_ndarray(int arg_id,
                                /*is_device_allocation=*/true);
   TI_ASSERT_INFO(arr.shape.size() <= taichi_max_num_indices,
                  "External array cannot have > {max_num_indices} indices");
-  for (uint64 i = 0; i < arr.shape.size(); ++i) {
-    this->set_extra_arg_int(arg_id, i, arr.shape[i]);
+  // TODO: Update the codegen so that we don't reserve slots for element_shape
+  // in extra_args, especially in SOA case.
+  if (arr.layout == ExternalArrayLayout::kAOS) {
+    for (uint64 i = 0; i < arr.shape.size(); ++i) {
+      this->set_extra_arg_int(arg_id, i, arr.shape[i]);
+    }
+  } else {
+    auto element_dim = arr.element_shape.size();
+    for (uint64 i = element_dim; i < arr.total_shape().size(); ++i) {
+      this->set_extra_arg_int(arg_id, i, arr.shape[i - element_dim]);
+    }
   }
 }
 
diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index 2e6876dfb9676..a158c56526f88 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -31,18 +31,17 @@ Ndarray::Ndarray(Program *prog,
                                     std::multiplies<>())),
       prog_(prog),
       rw_accessors_bank_(&prog->get_ndarray_rw_accessors_bank()) {
-  // TODO: Instead of flattening the element, shape/nelement_/num_active_indices
-  // should refer to field shape only.
-  // The only blocker left is the accessors should handle vector/matrix as well
-  // instead of scalar only.
+  // Now that we have two shapes which may be concatenated differently
+  // depending on layout, total_shape_ comes handy.
+  total_shape_ = shape;
   if (layout == ExternalArrayLayout::kAOS) {
-    shape.insert(shape.end(), element_shape.begin(), element_shape.end());
+    total_shape_.insert(total_shape_.end(), element_shape.begin(),
+                        element_shape.end());
   } else if (layout == ExternalArrayLayout::kSOA) {
-    shape.insert(shape.begin(), element_shape.begin(), element_shape.end());
+    total_shape_.insert(total_shape_.begin(), element_shape.begin(),
+                        element_shape.end());
   }
 
-  num_active_indices = shape.size();
-
   ndarray_alloc_ = prog->allocate_memory_ndarray(nelement_ * element_size_,
                                                  prog->result_buffer);
 }
@@ -53,7 +52,6 @@ Ndarray::Ndarray(DeviceAllocation &devalloc,
     : ndarray_alloc_(devalloc),
       dtype(type),
       shape(shape),
-      num_active_indices(shape.size()),
       nelement_(std::accumulate(std::begin(shape),
                                 std::end(shape),
                                 1,
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index 2e4320be77c8d..2ff2e70b34683 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -40,7 +40,6 @@ class TI_DLL_EXPORT Ndarray {
   //   num_active_indices = shape.size()
   std::vector<int> shape;
   ExternalArrayLayout layout{ExternalArrayLayout::kNull};
-  int num_active_indices{0};
 
   intptr_t get_data_ptr_as_int() const;
   intptr_t get_device_allocation_ptr_as_int() const;
@@ -51,11 +50,16 @@ class TI_DLL_EXPORT Ndarray {
   float64 read_float(const std::vector<int> &i);
   void write_int(const std::vector<int> &i, int64 val);
   void write_float(const std::vector<int> &i, float64 val);
+
+  const std::vector<int> &total_shape() const {
+    return total_shape_;
+  }
   ~Ndarray();
 
  private:
   std::size_t nelement_{1};
   std::size_t element_size_{1};
+  std::vector<int> total_shape_;
 
   Program *prog_{nullptr};
   // TODO: maybe remove these?
diff --git a/taichi/program/ndarray_rw_accessors_bank.cpp b/taichi/program/ndarray_rw_accessors_bank.cpp
index 1d5078f9dbd9f..da7ae39cbf380 100644
--- a/taichi/program/ndarray_rw_accessors_bank.cpp
+++ b/taichi/program/ndarray_rw_accessors_bank.cpp
@@ -15,15 +15,17 @@ void set_kernel_args(const std::vector<int> &I,
 void set_kernel_extra_args(const Ndarray *ndarray,
                            int arg_id,
                            Kernel::LaunchContextBuilder *launch_ctx) {
-  for (int i = 0; i < ndarray->num_active_indices; i++) {
-    launch_ctx->set_extra_arg_int(arg_id, i, ndarray->shape[i]);
+  // accessor kernels are special as they use element_shape as runtime
+  // information so it's required to use total_shape here.
+  for (int i = 0; i < ndarray->total_shape().size(); ++i) {
+    launch_ctx->set_extra_arg_int(arg_id, i, ndarray->total_shape()[i]);
   }
 }
 }  // namespace
 
 NdarrayRwAccessorsBank::Accessors NdarrayRwAccessorsBank::get(
     Ndarray *ndarray) {
-  NdarrayRwKeys keys{ndarray->num_active_indices, ndarray->dtype};
+  NdarrayRwKeys keys{ndarray->total_shape().size(), ndarray->dtype};
   if (ndarray_to_kernels_.find(keys) == ndarray_to_kernels_.end()) {
     ndarray_to_kernels_[keys] = {&(program_->get_ndarray_reader(ndarray)),
                                  &(program_->get_ndarray_writer(ndarray))};
@@ -45,14 +47,14 @@ NdarrayRwAccessorsBank::Accessors::Accessors(const Ndarray *ndarray,
 void NdarrayRwAccessorsBank::Accessors::write_float(const std::vector<int> &I,
                                                     float64 val) {
   auto launch_ctx = writer_->make_launch_context();
-  set_kernel_args(I, ndarray_->num_active_indices, &launch_ctx);
-  launch_ctx.set_arg_float(ndarray_->num_active_indices, val);
+  set_kernel_args(I, ndarray_->total_shape().size(), &launch_ctx);
+  launch_ctx.set_arg_float(ndarray_->total_shape().size(), val);
   launch_ctx.set_arg_external_array(
-      ndarray_->num_active_indices + 1,
+      ndarray_->total_shape().size() + 1,
       ndarray_->get_device_allocation_ptr_as_int(),
       ndarray_->get_nelement() * ndarray_->get_element_size(),
       /*is_device_allocation=*/true);
-  set_kernel_extra_args(ndarray_, ndarray_->num_active_indices + 1,
+  set_kernel_extra_args(ndarray_, ndarray_->total_shape().size() + 1,
                         &launch_ctx);
   prog_->synchronize();
   (*writer_)(launch_ctx);
@@ -62,13 +64,13 @@ float64 NdarrayRwAccessorsBank::Accessors::read_float(
     const std::vector<int> &I) {
   prog_->synchronize();
   auto launch_ctx = reader_->make_launch_context();
-  set_kernel_args(I, ndarray_->num_active_indices, &launch_ctx);
+  set_kernel_args(I, ndarray_->total_shape().size(), &launch_ctx);
   launch_ctx.set_arg_external_array(
-      ndarray_->num_active_indices,
+      ndarray_->total_shape().size(),
       ndarray_->get_device_allocation_ptr_as_int(),
       ndarray_->get_nelement() * ndarray_->get_element_size(),
       /*is_device_allocation=*/true);
-  set_kernel_extra_args(ndarray_, ndarray_->num_active_indices, &launch_ctx);
+  set_kernel_extra_args(ndarray_, ndarray_->total_shape().size(), &launch_ctx);
   (*reader_)(launch_ctx);
   prog_->synchronize();
   auto ret = reader_->get_ret_float(0);
@@ -79,14 +81,14 @@ float64 NdarrayRwAccessorsBank::Accessors::read_float(
 void NdarrayRwAccessorsBank::Accessors::write_int(const std::vector<int> &I,
                                                   int64 val) {
   auto launch_ctx = writer_->make_launch_context();
-  set_kernel_args(I, ndarray_->num_active_indices, &launch_ctx);
-  launch_ctx.set_arg_int(ndarray_->num_active_indices, val);
+  set_kernel_args(I, ndarray_->total_shape().size(), &launch_ctx);
+  launch_ctx.set_arg_int(ndarray_->total_shape().size(), val);
   launch_ctx.set_arg_external_array(
-      ndarray_->num_active_indices + 1,
+      ndarray_->total_shape().size() + 1,
       ndarray_->get_device_allocation_ptr_as_int(),
       ndarray_->get_nelement() * ndarray_->get_element_size(),
       /*is_device_allocation=*/true);
-  set_kernel_extra_args(ndarray_, ndarray_->num_active_indices + 1,
+  set_kernel_extra_args(ndarray_, ndarray_->total_shape().size() + 1,
                         &launch_ctx);
   prog_->synchronize();
   (*writer_)(launch_ctx);
@@ -95,13 +97,13 @@ void NdarrayRwAccessorsBank::Accessors::write_int(const std::vector<int> &I,
 int64 NdarrayRwAccessorsBank::Accessors::read_int(const std::vector<int> &I) {
   prog_->synchronize();
   auto launch_ctx = reader_->make_launch_context();
-  set_kernel_args(I, ndarray_->num_active_indices, &launch_ctx);
+  set_kernel_args(I, ndarray_->total_shape().size(), &launch_ctx);
   launch_ctx.set_arg_external_array(
-      ndarray_->num_active_indices,
+      ndarray_->total_shape().size(),
       ndarray_->get_device_allocation_ptr_as_int(),
       ndarray_->get_nelement() * ndarray_->get_element_size(),
       /*is_device_allocation=*/true);
-  set_kernel_extra_args(ndarray_, ndarray_->num_active_indices, &launch_ctx);
+  set_kernel_extra_args(ndarray_, ndarray_->total_shape().size(), &launch_ctx);
   (*reader_)(launch_ctx);
   prog_->synchronize();
   auto ret = reader_->get_ret_int(0);
diff --git a/taichi/program/ndarray_rw_accessors_bank.h b/taichi/program/ndarray_rw_accessors_bank.h
index 567859bbf8e09..b278b35d646b3 100644
--- a/taichi/program/ndarray_rw_accessors_bank.h
+++ b/taichi/program/ndarray_rw_accessors_bank.h
@@ -25,7 +25,7 @@ class Ndarray;
  * in get_ndarray_reader/writer in program.cpp.
  */
 struct NdarrayRwKeys {
-  int num_active_indices;
+  size_t num_active_indices;
   DataType dtype;
 
   struct Hasher {
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 10603a6f7bbef..994fcbfbdf351 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -408,7 +408,7 @@ Kernel &Program::get_snode_writer(SNode *snode) {
 Kernel &Program::get_ndarray_reader(Ndarray *ndarray) {
   auto kernel_name =
       fmt::format("ndarray_reader_{}", ndarray_reader_counter_++);
-  NdarrayRwKeys keys{ndarray->num_active_indices, ndarray->dtype};
+  NdarrayRwKeys keys{ndarray->total_shape().size(), ndarray->dtype};
   auto &ker = kernel([keys, this] {
     ExprGroup indices;
     for (int i = 0; i < keys.num_active_indices; i++) {
@@ -434,7 +434,7 @@ Kernel &Program::get_ndarray_reader(Ndarray *ndarray) {
 Kernel &Program::get_ndarray_writer(Ndarray *ndarray) {
   auto kernel_name =
       fmt::format("ndarray_writer_{}", ndarray_writer_counter_++);
-  NdarrayRwKeys keys{ndarray->num_active_indices, ndarray->dtype};
+  NdarrayRwKeys keys{ndarray->total_shape().size(), ndarray->dtype};
   auto &ker = kernel([keys, this] {
     ExprGroup indices;
     for (int i = 0; i < keys.num_active_indices; i++) {
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index e2c53c5077071..768bbf95bfd86 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -548,6 +548,7 @@ void export_lang(py::module &m) {
       .def("read_float", &Ndarray::read_float)
       .def("write_int", &Ndarray::write_int)
       .def("write_float", &Ndarray::write_float)
+      .def("total_shape", &Ndarray::total_shape)
       .def_readonly("dtype", &Ndarray::dtype)
       .def_readonly("element_shape", &Ndarray::element_shape)
       .def_readonly("shape", &Ndarray::shape);
diff --git a/tests/python/test_ndarray.py b/tests/python/test_ndarray.py
index a4861d4b88cd6..b91eeccf432e5 100644
--- a/tests/python/test_ndarray.py
+++ b/tests/python/test_ndarray.py
@@ -303,13 +303,13 @@ def test_ndarray_fill():
     assert (a.to_numpy() == anp).all()
 
     b = ti.Vector.ndarray(4, ti.f32, shape=(n))
-    bnp = np.ones(shape=b.arr.shape, dtype=np.float32)
+    bnp = np.ones(shape=b.arr.total_shape(), dtype=np.float32)
     b.fill(2.5)
     bnp.fill(2.5)
     assert (b.to_numpy() == bnp).all()
 
     c = ti.Matrix.ndarray(4, 4, ti.f32, shape=(n))
-    cnp = np.ones(shape=c.arr.shape, dtype=np.float32)
+    cnp = np.ones(shape=c.arr.total_shape(), dtype=np.float32)
     c.fill(1.5)
     cnp.fill(1.5)
     assert (c.to_numpy() == cnp).all()

From a645b99b56486ededda7d1ba221d678bd4a08fc3 Mon Sep 17 00:00:00 2001
From: Mingrui Zhang <33411325+erizmr@users.noreply.github.com>
Date: Thu, 2 Jun 2022 10:58:15 +0800
Subject: [PATCH 145/176] [autodiff] Allocate dual and adjoint snode (#5083)

* allocate dual and decouple grad and adjoint

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* update

* update

* update the adjoint name

* fix matrix

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* recover the grad name

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 cpp_examples/autograd.cpp                 | 30 ++++++++++++++---------
 python/taichi/lang/field.py               | 30 ++++++++++++++++++++---
 python/taichi/lang/impl.py                | 22 ++++++++---------
 python/taichi/lang/matrix.py              | 13 +++++-----
 python/taichi/lang/misc.py                |  2 +-
 taichi/analysis/gen_offline_cache_key.cpp |  1 +
 taichi/analysis/offline_cache_util.cpp    |  7 ++++--
 taichi/ir/expr.cpp                        |  6 ++++-
 taichi/ir/expr.h                          |  4 ++-
 taichi/ir/frontend_ir.h                   |  1 +
 taichi/ir/snode.cpp                       | 19 ++++++++++----
 taichi/ir/snode.h                         | 11 ++++++---
 taichi/program/snode_expr_utils.cpp       | 13 ++++++++--
 taichi/python/export_lang.cpp             |  6 +++--
 taichi/transforms/auto_diff.cpp           | 20 +++++++--------
 15 files changed, 126 insertions(+), 59 deletions(-)

diff --git a/cpp_examples/autograd.cpp b/cpp_examples/autograd.cpp
index 0b3bc8f43ed95..81c5a0f4422bc 100644
--- a/cpp_examples/autograd.cpp
+++ b/cpp_examples/autograd.cpp
@@ -55,7 +55,10 @@ void autograd() {
       bool is_primal() const override {
         return true;
       }
-      SNode *grad_snode() const override {
+      SNode *adjoint_snode() const override {
+        return snode;
+      }
+      SNode *dual_snode() const override {
         return snode;
       }
     };
@@ -66,7 +69,10 @@ void autograd() {
       bool is_primal() const override {
         return false;
       }
-      SNode *grad_snode() const override {
+      SNode *adjoint_snode() const override {
+        return nullptr;
+      }
+      SNode *dual_snode() const override {
         return nullptr;
       }
     };
@@ -76,8 +82,8 @@ void autograd() {
     snode->dt = PrimitiveType::f32;
     snode->grad_info = std::make_unique<GradInfoPrimal>(
         &root->dense(Axis(0), n, false).insert_children(SNodeType::place));
-    snode->get_grad()->dt = PrimitiveType::f32;
-    snode->get_grad()->grad_info = std::make_unique<GradInfoAdjoint>();
+    snode->get_adjoint()->dt = PrimitiveType::f32;
+    snode->get_adjoint()->grad_info = std::make_unique<GradInfoAdjoint>();
     return snode;
   };
   auto *a = get_snode_grad(), *b = get_snode_grad(), *c = get_snode_grad();
@@ -100,12 +106,12 @@ void autograd() {
                                   builder.create_add(i, one));
       builder.create_global_store(builder.create_global_ptr(c, {i}), zero);
 
-      builder.create_global_store(builder.create_global_ptr(a->get_grad(), {i}),
-                                  zero);
-      builder.create_global_store(builder.create_global_ptr(b->get_grad(), {i}),
-                                  zero);
-      builder.create_global_store(builder.create_global_ptr(c->get_grad(), {i}),
-                                  one);
+      builder.create_global_store(
+          builder.create_global_ptr(a->get_adjoint(), {i}), zero);
+      builder.create_global_store(
+          builder.create_global_ptr(b->get_adjoint(), {i}), zero);
+      builder.create_global_store(
+          builder.create_global_ptr(c->get_adjoint(), {i}), one);
     }
 
     kernel_init =
@@ -141,13 +147,13 @@ void autograd() {
       auto *ext_a = builder.create_external_ptr(
           builder.create_arg_load(0, PrimitiveType::f32, true), {i});
       auto *a_grad_i = builder.create_global_load(
-          builder.create_global_ptr(a->get_grad(), {i}));
+          builder.create_global_ptr(a->get_adjoint(), {i}));
       builder.create_global_store(ext_a, a_grad_i);
 
       auto *ext_b = builder.create_external_ptr(
           builder.create_arg_load(1, PrimitiveType::f32, true), {i});
       auto *b_grad_i = builder.create_global_load(
-          builder.create_global_ptr(b->get_grad(), {i}));
+          builder.create_global_ptr(b->get_adjoint(), {i}));
       builder.create_global_store(ext_b, b_grad_i);
 
       auto *ext_c = builder.create_external_ptr(
diff --git a/python/taichi/lang/field.py b/python/taichi/lang/field.py
index 5213ebd99c9af..123959a2af677 100644
--- a/python/taichi/lang/field.py
+++ b/python/taichi/lang/field.py
@@ -19,6 +19,8 @@ def __init__(self, _vars):
         self.vars = _vars
         self.host_accessors = None
         self.grad = None
+        self.adjoint = None
+        self.dual = None
 
     @property
     def snode(self):
@@ -92,13 +94,35 @@ def _loop_range(self):
         """
         return self.vars[0].ptr
 
-    def _set_grad(self, grad):
-        """Sets corresponding gradient field.
+    def _set_grad(self, grad, reverse_mode=True):
+        """Binds corresponding gradient field to adjoint or dual.
 
         Args:
             grad (Field): Corresponding gradient field.
+            reverse_mode (Bool): set for reverse or forward mode
         """
-        self.grad = grad
+        if reverse_mode:
+            self._set_adjoint(grad)
+            self.grad = self.adjoint
+        else:
+            self._set_dual(grad)
+            self.grad = self.dual
+
+    def _set_adjoint(self, adjoint):
+        """Sets corresponding adjoint field (reverse mode).
+
+        Args:
+            adjoint (Field): Corresponding adjoint field.
+        """
+        self.adjoint = adjoint
+
+    def _set_dual(self, dual):
+        """Sets corresponding dual field (forward mode).
+
+        Args:
+            dual (Field): Corresponding dual field.
+        """
+        self.dual = dual
 
     @python_scope
     def fill(self, val):
diff --git a/python/taichi/lang/impl.py b/python/taichi/lang/impl.py
index f10ba72937b91..e950d53adba0c 100644
--- a/python/taichi/lang/impl.py
+++ b/python/taichi/lang/impl.py
@@ -496,16 +496,16 @@ def create_field_member(dtype, name):
     x.ptr.set_is_primal(True)
     pytaichi.global_vars.append(x)
 
-    x_grad = None
+    x_adjoint = None
     if _ti_core.needs_grad(dtype):
         # adjoint
-        x_grad = Expr(get_runtime().prog.make_id_expr(""))
-        x_grad.ptr = _ti_core.global_new(x_grad.ptr, dtype)
-        x_grad.ptr.set_name(name + ".grad")
-        x_grad.ptr.set_is_primal(False)
-        x.ptr.set_grad(x_grad.ptr)
+        x_adjoint = Expr(get_runtime().prog.make_id_expr(""))
+        x_adjoint.ptr = _ti_core.global_new(x_adjoint.ptr, dtype)
+        x_adjoint.ptr.set_name(name + ".grad")
+        x_adjoint.ptr.set_is_primal(False)
+        x.ptr.set_adjoint(x_adjoint.ptr)
 
-    return x, x_grad
+    return x, x_adjoint
 
 
 @python_scope
@@ -552,15 +552,15 @@ def field(dtype, shape=None, name="", offset=None, needs_grad=False):
     assert (offset is None or shape
             is not None), 'The shape cannot be None when offset is being set'
 
-    x, x_grad = create_field_member(dtype, name)
-    x, x_grad = ScalarField(x), ScalarField(x_grad)
-    x._set_grad(x_grad)
+    x, x_adjoint = create_field_member(dtype, name)
+    x, x_adjoint = ScalarField(x), ScalarField(x_adjoint)
+    x._set_grad(x_adjoint, reverse_mode=True)
 
     if shape is not None:
         dim = len(shape)
         root.dense(index_nd(dim), shape).place(x, offset=offset)
         if needs_grad:
-            root.dense(index_nd(dim), shape).place(x_grad)
+            root.dense(index_nd(dim), shape).place(x_adjoint)
     return x
 
 
diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index adc69777c6092..1c7add5cd19c3 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1117,10 +1117,10 @@ def field(cls,
         else:
             for _ in range(n * m):
                 entries.append(impl.create_field_member(dtype, name=name))
-        entries, entries_grad = zip(*entries)
-        entries, entries_grad = MatrixField(entries, n, m), MatrixField(
-            entries_grad, n, m)
-        entries._set_grad(entries_grad)
+        entries, entries_adjoint = zip(*entries)
+        entries, entries_adjoint = MatrixField(entries, n, m), MatrixField(
+            entries_adjoint, n, m)
+        entries._set_grad(entries_adjoint, reverse_mode=True)
         impl.get_runtime().matrix_fields.append(entries)
 
         if shape is None:
@@ -1143,7 +1143,7 @@ def field(cls,
                     impl.root.dense(impl.index_nd(dim),
                                     shape).place(ScalarField(e), offset=offset)
                 if needs_grad:
-                    for e in entries_grad._get_field_members():
+                    for e in entries_adjoint._get_field_members():
                         impl.root.dense(impl.index_nd(dim),
                                         shape).place(ScalarField(e),
                                                      offset=offset)
@@ -1152,7 +1152,8 @@ def field(cls,
                                                                  offset=offset)
                 if needs_grad:
                     impl.root.dense(impl.index_nd(dim),
-                                    shape).place(entries_grad, offset=offset)
+                                    shape).place(entries_adjoint,
+                                                 offset=offset)
         return entries
 
     @classmethod
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 1af43927d945b..2d03a935063d4 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -692,7 +692,7 @@ def Tape(loss, clear_gradients=True):
     if len(loss.shape) != 0:
         raise RuntimeError(
             'The loss of `Tape` must be a 0-D field, i.e. scalar')
-    if not loss.snode.ptr.has_grad():
+    if not loss.snode.ptr.has_adjoint():
         raise RuntimeError(
             'Gradients of loss are not allocated, please use ti.field(..., needs_grad=True)'
             ' for all fields that are required by autodiff.')
diff --git a/taichi/analysis/gen_offline_cache_key.cpp b/taichi/analysis/gen_offline_cache_key.cpp
index bab660c68409b..f3ecaae6171e7 100644
--- a/taichi/analysis/gen_offline_cache_key.cpp
+++ b/taichi/analysis/gen_offline_cache_key.cpp
@@ -136,6 +136,7 @@ class ASTSerializer : public IRVisitor, public ExpressionVisitor {
     emit(expr->ambient_value);
     emit(expr->is_primal);
     emit(expr->adjoint);
+    emit(expr->dual);
   }
 
   void visit(GlobalPtrExpression *expr) override {
diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
index bcffd829f3940..2d4793d39c86e 100644
--- a/taichi/analysis/offline_cache_util.cpp
+++ b/taichi/analysis/offline_cache_util.cpp
@@ -112,8 +112,11 @@ static void get_offline_cache_key_of_snode_impl(
     serializer(snode->ambient_val.stringify());
   }
   if (snode->grad_info && !snode->grad_info->is_primal()) {
-    if (auto *grad_snode = snode->grad_info->grad_snode()) {
-      get_offline_cache_key_of_snode_impl(grad_snode, serializer, visited);
+    if (auto *adjoint_snode = snode->grad_info->adjoint_snode()) {
+      get_offline_cache_key_of_snode_impl(adjoint_snode, serializer, visited);
+    }
+    if (auto *dual_snode = snode->grad_info->dual_snode()) {
+      get_offline_cache_key_of_snode_impl(dual_snode, serializer, visited);
     }
   }
   if (snode->exp_snode) {
diff --git a/taichi/ir/expr.cpp b/taichi/ir/expr.cpp
index e56c679258c11..fd6f0deb8a09c 100644
--- a/taichi/ir/expr.cpp
+++ b/taichi/ir/expr.cpp
@@ -50,10 +50,14 @@ SNode *Expr::snode() const {
   return cast<GlobalVariableExpression>()->snode;
 }
 
-void Expr::set_grad(const Expr &o) {
+void Expr::set_adjoint(const Expr &o) {
   this->cast<GlobalVariableExpression>()->adjoint.set(o);
 }
 
+void Expr::set_dual(const Expr &o) {
+  this->cast<GlobalVariableExpression>()->dual.set(o);
+}
+
 Expr::Expr(int16 x) : Expr() {
   expr = std::make_shared<ConstExpression>(PrimitiveType::i16, x);
 }
diff --git a/taichi/ir/expr.h b/taichi/ir/expr.h
index 1997431c1fdbc..d826026c25133 100644
--- a/taichi/ir/expr.h
+++ b/taichi/ir/expr.h
@@ -93,7 +93,9 @@ class Expr {
   // traceback for type checking error message
   void set_tb(const std::string &tb);
 
-  void set_grad(const Expr &o);
+  void set_adjoint(const Expr &o);
+
+  void set_dual(const Expr &o);
 
   void set_attribute(const std::string &key, const std::string &value);
 
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 93752a4fca9da..6343de337e831 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -439,6 +439,7 @@ class GlobalVariableExpression : public Expression {
   TypedConstant ambient_value;
   bool is_primal{true};
   Expr adjoint;
+  Expr dual;
 
   GlobalVariableExpression(DataType dt, const Identifier &ident)
       : ident(ident), dt(dt) {
diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
index b5b715884868c..1a583cda431b5 100644
--- a/taichi/ir/snode.cpp
+++ b/taichi/ir/snode.cpp
@@ -304,13 +304,22 @@ bool SNode::is_primal() const {
   return grad_info->is_primal();
 }
 
-bool SNode::has_grad() const {
-  return is_primal() && (grad_info->grad_snode() != nullptr);
+bool SNode::has_adjoint() const {
+  return is_primal() && (grad_info->adjoint_snode() != nullptr);
 }
 
-SNode *SNode::get_grad() const {
-  TI_ASSERT(has_grad());
-  return grad_info->grad_snode();
+bool SNode::has_dual() const {
+  return is_primal() && (grad_info->dual_snode() != nullptr);
+}
+
+SNode *SNode::get_adjoint() const {
+  TI_ASSERT(has_adjoint());
+  return grad_info->adjoint_snode();
+}
+
+SNode *SNode::get_dual() const {
+  TI_ASSERT(has_dual());
+  return grad_info->dual_snode();
 }
 
 void SNode::set_snode_tree_id(int id) {
diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
index a3e7f55987f0f..8a21721c2a7bc 100644
--- a/taichi/ir/snode.h
+++ b/taichi/ir/snode.h
@@ -93,7 +93,8 @@ class SNode {
    public:
     virtual ~GradInfoProvider() = default;
     virtual bool is_primal() const = 0;
-    virtual SNode *grad_snode() const = 0;
+    virtual SNode *adjoint_snode() const = 0;
+    virtual SNode *dual_snode() const = 0;
 
     template <typename T>
     T *cast() {
@@ -286,9 +287,13 @@ class SNode {
 
   bool is_scalar() const;
 
-  bool has_grad() const;
+  bool has_adjoint() const;
 
-  SNode *get_grad() const;
+  SNode *get_adjoint() const;
+
+  bool has_dual() const;
+
+  SNode *get_dual() const;
 
   SNode *get_least_sparse_ancestor() const;
 
diff --git a/taichi/program/snode_expr_utils.cpp b/taichi/program/snode_expr_utils.cpp
index 666c08790ff91..e5a9d4d7b9e3f 100644
--- a/taichi/program/snode_expr_utils.cpp
+++ b/taichi/program/snode_expr_utils.cpp
@@ -16,7 +16,7 @@ class GradInfoImpl final : public SNode::GradInfoProvider {
     return glb_var_->is_primal;
   }
 
-  SNode *grad_snode() const override {
+  SNode *adjoint_snode() const override {
     auto &adj = glb_var_->adjoint;
     if (adj.expr == nullptr) {
       return nullptr;
@@ -24,6 +24,14 @@ class GradInfoImpl final : public SNode::GradInfoProvider {
     return adj.snode();
   }
 
+  SNode *dual_snode() const override {
+    auto &dual = glb_var_->dual;
+    if (dual.expr == nullptr) {
+      return nullptr;
+    }
+    return dual.snode();
+  }
+
  private:
   GlobalVariableExpression *glb_var_;
 };
@@ -102,8 +110,9 @@ void make_lazy_grad(SNode *snode, SNodeGlobalVarExprMap *snode_to_exprs) {
   }
   std::vector<Expr> new_grads;
   for (auto &c : snode->ch) {
+    // TODO: handle the dual SNode
     if (c->type == SNodeType::place && c->is_primal() && needs_grad(c->dt) &&
-        !c->has_grad()) {
+        !c->has_adjoint()) {
       new_grads.push_back(snode_to_exprs->at(c.get())->adjoint);
     }
   }
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 768bbf95bfd86..228e57b67e4bc 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -512,7 +512,8 @@ void export_lang(py::module &m) {
       .def("read_int", &SNode::read_int)
       .def("read_uint", &SNode::read_uint)
       .def("read_float", &SNode::read_float)
-      .def("has_grad", &SNode::has_grad)
+      .def("has_adjoint", &SNode::has_adjoint)
+      .def("has_dual", &SNode::has_dual)
       .def("is_primal", &SNode::is_primal)
       .def("is_place", &SNode::is_place)
       .def("get_expr", &SNode::get_expr)
@@ -662,7 +663,8 @@ void export_lang(py::module &m) {
            [&](Expr *expr, bool v) {
              expr->cast<GlobalVariableExpression>()->is_primal = v;
            })
-      .def("set_grad", &Expr::set_grad)
+      .def("set_adjoint", &Expr::set_adjoint)
+      .def("set_dual", &Expr::set_dual)
       .def("set_attribute", &Expr::set_attribute)
       .def("get_ret_type", &Expr::get_ret_type)
       .def("type_check", &Expr::type_check)
diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
index b9b7d493648f4..871f092a4a28f 100644
--- a/taichi/transforms/auto_diff.cpp
+++ b/taichi/transforms/auto_diff.cpp
@@ -33,7 +33,7 @@ class IndependentBlocksJudger : public BasicStmtVisitor {
       return;
     TI_ASSERT(stmt->dest->is<GlobalPtrStmt>());
     for (const auto &node : stmt->dest->cast<GlobalPtrStmt>()->snodes.data) {
-      if (node->has_grad()) {
+      if (node->has_adjoint()) {
         qualified_atomics_ = false;
         break;
       }
@@ -953,7 +953,7 @@ class MakeAdjoint : public IRVisitor {
     GlobalPtrStmt *src = stmt->src->as<GlobalPtrStmt>();
     TI_ASSERT(src->width() == 1);
     auto snodes = src->snodes;
-    if (!snodes[0]->has_grad()) {
+    if (!snodes[0]->has_adjoint()) {
       // No adjoint SNode. Do nothing
       return;
     }
@@ -961,8 +961,8 @@ class MakeAdjoint : public IRVisitor {
       // gradients stopped, do nothing.
       return;
     }
-    TI_ASSERT(snodes[0]->get_grad() != nullptr);
-    snodes[0] = snodes[0]->get_grad();
+    TI_ASSERT(snodes[0]->get_adjoint() != nullptr);
+    snodes[0] = snodes[0]->get_adjoint();
     auto adj_ptr = insert<GlobalPtrStmt>(snodes, src->indices);
     insert<AtomicOpStmt>(AtomicOpType::add, adj_ptr, load(adjoint(stmt)));
   }
@@ -972,12 +972,12 @@ class MakeAdjoint : public IRVisitor {
     GlobalPtrStmt *dest = stmt->dest->as<GlobalPtrStmt>();
     TI_ASSERT(dest->width() == 1);
     auto snodes = dest->snodes;
-    if (!snodes[0]->has_grad()) {
+    if (!snodes[0]->has_adjoint()) {
       // no gradient (likely integer types)
       return;
     }
-    TI_ASSERT(snodes[0]->get_grad() != nullptr);
-    snodes[0] = snodes[0]->get_grad();
+    TI_ASSERT(snodes[0]->get_adjoint() != nullptr);
+    snodes[0] = snodes[0]->get_adjoint();
     auto adjoint_ptr = insert<GlobalPtrStmt>(snodes, dest->indices);
     auto load = insert<GlobalLoadStmt>(adjoint_ptr);
     accumulate(stmt->val, load);
@@ -989,9 +989,9 @@ class MakeAdjoint : public IRVisitor {
     GlobalPtrStmt *dest = stmt->dest->as<GlobalPtrStmt>();
     TI_ASSERT(dest->width() == 1);
     auto snodes = dest->snodes;
-    if (snodes[0]->has_grad()) {
-      TI_ASSERT(snodes[0]->get_grad() != nullptr);
-      snodes[0] = snodes[0]->get_grad();
+    if (snodes[0]->has_adjoint()) {
+      TI_ASSERT(snodes[0]->get_adjoint() != nullptr);
+      snodes[0] = snodes[0]->get_adjoint();
       auto adjoint_ptr = insert<GlobalPtrStmt>(snodes, dest->indices);
       accumulate(stmt->val, insert<GlobalLoadStmt>(adjoint_ptr));
     } else {

From 094c47f0c1a1dcf8cd555d55e73b2c86caf179a1 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 2 Jun 2022 12:51:31 +0800
Subject: [PATCH 146/176] [build] [refactor] Change CMake global
 include_directories to target based function (#5082)

* Change to target_include_directories

* Update runtime cmake

* Pre-commit format
---
 CMakeLists.txt                       |  2 -
 cmake/TaichiCore.cmake               | 59 ++++++++++++++++++++++------
 cmake/TaichiExamples.cmake           | 11 ++++--
 cmake/TaichiTests.cmake              | 23 +++++++++--
 taichi/runtime/gfx/CMakeLists.txt    |  6 +++
 taichi/runtime/opengl/CMakeLists.txt |  8 +++-
 6 files changed, 85 insertions(+), 24 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ceeeda040a349..08be5fed709de 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -105,8 +105,6 @@ if (TI_BUILD_EXAMPLES)
   include(cmake/TaichiExamples.cmake)
 endif()
 
-include_directories(${PROJECT_SOURCE_DIR}/external/eigen)
-
 message("C++ Flags: ${CMAKE_CXX_FLAGS}")
 message("Build type: ${CMAKE_BUILD_TYPE}")
 
diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index c9584aa66f7df..c3ccfe0ce47f0 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -252,12 +252,14 @@ if (APPLE)
 endif()
 
 # TODO: replace these includes per target basis
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-include_directories(external/include)
-include_directories(external/spdlog/include)
-include_directories(external/glad/include)
-include_directories(external/SPIRV-Tools/include)
-include_directories(external/PicoSHA2)
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE ${CMAKE_SOURCE_DIR})
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/include)
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/spdlog/include)
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Tools/include)
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/PicoSHA2)
+target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/eigen)
+
+
 if (TI_WITH_OPENGL)
     target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/glad/include)
 endif()
@@ -294,7 +296,8 @@ if(TI_WITH_LLVM)
         message(FATAL_ERROR "LLVM version < 10 is not supported")
     endif()
     message(STATUS "Using LLVMConfig.cmake in: ${LLVM_DIR}")
-    include_directories(${LLVM_INCLUDE_DIRS})
+    target_include_directories(${CORE_LIBRARY_NAME} PUBLIC ${LLVM_INCLUDE_DIRS})
+
     message("LLVM include dirs ${LLVM_INCLUDE_DIRS}")
     message("LLVM library dirs ${LLVM_LIBRARY_DIRS}")
     add_definitions(${LLVM_DEFINITIONS})
@@ -339,8 +342,8 @@ if (TI_WITH_CUDA_TOOLKIT)
         message(STATUS "TI_WITH_CUDA_TOOLKIT = ON")
         message(STATUS "CUDA_TOOLKIT_ROOT_DIR=$ENV{CUDA_TOOLKIT_ROOT_DIR}")
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_CUDA_TOOLKIT")
-        include_directories($ENV{CUDA_TOOLKIT_ROOT_DIR}/include)
-        link_directories($ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64)
+        target_include_directories(${CORE_LIBRARY_NAME} PRIVATE $ENV{CUDA_TOOLKIT_ROOT_DIR}/include)
+        target_link_directories(${CORE_LIBRARY_NAME} PRIVATE $ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64)
         #libraries for cuda kernel profiler CuptiToolkit
         target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE cupti nvperf_host)
     endif()
@@ -380,9 +383,10 @@ target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE gfx_runtime)
 
 # Vulkan Device API
 if (TI_WITH_VULKAN)
-    include_directories(SYSTEM external/Vulkan-Headers/include)
+    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/Vulkan-Headers/include)
+
+    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/volk)
 
-    include_directories(SYSTEM external/volk)
 
     # By specifying SYSTEM, we suppressed the warnings from third-party headers.
     target_include_directories(${CORE_LIBRARY_NAME} SYSTEM PRIVATE external/VulkanMemoryAllocator/include)
@@ -475,6 +479,27 @@ if(TI_WITH_PYTHON AND NOT TI_EMSCRIPTENED)
     # https://cmake.org/cmake/help/v3.13/command/target_link_libraries.html?highlight=target_link_libraries#linking-object-libraries
     target_link_libraries(${CORE_WITH_PYBIND_LIBRARY_NAME} PRIVATE ${CORE_LIBRARY_NAME})
 
+    # TODO 4832: move some header dependencis to other targets, e.g., gui
+    target_include_directories(${CORE_WITH_PYBIND_LIBRARY_NAME}
+      PRIVATE
+        ${PROJECT_SOURCE_DIR}
+        ${PROJECT_SOURCE_DIR}/external/spdlog/include
+        ${PROJECT_SOURCE_DIR}/external/glad/include
+        ${PROJECT_SOURCE_DIR}/external/eigen
+        ${PROJECT_SOURCE_DIR}/external/volk
+        ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+        ${PROJECT_SOURCE_DIR}/external/Vulkan-Headers/include
+        ${PROJECT_SOURCE_DIR}/external/imgui
+        ${PROJECT_SOURCE_DIR}/external/imgui/backends
+      )
+
+    if (NOT ANDROID)
+      target_include_directories(${CORE_WITH_PYBIND_LIBRARY_NAME}
+        PRIVATE
+          external/glfw/include
+        )
+    endif ()
+
     # These commands should apply to the DLL that is loaded from python, not the OBJECT library.
     if (MSVC)
         set_property(TARGET ${CORE_WITH_PYBIND_LIBRARY_NAME} APPEND PROPERTY LINK_FLAGS /DEBUG)
@@ -501,18 +526,26 @@ if(TI_EMSCRIPTENED)
 endif()
 
 if(TI_WITH_GGUI)
-    include_directories(SYSTEM PRIVATE external/glm)
+    # PUBLIC as required by python module
+    target_include_directories(${CORE_LIBRARY_NAME} PUBLIC external/glm)
 
     # Dear ImGui
     add_definitions(-DIMGUI_IMPL_VULKAN_NO_PROTOTYPES)
     set(IMGUI_DIR external/imgui)
-    include_directories(SYSTEM ${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
 if(ANDROID)
     add_library(imgui  ${IMGUI_DIR}/backends/imgui_impl_android.cpp ${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp ${IMGUI_DIR}/imgui.cpp ${IMGUI_DIR}/imgui_draw.cpp  ${IMGUI_DIR}/imgui_tables.cpp ${IMGUI_DIR}/imgui_widgets.cpp)
+
+target_include_directories(imgui PUBLIC ${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
+
 else()
     include_directories(external/glfw/include)
     add_library(imgui  ${IMGUI_DIR}/backends/imgui_impl_glfw.cpp ${IMGUI_DIR}/backends/imgui_impl_vulkan.cpp ${IMGUI_DIR}/imgui.cpp ${IMGUI_DIR}/imgui_draw.cpp  ${IMGUI_DIR}/imgui_tables.cpp ${IMGUI_DIR}/imgui_widgets.cpp)
+
+    target_include_directories(imgui PUBLIC ${IMGUI_DIR} ${IMGUI_DIR}/backends ..)
+    target_include_directories(imgui PRIVATE external/glfw/include)
+
 endif()
+    target_include_directories(imgui PRIVATE external/Vulkan-Headers/include)
     target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE imgui)
 
 endif()
diff --git a/cmake/TaichiExamples.cmake b/cmake/TaichiExamples.cmake
index 3ab7d0084a1f3..dbcbf1ee3715c 100644
--- a/cmake/TaichiExamples.cmake
+++ b/cmake/TaichiExamples.cmake
@@ -11,10 +11,6 @@ file(GLOB_RECURSE TAICHI_EXAMPLES_SOURCE
 "cpp_examples/aot_save.cpp"
 )
 
-include_directories(
-    ${PROJECT_SOURCE_DIR},
-)
-
 add_executable(${EXAMPLES_NAME} ${TAICHI_EXAMPLES_SOURCE})
 if (WIN32)
     # Output the executable to bin/ instead of build/Debug/...
@@ -27,4 +23,11 @@ if (WIN32)
 endif()
 target_link_libraries(${EXAMPLES_NAME} PRIVATE taichi_isolated_core)
 
+# TODO 4832: be specific on the header dependencis here, e.g., ir
+target_include_directories(${EXAMPLES_NAME}
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/external/spdlog/include
+    ${PROJECT_SOURCE_DIR}/external/eigen
+  )
 endif()
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index b3085731ba612..0ad2b23cab576 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -22,10 +22,6 @@ file(GLOB_RECURSE TAICHI_TESTS_SOURCE
         "tests/cpp/struct/*.cpp"
         "tests/cpp/transforms/*.cpp")
 
-include_directories(
-    ${PROJECT_SOURCE_DIR},
-)
-
 add_executable(${TESTS_NAME} ${TAICHI_TESTS_SOURCE})
 if (WIN32)
     # Output the executable to bin/ instead of build/Debug/...
@@ -39,4 +35,23 @@ endif()
 target_link_libraries(${TESTS_NAME} PRIVATE taichi_isolated_core)
 target_link_libraries(${TESTS_NAME} PRIVATE gtest_main)
 
+target_include_directories(${TESTS_NAME}
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/external/spdlog/include
+    ${PROJECT_SOURCE_DIR}/external/include
+    ${PROJECT_SOURCE_DIR}/external/eigen
+    ${PROJECT_SOURCE_DIR}/external/volk
+    ${PROJECT_SOURCE_DIR}/external/glad/include
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+    ${PROJECT_SOURCE_DIR}/external/Vulkan-Headers/include
+  )
+
+if (NOT ANDROID)
+  target_include_directories(${TESTS_NAME}
+  PRIVATE
+    external/glfw/include
+  )
+endif ()
+
 add_test(NAME ${TESTS_NAME} COMMAND ${TESTS_NAME})
diff --git a/taichi/runtime/gfx/CMakeLists.txt b/taichi/runtime/gfx/CMakeLists.txt
index bc5bcdf03b5fb..f33df7268a8a7 100644
--- a/taichi/runtime/gfx/CMakeLists.txt
+++ b/taichi/runtime/gfx/CMakeLists.txt
@@ -8,9 +8,15 @@ target_sources(gfx_runtime
     aot_module_builder_impl.cpp
     aot_module_loader_impl.cpp
   )
+#TODO 4832, some dependencies here should not be required as they
+# are build requirements of other targets.
 target_include_directories(gfx_runtime
   PRIVATE
+    ${PROJECT_SOURCE_DIR}
     ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
     ${PROJECT_SOURCE_DIR}/external/eigen
     ${PROJECT_SOURCE_DIR}/external/FP16/include
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/external/spdlog/include
+        ${LLVM_INCLUDE_DIRS}
   )
diff --git a/taichi/runtime/opengl/CMakeLists.txt b/taichi/runtime/opengl/CMakeLists.txt
index e736722bade47..f846fd8996491 100644
--- a/taichi/runtime/opengl/CMakeLists.txt
+++ b/taichi/runtime/opengl/CMakeLists.txt
@@ -5,12 +5,18 @@ target_sources(opengl_runtime
   PRIVATE
     opengl_api.cpp
   )
+
+#TODO #4832, some path here should not be included as they are
+# dependencies of other targets.
 target_include_directories(opengl_runtime
   PRIVATE
-    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}
     ${CMAKE_CURRENT_SOURCE_DIR}/shaders
     ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
     ${PROJECT_SOURCE_DIR}/external/glad/include
     ${PROJECT_SOURCE_DIR}/external/eigen
     ${PROJECT_SOURCE_DIR}/external/glfw/include
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/external/spdlog/include
+    ${LLVM_INCLUDE_DIRS}
   )

From 98e6cecb5d8c67b1c5a4ae25166e1263bf986532 Mon Sep 17 00:00:00 2001
From: bsavery <brian.savery@gmail.com>
Date: Wed, 1 Jun 2022 22:06:59 -0700
Subject: [PATCH 147/176] [Doc] Add documentation of Taichi Struct Classes.
 (#5075)

* Add documentation of Taichi Struct Classes.

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* edits

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* Update docs/lang/articles/advanced/odop.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docs/lang/articles/advanced/odop.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docs/lang/articles/advanced/odop.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docs/lang/articles/advanced/odop.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docs/lang/articles/advanced/odop.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Update docs/lang/articles/advanced/odop.md

Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>

* Fix capitalization of Python

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Yi Xu <xy_xuyi@foxmail.com>
---
 docs/lang/articles/advanced/odop.md | 76 +++++++++++++++++++++++++++++
 1 file changed, 76 insertions(+)

diff --git a/docs/lang/articles/advanced/odop.md b/docs/lang/articles/advanced/odop.md
index 0384482c8fb01..0ed023630e2cd 100644
--- a/docs/lang/articles/advanced/odop.md
+++ b/docs/lang/articles/advanced/odop.md
@@ -280,3 +280,79 @@ b = Counter((4, 10))
 print(a.num())  # 6
 print(b.num())  # 7
 ```
+
+## Python classes as Taichi struct types
+Taichi provides custom [struct types](../basic/type.md#compound-types) for developers to associate pieces of data together. However, it is often convenient to have:
+  1. A Python representation of the struct type which is more object oriented.
+  2. Functions associated with a struct type. (C++ style structs)
+
+
+To achieve these two points, developers can use the `@ti.struct_class` decorator on a Python class.  This is heavily inspired by the Python [dataclass](https://docs.python.org/3/library/dataclasses.html) feature, which uses class fields with annotations to create data types.
+
+### Creating a struct from a Python class
+Here is an example of how we could create a Taichi struct type from a Python class:
+
+```python
+@ti.struct_class
+class Sphere:
+    center: vec3
+    radius: ti.f32
+```
+This will create the *exact* same type as doing:
+
+```python
+Sphere = ti.types.struct(center=vec3, radius=ti.f32)
+```
+Using the `@ti.struct_class` decorator will convert the annotated fields in the Python class to members in the resulting struct type.  In both of the above examples you would create a field of the struct the same way.
+
+```python
+sphere_field = Sphere.field(shape=(n,))
+```
+
+### Associating functions with the struct type
+Python classes can have functions attached to them, as can Taichi struct types.  Building from the above example, here is how one would add functions to the struct.
+
+```python
+@ti.struct_class
+class Sphere:
+    center: vec3
+    radius: ti.f32
+
+    @ti.func
+    def area(self):
+        # a function to run in taichi scope
+        return 4 * math.pi * self.radius * self.radius
+
+    def is_zero_sized(self):
+        # a python scope function
+        return self.radius == 0.0
+```
+
+Functions associated with structs follow the same [scope rules](../basic/syntax.md#taichi-scope-vs-python-scope) as normal functions, in that they can be in Taichi or Python scope.  Each instance of the `Sphere` struct type now will have the above functions added to them.  The functions can be called such as:
+
+```python
+a_python_struct = Sphere(center=vec3(0.0), radius=1.0)
+# calls a python scope function from python
+a_python_struct.is_zero_sized() # False
+
+@ti.kernel
+def get_area() -> ti.f32:
+    a_taichi_struct = Sphere(center=vec3(0.0), radius=4.0)
+    # return the area of the sphere, a taichi scope function
+    return a_taichi_struct.area()
+get_area() # 201.062...
+```
+
+### Notes on struct classes
+- Inheritance of struct classes is not implemented.
+- While functions attached to a struct with the `@ti.struct_class` decorator is convenient and encouraged, it is actually possible to associated a function to structs with the older method of defining structs.  As mentioned above, the two methods for defining a struct type are identical in their output.  To do this, use the `__struct_method` argument with the `ti.types.struct` call:
+
+```python
+@ti.func
+def area(self):
+    # a function to run in taichi scope
+    return 4 * math.pi * self.radius * self.radius
+
+Sphere = ti.types.struct(center=vec3, radius=ti.f32,
+                         __struct_methods={'area': area})
+```

From 8dc598d00fbeec6d3f33749cbd6b71e292b7752c Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Thu, 2 Jun 2022 17:19:07 +0800
Subject: [PATCH 148/176] [llvm] [aot] Add LLVM-CPU AOT tests (#5079)

* [llvm] [aot] Add LLVM-CPU AOT tests

* Refactored AOT test framework

* Fixed minor issue

* Enabled LLVM CPU-AOT for arm64 architecture

* Added aot unit tests programming guide

* Fixed typo

* Refactored AOT test framework
---
 cmake/TaichiTests.cmake                       |  3 +-
 taichi/llvm/llvm_program.cpp                  |  2 +-
 tests/cpp/backends/llvm/cpu_aot.py            | 25 ++++++++
 tests/cpp/backends/llvm/llvm_program_test.cpp | 56 +++++++++++++++++
 tests/run_tests.py                            | 60 ++++++++++++++++---
 tests/test_utils.py                           | 29 +++++++++
 6 files changed, 164 insertions(+), 11 deletions(-)
 create mode 100644 tests/cpp/backends/llvm/cpu_aot.py
 create mode 100644 tests/cpp/backends/llvm/llvm_program_test.cpp

diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index 0ad2b23cab576..6fd216b99fe37 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -14,10 +14,11 @@ file(GLOB_RECURSE TAICHI_TESTS_SOURCE
         "tests/cpp/analysis/*.cpp"
         "tests/cpp/aot/*.cpp"
         "tests/cpp/backends/*.cpp"
+        "tests/cpp/backends/llvm/*.cpp"
         "tests/cpp/codegen/*.cpp"
         "tests/cpp/common/*.cpp"
         "tests/cpp/ir/*.cpp"
-        "tests/cpp/llvm/*.cpp",
+        "tests/cpp/llvm/*.cpp"
         "tests/cpp/program/*.cpp"
         "tests/cpp/struct/*.cpp"
         "tests/cpp/transforms/*.cpp")
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 026a2c876b127..35772e33489d8 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -338,7 +338,7 @@ void LlvmProgramImpl::print_list_manager_info(void *list_manager,
 }
 
 std::unique_ptr<AotModuleBuilder> LlvmProgramImpl::make_aot_module_builder() {
-  if (config->arch == Arch::x64) {
+  if (config->arch == Arch::x64 || config->arch == Arch::arm64) {
     return std::make_unique<cpu::AotModuleBuilderImpl>();
   }
   TI_NOT_IMPLEMENTED;
diff --git a/tests/cpp/backends/llvm/cpu_aot.py b/tests/cpp/backends/llvm/cpu_aot.py
new file mode 100644
index 0000000000000..aec1315448224
--- /dev/null
+++ b/tests/cpp/backends/llvm/cpu_aot.py
@@ -0,0 +1,25 @@
+import os
+
+import taichi as ti
+
+
+def compile_aot():
+    ti.init(arch=ti.x64)
+
+    @ti.kernel
+    def run(base: int, arr: ti.types.ndarray()):
+        for i in arr:
+            arr[i] = base + i
+
+    arr = ti.ndarray(int, shape=16)
+    run(42, arr)
+
+    assert "TAICHI_AOT_FOLDER_PATH" in os.environ.keys()
+    dir_name = str(os.environ["TAICHI_AOT_FOLDER_PATH"])
+
+    m = ti.aot.Module(ti.x64)
+    m.add_kernel(run, template_args={'arr': arr})
+    m.save(dir_name, 'x64-aot')
+
+
+compile_aot()
diff --git a/tests/cpp/backends/llvm/llvm_program_test.cpp b/tests/cpp/backends/llvm/llvm_program_test.cpp
new file mode 100644
index 0000000000000..44514216f39b0
--- /dev/null
+++ b/tests/cpp/backends/llvm/llvm_program_test.cpp
@@ -0,0 +1,56 @@
+#include "gtest/gtest.h"
+
+#include "taichi/program/kernel_profiler.h"
+#include "taichi/llvm/llvm_program.h"
+#include "taichi/system/memory_pool.h"
+#include "taichi/backends/cpu/aot_module_loader_impl.h"
+
+#define TI_RUNTIME_HOST
+#include "taichi/program/context.h"
+#undef TI_RUNTIME_HOST
+
+namespace taichi {
+namespace lang {
+
+TEST(LlvmProgramTest, FullPipeline) {
+  CompileConfig cfg;
+  cfg.arch = Arch::x64;
+  cfg.kernel_profiler = false;
+  constexpr KernelProfilerBase *kNoProfiler = nullptr;
+  LlvmProgramImpl prog{cfg, kNoProfiler};
+  auto *compute_device = prog.get_compute_device();
+  // Must have handled all the arch fallback logic by this point.
+  auto memory_pool = std::make_unique<MemoryPool>(cfg.arch, compute_device);
+  prog.initialize_host();
+  uint64 *result_buffer{nullptr};
+  prog.materialize_runtime(memory_pool.get(), kNoProfiler, &result_buffer);
+
+  constexpr int kArrLen = 32;
+  constexpr int kArrBytes = kArrLen * sizeof(int32_t);
+  auto arr_devalloc = prog.allocate_memory_ndarray(kArrBytes, result_buffer);
+
+  cpu::AotModuleParams aot_params;
+  const auto folder_dir = getenv("TAICHI_AOT_FOLDER_PATH");
+
+  std::stringstream aot_mod_ss;
+  aot_mod_ss << folder_dir;
+  aot_params.module_path = aot_mod_ss.str();
+  aot_params.program = &prog;
+  auto mod = cpu::make_aot_module(aot_params);
+  auto *k_run = mod->get_kernel("run");
+  RuntimeContext ctx;
+  ctx.runtime = prog.get_llvm_runtime();
+  ctx.set_arg(0, /*v=*/0);
+  ctx.set_arg_devalloc(/*arg_id=*/1, arr_devalloc, /*shape=*/{kArrLen});
+  ctx.set_array_runtime_size(/*arg_id=*/1, kArrBytes);
+  k_run->launch(&ctx);
+
+  auto *data = reinterpret_cast<int32_t *>(
+      prog.get_ndarray_alloc_info_ptr(arr_devalloc));
+  for (int i = 0; i < kArrLen; ++i) {
+    EXPECT_EQ(data[i], i);
+  }
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/tests/run_tests.py b/tests/run_tests.py
index 7621f985394ae..9acafa614b738 100644
--- a/tests/run_tests.py
+++ b/tests/run_tests.py
@@ -3,12 +3,15 @@
 import pdb
 import subprocess
 import sys
+import tempfile
 import warnings
 
+from test_utils import __aot_test_cases, print_aot_test_guide
+
 import taichi as ti
 
 
-def _test_cpp():
+def _run_cpp_test(gtest_option="", extra_env=None):
     ti.reset()
     print("Running C++ tests...")
     ti_lib_dir = os.path.join(ti.__path__[0], '_lib', 'runtime')
@@ -16,17 +19,48 @@ def _test_cpp():
     cpp_test_filename = 'taichi_cpp_tests'
     curr_dir = os.path.dirname(os.path.abspath(__file__))
     build_dir = os.path.join(curr_dir, '../build')
+
     if os.path.exists(os.path.join(build_dir, cpp_test_filename)):
         env_copy = os.environ.copy()
         env_copy['TI_LIB_DIR'] = ti_lib_dir
-        subprocess.check_call(f'./{cpp_test_filename}',
-                              env=env_copy,
-                              cwd=build_dir)
-    else:
-        warnings.warn(
-            f"C++ tests are skipped due to missing {cpp_test_filename} in {build_dir}."
-            "Try building taichi with `TAICHI_CMAKE_ARGS=\'-DTI_BUILD_TESTS:BOOL=ON\' python setup.py develop`"
-            "if you want to enable it.")
+
+        cmd = [f'./{cpp_test_filename}']
+        if gtest_option: cmd.append(gtest_option)
+        if extra_env: env_copy.update(extra_env)
+
+        subprocess.check_call(cmd, env=env_copy, cwd=build_dir)
+
+
+def _test_cpp_aot():
+    tests_visited = []
+    for cpp_test_name, python_rpath in __aot_test_cases.items():
+        # Temporary folder will be removed upon handle destruction
+        temp_handle = tempfile.TemporaryDirectory()
+        temp_folderpath = temp_handle.name
+
+        curr_dir = os.path.dirname(os.path.abspath(__file__))
+        python_file_path = os.path.join(curr_dir, python_rpath)
+
+        extra_env = {"TAICHI_AOT_FOLDER_PATH": temp_folderpath}
+        env_copy = os.environ.copy()
+        env_copy.update(extra_env)
+
+        subprocess.check_call([sys.executable, python_file_path], env=env_copy)
+
+        # Run AOT C++ codes
+        _run_cpp_test(f"--gtest_filter={cpp_test_name}", extra_env)
+        tests_visited.append(cpp_test_name)
+
+    exclude_tests_cmd = "--gtest_filter=-" + ":".join(tests_visited)
+    return exclude_tests_cmd
+
+
+def _test_cpp():
+    # Run AOT test cases
+    exclude_tests_cmd = _test_cpp_aot()
+
+    # Run rest of the cpp tests
+    _run_cpp_test(exclude_tests_cmd)
 
 
 def _test_python(args):
@@ -196,10 +230,18 @@ def test():
         action='store_true',
         help=
         'Exclude arch(s) from test instead of include them, together with -a')
+    parser.add_argument('--help-aot',
+                        action='store_true',
+                        default=False,
+                        help='Show AOT test programming guide')
 
     args = parser.parse_args()
     print(args)
 
+    if args.help_aot:
+        print_aot_test_guide()
+        exit(1)
+
     if args.arch:
         arch = args.arch
         if args.exclusive:
diff --git a/tests/test_utils.py b/tests/test_utils.py
index c42d230cb886a..25b4b3b6dbd7a 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -12,6 +12,35 @@
 
 import taichi as ti
 
+__aot_test_cases = {
+    "LlvmProgramTest.FullPipeline":
+    os.path.join('cpp', 'backends', 'llvm', 'cpu_aot.py'),
+}
+
+
+def print_aot_test_guide():
+    message = f"""
+[ AOT Unit Tests Programming Guide ]
+
+An AOT test is usually composed of:
+1. A python script that compiles the Kernels and serialize into file.
+2. A C++ test that loads the file then perform execution.
+
+AOT test writer will have to configure your test case for "__aot_test_cases",
+the format of which follows:
+
+        "cpp_test_name" : "python_program_path"
+
+For example:
+
+        "LlvmProgramTest.FullPipeline": "cpp/backends/llvm.cpu_aot.py"
+
+The temporary directory where serialized cache file stays will be generated by run_tests.py. Both python program and C++ tests receives this directory path via environment variable "TAICHI_AOT_FOLDER_PATH".
+
+For each AOT test, run_tests.py will first run the python program specified by "python_program_path", followed by execution of the corresponding C++ test named "cpp_test_name".
+"""
+    print(message)
+
 
 # Helper functions
 def get_rel_eps():

From 52a7cd8325672bc160e5540e54064c960c78256d Mon Sep 17 00:00:00 2001
From: Mingrui Zhang <33411325+erizmr@users.noreply.github.com>
Date: Thu, 2 Jun 2022 21:51:35 +0800
Subject: [PATCH 149/176] [autodiff] Extract shared components for reverse and
 forward mode (#5088)

extract shared components for reverse and forward mode
---
 taichi/transforms/auto_diff.cpp | 187 ++++++++++++++++++--------------
 1 file changed, 107 insertions(+), 80 deletions(-)

diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
index 871f092a4a28f..798d00509a0ee 100644
--- a/taichi/transforms/auto_diff.cpp
+++ b/taichi/transforms/auto_diff.cpp
@@ -488,10 +488,9 @@ class ReverseOuterLoops : public BasicStmtVisitor {
   }
 };
 
-// Generate the adjoint version of an independent block
-
-class MakeAdjoint : public IRVisitor {
- private:
+// Base class for both reverse (make adjoint) and forward (make dual) mode
+class ADTransform : public IRVisitor {
+ protected:
   Stmt *constant(float32 x) {
     return insert<ConstStmt>(TypedConstant(x));
   }
@@ -556,6 +555,107 @@ class MakeAdjoint : public IRVisitor {
   }
 
  public:
+  virtual Stmt *insert_grad_stmt(std::unique_ptr<Stmt> &&stmt) = 0;
+
+  template <typename T, typename... Args>
+  Stmt *insert(Args &&...args) {
+    return insert_grad_stmt(Stmt::make<T>(args...));
+  }
+
+  void visit(AllocaStmt *alloca) override {
+    // do nothing.
+  }
+
+  void visit(AdStackAllocaStmt *alloca) override {
+    // do nothing.
+  }
+
+  void visit(ArgLoadStmt *stmt) override {
+    // do nothing.
+  }
+
+  void visit(LoopIndexStmt *stmt) override {
+    // do nothing.
+  }
+
+  void visit(PrintStmt *print_stmt) override {
+    // do nothing
+  }
+
+  void visit(ConstStmt *const_stmt) override {
+    // do nothing
+  }
+
+  void visit(WhileControlStmt *stmt) override {
+    TI_NOT_IMPLEMENTED
+  }
+
+  void visit(ContinueStmt *stmt) override {
+    TI_NOT_IMPLEMENTED;
+  }
+
+  void visit(WhileStmt *stmt) override {
+    TI_NOT_IMPLEMENTED
+  }
+
+  void visit(GlobalPtrStmt *stmt) override {
+    // do nothing
+  }
+
+  Stmt *load(Stmt *alloc) {
+    TI_ASSERT(alloc != nullptr);
+    if (alloc->is<AllocaStmt>()) {
+      return insert<LocalLoadStmt>(LocalAddress(alloc, 0));
+    } else {
+      // non alloca
+      return alloc;
+    }
+  }
+
+  bool gradients_stopped(GlobalLoadStmt *stmt, SNode *snode) {
+    for (auto block = stmt->parent; block; block = block->parent_block()) {
+      for (auto s : block->stop_gradients) {
+        if (s == snode) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
+  void visit(ElementShuffleStmt *stmt) override {
+    TI_NOT_IMPLEMENTED
+  }
+
+  void visit(AssertStmt *stmt) override {
+    // do nothing
+  }
+
+  void visit(RangeAssumptionStmt *stmt) override {
+    // do nothing
+  }
+
+  void visit(LinearizeStmt *stmt) override {
+    // do nothing
+  }
+
+  void visit(BitExtractStmt *stmt) override {
+    // do nothing
+  }
+
+  void visit(IntegerOffsetStmt *stmt) override {
+    // do nothing
+  }
+
+  void visit(RandStmt *stmt) override {
+    TI_ERROR("RandStmt not supported in AutoDiff for now.");
+  }
+};
+
+// Generate the adjoint version of an independent block
+class MakeAdjoint : public ADTransform {
+ public:
+  using ADTransform::visit;
   Block *current_block;
   Block *alloca_block;
   // Backup the forward pass (the forward pass might be modified during the
@@ -593,17 +693,12 @@ class MakeAdjoint : public IRVisitor {
     }
   }
 
-  Stmt *insert_back(std::unique_ptr<Stmt> &&stmt) {
+  Stmt *insert_grad_stmt(std::unique_ptr<Stmt> &&stmt) override {
     auto ptr = stmt.get();
     current_block->insert(std::move(stmt), -1);
     return ptr;
   }
 
-  template <typename T, typename... Args>
-  Stmt *insert(Args &&...args) {
-    return insert_back(Stmt::make<T>(args...));
-  }
-
   // Accumulate [value] to the adjoint of [primal]
   void accumulate(Stmt *primal, Stmt *value) {
     auto alloca_ = adjoint(primal);
@@ -675,22 +770,6 @@ class MakeAdjoint : public IRVisitor {
     return adjoint_stmt[stmt];
   }
 
-  void visit(AllocaStmt *alloca) override {
-    // do nothing.
-  }
-
-  void visit(AdStackAllocaStmt *alloca) override {
-    // do nothing.
-  }
-
-  void visit(ArgLoadStmt *stmt) override {
-    // do nothing.
-  }
-
-  void visit(LoopIndexStmt *stmt) override {
-    // do nothing.
-  }
-
   void visit(UnaryOpStmt *stmt) override {
     if (stmt->op_type == UnaryOpType::floor ||
         stmt->op_type == UnaryOpType::ceil) {
@@ -827,34 +906,14 @@ class MakeAdjoint : public IRVisitor {
       }
       current_block = old_current_block;
     }
-    insert_back(std::move(new_if));
-  }
-
-  void visit(PrintStmt *print_stmt) override {
-    // do nothing
-  }
-
-  void visit(ConstStmt *const_stmt) override {
-    // do nothing
-  }
-
-  void visit(WhileControlStmt *stmt) override {
-    TI_NOT_IMPLEMENTED
-  }
-
-  void visit(ContinueStmt *stmt) override {
-    TI_NOT_IMPLEMENTED;
-  }
-
-  void visit(WhileStmt *stmt) override {
-    TI_NOT_IMPLEMENTED
+    insert_grad_stmt(std::move(new_if));
   }
 
   void visit(RangeForStmt *for_stmt) override {
     auto new_for = for_stmt->clone();
     auto new_for_ptr = new_for->as<RangeForStmt>();
     new_for_ptr->reversed = !new_for_ptr->reversed;
-    insert_back(std::move(new_for));
+    insert_grad_stmt(std::move(new_for));
     const int len = new_for_ptr->body->size();
 
     for (int i = 0; i < len; i++) {
@@ -889,10 +948,6 @@ class MakeAdjoint : public IRVisitor {
     for_stmt->body->accept(this);
   }
 
-  void visit(GlobalPtrStmt *stmt) override {
-    // do nothing
-  }
-
   // Equivalent to AdStackLoadTopStmt when no stack is needed
   void visit(LocalLoadStmt *stmt) override {
     // TI_ASSERT(!needs_grad(stmt->ret_type));
@@ -999,34 +1054,6 @@ class MakeAdjoint : public IRVisitor {
     }
     stmt->parent->erase(stmt);
   }
-
-  void visit(ElementShuffleStmt *stmt) override {
-    TI_NOT_IMPLEMENTED
-  }
-
-  void visit(AssertStmt *stmt) override {
-    // do nothing
-  }
-
-  void visit(RangeAssumptionStmt *stmt) override {
-    // do nothing
-  }
-
-  void visit(LinearizeStmt *stmt) override {
-    // do nothing
-  }
-
-  void visit(BitExtractStmt *stmt) override {
-    // do nothing
-  }
-
-  void visit(IntegerOffsetStmt *stmt) override {
-    // do nothing
-  }
-
-  void visit(RandStmt *stmt) override {
-    TI_ERROR("RandStmt not supported in AutoDiff for now.");
-  }
 };
 
 class BackupSSA : public BasicStmtVisitor {

From fb7de6dc670f93f95c644934063cf46aa6f4a671 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Mon, 6 Jun 2022 09:36:59 +0800
Subject: [PATCH 150/176] [llvm] [refactor] Use LLVM native atomic ops if
 possible (#5091)

* [llvm] [refactor] Use LLVM native atomic ops if possible

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/codegen_llvm.cpp | 44 +++++++++++++++------------------
 taichi/codegen/codegen_llvm.h   |  2 +-
 2 files changed, 21 insertions(+), 25 deletions(-)

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 306b0c863b29a..5dd24f975d985 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1231,11 +1231,16 @@ llvm::Value *CodeGenLLVM::integral_type_atomic(AtomicOpStmt *stmt) {
   if (!is_integral(stmt->val->ret_type)) {
     return nullptr;
   }
+
   std::unordered_map<AtomicOpType, llvm::AtomicRMWInst::BinOp> bin_op;
   bin_op[AtomicOpType::add] = llvm::AtomicRMWInst::BinOp::Add;
-  bin_op[AtomicOpType::min] = llvm::AtomicRMWInst::BinOp::Min;
-  bin_op[AtomicOpType::max] = llvm::AtomicRMWInst::BinOp::Max;
-
+  if (is_signed(stmt->val->ret_type)) {
+    bin_op[AtomicOpType::min] = llvm::AtomicRMWInst::BinOp::Min;
+    bin_op[AtomicOpType::max] = llvm::AtomicRMWInst::BinOp::Max;
+  } else {
+    bin_op[AtomicOpType::min] = llvm::AtomicRMWInst::BinOp::UMin;
+    bin_op[AtomicOpType::max] = llvm::AtomicRMWInst::BinOp::UMax;
+  }
   bin_op[AtomicOpType::bit_and] = llvm::AtomicRMWInst::BinOp::And;
   bin_op[AtomicOpType::bit_or] = llvm::AtomicRMWInst::BinOp::Or;
   bin_op[AtomicOpType::bit_xor] = llvm::AtomicRMWInst::BinOp::Xor;
@@ -1280,12 +1285,14 @@ llvm::Value *CodeGenLLVM::atomic_op_using_cas(
   return old_val;
 }
 
-llvm::Value *CodeGenLLVM::real_or_unsigned_type_atomic(AtomicOpStmt *stmt) {
-  if (!stmt->val->ret_type->is<PrimitiveType>()) {
+llvm::Value *CodeGenLLVM::real_type_atomic(AtomicOpStmt *stmt) {
+  if (!is_real(stmt->val->ret_type)) {
     return nullptr;
   }
+
+  PrimitiveTypeID prim_type = stmt->val->ret_type->cast<PrimitiveType>()->type;
   AtomicOpType op = stmt->op_type;
-  if (stmt->val->ret_type->is_primitive(PrimitiveTypeID::f16)) {
+  if (prim_type == PrimitiveTypeID::f16) {
     switch (op) {
       case AtomicOpType::add:
         return atomic_op_using_cas(
@@ -1304,32 +1311,21 @@ llvm::Value *CodeGenLLVM::real_or_unsigned_type_atomic(AtomicOpStmt *stmt) {
     }
   }
 
-  PrimitiveTypeID prim_type = stmt->val->ret_type->cast<PrimitiveType>()->type;
+  if (op == AtomicOpType::add) {
+    return builder->CreateAtomicRMW(
+        llvm::AtomicRMWInst::FAdd, llvm_val[stmt->dest], llvm_val[stmt->val],
+        llvm::AtomicOrdering::SequentiallyConsistent);
+  }
 
   std::unordered_map<PrimitiveTypeID,
                      std::unordered_map<AtomicOpType, std::string>>
       atomics;
-
-  atomics[PrimitiveTypeID::f32][AtomicOpType::add] = "atomic_add_f32";
-  atomics[PrimitiveTypeID::f64][AtomicOpType::add] = "atomic_add_f64";
   atomics[PrimitiveTypeID::f32][AtomicOpType::min] = "atomic_min_f32";
   atomics[PrimitiveTypeID::f64][AtomicOpType::min] = "atomic_min_f64";
   atomics[PrimitiveTypeID::f32][AtomicOpType::max] = "atomic_max_f32";
   atomics[PrimitiveTypeID::f64][AtomicOpType::max] = "atomic_max_f64";
-  atomics[PrimitiveTypeID::u32][AtomicOpType::min] = "atomic_min_u32";
-  atomics[PrimitiveTypeID::u64][AtomicOpType::min] = "atomic_min_u64";
-  atomics[PrimitiveTypeID::u32][AtomicOpType::max] = "atomic_max_u32";
-  atomics[PrimitiveTypeID::u64][AtomicOpType::max] = "atomic_max_u64";
-
-  if (atomics.find(prim_type) == atomics.end()) {
-    return nullptr;
-  }
-  if (is_integral(stmt->val->ret_type) &&
-      atomics.at(prim_type).find(op) == atomics.at(prim_type).end()) {
-    return nullptr;
-  }
+  TI_ASSERT(atomics.find(prim_type) != atomics.end());
   TI_ASSERT(atomics.at(prim_type).find(op) != atomics.at(prim_type).end());
-
   return create_call(atomics.at(prim_type).at(op),
                      {llvm_val[stmt->dest], llvm_val[stmt->val]});
 }
@@ -1347,7 +1343,7 @@ void CodeGenLLVM::visit(AtomicOpStmt *stmt) {
       old_value = result;
     } else if (llvm::Value *result = custom_type_atomic(stmt)) {
       old_value = result;
-    } else if (llvm::Value *result = real_or_unsigned_type_atomic(stmt)) {
+    } else if (llvm::Value *result = real_type_atomic(stmt)) {
       old_value = result;
     } else if (llvm::Value *result = integral_type_atomic(stmt)) {
       old_value = result;
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index 6bc5c15c26229..9252e78a0d769 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -241,7 +241,7 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
       llvm::Value *val,
       std::function<llvm::Value *(llvm::Value *, llvm::Value *)> op);
 
-  virtual llvm::Value *real_or_unsigned_type_atomic(AtomicOpStmt *stmt);
+  virtual llvm::Value *real_type_atomic(AtomicOpStmt *stmt);
 
   void visit(AtomicOpStmt *stmt) override;
 

From 45870ecf19a188206edc270d2349d032090584f7 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Mon, 6 Jun 2022 12:10:24 +0800
Subject: [PATCH 151/176] [bug] Minor fix for ndarray element_shape in graph
 mode (#5093)

Now that ndarray's element_shape is separated from shape, this hack can
be removed.
---
 taichi/aot/graph_data.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/taichi/aot/graph_data.cpp b/taichi/aot/graph_data.cpp
index 4f642ba6e9fc3..4731b968b10fe 100644
--- a/taichi/aot/graph_data.cpp
+++ b/taichi/aot/graph_data.cpp
@@ -26,10 +26,7 @@ void CompiledGraph::run(
         Ndarray *arr = reinterpret_cast<Ndarray *>(ival.val);
         TI_ERROR_IF(ival.tag != aot::ArgKind::kNdarray,
                     "Required a ndarray for argument {}", symbolic_arg.name);
-        auto ndarray_elem_shape = std::vector<int>(
-            arr->shape.end() - symbolic_arg.element_shape.size(),
-            arr->shape.end());
-        TI_ERROR_IF(ndarray_elem_shape != symbolic_arg.element_shape,
+        TI_ERROR_IF(arr->element_shape != symbolic_arg.element_shape,
                     "Mismatched shape information for argument {}",
                     symbolic_arg.name);
         set_runtime_ctx_ndarray(&ctx, i, arr);

From e1ae06ee1c02a01e7ea6cf56ef073570954e1e48 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Mon, 6 Jun 2022 13:57:27 +0800
Subject: [PATCH 152/176] Use pre-calculated runtime size array for gfx
 runtime. (#5094)

---
 taichi/runtime/gfx/runtime.cpp | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

diff --git a/taichi/runtime/gfx/runtime.cpp b/taichi/runtime/gfx/runtime.cpp
index d47e19c71ce46..770a4619a138b 100644
--- a/taichi/runtime/gfx/runtime.cpp
+++ b/taichi/runtime/gfx/runtime.cpp
@@ -444,31 +444,11 @@ void GfxRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
             any_arrays[i] = kDeviceNullAllocation;
           }
         } else {
-          // Compute ext arr sizes
-          size_t size = arg.stride;
-          bool has_zero_axis = false;
-
-          for (int ax = 0; ax < 8; ax++) {
-            // FIXME: how and when do we determine the size of ext arrs?
-            size_t axis_size = host_ctx->extra_args[i][ax];
-            if (axis_size) {
-              if (has_zero_axis) {
-                // e.g. shape [1, 0, 1]
-                size = 0;
-              } else {
-                size *= host_ctx->extra_args[i][ax];
-              }
-            } else {
-              has_zero_axis = true;
-            }
-          }
-
-          ext_array_size[i] = size;
-
+          ext_array_size[i] = host_ctx->array_runtime_sizes[i];
           // Alloc ext arr
-          if (size) {
+          if (ext_array_size[i]) {
             DeviceAllocation extarr_buf = device_->allocate_memory(
-                {size, /*host_write=*/true, /*host_read=*/true,
+                {ext_array_size[i], /*host_write=*/true, /*host_read=*/true,
                  /*export_sharing=*/false, AllocUsage::Storage});
             any_arrays[i] = extarr_buf;
           } else {

From 8ab9b9fd04fdcc5ce0859bdb8967286e7fd907ff Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Mon, 6 Jun 2022 16:35:50 +0800
Subject: [PATCH 153/176] [Doc] Improve ODOP doc structure (#5089)

---
 docs/lang/articles/advanced/odop.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/lang/articles/advanced/odop.md b/docs/lang/articles/advanced/odop.md
index 0ed023630e2cd..2f55163c36579 100644
--- a/docs/lang/articles/advanced/odop.md
+++ b/docs/lang/articles/advanced/odop.md
@@ -18,6 +18,8 @@ programming** (ODOP).
 
 ## Data-oriented classes
 
+### Introduction
+
 If you need to define a **Taichi kernel** as a Python class member function, please decorate the class with a `@ti.data_oriented` decorator. You can then define `ti.kernel`s and `ti.func`s in your *data-oriented* Python class.
 
 :::note
@@ -110,8 +112,6 @@ print(a.y)  # [ 5. 13. 21. 29.]
 ```
 
 
-## Integrating features from the Python classes
-
 ### Inheritance of data-oriented classes
 
 The *data-oriented* property will be automatically carried beyond the Python class inheriting. This means the **Taichi Kernel** could be called while any of the ancestor classes are decorated by the `@ti.data_oriented` decorator.
@@ -167,7 +167,7 @@ c = BaseClass()
 # The two lines above will trigger a kernel define error, since class c is not decorated by @ti.data_oriented
 ```
 
-### Python-built-in-decorators
+### Python built-in decorators
 
 Common decorators that are pre-built in Python, `@staticmethod`[^1] and `@classmethod`[^2], could decorate to a **Taichi kernel** in *data-oriented* classes.
 
@@ -345,7 +345,7 @@ get_area() # 201.062...
 
 ### Notes on struct classes
 - Inheritance of struct classes is not implemented.
-- While functions attached to a struct with the `@ti.struct_class` decorator is convenient and encouraged, it is actually possible to associated a function to structs with the older method of defining structs.  As mentioned above, the two methods for defining a struct type are identical in their output.  To do this, use the `__struct_method` argument with the `ti.types.struct` call:
+- While functions attached to a struct with the `@ti.struct_class` decorator is convenient and encouraged, it is actually possible to associate a function to structs with the older method of defining structs.  As mentioned above, the two methods for defining a struct type are identical in their output.  To do this, use the `__struct_methods` argument with the `ti.types.struct` call:
 
 ```python
 @ti.func

From 8e8a792bfff5db149e5e872723222f54c964e9f9 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Mon, 6 Jun 2022 17:00:40 +0800
Subject: [PATCH 154/176] [llvm] [aot] CUDA-AOT PR #2: Implemented
 AOTModuleLoader & AOTModuleBuilder for LLVM-CUDA backend (#5087)

* [llvm] [aot] Add LLVM-CPU AOT tests

* Refactored AOT test framework

* Fixed minor issue

* Enabled LLVM CPU-AOT for arm64 architecture

* Added aot unit tests programming guide

* [llvm] [aot] CUDA-AOT PR #2: Implemented AOT Module Loader for LLVM-CUDA backend

* Fixed typo

* Fixed minor issue

* Refactored AOT test framework

* [llvm] [aot] Add LLVM-CUDA AOT tests

* Added cuda device availability check
---
 .../backends/cpu/aot_module_builder_impl.cpp  | 29 +-------
 taichi/backends/cpu/aot_module_builder_impl.h | 12 +---
 .../backends/cpu/aot_module_loader_impl.cpp   |  8 +--
 .../backends/cuda/aot_module_builder_impl.cpp | 19 ++++++
 .../backends/cuda/aot_module_builder_impl.h   | 18 +++++
 .../backends/cuda/aot_module_loader_impl.cpp  | 66 +++++++++++++++++++
 taichi/backends/cuda/aot_module_loader_impl.h | 21 ++++++
 taichi/backends/cuda/codegen_cuda.cpp         |  8 +++
 taichi/backends/cuda/codegen_cuda.h           |  6 ++
 taichi/llvm/llvm_aot_module_builder.cpp       | 38 +++++++++++
 taichi/llvm/llvm_aot_module_builder.h         | 24 +++++++
 taichi/llvm/llvm_aot_module_loader.h          |  4 ++
 taichi/llvm/llvm_program.cpp                  |  8 +++
 tests/cpp/backends/llvm/cuda_aot.py           | 25 +++++++
 tests/cpp/backends/llvm/llvm_program_test.cpp | 49 ++++++++++++++
 tests/test_utils.py                           |  2 +
 16 files changed, 296 insertions(+), 41 deletions(-)
 create mode 100644 taichi/backends/cuda/aot_module_builder_impl.cpp
 create mode 100644 taichi/backends/cuda/aot_module_builder_impl.h
 create mode 100644 taichi/backends/cuda/aot_module_loader_impl.cpp
 create mode 100644 taichi/backends/cuda/aot_module_loader_impl.h
 create mode 100644 taichi/llvm/llvm_aot_module_builder.cpp
 create mode 100644 taichi/llvm/llvm_aot_module_builder.h
 create mode 100644 tests/cpp/backends/llvm/cuda_aot.py

diff --git a/taichi/backends/cpu/aot_module_builder_impl.cpp b/taichi/backends/cpu/aot_module_builder_impl.cpp
index a65b4c01db57e..11d71e0e2b788 100644
--- a/taichi/backends/cpu/aot_module_builder_impl.cpp
+++ b/taichi/backends/cpu/aot_module_builder_impl.cpp
@@ -3,40 +3,15 @@
 #include <algorithm>
 
 #include "taichi/backends/cpu/codegen_cpu.h"
-
 #include "taichi/llvm/launch_arg_info.h"
 
 namespace taichi {
 namespace lang {
 namespace cpu {
 
-void AotModuleBuilderImpl::dump(const std::string &output_dir,
-                                const std::string &filename) const {
-  LlvmOfflineCacheFileWriter writer;
-  writer.set_data(std::move(cache_));
-  writer.dump(output_dir);
-}
-
-void AotModuleBuilderImpl::add_per_backend(const std::string &identifier,
-                                           Kernel *kernel) {
+CodeGenLLVM::CompiledData AotModuleBuilderImpl::compile_kernel(Kernel *kernel) {
   auto cgen = CodeGenCPU::make_codegen_llvm(kernel, /*ir=*/nullptr);
-  auto compiled = cgen->run_compilation();
-  LlvmOfflineCache::KernelCacheData kcache;
-  kcache.kernel_key = identifier;
-  kcache.module = compiled.llvm_module.get();
-  kcache.owned_module = std::move(compiled.llvm_module);
-  const auto &tasks = compiled.offloaded_tasks;
-  kcache.args = infer_launch_args(kernel);
-  kcache.offloaded_task_list.resize(tasks.size());
-  std::transform(tasks.begin(), tasks.end(), kcache.offloaded_task_list.begin(),
-                 [](const auto &t) -> LlvmOfflineCache::OffloadedTaskCacheData {
-                   LlvmOfflineCache::OffloadedTaskCacheData res;
-                   res.name = t.name;
-                   res.block_dim = t.block_dim;
-                   res.grid_dim = t.grid_dim;
-                   return res;
-                 });
-  cache_.kernels[identifier] = std::move(kcache);
+  return cgen->run_compilation();
 }
 
 }  // namespace cpu
diff --git a/taichi/backends/cpu/aot_module_builder_impl.h b/taichi/backends/cpu/aot_module_builder_impl.h
index b800398f49f79..1d81fa41d7c2e 100644
--- a/taichi/backends/cpu/aot_module_builder_impl.h
+++ b/taichi/backends/cpu/aot_module_builder_impl.h
@@ -2,21 +2,15 @@
 
 #include "taichi/aot/module_builder.h"
 #include "taichi/llvm/llvm_offline_cache.h"
+#include "taichi/llvm/llvm_aot_module_builder.h"
 
 namespace taichi {
 namespace lang {
 namespace cpu {
 
-class AotModuleBuilderImpl : public AotModuleBuilder {
- public:
-  void dump(const std::string &output_dir,
-            const std::string &filename) const override;
-
- protected:
-  void add_per_backend(const std::string &identifier, Kernel *kernel) override;
-
+class AotModuleBuilderImpl : public LlvmAotModuleBuilder {
  private:
-  mutable LlvmOfflineCache cache_;
+  CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) override;
 };
 
 }  // namespace cpu
diff --git a/taichi/backends/cpu/aot_module_loader_impl.cpp b/taichi/backends/cpu/aot_module_loader_impl.cpp
index fcd1761e20e99..e2ff3b2ecf0f6 100644
--- a/taichi/backends/cpu/aot_module_loader_impl.cpp
+++ b/taichi/backends/cpu/aot_module_loader_impl.cpp
@@ -15,15 +15,13 @@ class AotModuleImpl : public LlvmAotModule {
       : LlvmAotModule(params.module_path, params.program) {
   }
 
-  Arch arch() const override {
-    return Arch::x64;
-  }
-
  private:
   FunctionType convert_module_to_function(
       const std::string &name,
       LlvmOfflineCache::KernelCacheData &&loaded) override {
-    auto *tlctx = program_->get_llvm_context(program_->config->arch);
+    Arch arch = program_->config->arch;
+    TI_ASSERT(arch == Arch::x64 || arch == Arch::arm64);
+    auto *tlctx = program_->get_llvm_context(arch);
 
     const auto &tasks = loaded.offloaded_task_list;
     std::vector<OffloadedTask> offloaded_tasks;
diff --git a/taichi/backends/cuda/aot_module_builder_impl.cpp b/taichi/backends/cuda/aot_module_builder_impl.cpp
new file mode 100644
index 0000000000000..bf9833d46bda1
--- /dev/null
+++ b/taichi/backends/cuda/aot_module_builder_impl.cpp
@@ -0,0 +1,19 @@
+#include "taichi/backends/cuda/aot_module_builder_impl.h"
+
+#include <algorithm>
+
+#include "taichi/backends/cuda/codegen_cuda.h"
+#include "taichi/llvm/launch_arg_info.h"
+
+namespace taichi {
+namespace lang {
+namespace cuda {
+
+CodeGenLLVM::CompiledData AotModuleBuilderImpl::compile_kernel(Kernel *kernel) {
+  auto cgen = CodeGenCUDA::make_codegen_llvm(kernel, /*ir=*/nullptr);
+  return cgen->run_compilation();
+}
+
+}  // namespace cuda
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/cuda/aot_module_builder_impl.h b/taichi/backends/cuda/aot_module_builder_impl.h
new file mode 100644
index 0000000000000..f0fdc74e14f9c
--- /dev/null
+++ b/taichi/backends/cuda/aot_module_builder_impl.h
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "taichi/aot/module_builder.h"
+#include "taichi/llvm/llvm_offline_cache.h"
+#include "taichi/llvm/llvm_aot_module_builder.h"
+
+namespace taichi {
+namespace lang {
+namespace cuda {
+
+class AotModuleBuilderImpl : public LlvmAotModuleBuilder {
+ private:
+  CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) override;
+};
+
+}  // namespace cuda
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/cuda/aot_module_loader_impl.cpp b/taichi/backends/cuda/aot_module_loader_impl.cpp
new file mode 100644
index 0000000000000..b08efdc9632da
--- /dev/null
+++ b/taichi/backends/cuda/aot_module_loader_impl.cpp
@@ -0,0 +1,66 @@
+#include "taichi/backends/cuda/aot_module_loader_impl.h"
+#include "taichi/llvm/llvm_aot_module_loader.h"
+
+#include "taichi/llvm/llvm_offline_cache.h"
+#include "taichi/llvm/llvm_program.h"
+#include "taichi/backends/cuda/codegen_cuda.h"
+
+namespace taichi {
+namespace lang {
+namespace {
+
+class AotModuleImpl : public LlvmAotModule {
+ public:
+  explicit AotModuleImpl(const cuda::AotModuleParams &params)
+      : LlvmAotModule(params.module_path, params.program) {
+  }
+
+ private:
+  FunctionType convert_module_to_function(
+      const std::string &name,
+      LlvmOfflineCache::KernelCacheData &&loaded) override {
+    Arch arch = program_->config->arch;
+    TI_ASSERT(arch == Arch::cuda);
+    auto *tlctx = program_->get_llvm_context(arch);
+
+    const auto &tasks = loaded.offloaded_task_list;
+    std::vector<OffloadedTask> offloaded_tasks;
+    offloaded_tasks.reserve(tasks.size());
+    for (const auto &t : tasks) {
+      OffloadedTask ot{/*codegen=*/nullptr};
+      ot.name = t.name;
+      ot.block_dim = t.block_dim;
+      ot.grid_dim = t.grid_dim;
+      offloaded_tasks.push_back(std::move(ot));
+    }
+
+    CUDAModuleToFunctionConverter converter{tlctx, program_};
+    return converter.convert(name, loaded.args, std::move(loaded.owned_module),
+                             std::move(offloaded_tasks));
+  }
+
+  std::unique_ptr<aot::KernelTemplate> make_new_kernel_template(
+      const std::string &name) override {
+    TI_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+
+  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override {
+    TI_NOT_IMPLEMENTED;
+    return nullptr;
+  }
+};
+
+}  // namespace
+
+namespace cuda {
+
+std::unique_ptr<aot::Module> make_aot_module(std::any mod_params) {
+  auto mod = std::make_unique<AotModuleImpl>(
+      std::any_cast<const AotModuleParams &>(mod_params));
+  return mod;
+}
+
+}  // namespace cuda
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/cuda/aot_module_loader_impl.h b/taichi/backends/cuda/aot_module_loader_impl.h
new file mode 100644
index 0000000000000..03bd0a07be7e7
--- /dev/null
+++ b/taichi/backends/cuda/aot_module_loader_impl.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "taichi/aot/module_loader.h"
+
+namespace taichi {
+namespace lang {
+
+class LlvmProgramImpl;
+
+namespace cuda {
+
+struct TI_DLL_EXPORT AotModuleParams {
+  std::string module_path;
+  LlvmProgramImpl *program{nullptr};
+};
+
+TI_DLL_EXPORT std::unique_ptr<aot::Module> make_aot_module(std::any mod_params);
+
+}  // namespace cuda
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index 8715a6786172b..61cbffa52928d 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -711,6 +711,14 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
   }
 };
 
+#ifdef TI_WITH_LLVM
+// static
+std::unique_ptr<CodeGenLLVM> CodeGenCUDA::make_codegen_llvm(Kernel *kernel,
+                                                            IRNode *ir) {
+  return std::make_unique<CodeGenLLVMCUDA>(kernel, ir);
+}
+#endif  // TI_WITH_LLVM
+
 static void set_arg_external_array(RuntimeContext *ctx,
                                    const std::string &kernel_name,
                                    int arg_id,
diff --git a/taichi/backends/cuda/codegen_cuda.h b/taichi/backends/cuda/codegen_cuda.h
index 3285e0f7564b0..7b1e873c7805d 100644
--- a/taichi/backends/cuda/codegen_cuda.h
+++ b/taichi/backends/cuda/codegen_cuda.h
@@ -13,6 +13,12 @@ class CodeGenCUDA : public KernelCodeGen {
       : KernelCodeGen(kernel, ir) {
   }
 
+// TODO: Stop defining this macro guards in the headers
+#ifdef TI_WITH_LLVM
+  static std::unique_ptr<CodeGenLLVM> make_codegen_llvm(Kernel *kernel,
+                                                        IRNode *ir);
+#endif  // TI_WITH_LLVM
+
   FunctionType codegen() override;
 };
 
diff --git a/taichi/llvm/llvm_aot_module_builder.cpp b/taichi/llvm/llvm_aot_module_builder.cpp
new file mode 100644
index 0000000000000..d23ee5c47c564
--- /dev/null
+++ b/taichi/llvm/llvm_aot_module_builder.cpp
@@ -0,0 +1,38 @@
+#include "taichi/llvm/llvm_aot_module_builder.h"
+
+#include <algorithm>
+#include "taichi/llvm/launch_arg_info.h"
+
+namespace taichi {
+namespace lang {
+
+void LlvmAotModuleBuilder::dump(const std::string &output_dir,
+                                const std::string &filename) const {
+  LlvmOfflineCacheFileWriter writer;
+  writer.set_data(std::move(cache_));
+  writer.dump(output_dir);
+}
+
+void LlvmAotModuleBuilder::add_per_backend(const std::string &identifier,
+                                           Kernel *kernel) {
+  auto compiled = compile_kernel(kernel);
+  LlvmOfflineCache::KernelCacheData kcache;
+  kcache.kernel_key = identifier;
+  kcache.module = compiled.llvm_module.get();
+  kcache.owned_module = std::move(compiled.llvm_module);
+  const auto &tasks = compiled.offloaded_tasks;
+  kcache.args = infer_launch_args(kernel);
+  kcache.offloaded_task_list.resize(tasks.size());
+  std::transform(tasks.begin(), tasks.end(), kcache.offloaded_task_list.begin(),
+                 [](const auto &t) -> LlvmOfflineCache::OffloadedTaskCacheData {
+                   LlvmOfflineCache::OffloadedTaskCacheData res;
+                   res.name = t.name;
+                   res.block_dim = t.block_dim;
+                   res.grid_dim = t.grid_dim;
+                   return res;
+                 });
+  cache_.kernels[identifier] = std::move(kcache);
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/llvm_aot_module_builder.h b/taichi/llvm/llvm_aot_module_builder.h
new file mode 100644
index 0000000000000..b88133a761783
--- /dev/null
+++ b/taichi/llvm/llvm_aot_module_builder.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include "taichi/aot/module_builder.h"
+#include "taichi/llvm/llvm_offline_cache.h"
+#include "taichi/codegen/codegen_llvm.h"
+
+namespace taichi {
+namespace lang {
+
+class LlvmAotModuleBuilder : public AotModuleBuilder {
+ public:
+  void dump(const std::string &output_dir,
+            const std::string &filename) const override;
+
+ protected:
+  void add_per_backend(const std::string &identifier, Kernel *kernel) override;
+  virtual CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) = 0;
+
+ private:
+  mutable LlvmOfflineCache cache_;
+};
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/llvm/llvm_aot_module_loader.h b/taichi/llvm/llvm_aot_module_loader.h
index 7056eba37d212..b5e8f527cea67 100644
--- a/taichi/llvm/llvm_aot_module_loader.h
+++ b/taichi/llvm/llvm_aot_module_loader.h
@@ -15,6 +15,10 @@ class LlvmAotModule : public aot::Module {
     TI_ASSERT(program_ != nullptr);
   }
 
+  Arch arch() const override {
+    return program_->config->arch;
+  }
+
   uint64_t version() const override {
     return 0;
   }
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 35772e33489d8..65bc1a75e12e1 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -17,6 +17,7 @@
 #include "taichi/backends/cuda/cuda_device.h"
 
 #if defined(TI_WITH_CUDA)
+#include "taichi/backends/cuda/aot_module_builder_impl.h"
 #include "taichi/backends/cuda/cuda_driver.h"
 #include "taichi/backends/cuda/codegen_cuda.h"
 #include "taichi/backends/cuda/cuda_context.h"
@@ -341,6 +342,13 @@ std::unique_ptr<AotModuleBuilder> LlvmProgramImpl::make_aot_module_builder() {
   if (config->arch == Arch::x64 || config->arch == Arch::arm64) {
     return std::make_unique<cpu::AotModuleBuilderImpl>();
   }
+
+#if defined(TI_WITH_CUDA)
+  if (config->arch == Arch::cuda) {
+    return std::make_unique<cuda::AotModuleBuilderImpl>();
+  }
+#endif
+
   TI_NOT_IMPLEMENTED;
   return nullptr;
 }
diff --git a/tests/cpp/backends/llvm/cuda_aot.py b/tests/cpp/backends/llvm/cuda_aot.py
new file mode 100644
index 0000000000000..a347cb4899558
--- /dev/null
+++ b/tests/cpp/backends/llvm/cuda_aot.py
@@ -0,0 +1,25 @@
+import os
+
+import taichi as ti
+
+
+def compile_aot():
+    ti.init(arch=ti.cuda)
+
+    @ti.kernel
+    def run(base: int, arr: ti.types.ndarray()):
+        for i in arr:
+            arr[i] = base + i
+
+    arr = ti.ndarray(int, shape=16)
+    run(42, arr)
+
+    assert "TAICHI_AOT_FOLDER_PATH" in os.environ.keys()
+    dir_name = str(os.environ["TAICHI_AOT_FOLDER_PATH"])
+
+    m = ti.aot.Module(ti.cuda)
+    m.add_kernel(run, template_args={'arr': arr})
+    m.save(dir_name, 'cuda-aot')
+
+
+compile_aot()
diff --git a/tests/cpp/backends/llvm/llvm_program_test.cpp b/tests/cpp/backends/llvm/llvm_program_test.cpp
index 44514216f39b0..dca0aa2ea1631 100644
--- a/tests/cpp/backends/llvm/llvm_program_test.cpp
+++ b/tests/cpp/backends/llvm/llvm_program_test.cpp
@@ -4,6 +4,9 @@
 #include "taichi/llvm/llvm_program.h"
 #include "taichi/system/memory_pool.h"
 #include "taichi/backends/cpu/aot_module_loader_impl.h"
+#include "taichi/backends/cuda/aot_module_loader_impl.h"
+#include "taichi/backends/cuda/cuda_driver.h"
+#include "taichi/platform/cuda/detect_cuda.h"
 
 #define TI_RUNTIME_HOST
 #include "taichi/program/context.h"
@@ -52,5 +55,51 @@ TEST(LlvmProgramTest, FullPipeline) {
   }
 }
 
+TEST(LlvmProgramTest, FullPipelineCUDA) {
+  if (is_cuda_api_available()) {
+    CompileConfig cfg;
+    cfg.arch = Arch::cuda;
+    cfg.kernel_profiler = false;
+    constexpr KernelProfilerBase *kNoProfiler = nullptr;
+    LlvmProgramImpl prog{cfg, kNoProfiler};
+
+    // Must have handled all the arch fallback logic by this point.
+    prog.initialize_host();
+    uint64 *result_buffer{nullptr};
+    prog.materialize_runtime(nullptr, kNoProfiler, &result_buffer);
+
+    constexpr int kArrLen = 32;
+    constexpr int kArrBytes = kArrLen * sizeof(int32_t);
+    auto arr_devalloc = prog.allocate_memory_ndarray(kArrBytes, result_buffer);
+
+    cuda::AotModuleParams aot_params;
+    const auto folder_dir = getenv("TAICHI_AOT_FOLDER_PATH");
+
+    std::stringstream aot_mod_ss;
+    aot_mod_ss << folder_dir;
+    aot_params.module_path = aot_mod_ss.str();
+    aot_params.program = &prog;
+    auto mod = cuda::make_aot_module(aot_params);
+    auto *k_run = mod->get_kernel("run");
+    RuntimeContext ctx;
+    ctx.runtime = prog.get_llvm_runtime();
+    ctx.set_arg(0, /*v=*/0);
+    ctx.set_arg_devalloc(/*arg_id=*/1, arr_devalloc, /*shape=*/{kArrLen});
+    ctx.set_array_runtime_size(/*arg_id=*/1, kArrBytes);
+    k_run->launch(&ctx);
+
+    auto *data = reinterpret_cast<int32_t *>(
+        prog.get_ndarray_alloc_info_ptr(arr_devalloc));
+
+    std::vector<int32_t> cpu_data(kArrLen);
+    CUDADriver::get_instance().memcpy_device_to_host(
+        (void *)cpu_data.data(), (void *)data, kArrLen * sizeof(int32_t));
+
+    for (int i = 0; i < kArrLen; ++i) {
+      EXPECT_EQ(cpu_data[i], i);
+    }
+  }
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 25b4b3b6dbd7a..7974ce0709f6b 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -15,6 +15,8 @@
 __aot_test_cases = {
     "LlvmProgramTest.FullPipeline":
     os.path.join('cpp', 'backends', 'llvm', 'cpu_aot.py'),
+    "LlvmProgramTest.FullPipelineCUDA":
+    os.path.join('cpp', 'backends', 'llvm', 'cuda_aot.py'),
 }
 
 
From f0d7d6925a7963ba33ad68485d44c470cc79d7f3 Mon Sep 17 00:00:00 2001
From: Mingrui Zhang <33411325+erizmr@users.noreply.github.com>
Date: Mon, 6 Jun 2022 17:50:40 +0800
Subject: [PATCH 155/176] clean hidden override functions (#5097)

---
 taichi/transforms/auto_diff.cpp | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
index 798d00509a0ee..65ade4cd7289f 100644
--- a/taichi/transforms/auto_diff.cpp
+++ b/taichi/transforms/auto_diff.cpp
@@ -982,27 +982,6 @@ class MakeAdjoint : public ADTransform {
     insert<AdStackPopStmt>(stmt->stack);
   }
 
-  Stmt *load(Stmt *alloc) {
-    TI_ASSERT(alloc != nullptr);
-    if (alloc->is<AllocaStmt>()) {
-      return insert<LocalLoadStmt>(LocalAddress(alloc, 0));
-    } else {
-      // non alloca
-      return alloc;
-    }
-  }
-
-  bool gradients_stopped(GlobalLoadStmt *stmt, SNode *snode) {
-    for (auto block = stmt->parent; block; block = block->parent_block()) {
-      for (auto s : block->stop_gradients) {
-        if (s == snode) {
-          return true;
-        }
-      }
-    }
-    return false;
-  }
-
   void visit(GlobalLoadStmt *stmt) override {
     // issue global store to adjoint
     GlobalPtrStmt *src = stmt->src->as<GlobalPtrStmt>();

From 3d9539636ab24b2cb84f3ed5329637680cd0c998 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Mon, 6 Jun 2022 18:08:06 +0800
Subject: [PATCH 156/176] [refactor] Update Ndarray constructor used in AOT
 runtime. (#5095)

This constructor is mainly used to construct an Ndarray out of an
existing device allocation. This PR updates the behavior of this
constructor to seprate element_shape out of shape.
---
 taichi/program/ndarray.cpp     | 26 +++++++++++-
 taichi/program/ndarray.h       |  9 ++++-
 tests/cpp/aot/runtime_test.cpp | 72 ++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 4 deletions(-)
 create mode 100644 tests/cpp/aot/runtime_test.cpp

diff --git a/taichi/program/ndarray.cpp b/taichi/program/ndarray.cpp
index a158c56526f88..fc2648b2c3391 100644
--- a/taichi/program/ndarray.cpp
+++ b/taichi/program/ndarray.cpp
@@ -48,15 +48,37 @@ Ndarray::Ndarray(Program *prog,
 
 Ndarray::Ndarray(DeviceAllocation &devalloc,
                  const DataType type,
-                 const std::vector<int> &shape)
+                 const std::vector<int> &shape,
+                 const std::vector<int> &element_shape,
+                 ExternalArrayLayout layout)
     : ndarray_alloc_(devalloc),
       dtype(type),
+      element_shape(element_shape),
       shape(shape),
+      layout(layout),
       nelement_(std::accumulate(std::begin(shape),
                                 std::end(shape),
                                 1,
                                 std::multiplies<>())),
-      element_size_(data_type_size(dtype)) {
+      element_size_(data_type_size(dtype) *
+                    std::accumulate(std::begin(element_shape),
+                                    std::end(element_shape),
+                                    1,
+                                    std::multiplies<>())) {
+  // When element_shape is specfied but layout is not, default layout is AOS.
+  if (!element_shape.empty() && layout == ExternalArrayLayout::kNull) {
+    layout = ExternalArrayLayout::kAOS;
+  }
+  // Now that we have two shapes which may be concatenated differently
+  // depending on layout, total_shape_ comes handy.
+  total_shape_ = shape;
+  if (layout == ExternalArrayLayout::kAOS) {
+    total_shape_.insert(total_shape_.end(), element_shape.begin(),
+                        element_shape.end());
+  } else if (layout == ExternalArrayLayout::kSOA) {
+    total_shape_.insert(total_shape_.begin(), element_shape.begin(),
+                        element_shape.end());
+  }
 }
 
 Ndarray::~Ndarray() {
diff --git a/taichi/program/ndarray.h b/taichi/program/ndarray.h
index 2ff2e70b34683..dae837153cfd6 100644
--- a/taichi/program/ndarray.h
+++ b/taichi/program/ndarray.h
@@ -26,12 +26,17 @@ class TI_DLL_EXPORT Ndarray {
                    const std::vector<int> &element_shape = {},
                    ExternalArrayLayout layout = ExternalArrayLayout::kNull);
 
-  /* Constructs a Ndarray from an existing DeviceAllocation
+  /* Constructs a Ndarray from an existing DeviceAllocation.
    * It doesn't handle the allocation and deallocation.
+   * You can see a Ndarray as a view or interpretation of DeviceAllocation
+   * with specified element_shape & dtype & layout.
    */
   explicit Ndarray(DeviceAllocation &devalloc,
                    const DataType type,
-                   const std::vector<int> &shape);
+                   const std::vector<int> &shape,
+                   const std::vector<int> &element_shape = {},
+                   ExternalArrayLayout layout = ExternalArrayLayout::kNull);
+
   DeviceAllocation ndarray_alloc_{kDeviceNullAllocation};
   DataType dtype;
   std::vector<int> element_shape;
diff --git a/tests/cpp/aot/runtime_test.cpp b/tests/cpp/aot/runtime_test.cpp
new file mode 100644
index 0000000000000..2a8e6fae82dc2
--- /dev/null
+++ b/tests/cpp/aot/runtime_test.cpp
@@ -0,0 +1,72 @@
+#include "gtest/gtest.h"
+#define TI_RUNTIME_HOST
+#include "taichi/common/core.h"
+#include "taichi/program/ndarray.h"
+#include "taichi/program/context.h"
+#include "taichi/system/memory_pool.h"
+#include "taichi/runtime/gfx/runtime.h"
+#ifdef TI_WITH_VULKAN
+#include "taichi/backends/device.h"
+#include "taichi/backends/vulkan/vulkan_device.h"
+#include "taichi/backends/vulkan/vulkan_device_creator.h"
+#include "taichi/backends/vulkan/vulkan_loader.h"
+#include "taichi/backends/vulkan/vulkan_utils.h"
+#endif
+
+using namespace taichi;
+using namespace lang;
+
+#ifdef TI_WITH_VULKAN
+TEST(RuntimeTest, ViewDevAllocAsNdarray) {
+  // Otherwise will segfault on macOS VM,
+  // where Vulkan is installed but no devices are present
+  if (!vulkan::is_vulkan_api_available()) {
+    return;
+  }
+
+  // API based on proposal https://github.com/taichi-dev/taichi/issues/3642
+  // Initialize Vulkan program
+  taichi::uint64 *result_buffer{nullptr};
+  auto memory_pool =
+      std::make_unique<taichi::lang::MemoryPool>(Arch::vulkan, nullptr);
+  result_buffer = (taichi::uint64 *)memory_pool->allocate(
+      sizeof(taichi::uint64) * taichi_result_buffer_entries, 8);
+
+  // Create Taichi Device for computation
+  lang::vulkan::VulkanDeviceCreator::Params evd_params;
+  evd_params.api_version =
+      taichi::lang::vulkan::VulkanEnvSettings::kApiVersion();
+  auto embedded_device =
+      std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
+  taichi::lang::vulkan::VulkanDevice *device_ =
+      static_cast<taichi::lang::vulkan::VulkanDevice *>(
+          embedded_device->device());
+  // Create Vulkan runtime
+  gfx::GfxRuntime::Params params;
+  params.host_result_buffer = result_buffer;
+  params.device = device_;
+  auto vulkan_runtime =
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
+
+  const int size = 40;
+  taichi::lang::Device::AllocParams alloc_params;
+  alloc_params.host_write = true;
+  alloc_params.size = size * sizeof(int);
+  alloc_params.usage = taichi::lang::AllocUsage::Storage;
+  DeviceAllocation devalloc_arr_ = device_->allocate_memory(alloc_params);
+
+  std::vector<int> element_shape = {4};
+  auto arr1 = Ndarray(devalloc_arr_, PrimitiveType::i32, {10}, element_shape);
+  EXPECT_TRUE(arr1.element_shape == element_shape);
+  EXPECT_EQ(arr1.total_shape()[0], 10);
+  EXPECT_EQ(arr1.total_shape()[1], 4);
+
+  auto arr2 = Ndarray(devalloc_arr_, PrimitiveType::i32, {10}, element_shape,
+                      ExternalArrayLayout::kSOA);
+  EXPECT_TRUE(arr2.element_shape == element_shape);
+  EXPECT_EQ(arr2.total_shape()[0], 4);
+  EXPECT_EQ(arr2.total_shape()[1], 10);
+
+  device_->dealloc_memory(devalloc_arr_);
+}
+#endif

From fe250ac96337dc648b9fb72053c3d0b68e110a34 Mon Sep 17 00:00:00 2001
From: Haidong Lan <haidonglan@taichi.graphics>
Date: Tue, 7 Jun 2022 22:07:48 +0800
Subject: [PATCH 157/176] [refactor] Remove ndarray element shape from extra
 arg buffer (#5100)

* Remove element shape from extra args.
---
 python/taichi/lang/kernel_impl.py         | 17 ++++--
 taichi/backends/cc/codegen_cc.cpp         | 18 ++++++-
 taichi/backends/metal/codegen_metal.cpp   | 48 +++++------------
 taichi/backends/opengl/codegen_opengl.cpp | 64 +++++------------------
 taichi/codegen/codegen_llvm.cpp           | 34 +++++-------
 taichi/codegen/spirv/spirv_codegen.cpp    | 34 ++++--------
 taichi/program/kernel.cpp                 | 13 +----
 taichi/transforms/lower_ast.cpp           | 13 ++---
 8 files changed, 84 insertions(+), 157 deletions(-)

diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index fab88a564e267..884298e9f99c7 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -656,13 +656,24 @@ def func__(*args):
                     is_numpy = isinstance(v, np.ndarray)
                     is_torch = isinstance(v,
                                           torch.Tensor) if has_torch else False
+
+                    # Element shapes are already spcialized in Taichi codegen.
+                    # The shape information for element dims are no longer needed.
+                    # Therefore we strip the element shapes from the shape vector,
+                    # so that it only holds "real" array shapes.
+                    is_soa = needed.layout == Layout.SOA
+                    array_shape = v.shape
+                    element_dim = needed.element_dim
+                    if element_dim:
+                        array_shape = v.shape[
+                            element_dim:] if is_soa else v.shape[:-element_dim]
                     if is_numpy:
                         tmp = np.ascontiguousarray(v)
                         # Purpose: DO NOT GC |tmp|!
                         tmps.append(tmp)
                         launch_ctx.set_arg_external_array_with_shape(
                             actual_argument_slot, int(tmp.ctypes.data),
-                            tmp.nbytes, v.shape)
+                            tmp.nbytes, array_shape)
                     elif is_torch:
                         is_ndarray = False
                         tmp, torch_callbacks = self.get_torch_callbacks(
@@ -670,7 +681,7 @@ def func__(*args):
                         callbacks += torch_callbacks
                         launch_ctx.set_arg_external_array_with_shape(
                             actual_argument_slot, int(tmp.data_ptr()),
-                            tmp.element_size() * tmp.nelement(), v.shape)
+                            tmp.element_size() * tmp.nelement(), array_shape)
                     else:
                         # For now, paddle.fluid.core.Tensor._ptr() is only available on develop branch
                         tmp, paddle_callbacks = self.get_paddle_callbacks(
@@ -678,7 +689,7 @@ def func__(*args):
                         callbacks += paddle_callbacks
                         launch_ctx.set_arg_external_array_with_shape(
                             actual_argument_slot, int(tmp._ptr()),
-                            v.element_size() * v.size, v.shape)
+                            v.element_size() * v.size, array_shape)
 
                 elif isinstance(needed, MatrixType):
                     if id(needed.dtype) in primitive_types.real_type_ids:
diff --git a/taichi/backends/cc/codegen_cc.cpp b/taichi/backends/cc/codegen_cc.cpp
index 6f78f2182e739..838f859cbbb92 100644
--- a/taichi/backends/cc/codegen_cc.cpp
+++ b/taichi/backends/cc/codegen_cc.cpp
@@ -156,9 +156,23 @@ class CCTransformer : public IRVisitor {
     std::string offset = "0";
     const auto *argload = stmt->base_ptrs[0]->as<ArgLoadStmt>();
     const int arg_id = argload->arg_id;
+    const auto element_shape = stmt->element_shape;
+    const auto layout = stmt->element_dim < 0 ? ExternalArrayLayout::kAOS
+                                              : ExternalArrayLayout::kSOA;
+    const size_t element_shape_index_offset =
+        (layout == ExternalArrayLayout::kAOS)
+            ? stmt->indices.size() - element_shape.size()
+            : 0;
+    size_t size_var_index = 0;
     for (int i = 0; i < stmt->indices.size(); i++) {
-      auto stride = fmt::format("ti_ctx->earg[{} * {} + {}]", arg_id,
-                                taichi_max_num_indices, i);
+      std::string stride;
+      if (i >= element_shape_index_offset &&
+          i < element_shape_index_offset + element_shape.size()) {
+        stride = fmt::format("{}", element_shape[i - element_shape.size()]);
+      } else {
+        stride = fmt::format("ti_ctx->earg[{} * {} + {}]", arg_id,
+                             taichi_max_num_indices, size_var_index++);
+      }
       offset = fmt::format("({} * {} + {})", offset, stride,
                            stmt->indices[i]->raw_name());
     }
diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index 12157a74c94e4..5068a9239fb82 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -472,50 +472,30 @@ class KernelCodegenImpl : public IRVisitor {
       const int num_indices = stmt->indices.size();
       const auto &element_shape = stmt->element_shape;
       std::vector<std::string> size_exprs;
-      enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
-      const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
-
-      // Args buffer arrange dimensions from outer to inner
-      // AoS args buffer:   array_shape|element_shape
-      // SoA args buffer: element_shape|array_shape
-      //
-      // ti.Matrix.ndarray(3, 2, ti.f32, (5, 4), layout=ti.Layout.AOS)
-      // args buffer: 5, 4, 3, 2
-      // ti.Matrix.ndarray(3, 2, ti.f32, (5, 4), layout=ti.Layout.SOA)
-      // args buffer: 3, 2, 5, 4
+      const auto layout = stmt->element_dim <= 0 ? ExternalArrayLayout::kAOS
+                                                 : ExternalArrayLayout::kSOA;
       const int arr_shape_len = num_indices - element_shape.size();
-      int index_i = 0;
-      const auto add_elem_shape_exprs = [&]() {
-        for (int es : element_shape) {
-          size_exprs.push_back(std::to_string(es));
-          ++index_i;
-        }
-      };
-      int arr_shape_offset = 0;
-      if (layout == layout_SOA) {
-        add_elem_shape_exprs();
-        // When the layout is SOA, element shape comes before array shape, so
-        // we have to skip the element shapes first.
-        // TODO: Element shape is a compile-time known information, so extra
-        // args will always only need the array shape.
-        arr_shape_offset = element_shape.size();
-      }
+      const size_t element_shape_index_offset =
+          (layout == ExternalArrayLayout::kAOS) ? arr_shape_len : 0;
       for (int i = 0; i < arr_shape_len; i++) {
         std::string var_name =
             fmt::format("{}_arr_dim{}_", stmt->raw_name(), i);
         emit("const int {} = {}.extra_arg({}, {});", var_name, kContextVarName,
-             arg_id, i + arr_shape_offset);
+             arg_id, i);
         size_exprs.push_back(std::move(var_name));
-        ++index_i;
-      }
-      if (layout == layout_AOS) {
-        add_elem_shape_exprs();
       }
-      TI_ASSERT(index_i == num_indices);
+      size_t size_var_index = 0;
       for (int i = 0; i < num_indices; i++) {
-        emit("{} *= {};", linear_index_name, size_exprs[i]);
+        if (i >= element_shape_index_offset &&
+            i < element_shape_index_offset + element_shape.size()) {
+          emit("{} *= {};", linear_index_name,
+               element_shape[i - element_shape_index_offset]);
+        } else {
+          emit("{} *= {};", linear_index_name, size_exprs[size_var_index++]);
+        }
         emit("{} += {};", linear_index_name, stmt->indices[i]->raw_name());
       }
+      TI_ASSERT(size_var_index == arr_shape_len);
     }
     emit("}}");
 
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index b25ae7852a6ea..910bde1882d65 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -485,49 +485,13 @@ class KernelGen : public IRVisitor {
     const int num_indices = stmt->indices.size();
     const auto &element_shape = stmt->element_shape;
     std::vector<std::string> size_var_names;
-    std::vector<std::string> element_shape_size_var_names;
 
     const auto layout = stmt->element_dim <= 0 ? ExternalArrayLayout::kAOS
                                                : ExternalArrayLayout::kSOA;
-
-    if (element_shape.size() > 0) {
-      int elem_beg = 0;
-      int elem_end = 0;
-      if (layout == ExternalArrayLayout::kSOA) {
-        elem_beg = 0;
-        elem_end = element_shape.size();
-      } else {
-        elem_beg = num_indices - element_shape.size();
-        elem_end = num_indices;
-      }
-      for (int i = elem_beg; i < elem_end; i++) {
-        used.int32 = true;
-        std::string var_name = fmt::format("_s{}_{}{}", i, "arr", arg_id);
-        if (!loaded_args_.count(var_name)) {
-          emit("int {} = {};", var_name, element_shape[i - elem_beg]);
-          loaded_args_.insert(var_name);
-        }
-        element_shape_size_var_names.push_back(std::move(var_name));
-      }
-    }
-    // Args buffer arrange dimensions from outer to inner
-    // AoS args buffer:   array_shape|element_shape
-    // SoA args buffer: element_shape|array_shape
-    //
-    // ti.Matrix.ndarray(3, 2, ti.f32, (5, 4), layout=ti.Layout.AOS)
-    // args buffer: 5, 4, 3, 2
-    // ti.Matrix.ndarray(3, 2, ti.f32, (5, 4), layout=ti.Layout.SOA)
-    // args buffer: 3, 2, 5, 4
-    int ind_beg = 0;
-    int ind_end = 0;
-    if (layout == ExternalArrayLayout::kSOA) {
-      ind_beg = element_shape.size();
-      ind_end = num_indices;
-    } else {
-      ind_beg = 0;
-      ind_end = num_indices - element_shape.size();
-    }
-    for (int i = ind_beg; i < ind_end; i++) {
+    const size_t element_shape_index_offset =
+        layout == ExternalArrayLayout::kAOS ? num_indices - element_shape.size()
+                                            : 0;
+    for (int i = 0; i < num_indices - element_shape.size(); i++) {
       used.buf_args = true;
       used.int32 = true;
       std::string var_name = fmt::format("_s{}_{}{}", i, "arr", arg_id);
@@ -540,22 +504,20 @@ class KernelGen : public IRVisitor {
       }
       size_var_names.push_back(std::move(var_name));
     }
-    // Arrange index stride and offsets in correct order
-    if (layout == ExternalArrayLayout::kSOA) {
-      size_var_names.insert(size_var_names.begin(),
-                            element_shape_size_var_names.begin(),
-                            element_shape_size_var_names.end());
-    } else {
-      size_var_names.insert(size_var_names.end(),
-                            element_shape_size_var_names.begin(),
-                            element_shape_size_var_names.end());
-    }
 
     emit("int {} = {};", linear_index_name,
          num_indices == 0 ? "0" : stmt->indices[0]->short_name());
 
+    size_t size_var_name_index = (layout == ExternalArrayLayout::kAOS) ? 1 : 0;
     for (int i = 1; i < num_indices; i++) {
-      emit("{} *= {};", linear_index_name, size_var_names[i]);
+      if (i >= element_shape_index_offset &&
+          i < element_shape_index_offset + element_shape.size()) {
+        emit("{} *= {};", linear_index_name,
+             std::to_string(element_shape[i - element_shape_index_offset]));
+      } else {
+        emit("{} *= {};", linear_index_name,
+             size_var_names[size_var_name_index++]);
+      }
       emit("{} += {};", linear_index_name, stmt->indices[i]->short_name());
     }
 
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 5dd24f975d985..730b37feb69c6 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1617,24 +1617,13 @@ void CodeGenLLVM::visit(ExternalPtrStmt *stmt) {
   int num_indices = stmt->indices.size();
   std::vector<llvm::Value *> sizes(num_indices);
   const auto &element_shape = stmt->element_shape;
-  enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
-  const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
-  // Determine the element shape position inside the indices vector
-  // TODO: change the outer layout in order to remove the element layout
-  // guess work
-  int element_shape_begin = -1;
-  int element_shape_end = -1;
-  if (element_shape.size() > 0) {
-    if (layout == layout_SOA) {
-      element_shape_begin = 0;
-      element_shape_end = element_shape.size();
-    } else {
-      element_shape_begin = num_indices - element_shape.size();
-      element_shape_end = num_indices;
-    }
-  }
+  const auto layout = stmt->element_dim <= 0 ? ExternalArrayLayout::kAOS
+                                             : ExternalArrayLayout::kSOA;
+  const size_t element_shape_index_offset =
+      (layout == ExternalArrayLayout::kAOS) ? num_indices - element_shape.size()
+                                            : 0;
 
-  for (int i = 0; i < num_indices; i++) {
+  for (int i = 0; i < num_indices - element_shape.size(); i++) {
     auto raw_arg = create_call(
         "RuntimeContext_get_extra_args",
         {get_context(), tlctx->get_constant(arg_id), tlctx->get_constant(i)});
@@ -1647,18 +1636,19 @@ void CodeGenLLVM::visit(ExternalPtrStmt *stmt) {
       llvm::PointerType::get(tlctx->get_data_type(dt), 0));
 
   auto linear_index = tlctx->get_constant(0);
-  int element_shape_idx = 0;
+  size_t size_var_index = 0;
   for (int i = 0; i < num_indices; i++) {
-    if (i >= element_shape_begin && i < element_shape_end) {
+    if (i >= element_shape_index_offset &&
+        i < element_shape_index_offset + element_shape.size()) {
       llvm::Value *size_var =
-          tlctx->get_constant(element_shape[element_shape_idx++]);
+          tlctx->get_constant(element_shape[i - element_shape_index_offset]);
       linear_index = builder->CreateMul(linear_index, size_var);
     } else {
-      linear_index = builder->CreateMul(linear_index, sizes[i]);
+      linear_index = builder->CreateMul(linear_index, sizes[size_var_index++]);
     }
     linear_index = builder->CreateAdd(linear_index, llvm_val[stmt->indices[i]]);
   }
-
+  TI_ASSERT(size_var_index == num_indices - element_shape.size())
   llvm_val[stmt] = builder->CreateGEP(base, linear_index);
 }
 
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index 25127db5be3df..f812fae204433 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -565,29 +565,14 @@ class TaskCodegen : public IRVisitor {
       const int num_indices = stmt->indices.size();
       std::vector<std::string> size_var_names;
       const auto &element_shape = stmt->element_shape;
-      enum ExternalArrayLayout { layout_AOS = 0, layout_SOA = 1 };
-      const auto layout = stmt->element_dim <= 0 ? layout_AOS : layout_SOA;
+      const auto layout = stmt->element_dim <= 0 ? ExternalArrayLayout::kAOS
+                                                 : ExternalArrayLayout::kSOA;
       const auto extra_args_member_index = ctx_attribs_->args().size();
-
-      // Determine the element shape position inside the indices vector
-      // TODO: change the outer layout in order to remove the element layout
-      // guess work
-      int element_shape_begin = -1;
-      int element_shape_end = -1;
-      if (element_shape.size() > 0) {
-        if (layout == layout_SOA) {
-          element_shape_begin = 0;
-          element_shape_end = element_shape.size();
-        } else {
-          element_shape_begin = num_indices - element_shape.size();
-          element_shape_end = num_indices;
-        }
-      }
-      for (int i = 0; i < num_indices; i++) {
-        // Skip expressions for element shapes.
-        if (i >= element_shape_begin && i < element_shape_end) {
-          continue;
-        }
+      const size_t element_shape_index_offset =
+          (layout == ExternalArrayLayout::kAOS)
+              ? num_indices - element_shape.size()
+              : 0;
+      for (int i = 0; i < num_indices - element_shape.size(); i++) {
         std::string var_name = fmt::format("{}_size{}_", stmt->raw_name(), i);
         const auto extra_arg_index = (arg_id * taichi_max_num_indices) + i;
         spirv::Value var_ptr = ir_->make_value(
@@ -604,9 +589,10 @@ class TaskCodegen : public IRVisitor {
       for (int i = 0; i < num_indices; i++) {
         spirv::Value size_var;
         // Use immediate numbers to flatten index for element shapes.
-        if (i >= element_shape_begin && i < element_shape_end) {
+        if (i >= element_shape_index_offset &&
+            i < element_shape_index_offset + element_shape.size()) {
           size_var = ir_->uint_immediate_number(
-              ir_->i32_type(), element_shape[i - element_shape_begin]);
+              ir_->i32_type(), element_shape[i - element_shape_index_offset]);
         } else {
           size_var = ir_->query_value(size_var_names[size_var_names_idx++]);
         }
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 7148650722e78..3b12af94ab526 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -269,17 +269,8 @@ void Kernel::LaunchContextBuilder::set_arg_ndarray(int arg_id,
                                /*is_device_allocation=*/true);
   TI_ASSERT_INFO(arr.shape.size() <= taichi_max_num_indices,
                  "External array cannot have > {max_num_indices} indices");
-  // TODO: Update the codegen so that we don't reserve slots for element_shape
-  // in extra_args, especially in SOA case.
-  if (arr.layout == ExternalArrayLayout::kAOS) {
-    for (uint64 i = 0; i < arr.shape.size(); ++i) {
-      this->set_extra_arg_int(arg_id, i, arr.shape[i]);
-    }
-  } else {
-    auto element_dim = arr.element_shape.size();
-    for (uint64 i = element_dim; i < arr.total_shape().size(); ++i) {
-      this->set_extra_arg_int(arg_id, i, arr.shape[i - element_dim]);
-    }
+  for (uint64 i = 0; i < arr.shape.size(); ++i) {
+    this->set_extra_arg_int(arg_id, i, arr.shape[i]);
   }
 }
 
diff --git a/taichi/transforms/lower_ast.cpp b/taichi/transforms/lower_ast.cpp
index 7920370e3444e..6712ab61de56a 100644
--- a/taichi/transforms/lower_ast.cpp
+++ b/taichi/transforms/lower_ast.cpp
@@ -313,16 +313,9 @@ class LowerAST : public IRVisitor {
     } else {
       auto tensor = stmt->global_var.cast<ExternalTensorExpression>();
       std::vector<Stmt *> shape;
-      if (tensor->element_dim > 0) {  // Layout.SOA
-        for (int i = tensor->element_dim; i < tensor->dim; i++) {
-          shape.push_back(fctx.push_back<ExternalTensorShapeAlongAxisStmt>(
-              i, tensor->arg_id));
-        }
-      } else {  // Layout.AOS
-        for (int i = 0; i < tensor->dim + tensor->element_dim; i++) {
-          shape.push_back(fctx.push_back<ExternalTensorShapeAlongAxisStmt>(
-              i, tensor->arg_id));
-        }
+      for (int i = 0; i < tensor->dim - abs(tensor->element_dim); i++) {
+        shape.push_back(fctx.push_back<ExternalTensorShapeAlongAxisStmt>(
+            i, tensor->arg_id));
       }
       Stmt *begin = fctx.push_back<ConstStmt>(TypedConstant(0));
       Stmt *end = fctx.push_back<ConstStmt>(TypedConstant(1));

From 4f2b9e3fd84eb35a6b29088b6c90dc8f534d5fcd Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Wed, 8 Jun 2022 09:13:18 +0800
Subject: [PATCH 158/176] [llvm] [refactor] Move load_bit_pointer() to
 CodeGenLLVM (#5099)

* [llvm] [refactor] Move load_bit_pointer() to CodeGenLLVM

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/codegen_llvm.cpp  | 16 ++++++++++++++++
 taichi/codegen/codegen_llvm.h    |  2 ++
 taichi/llvm/llvm_codegen_utils.h | 15 ---------------
 3 files changed, 18 insertions(+), 15 deletions(-)

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 730b37feb69c6..9b9f312cd4f2b 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1541,6 +1541,22 @@ llvm::Value *CodeGenLLVM::offset_bit_ptr(llvm::Value *input_bit_ptr,
   return create_bit_ptr_struct(byte_ptr_base, new_bit_offset);
 }
 
+std::tuple<llvm::Value *, llvm::Value *> CodeGenLLVM::load_bit_pointer(
+    llvm::Value *ptr) {
+  // 1. load byte pointer
+  auto byte_ptr_in_bit_struct =
+      builder->CreateGEP(ptr, {tlctx->get_constant(0), tlctx->get_constant(0)});
+  auto byte_ptr = builder->CreateLoad(byte_ptr_in_bit_struct);
+  TI_ASSERT(byte_ptr->getType()->getPointerElementType()->isIntegerTy(8));
+
+  // 2. load bit offset
+  auto bit_offset_in_bit_struct =
+      builder->CreateGEP(ptr, {tlctx->get_constant(0), tlctx->get_constant(1)});
+  auto bit_offset = builder->CreateLoad(bit_offset_in_bit_struct);
+  TI_ASSERT(bit_offset->getType()->isIntegerTy(32));
+  return std::make_tuple(byte_ptr, bit_offset);
+}
+
 void CodeGenLLVM::visit(SNodeLookupStmt *stmt) {
   llvm::Value *parent = nullptr;
   parent = llvm_val[stmt->input_snode];
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index 9252e78a0d769..5cbe35cd4ea9e 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -317,6 +317,8 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   llvm::Value *offset_bit_ptr(llvm::Value *input_bit_ptr, int bit_offset_delta);
 
+  std::tuple<llvm::Value *, llvm::Value *> load_bit_pointer(llvm::Value *ptr);
+
   void visit(SNodeLookupStmt *stmt) override;
 
   void visit(GetChStmt *stmt) override;
diff --git a/taichi/llvm/llvm_codegen_utils.h b/taichi/llvm/llvm_codegen_utils.h
index 17ae948854a00..073d35521f8c5 100644
--- a/taichi/llvm/llvm_codegen_utils.h
+++ b/taichi/llvm/llvm_codegen_utils.h
@@ -124,21 +124,6 @@ class LLVMModuleBuilder {
   llvm::Value *call(const std::string &func_name, Args &&...args) {
     return call(this->builder.get(), func_name, std::forward<Args>(args)...);
   }
-
-  std::tuple<llvm::Value *, llvm::Value *> load_bit_pointer(llvm::Value *ptr) {
-    // 1. load byte pointer
-    auto byte_ptr_in_bit_struct = builder->CreateGEP(
-        ptr, {tlctx->get_constant(0), tlctx->get_constant(0)});
-    auto byte_ptr = builder->CreateLoad(byte_ptr_in_bit_struct);
-    TI_ASSERT(byte_ptr->getType()->getPointerElementType()->isIntegerTy(8));
-
-    // 2. load bit offset
-    auto bit_offset_in_bit_struct = builder->CreateGEP(
-        ptr, {tlctx->get_constant(0), tlctx->get_constant(1)});
-    auto bit_offset = builder->CreateLoad(bit_offset_in_bit_struct);
-    TI_ASSERT(bit_offset->getType()->isIntegerTy(32));
-    return std::make_tuple(byte_ptr, bit_offset);
-  }
 };
 
 class RuntimeObject {

From 66cc85267b5dbc4b73d3e655474df05d70b20c57 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Wed, 8 Jun 2022 09:15:57 +0800
Subject: [PATCH 159/176] [test] Save mpm88 graph in python and load in C++
 test. (#5104)

This is a simplified version of
https://github.com/ailzhang/taichi-aot-demo/tree/mpm88_cgraph_demo which
strips the GGUI rendering part. Let's add this as a test (as well as
demo ;) ) in the codebase. We used to test the saving part of mpm88
btw and it was replaced with this e2e test.

Huge thanks to @k-ye for help debugging the GGUI rendering issue!
---
 tests/cpp/aot/aot_save_load_test.cpp | 129 ++++++++++++++++++++
 tests/cpp/aot/mpm88_graph_aot.py     | 169 +++++++++++++++++++++++++++
 tests/python/test_aot.py             | 147 -----------------------
 tests/test_utils.py                  |   2 +
 4 files changed, 300 insertions(+), 147 deletions(-)
 create mode 100644 tests/cpp/aot/mpm88_graph_aot.py

diff --git a/tests/cpp/aot/aot_save_load_test.cpp b/tests/cpp/aot/aot_save_load_test.cpp
index f83f5e55e9c18..78d8d16eb0939 100644
--- a/tests/cpp/aot/aot_save_load_test.cpp
+++ b/tests/cpp/aot/aot_save_load_test.cpp
@@ -399,4 +399,133 @@ TEST(AotLoadGraph, Vulkan) {
   EXPECT_EQ(dst[2], 42);
   device_->dealloc_memory(devalloc_arr_);
 }
+
+TEST(AotLoadGraph, Mpm88) {
+  // Otherwise will segfault on macOS VM,
+  // where Vulkan is installed but no devices are present
+  if (!vulkan::is_vulkan_api_available()) {
+    return;
+  }
+  constexpr int NR_PARTICLES = 8192;
+  constexpr int N_GRID = 128;
+
+  // API based on proposal https://github.com/taichi-dev/taichi/issues/3642
+  // Initialize Vulkan program
+  taichi::uint64 *result_buffer{nullptr};
+  taichi::lang::RuntimeContext host_ctx;
+  auto memory_pool =
+      std::make_unique<taichi::lang::MemoryPool>(Arch::vulkan, nullptr);
+  result_buffer = (taichi::uint64 *)memory_pool->allocate(
+      sizeof(taichi::uint64) * taichi_result_buffer_entries, 8);
+  host_ctx.result_buffer = result_buffer;
+
+  // Create Taichi Device for computation
+  lang::vulkan::VulkanDeviceCreator::Params evd_params;
+  evd_params.api_version =
+      taichi::lang::vulkan::VulkanEnvSettings::kApiVersion();
+  auto embedded_device =
+      std::make_unique<taichi::lang::vulkan::VulkanDeviceCreator>(evd_params);
+  taichi::lang::vulkan::VulkanDevice *device_ =
+      static_cast<taichi::lang::vulkan::VulkanDevice *>(
+          embedded_device->device());
+  // Create Vulkan runtime
+  gfx::GfxRuntime::Params params;
+  params.host_result_buffer = result_buffer;
+  params.device = device_;
+  auto vulkan_runtime =
+      std::make_unique<taichi::lang::gfx::GfxRuntime>(std::move(params));
+
+  // Run AOT module loader
+  const auto folder_dir = getenv("TAICHI_AOT_FOLDER_PATH");
+  std::stringstream ss;
+  ss << folder_dir;
+  gfx::AotModuleParams mod_params;
+  mod_params.module_path = ss.str();
+  mod_params.runtime = vulkan_runtime.get();
+
+  std::unique_ptr<aot::Module> vk_module =
+      aot::Module::load(Arch::vulkan, mod_params);
+  EXPECT_TRUE(vk_module);
+
+  // Retrieve kernels/fields/etc from AOT module
+  auto root_size = vk_module->get_root_size();
+  EXPECT_EQ(root_size, 0);
+  vulkan_runtime->add_root_buffer(root_size);
+
+  auto g_init = vk_module->get_graph("init");
+  auto g_update = vk_module->get_graph("update");
+
+  // Prepare Ndarray for model
+  taichi::lang::Device::AllocParams alloc_params;
+  alloc_params.host_write = false;
+  alloc_params.host_read = false;
+  alloc_params.size = NR_PARTICLES * 2 * sizeof(float);
+  alloc_params.usage = taichi::lang::AllocUsage::Storage;
+
+  taichi::lang::DeviceAllocation devalloc_x =
+      device_->allocate_memory(alloc_params);
+  auto x = taichi::lang::Ndarray(devalloc_x, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES}, {2});
+
+  taichi::lang::DeviceAllocation devalloc_v =
+      device_->allocate_memory(alloc_params);
+  auto v = taichi::lang::Ndarray(devalloc_v, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES}, {2});
+
+  alloc_params.size = NR_PARTICLES * 3 * sizeof(float);
+  taichi::lang::DeviceAllocation devalloc_pos =
+      device_->allocate_memory(alloc_params);
+  auto pos = taichi::lang::Ndarray(
+      devalloc_pos, taichi::lang::PrimitiveType::f32, {NR_PARTICLES}, {3});
+
+  alloc_params.size = NR_PARTICLES * sizeof(float) * 2 * 2;
+  taichi::lang::DeviceAllocation devalloc_C =
+      device_->allocate_memory(alloc_params);
+  auto C = taichi::lang::Ndarray(devalloc_C, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES}, {2, 2});
+
+  alloc_params.size = NR_PARTICLES * sizeof(float);
+  taichi::lang::DeviceAllocation devalloc_J =
+      device_->allocate_memory(alloc_params);
+  auto J = taichi::lang::Ndarray(devalloc_J, taichi::lang::PrimitiveType::f32,
+                                 {NR_PARTICLES});
+
+  alloc_params.size = N_GRID * N_GRID * 2 * sizeof(float);
+  taichi::lang::DeviceAllocation devalloc_grid_v =
+      device_->allocate_memory(alloc_params);
+  auto grid_v = taichi::lang::Ndarray(
+      devalloc_grid_v, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID}, {2});
+
+  alloc_params.size = N_GRID * N_GRID * sizeof(float);
+  taichi::lang::DeviceAllocation devalloc_grid_m =
+      device_->allocate_memory(alloc_params);
+  auto grid_m = taichi::lang::Ndarray(
+      devalloc_grid_m, taichi::lang::PrimitiveType::f32, {N_GRID, N_GRID});
+
+  std::unordered_map<std::string, taichi::lang::aot::IValue> args;
+  args.insert({"x", taichi::lang::aot::IValue::create(x)});
+  args.insert({"v", taichi::lang::aot::IValue::create(v)});
+  args.insert({"J", taichi::lang::aot::IValue::create(J)});
+
+  g_init->run(args);
+  vulkan_runtime->synchronize();
+
+  args.insert({"C", taichi::lang::aot::IValue::create(C)});
+  args.insert({"grid_v", taichi::lang::aot::IValue::create(grid_v)});
+  args.insert({"grid_m", taichi::lang::aot::IValue::create(grid_m)});
+  args.insert({"pos", taichi::lang::aot::IValue::create(pos)});
+
+  // Run update graph once. In real application this runs as long as window is
+  // alive.
+  g_update->run(args);
+  vulkan_runtime->synchronize();
+
+  device_->dealloc_memory(devalloc_x);
+  device_->dealloc_memory(devalloc_v);
+  device_->dealloc_memory(devalloc_J);
+  device_->dealloc_memory(devalloc_C);
+  device_->dealloc_memory(devalloc_grid_v);
+  device_->dealloc_memory(devalloc_grid_m);
+  device_->dealloc_memory(devalloc_pos);
+}
 #endif
diff --git a/tests/cpp/aot/mpm88_graph_aot.py b/tests/cpp/aot/mpm88_graph_aot.py
new file mode 100644
index 0000000000000..c2c2a66cf6f41
--- /dev/null
+++ b/tests/cpp/aot/mpm88_graph_aot.py
@@ -0,0 +1,169 @@
+import os
+
+import taichi as ti
+
+
+def compile_mpm88_graph():
+    ti.init(ti.vulkan)
+    if ti.lang.impl.current_cfg().arch != ti.vulkan:
+        return
+    n_particles = 8192
+    n_grid = 128
+    dx = 1 / n_grid
+    dt = 2e-4
+
+    p_rho = 1
+    p_vol = (dx * 0.5)**2
+    p_mass = p_vol * p_rho
+    gravity = 9.8
+    bound = 3
+    E = 400
+
+    @ti.kernel
+    def substep_reset_grid(grid_v: ti.any_arr(field_dim=2),
+                           grid_m: ti.any_arr(field_dim=2)):
+        for i, j in grid_m:
+            grid_v[i, j] = [0, 0]
+            grid_m[i, j] = 0
+
+    @ti.kernel
+    def substep_p2g(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                    C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
+                    grid_v: ti.any_arr(field_dim=2),
+                    grid_m: ti.any_arr(field_dim=2)):
+        for p in x:
+            Xp = x[p] / dx
+            base = int(Xp - 0.5)
+            fx = Xp - base
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
+            affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
+            for i, j in ti.static(ti.ndrange(3, 3)):
+                offset = ti.Vector([i, j])
+                dpos = (offset - fx) * dx
+                weight = w[i].x * w[j].y
+                grid_v[base +
+                       offset] += weight * (p_mass * v[p] + affine @ dpos)
+                grid_m[base + offset] += weight * p_mass
+
+    @ti.kernel
+    def substep_update_grid_v(grid_v: ti.any_arr(field_dim=2),
+                              grid_m: ti.any_arr(field_dim=2)):
+        for i, j in grid_m:
+            if grid_m[i, j] > 0:
+                grid_v[i, j] /= grid_m[i, j]
+            grid_v[i, j].y -= dt * gravity
+            if i < bound and grid_v[i, j].x < 0:
+                grid_v[i, j].x = 0
+            if i > n_grid - bound and grid_v[i, j].x > 0:
+                grid_v[i, j].x = 0
+            if j < bound and grid_v[i, j].y < 0:
+                grid_v[i, j].y = 0
+            if j > n_grid - bound and grid_v[i, j].y > 0:
+                grid_v[i, j].y = 0
+
+    @ti.kernel
+    def substep_g2p(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                    C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
+                    grid_v: ti.any_arr(field_dim=2),
+                    pos: ti.any_arr(field_dim=1)):
+        for p in x:
+            Xp = x[p] / dx
+            base = int(Xp - 0.5)
+            fx = Xp - base
+            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
+            new_v = ti.Vector.zero(float, 2)
+            new_C = ti.Matrix.zero(float, 2, 2)
+            for i, j in ti.static(ti.ndrange(3, 3)):
+                offset = ti.Vector([i, j])
+                dpos = (offset - fx) * dx
+                weight = w[i].x * w[j].y
+                g_v = grid_v[base + offset]
+                new_v += weight * g_v
+                new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
+            v[p] = new_v
+            x[p] += dt * v[p]
+            pos[p] = [x[p][0], x[p][1], 0]
+            J[p] *= 1 + dt * new_C.trace()
+            C[p] = new_C
+
+    @ti.kernel
+    def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
+                       J: ti.any_arr(field_dim=1)):
+        for i in range(n_particles):
+            x[i] = [ti.random() * 0.4 + 0.2, ti.random() * 0.4 + 0.2]
+            v[i] = [0, -1]
+            J[i] = 1
+
+    N_ITER = 50
+
+    sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'x',
+                         ti.f32,
+                         element_shape=(2, ))
+    sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'v',
+                         ti.f32,
+                         element_shape=(2, ))
+    sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'C',
+                         ti.f32,
+                         element_shape=(2, 2))
+    sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                         'J',
+                         ti.f32,
+                         element_shape=())
+    sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                              'grid_v',
+                              ti.f32,
+                              element_shape=(2, ))
+    sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                              'grid_m',
+                              ti.f32,
+                              element_shape=())
+    sym_pos = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                           'pos',
+                           ti.f32,
+                           element_shape=(3, ))
+
+    g_init_builder = ti.graph.GraphBuilder()
+    g_init_builder.dispatch(init_particles, sym_x, sym_v, sym_J)
+
+    g_update_builder = ti.graph.GraphBuilder()
+    substep = g_update_builder.create_sequential()
+
+    substep.dispatch(substep_reset_grid, sym_grid_v, sym_grid_m)
+    substep.dispatch(substep_p2g, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
+                     sym_grid_m)
+    substep.dispatch(substep_update_grid_v, sym_grid_v, sym_grid_m)
+    substep.dispatch(substep_g2p, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
+                     sym_pos)
+
+    for i in range(N_ITER):
+        g_update_builder.append(substep)
+
+    g_init = g_init_builder.compile()
+    g_update = g_update_builder.compile()
+
+    # GGUI only supports vec3 vertex so we need an extra `pos` here
+    # This is not necessary if you're not going to render it using GGUI.
+    # Let's keep this hack here so that the shaders serialized by this
+    # script can be loaded and rendered in the provided script in taichi-aot-demo.
+    pos = ti.Vector.ndarray(3, ti.f32, n_particles)
+    x = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
+    v = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
+
+    C = ti.Matrix.ndarray(2, 2, ti.f32, shape=(n_particles))
+    J = ti.ndarray(ti.f32, shape=(n_particles))
+    grid_v = ti.Vector.ndarray(2, ti.f32, shape=(n_grid, n_grid))
+    grid_m = ti.ndarray(ti.f32, shape=(n_grid, n_grid))
+
+    assert "TAICHI_AOT_FOLDER_PATH" in os.environ.keys()
+    tmpdir = str(os.environ["TAICHI_AOT_FOLDER_PATH"])
+    mod = ti.aot.Module(ti.vulkan)
+    mod.add_graph('init', g_init)
+    mod.add_graph('update', g_update)
+    mod.save(tmpdir, '')
+
+
+compile_mpm88_graph()
diff --git a/tests/python/test_aot.py b/tests/python/test_aot.py
index 6cb0252fe3046..a3f50e2b9484a 100644
--- a/tests/python/test_aot.py
+++ b/tests/python/test_aot.py
@@ -619,150 +619,3 @@ def run(arr: ti.types.ndarray(), val1: ti.f32, val2: ti.template()):
             res = json.load(json_file)
             args_count = res['aot_data']['kernels']['run']['args_count']
             assert args_count == 2, res  # `arr` and `val1`
-
-
-@test_utils.test(arch=ti.vulkan)
-def test_mpm88_ndarray_graph_aot():
-    n_particles = 8192
-    n_grid = 128
-    dx = 1 / n_grid
-    dt = 2e-4
-
-    p_rho = 1
-    p_vol = (dx * 0.5)**2
-    p_mass = p_vol * p_rho
-    gravity = 9.8
-    bound = 3
-    E = 400
-
-    @ti.kernel
-    def substep_reset_grid(grid_v: ti.any_arr(field_dim=2),
-                           grid_m: ti.any_arr(field_dim=2)):
-        for i, j in grid_m:
-            grid_v[i, j] = [0, 0]
-            grid_m[i, j] = 0
-
-    @ti.kernel
-    def substep_p2g(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
-                    C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
-                    grid_v: ti.any_arr(field_dim=2),
-                    grid_m: ti.any_arr(field_dim=2)):
-        for p in x:
-            Xp = x[p] / dx
-            base = int(Xp - 0.5)
-            fx = Xp - base
-            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
-            stress = -dt * 4 * E * p_vol * (J[p] - 1) / dx**2
-            affine = ti.Matrix([[stress, 0], [0, stress]]) + p_mass * C[p]
-            for i, j in ti.static(ti.ndrange(3, 3)):
-                offset = ti.Vector([i, j])
-                dpos = (offset - fx) * dx
-                weight = w[i].x * w[j].y
-                grid_v[base +
-                       offset] += weight * (p_mass * v[p] + affine @ dpos)
-                grid_m[base + offset] += weight * p_mass
-
-    @ti.kernel
-    def substep_update_grid_v(grid_v: ti.any_arr(field_dim=2),
-                              grid_m: ti.any_arr(field_dim=2)):
-        for i, j in grid_m:
-            if grid_m[i, j] > 0:
-                grid_v[i, j] /= grid_m[i, j]
-            grid_v[i, j].y -= dt * gravity
-            if i < bound and grid_v[i, j].x < 0:
-                grid_v[i, j].x = 0
-            if i > n_grid - bound and grid_v[i, j].x > 0:
-                grid_v[i, j].x = 0
-            if j < bound and grid_v[i, j].y < 0:
-                grid_v[i, j].y = 0
-            if j > n_grid - bound and grid_v[i, j].y > 0:
-                grid_v[i, j].y = 0
-
-    @ti.kernel
-    def substep_g2p(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
-                    C: ti.any_arr(field_dim=1), J: ti.any_arr(field_dim=1),
-                    grid_v: ti.any_arr(field_dim=2)):
-        for p in x:
-            Xp = x[p] / dx
-            base = int(Xp - 0.5)
-            fx = Xp - base
-            w = [0.5 * (1.5 - fx)**2, 0.75 - (fx - 1)**2, 0.5 * (fx - 0.5)**2]
-            new_v = ti.Vector.zero(float, 2)
-            new_C = ti.Matrix.zero(float, 2, 2)
-            for i, j in ti.static(ti.ndrange(3, 3)):
-                offset = ti.Vector([i, j])
-                dpos = (offset - fx) * dx
-                weight = w[i].x * w[j].y
-                g_v = grid_v[base + offset]
-                new_v += weight * g_v
-                new_C += 4 * weight * g_v.outer_product(dpos) / dx**2
-            v[p] = new_v
-            x[p] += dt * v[p]
-            J[p] *= 1 + dt * new_C.trace()
-            C[p] = new_C
-
-    @ti.kernel
-    def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
-                       J: ti.any_arr(field_dim=1)):
-        for i in range(n_particles):
-            x[i] = [ti.random() * 0.4 + 0.2, ti.random() * 0.4 + 0.2]
-            v[i] = [0, -1]
-            J[i] = 1
-
-    N_ITER = 50
-
-    sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
-                         'x',
-                         ti.f32,
-                         element_shape=(2, ))
-    sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
-                         'v',
-                         ti.f32,
-                         element_shape=(2, ))
-    sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
-                         'C',
-                         ti.f32,
-                         element_shape=(2, 2))
-    sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
-                         'J',
-                         ti.f32,
-                         element_shape=())
-    sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
-                              'grid_v',
-                              ti.f32,
-                              element_shape=(2, ))
-    sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
-                              'grid_m',
-                              ti.f32,
-                              element_shape=())
-    g_init_builder = ti.graph.GraphBuilder()
-    g_init_builder.dispatch(init_particles, sym_x, sym_v, sym_J)
-
-    g_update_builder = ti.graph.GraphBuilder()
-    substep = g_update_builder.create_sequential()
-
-    substep.dispatch(substep_reset_grid, sym_grid_v, sym_grid_m)
-    substep.dispatch(substep_p2g, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
-                     sym_grid_m)
-    substep.dispatch(substep_update_grid_v, sym_grid_v, sym_grid_m)
-    substep.dispatch(substep_g2p, sym_x, sym_v, sym_C, sym_J, sym_grid_v)
-
-    for i in range(N_ITER):
-        g_update_builder.append(substep)
-
-    g_init = g_init_builder.compile()
-    g_update = g_update_builder.compile()
-
-    x = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
-    v = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
-
-    C = ti.Matrix.ndarray(2, 2, ti.f32, shape=(n_particles))
-    J = ti.ndarray(ti.f32, shape=(n_particles))
-    grid_v = ti.Vector.ndarray(2, ti.f32, shape=(n_grid, n_grid))
-    grid_m = ti.ndarray(ti.f32, shape=(n_grid, n_grid))
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        mod = ti.aot.Module(ti.vulkan)
-        mod.add_graph('init', g_init)
-        mod.add_graph('update', g_update)
-        mod.save(tmpdir, '')
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 7974ce0709f6b..998f7cfa68b72 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -17,6 +17,8 @@
     os.path.join('cpp', 'backends', 'llvm', 'cpu_aot.py'),
     "LlvmProgramTest.FullPipelineCUDA":
     os.path.join('cpp', 'backends', 'llvm', 'cuda_aot.py'),
+    "AotLoadGraph.Mpm88":
+    os.path.join('cpp', 'aot', 'mpm88_graph_aot.py'),
 }
 
 
From d95e6b08d6233c2ea8092e29baa5389e871cb706 Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Wed, 8 Jun 2022 10:21:09 +0800
Subject: [PATCH 160/176] [Example] Update visual effects of
 mass_spring_3d_ggui.py (#5081)

* update scene for mass_spring simulation

* update scene for mass_spring simulation

* update scene for mass_spring simulation
---
 .../examples/ggui_examples/mass_spring_3d_ggui.py     | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
index 80302a772c5f0..3579afcd3dd8c 100644
--- a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
+++ b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
@@ -21,6 +21,7 @@
 num_triangles = (n - 1) * (n - 1) * 2
 indices = ti.field(int, shape=num_triangles * 3)
 vertices = ti.Vector.field(3, dtype=float, shape=n * n)
+colors = ti.Vector.field(3, dtype=float, shape=n * n)
 
 bending_springs = False
 
@@ -49,6 +50,11 @@ def initialize_mesh_indices():
         indices[quad_id * 6 + 4] = i * n + (j + 1)
         indices[quad_id * 6 + 5] = (i + 1) * n + j
 
+    for i, j in ti.ndrange(n, n):
+        if (i // 4 + j // 4) % 2 == 0:
+            colors[i * n + j] = (0.22, 0.72, 0.52)
+        else:
+            colors[i * n + j] = (1, 0.334, 0.52)
 
 initialize_mesh_indices()
 
@@ -130,13 +136,14 @@ def update_vertices():
     scene.set_camera(camera)
 
     scene.point_light(pos=(0, 1, 2), color=(1, 1, 1))
+    scene.ambient_light((0.5, 0.5, 0.5))
     scene.mesh(vertices,
                indices=indices,
-               color=(0.8, 0, 0),
+               per_vertex_color=colors,
                two_sided=True)
 
     # Draw a smaller ball to avoid visual penetration
-    scene.particles(ball_center, radius=ball_radius * 0.95, color=(0.2, 0.6, 1))
+    scene.particles(ball_center, radius=ball_radius * 0.95, color=(0.5, 0.42, 0.8))
     canvas.scene(scene)
     window.show()
 

From 8791cdaeab8b08083f37c5d85009d1fb33f69c8f Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Wed, 8 Jun 2022 16:38:04 +0800
Subject: [PATCH 161/176] [type] [refactor] Remove redundant promotion for
 custom int in type_check (#5102)

---
 taichi/transforms/type_check.cpp | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index 87c7cc39723c4..a7cc5b60ecfb8 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -316,15 +316,6 @@ class TypeCheck : public IRVisitor {
     }
 
     if (stmt->lhs->ret_type != stmt->rhs->ret_type) {
-      auto promote_custom_int_type = [&](Stmt *stmt, Stmt *hs) {
-        if (auto cit = hs->ret_type->cast<CustomIntType>()) {
-          return insert_type_cast_before(stmt, hs, cit->get_compute_type());
-        }
-        return hs;
-      };
-      stmt->lhs = promote_custom_int_type(stmt, stmt->lhs);
-      stmt->rhs = promote_custom_int_type(stmt, stmt->rhs);
-
       DataType ret_type;
       if (is_shift_op(stmt->op_type)) {
         // shift_ops does not follow the same type promotion rule as numerical

From 9c4fa7372417cb14c783edaf239436d149602e96 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Wed, 8 Jun 2022 16:39:24 +0800
Subject: [PATCH 162/176] [llvm] [refactor] Replace cast_int() with LLVM native
 integer cast (#5110)

* [llvm] [refactor] Use LLVM native integer cast

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/codegen/codegen_llvm.cpp       | 27 ++-------------------------
 taichi/codegen/codegen_llvm.h         |  2 --
 taichi/codegen/codegen_llvm_quant.cpp |  3 ++-
 3 files changed, 4 insertions(+), 28 deletions(-)

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 9b9f312cd4f2b..5e0489d5c9a39 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -324,28 +324,6 @@ CodeGenLLVM::CodeGenLLVM(Kernel *kernel,
   kernel_name = kernel->name + "_kernel";
 }
 
-llvm::Value *CodeGenLLVM::cast_int(llvm::Value *input_val,
-                                   Type *from,
-                                   Type *to) {
-  if (from == to)
-    return input_val;
-  auto from_size = 0;
-  if (from->is<CustomIntType>()) {
-    from_size = data_type_size(from->cast<CustomIntType>()->get_compute_type());
-  } else {
-    from_size = data_type_size(from);
-  }
-  if (from_size < data_type_size(to)) {
-    if (is_signed(from)) {
-      return builder->CreateSExt(input_val, tlctx->get_data_type(to));
-    } else {
-      return builder->CreateZExt(input_val, tlctx->get_data_type(to));
-    }
-  } else {
-    return builder->CreateTrunc(input_val, tlctx->get_data_type(to));
-  }
-}
-
 void CodeGenLLVM::visit(DecorationStmt *stmt) {
 }
 
@@ -404,9 +382,8 @@ void CodeGenLLVM::visit(UnaryOpStmt *stmt) {
         }
       }
     } else if (!is_real(from) && !is_real(to)) {
-      // TODO: implement casting into custom integer type
-      TI_ASSERT(!to->is<CustomIntType>());
-      llvm_val[stmt] = cast_int(llvm_val[stmt->operand], from, to);
+      llvm_val[stmt] = builder->CreateIntCast(llvm_val[stmt->operand],
+                                              llvm_type(to), is_signed(from));
     }
   } else if (stmt->op_type == UnaryOpType::cast_bits) {
     TI_ASSERT(data_type_size(stmt->ret_type) ==
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index 5cbe35cd4ea9e..d0d857275be0c 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -183,8 +183,6 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void visit(RandStmt *stmt) override;
 
-  llvm::Value *cast_int(llvm::Value *input_val, Type *from, Type *to);
-
   virtual void emit_extra_unary(UnaryOpStmt *stmt);
 
   void visit(DecorationStmt *stmt) override;
diff --git a/taichi/codegen/codegen_llvm_quant.cpp b/taichi/codegen/codegen_llvm_quant.cpp
index f136f2fb8e970..d0eee7694141d 100644
--- a/taichi/codegen/codegen_llvm_quant.cpp
+++ b/taichi/codegen/codegen_llvm_quant.cpp
@@ -25,7 +25,8 @@ llvm::Value *CodeGenLLVM::atomic_add_custom_int(AtomicOpStmt *stmt,
       fmt::format("atomic_add_partial_bits_b{}", data_type_bits(physical_type)),
       {builder->CreateBitCast(byte_ptr, llvm_ptr_type(physical_type)),
        bit_offset, tlctx->get_constant(cit->get_num_bits()),
-       cast_int(llvm_val[stmt->val], stmt->val->ret_type, physical_type)});
+       builder->CreateIntCast(llvm_val[stmt->val], llvm_type(physical_type),
+                              is_signed(stmt->val->ret_type))});
 }
 
 llvm::Value *CodeGenLLVM::atomic_add_custom_float(AtomicOpStmt *stmt,

From 130e37c17ab07c0e9ec9c40b45955ae1a07f3168 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Thu, 9 Jun 2022 15:44:09 +0800
Subject: [PATCH 163/176] [type] [llvm] [refactor] Fix function names in
 codegen_llvm_quant (#5115)

* [type] [llvm] [refactor] Fix function names in codegen_llvm_quant

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 taichi/backends/cuda/codegen_cuda.cpp   |   4 +-
 taichi/backends/metal/codegen_metal.cpp |   2 +-
 taichi/codegen/codegen_llvm.cpp         |  10 +-
 taichi/codegen/codegen_llvm.h           |  74 +++++++-------
 taichi/codegen/codegen_llvm_quant.cpp   | 127 ++++++++++++------------
 5 files changed, 106 insertions(+), 111 deletions(-)

diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
index 61cbffa52928d..fe63fd8cf8ed4 100644
--- a/taichi/backends/cuda/codegen_cuda.cpp
+++ b/taichi/backends/cuda/codegen_cuda.cpp
@@ -548,10 +548,10 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
             auto [data_ptr, bit_offset] = load_bit_pointer(llvm_val[stmt->src]);
             data_ptr = builder->CreateBitCast(data_ptr, llvm_ptr_type(dtype));
             auto data = create_intrinsic_load(dtype, data_ptr);
-            llvm_val[stmt] = extract_custom_int(data, bit_offset, int_in_mem);
+            llvm_val[stmt] = extract_quant_int(data, bit_offset, int_in_mem);
           } else if (val_type->cast<CustomFloatType>()) {
             // TODO: support __ldg
-            llvm_val[stmt] = load_custom_float(stmt->src);
+            llvm_val[stmt] = load_quant_fixed_or_quant_float(stmt->src);
           } else {
             TI_NOT_IMPLEMENTED;
           }
diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index 5068a9239fb82..a020c4635d26a 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -1010,7 +1010,7 @@ class KernelCodegenImpl : public IRVisitor {
       const auto loaded = construct_load_as_custom_int(
           stmt->src, cft->get_digits_type()->as<CustomIntType>());
       // Computes `float(digits_expr) * scale`
-      // See LLVM backend's reconstruct_custom_float()
+      // See LLVM backend's reconstruct_quant_fixed()
       return fmt::format("(static_cast<float>({}) * {})", loaded,
                          cft->get_scale());
     }
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 5e0489d5c9a39..ce41f82be2ff7 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1196,9 +1196,9 @@ llvm::Value *CodeGenLLVM::custom_type_atomic(AtomicOpStmt *stmt) {
 
   auto dst_type = stmt->dest->ret_type->as<PointerType>()->get_pointee_type();
   if (auto cit = dst_type->cast<CustomIntType>()) {
-    return atomic_add_custom_int(stmt, cit);
+    return atomic_add_quant_int(stmt, cit);
   } else if (auto cft = dst_type->cast<CustomFloatType>()) {
-    return atomic_add_custom_float(stmt, cft);
+    return atomic_add_quant_fixed(stmt, cft);
   } else {
     return nullptr;
   }
@@ -1355,7 +1355,7 @@ void CodeGenLLVM::visit(GlobalStoreStmt *stmt) {
     llvm::Value *store_value = nullptr;
     auto *cit = pointee_type->as<CustomIntType>();
     store_value = llvm_val[stmt->val];
-    store_custom_int(llvm_val[stmt->dest], cit, store_value, /*atomic=*/true);
+    store_quant_int(llvm_val[stmt->dest], cit, store_value, /*atomic=*/true);
   } else {
     builder->CreateStore(llvm_val[stmt->val], llvm_val[stmt->dest]);
   }
@@ -1368,10 +1368,10 @@ void CodeGenLLVM::visit(GlobalLoadStmt *stmt) {
   if (ptr_type->is_bit_pointer()) {
     auto val_type = ptr_type->get_pointee_type();
     if (val_type->is<CustomIntType>()) {
-      llvm_val[stmt] = load_as_custom_int(llvm_val[stmt->src], val_type);
+      llvm_val[stmt] = load_quant_int(llvm_val[stmt->src], val_type);
     } else if (val_type->cast<CustomFloatType>()) {
       TI_ASSERT(stmt->src->is<GetChStmt>());
-      llvm_val[stmt] = load_custom_float(stmt->src);
+      llvm_val[stmt] = load_quant_fixed_or_quant_float(stmt->src);
     } else {
       TI_NOT_IMPLEMENTED
     }
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index d0d857275be0c..af7a0bf091740 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -219,14 +219,13 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void visit(SNodeOpStmt *stmt) override;
 
-  llvm::Value *atomic_add_custom_float(AtomicOpStmt *stmt,
-                                       CustomFloatType *cft);
+  llvm::Value *atomic_add_quant_fixed(AtomicOpStmt *stmt, CustomFloatType *cft);
 
-  llvm::Value *atomic_add_custom_int(AtomicOpStmt *stmt, CustomIntType *cit);
+  llvm::Value *atomic_add_quant_int(AtomicOpStmt *stmt, CustomIntType *cit);
 
-  llvm::Value *float_to_custom_int(CustomFloatType *cft,
-                                   CustomIntType *cit,
-                                   llvm::Value *real);
+  llvm::Value *quant_fixed_to_quant_int(CustomFloatType *cft,
+                                        CustomIntType *cit,
+                                        llvm::Value *real);
 
   virtual llvm::Value *optimized_reduction(AtomicOpStmt *stmt);
 
@@ -247,16 +246,16 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void visit(PtrOffsetStmt *stmt) override;
 
-  void store_custom_int(llvm::Value *bit_ptr,
-                        CustomIntType *cit,
-                        llvm::Value *value,
-                        bool atomic);
+  void store_quant_int(llvm::Value *bit_ptr,
+                       CustomIntType *cit,
+                       llvm::Value *value,
+                       bool atomic);
 
-  void store_custom_int(llvm::Value *byte_ptr,
-                        llvm::Value *bit_offset,
-                        CustomIntType *cit,
-                        llvm::Value *value,
-                        bool atomic);
+  void store_quant_int(llvm::Value *byte_ptr,
+                       llvm::Value *bit_offset,
+                       CustomIntType *cit,
+                       llvm::Value *value,
+                       bool atomic);
 
   void store_masked(llvm::Value *byte_ptr,
                     uint64 mask,
@@ -272,31 +271,31 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void visit(BitStructStoreStmt *stmt) override;
 
-  void store_floats_with_shared_exponents(BitStructStoreStmt *stmt);
+  void store_quant_floats_with_shared_exponents(BitStructStoreStmt *stmt);
 
-  llvm::Value *reconstruct_float_from_bit_struct(llvm::Value *local_bit_struct,
-                                                 SNode *digits);
+  llvm::Value *extract_quant_float(llvm::Value *local_bit_struct,
+                                   SNode *digits_snode);
 
-  llvm::Value *load_as_custom_int(llvm::Value *ptr, Type *load_type);
+  llvm::Value *load_quant_int(llvm::Value *ptr, Type *load_type);
 
-  llvm::Value *extract_custom_int(llvm::Value *physical_value,
-                                  llvm::Value *bit_offset,
-                                  Type *load_type);
+  llvm::Value *extract_quant_int(llvm::Value *physical_value,
+                                 llvm::Value *bit_offset,
+                                 Type *load_type);
 
-  llvm::Value *reconstruct_custom_float(llvm::Value *digits,
-                                        CustomFloatType *load_type);
+  llvm::Value *reconstruct_quant_fixed(llvm::Value *digits,
+                                       CustomFloatType *cft);
 
-  llvm::Value *load_custom_float_with_exponent(llvm::Value *digits_bit_ptr,
-                                               llvm::Value *exponent_bit_ptr,
-                                               CustomFloatType *cft,
-                                               bool shared_exponent);
+  llvm::Value *load_quant_float(llvm::Value *digits_bit_ptr,
+                                llvm::Value *exponent_bit_ptr,
+                                CustomFloatType *cft,
+                                bool shared_exponent);
 
-  llvm::Value *reconstruct_custom_float_with_exponent(llvm::Value *digits,
-                                                      llvm::Value *exponent_val,
-                                                      CustomFloatType *cft,
-                                                      bool shared_exponent);
+  llvm::Value *reconstruct_quant_float(llvm::Value *input_digits,
+                                       llvm::Value *input_exponent_val,
+                                       CustomFloatType *cft,
+                                       bool shared_exponent);
 
-  llvm::Value *load_custom_float(Stmt *ptr_stmt);
+  llvm::Value *load_quant_fixed_or_quant_float(Stmt *ptr_stmt);
 
   void visit(GlobalLoadStmt *stmt) override;
 
@@ -396,12 +395,13 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   llvm::Value *create_mesh_xlogue(std::unique_ptr<Block> &block);
 
-  llvm::Value *extract_exponent_from_float(llvm::Value *f);
+  llvm::Value *extract_exponent_from_f32(llvm::Value *f);
 
-  llvm::Value *extract_digits_from_float(llvm::Value *f, bool full);
+  llvm::Value *extract_digits_from_f32(llvm::Value *f, bool full);
 
-  llvm::Value *get_float_digits_with_shared_exponents(llvm::Value *f,
-                                                      llvm::Value *shared_exp);
+  llvm::Value *extract_digits_from_quant_float_with_shared_exponent(
+      llvm::Value *f,
+      llvm::Value *shared_exp);
 
   llvm::Value *get_exponent_offset(llvm::Value *exponent, CustomFloatType *cft);
 
diff --git a/taichi/codegen/codegen_llvm_quant.cpp b/taichi/codegen/codegen_llvm_quant.cpp
index d0eee7694141d..ec3c764bfd78d 100644
--- a/taichi/codegen/codegen_llvm_quant.cpp
+++ b/taichi/codegen/codegen_llvm_quant.cpp
@@ -17,8 +17,8 @@ inline void update_mask(uint64 &mask, uint32 num_bits, uint32 offset) {
 
 }  // namespace
 
-llvm::Value *CodeGenLLVM::atomic_add_custom_int(AtomicOpStmt *stmt,
-                                                CustomIntType *cit) {
+llvm::Value *CodeGenLLVM::atomic_add_quant_int(AtomicOpStmt *stmt,
+                                               CustomIntType *cit) {
   auto [byte_ptr, bit_offset] = load_bit_pointer(llvm_val[stmt->dest]);
   auto physical_type = cit->get_physical_type();
   return create_call(
@@ -29,11 +29,11 @@ llvm::Value *CodeGenLLVM::atomic_add_custom_int(AtomicOpStmt *stmt,
                               is_signed(stmt->val->ret_type))});
 }
 
-llvm::Value *CodeGenLLVM::atomic_add_custom_float(AtomicOpStmt *stmt,
-                                                  CustomFloatType *cft) {
+llvm::Value *CodeGenLLVM::atomic_add_quant_fixed(AtomicOpStmt *stmt,
+                                                 CustomFloatType *cft) {
   auto [byte_ptr, bit_offset] = load_bit_pointer(llvm_val[stmt->dest]);
   auto cit = cft->get_digits_type()->as<CustomIntType>();
-  auto val_store = float_to_custom_int(cft, cit, llvm_val[stmt->val]);
+  auto val_store = quant_fixed_to_quant_int(cft, cit, llvm_val[stmt->val]);
   auto physical_type = cit->get_physical_type();
   val_store = builder->CreateSExt(val_store, llvm_type(physical_type));
 
@@ -43,9 +43,9 @@ llvm::Value *CodeGenLLVM::atomic_add_custom_float(AtomicOpStmt *stmt,
        bit_offset, tlctx->get_constant(cit->get_num_bits()), val_store});
 }
 
-llvm::Value *CodeGenLLVM::float_to_custom_int(CustomFloatType *cft,
-                                              CustomIntType *cit,
-                                              llvm::Value *real) {
+llvm::Value *CodeGenLLVM::quant_fixed_to_quant_int(CustomFloatType *cft,
+                                                   CustomIntType *cit,
+                                                   llvm::Value *real) {
   llvm::Value *s = nullptr;
 
   // Compute int(real * (1.0 / scale) + 0.5)
@@ -69,19 +69,19 @@ llvm::Value *CodeGenLLVM::float_to_custom_int(CustomFloatType *cft,
   }
 }
 
-void CodeGenLLVM::store_custom_int(llvm::Value *bit_ptr,
-                                   CustomIntType *cit,
-                                   llvm::Value *value,
-                                   bool atomic) {
+void CodeGenLLVM::store_quant_int(llvm::Value *bit_ptr,
+                                  CustomIntType *cit,
+                                  llvm::Value *value,
+                                  bool atomic) {
   auto [byte_ptr, bit_offset] = load_bit_pointer(bit_ptr);
-  store_custom_int(byte_ptr, bit_offset, cit, value, atomic);
+  store_quant_int(byte_ptr, bit_offset, cit, value, atomic);
 }
 
-void CodeGenLLVM::store_custom_int(llvm::Value *byte_ptr,
-                                   llvm::Value *bit_offset,
-                                   CustomIntType *cit,
-                                   llvm::Value *value,
-                                   bool atomic) {
+void CodeGenLLVM::store_quant_int(llvm::Value *byte_ptr,
+                                  llvm::Value *bit_offset,
+                                  CustomIntType *cit,
+                                  llvm::Value *value,
+                                  bool atomic) {
   // TODO(type): CUDA only supports atomicCAS on 32- and 64-bit integers.
   // Try to support CustomInt/FloatType with 8/16-bit physical
   // types.
@@ -135,7 +135,7 @@ llvm::Value *CodeGenLLVM::custom_type_to_bits(llvm::Value *val,
   if (auto cft = input_type->cast<CustomFloatType>()) {
     TI_ASSERT(cft->get_exponent_type() == nullptr);
     cit = cft->get_digits_type()->as<CustomIntType>();
-    val = float_to_custom_int(cft, cit, val);
+    val = quant_fixed_to_quant_int(cft, cit, val);
   } else {
     cit = input_type->as<CustomIntType>();
   }
@@ -176,7 +176,7 @@ void CodeGenLLVM::visit(BitStructStoreStmt *stmt) {
   //  that don't own the shared exponent?
 
   if (has_shared_exponent) {
-    store_floats_with_shared_exponents(stmt);
+    store_quant_floats_with_shared_exponents(stmt);
   }
 
   llvm::Value *bit_struct_val = nullptr;
@@ -186,7 +186,7 @@ void CodeGenLLVM::visit(BitStructStoreStmt *stmt) {
     auto &ch = bit_struct_snode->ch[ch_id];
     if (has_shared_exponent && ch->exp_snode != nullptr &&
         ch->exp_snode->exponent_users.size() > 1) {
-      // already handled in store_floats_with_shared_exponents
+      // already handled in store_quant_floats_with_shared_exponents
       continue;
     }
     auto dtype = ch->dt;
@@ -282,7 +282,7 @@ void CodeGenLLVM::visit(BitStructStoreStmt *stmt) {
       auto &ch = bit_struct_snode->ch[ch_id];
       if (has_shared_exponent && ch->exp_snode != nullptr &&
           ch->exp_snode->exponent_users.size() > 1) {
-        // already handled in store_floats_with_shared_exponents
+        // already handled in store_quant_floats_with_shared_exponents
         continue;
       }
       auto dtype = ch->dt;
@@ -306,7 +306,8 @@ void CodeGenLLVM::visit(BitStructStoreStmt *stmt) {
   }
 }
 
-void CodeGenLLVM::store_floats_with_shared_exponents(BitStructStoreStmt *stmt) {
+void CodeGenLLVM::store_quant_floats_with_shared_exponents(
+    BitStructStoreStmt *stmt) {
   // handle each exponent separately
   auto snode = stmt->get_bit_struct_snode();
   auto bit_struct_physical_type =
@@ -333,15 +334,14 @@ void CodeGenLLVM::store_floats_with_shared_exponents(BitStructStoreStmt *stmt) {
           input != stmt->ch_ids.end()) {
         floats.push_back(llvm_val[stmt->values[input - stmt->ch_ids.begin()]]);
       } else {
-        floats.push_back(
-            reconstruct_float_from_bit_struct(local_bit_struct, user));
+        floats.push_back(extract_quant_float(local_bit_struct, user));
       }
     }
     // convert to i32 for bit operations
     llvm::Value *max_exp_bits = nullptr;
     for (auto f : floats) {
       // TODO: we only support f32 here.
-      auto exp_bits = extract_exponent_from_float(f);
+      auto exp_bits = extract_exponent_from_f32(f);
       if (max_exp_bits) {
         max_exp_bits = create_call("max_u32", {max_exp_bits, exp_bits});
       } else {
@@ -374,8 +374,8 @@ void CodeGenLLVM::store_floats_with_shared_exponents(BitStructStoreStmt *stmt) {
     for (int c = 0; c < (int)exp->exponent_users.size(); c++) {
       auto user = exp->exponent_users[c];
       auto ch_id = snode->child_id(user);
-      auto digits =
-          get_float_digits_with_shared_exponents(floats[c], max_exp_bits);
+      auto digits = extract_digits_from_quant_float_with_shared_exponent(
+          floats[c], max_exp_bits);
       auto digits_snode = snode->ch[ch_id].get();
       auto cft = digits_snode->dt->as<CustomFloatType>();
       auto digits_bit_offset = digits_snode->bit_offset;
@@ -418,14 +418,14 @@ void CodeGenLLVM::store_floats_with_shared_exponents(BitStructStoreStmt *stmt) {
                stmt->is_atomic);
 }
 
-llvm::Value *CodeGenLLVM::extract_exponent_from_float(llvm::Value *f) {
+llvm::Value *CodeGenLLVM::extract_exponent_from_f32(llvm::Value *f) {
   TI_ASSERT(f->getType() == llvm::Type::getFloatTy(*llvm_context));
   f = builder->CreateBitCast(f, llvm::Type::getInt32Ty(*llvm_context));
   auto exp_bits = builder->CreateLShr(f, tlctx->get_constant(23));
   return builder->CreateAnd(exp_bits, tlctx->get_constant((1 << 8) - 1));
 }
 
-llvm::Value *CodeGenLLVM::extract_digits_from_float(llvm::Value *f, bool full) {
+llvm::Value *CodeGenLLVM::extract_digits_from_f32(llvm::Value *f, bool full) {
   TI_ASSERT(f->getType() == llvm::Type::getFloatTy(*llvm_context));
   f = builder->CreateBitCast(f, llvm::Type::getInt32Ty(*llvm_context));
   auto digits = builder->CreateAnd(f, tlctx->get_constant((1 << 23) - 1));
@@ -435,10 +435,10 @@ llvm::Value *CodeGenLLVM::extract_digits_from_float(llvm::Value *f, bool full) {
   return digits;
 }
 
-llvm::Value *CodeGenLLVM::get_float_digits_with_shared_exponents(
+llvm::Value *CodeGenLLVM::extract_digits_from_quant_float_with_shared_exponent(
     llvm::Value *f,
     llvm::Value *shared_exp) {
-  auto exp = extract_exponent_from_float(f);
+  auto exp = extract_exponent_from_f32(f);
   auto exp_offset = builder->CreateSub(shared_exp, exp);
   // TODO: handle negative digits
 
@@ -454,42 +454,40 @@ llvm::Value *CodeGenLLVM::get_float_digits_with_shared_exponents(
       builder->CreateZExt(exp_non_zero, llvm::Type::getInt32Ty(*llvm_context));
   auto implicit_bit = builder->CreateShl(exp_non_zero, tlctx->get_constant(23));
 
-  auto digits = extract_digits_from_float(f, true);
+  auto digits = extract_digits_from_f32(f, true);
   digits = builder->CreateOr(digits, implicit_bit);
   exp_offset = create_call("min_u32", {exp_offset, tlctx->get_constant(31)});
   return builder->CreateLShr(digits, exp_offset);
 }
 
-llvm::Value *CodeGenLLVM::reconstruct_float_from_bit_struct(
-    llvm::Value *local_bit_struct,
-    SNode *digits_snode) {
+llvm::Value *CodeGenLLVM::extract_quant_float(llvm::Value *local_bit_struct,
+                                              SNode *digits_snode) {
   auto cft = digits_snode->dt->as<CustomFloatType>();
   auto exponent_type = cft->get_exponent_type()->as<CustomIntType>();
   auto digits_type = cft->get_digits_type()->as<CustomIntType>();
-  auto digits = extract_custom_int(
-      local_bit_struct, tlctx->get_constant(digits_snode->bit_offset),
-      digits_type);
-  auto exponent = extract_custom_int(
+  auto digits = extract_quant_int(local_bit_struct,
+                                  tlctx->get_constant(digits_snode->bit_offset),
+                                  digits_type);
+  auto exponent = extract_quant_int(
       local_bit_struct,
       tlctx->get_constant(digits_snode->exp_snode->bit_offset), exponent_type);
-  return reconstruct_custom_float_with_exponent(
-      digits, exponent, cft, digits_snode->owns_shared_exponent);
+  return reconstruct_quant_float(digits, exponent, cft,
+                                 digits_snode->owns_shared_exponent);
 }
 
-llvm::Value *CodeGenLLVM::load_as_custom_int(llvm::Value *ptr,
-                                             Type *load_type) {
+llvm::Value *CodeGenLLVM::load_quant_int(llvm::Value *ptr, Type *load_type) {
   auto *cit = load_type->as<CustomIntType>();
   auto [byte_ptr, bit_offset] = load_bit_pointer(ptr);
 
   auto bit_level_container = builder->CreateLoad(builder->CreateBitCast(
       byte_ptr, llvm_ptr_type(cit->get_physical_type())));
 
-  return extract_custom_int(bit_level_container, bit_offset, load_type);
+  return extract_quant_int(bit_level_container, bit_offset, load_type);
 }
 
-llvm::Value *CodeGenLLVM::extract_custom_int(llvm::Value *physical_value,
-                                             llvm::Value *bit_offset,
-                                             Type *load_type) {
+llvm::Value *CodeGenLLVM::extract_quant_int(llvm::Value *physical_value,
+                                            llvm::Value *bit_offset,
+                                            Type *load_type) {
   //  bit shifting
   //    first left shift `physical_type - (offset + num_bits)`
   //    then right shift `physical_type - num_bits`
@@ -515,8 +513,8 @@ llvm::Value *CodeGenLLVM::extract_custom_int(llvm::Value *physical_value,
                                 cit->get_is_signed());
 }
 
-llvm::Value *CodeGenLLVM::reconstruct_custom_float(llvm::Value *digits,
-                                                   CustomFloatType *cft) {
+llvm::Value *CodeGenLLVM::reconstruct_quant_fixed(llvm::Value *digits,
+                                                  CustomFloatType *cft) {
   // Compute float(digits) * scale
   llvm::Value *cast = nullptr;
   auto compute_type = cft->get_compute_type()->as<PrimitiveType>();
@@ -531,24 +529,22 @@ llvm::Value *CodeGenLLVM::reconstruct_custom_float(llvm::Value *digits,
   return builder->CreateFMul(cast, s);
 }
 
-llvm::Value *CodeGenLLVM::load_custom_float_with_exponent(
-    llvm::Value *digits_bit_ptr,
-    llvm::Value *exponent_bit_ptr,
-    CustomFloatType *cft,
-    bool shared_exponent) {
+llvm::Value *CodeGenLLVM::load_quant_float(llvm::Value *digits_bit_ptr,
+                                           llvm::Value *exponent_bit_ptr,
+                                           CustomFloatType *cft,
+                                           bool shared_exponent) {
   // TODO: we ignore "scale" for CustomFloatType with exponent for now. May need
   // to support this in the future.
 
   TI_ASSERT(cft->get_scale() == 1);
-  auto digits = load_as_custom_int(digits_bit_ptr, cft->get_digits_type());
+  auto digits = load_quant_int(digits_bit_ptr, cft->get_digits_type());
 
-  auto exponent_val = load_as_custom_int(
+  auto exponent_val = load_quant_int(
       exponent_bit_ptr, cft->get_exponent_type()->as<CustomIntType>());
-  return reconstruct_custom_float_with_exponent(digits, exponent_val, cft,
-                                                shared_exponent);
+  return reconstruct_quant_float(digits, exponent_val, cft, shared_exponent);
 }
 
-llvm::Value *CodeGenLLVM::reconstruct_custom_float_with_exponent(
+llvm::Value *CodeGenLLVM::reconstruct_quant_float(
     llvm::Value *input_digits,
     llvm::Value *input_exponent_val,
     CustomFloatType *cft,
@@ -647,7 +643,7 @@ llvm::Value *CodeGenLLVM::reconstruct_custom_float_with_exponent(
   }
 }
 
-llvm::Value *CodeGenLLVM::load_custom_float(Stmt *ptr_stmt) {
+llvm::Value *CodeGenLLVM::load_quant_fixed_or_quant_float(Stmt *ptr_stmt) {
   auto ptr = ptr_stmt->as<GetChStmt>();
   auto cft = ptr->ret_type->as<PointerType>()
                  ->get_pointee_type()
@@ -661,12 +657,11 @@ llvm::Value *CodeGenLLVM::load_custom_float(Stmt *ptr_stmt) {
     TI_ASSERT(digits_snode->parent == exponent_snode->parent);
     auto exponent_bit_ptr = offset_bit_ptr(
         digits_bit_ptr, exponent_snode->bit_offset - digits_snode->bit_offset);
-    return load_custom_float_with_exponent(digits_bit_ptr, exponent_bit_ptr,
-                                           cft,
-                                           digits_snode->owns_shared_exponent);
+    return load_quant_float(digits_bit_ptr, exponent_bit_ptr, cft,
+                            digits_snode->owns_shared_exponent);
   } else {
-    auto digits = load_as_custom_int(llvm_val[ptr], cft->get_digits_type());
-    return reconstruct_custom_float(digits, cft);
+    auto digits = load_quant_int(llvm_val[ptr], cft->get_digits_type());
+    return reconstruct_quant_fixed(digits, cft);
   }
 }
 

From f3786e546cc028a90388999adf422a2171368dbb Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 9 Jun 2022 17:51:52 +0800
Subject: [PATCH 164/176] [bug] Fix build without llvm backend crash (#5113)

* [bug] Fix build without llvm backend crash

* Update taichi/python/export_lang.cpp

Co-authored-by: yekuang <k-ye@users.noreply.github.com>

Co-authored-by: yekuang <k-ye@users.noreply.github.com>
---
 python/taichi/_lib/utils.py   | 4 ++--
 taichi/python/export_lang.cpp | 8 ++++++--
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/python/taichi/_lib/utils.py b/python/taichi/_lib/utils.py
index 364fd86345278..fd90b9effd1c7 100644
--- a/python/taichi/_lib/utils.py
+++ b/python/taichi/_lib/utils.py
@@ -146,8 +146,8 @@ def _print_taichi_header():
     except:
         pass
 
-    llvm_version = ti_core.get_llvm_version_string()
-    header += f'llvm {llvm_version}, '
+    llvm_target_support = ti_core.get_llvm_target_support()
+    header += f'llvm {llvm_target_support}, '
 
     commit_hash = ti_core.get_commit_hash()
     commit_hash = commit_hash[:8]
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 228e57b67e4bc..cdc97b8f49b43 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -938,9 +938,13 @@ void export_lang(py::module &m) {
   m.def("get_version_major", get_version_major);
   m.def("get_version_minor", get_version_minor);
   m.def("get_version_patch", get_version_patch);
-#if TI_WITH_LLVM
-  m.def("get_llvm_version_string", [] { return LLVM_VERSION_STRING; });
+  m.def("get_llvm_target_support", [] {
+#if defined(TI_WITH_LLVM)
+    return LLVM_VERSION_STRING;
+#else
+    return "targets unsupported";
 #endif
+  });
   m.def("test_printf", [] { printf("test_printf\n"); });
   m.def("test_logging", [] { TI_INFO("test_logging"); });
   m.def("trigger_crash", [] { *(int *)(1) = 0; });

From 1acb8d15177e6da86f2191d3940eb73cfacbf18d Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 9 Jun 2022 17:53:40 +0800
Subject: [PATCH 165/176] [build] [refactor] Move Vulkan runtime out of
 backends dir (#5106)

* Precommit fix

* Add spirv source

* Move device code back to backends

* Expose glfw include in vulkan rhi

* Fix llvm include

* Fix include for test
---
 cmake/TaichiCore.cmake                        | 36 ++++++++++++-----
 cmake/TaichiTests.cmake                       |  5 +++
 taichi/backends/vulkan/CMakeLists.txt         | 39 +++++++++++++++++++
 taichi/backends/vulkan/vulkan_api.h           |  2 +-
 taichi/backends/vulkan/vulkan_device.h        |  2 +-
 taichi/program/program.cpp                    |  2 +-
 taichi/runtime/program_impls/CMakeLists.txt   |  3 ++
 .../program_impls/vulkan/CMakeLists.txt       | 19 +++++++++
 .../program_impls}/vulkan/vulkan_program.cpp  |  2 +-
 .../program_impls}/vulkan/vulkan_program.h    |  0
 10 files changed, 96 insertions(+), 14 deletions(-)
 create mode 100644 taichi/backends/vulkan/CMakeLists.txt
 create mode 100644 taichi/runtime/program_impls/CMakeLists.txt
 create mode 100644 taichi/runtime/program_impls/vulkan/CMakeLists.txt
 rename taichi/{backends => runtime/program_impls}/vulkan/vulkan_program.cpp (99%)
 rename taichi/{backends => runtime/program_impls}/vulkan/vulkan_program.h (100%)

diff --git a/cmake/TaichiCore.cmake b/cmake/TaichiCore.cmake
index c3ccfe0ce47f0..0c9c41e59cdd3 100644
--- a/cmake/TaichiCore.cmake
+++ b/cmake/TaichiCore.cmake
@@ -90,6 +90,7 @@ endif()
 
 
+# TODO 4832: Split source per target, do not include everything in taichi_core_source
 file(GLOB TAICHI_CORE_SOURCE
         "taichi/*/*/*/*.cpp" "taichi/*/*/*.cpp" "taichi/*/*.cpp" "taichi/*.cpp"
         "taichi/*/*/*/*.h" "taichi/*/*/*.h" "taichi/*/*.h" "taichi/*.h" "tests/cpp/task/*.cpp")
@@ -103,7 +104,6 @@ file(GLOB TAICHI_METAL_SOURCE "taichi/backends/metal/*.h" "taichi/backends/metal
 file(GLOB TAICHI_OPENGL_SOURCE "taichi/backends/opengl/*.h" "taichi/backends/opengl/*.cpp" "taichi/backends/opengl/shaders/*")
 file(GLOB TAICHI_DX11_SOURCE "taichi/backends/dx/*.h" "taichi/backends/dx/*.cpp")
 file(GLOB TAICHI_CC_SOURCE "taichi/backends/cc/*.h" "taichi/backends/cc/*.cpp")
-file(GLOB TAICHI_VULKAN_SOURCE "taichi/backends/vulkan/*.h" "taichi/backends/vulkan/*.cpp" "external/SPIRV-Reflect/spirv_reflect.c")
 file(GLOB TAICHI_INTEROP_SOURCE "taichi/backends/interop/*.cpp" "taichi/backends/interop/*.h")
 
 
@@ -195,7 +195,6 @@ endif()
 
 if (TI_WITH_VULKAN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DTI_WITH_VULKAN")
-    list(APPEND TAICHI_CORE_SOURCE ${TAICHI_VULKAN_SOURCE})
 endif()
 
 
@@ -230,6 +229,18 @@ if (TAICHI_EMBIND_SOURCE)
   list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_EMBIND_SOURCE})
 endif()
 
+
+# TODO(#4832), Remove vulkan runtime files from TAICHI_CORE_SOURCE
+# Remove this after all sources are splitted into targets.
+file(GLOB TAICHI_VULKAN_TEMP_SOURCE
+  "taichi/backends/vulkan/*.h"
+  "taichi/backends/vulkan/*.cpp"
+  "taichi/runtime/program_impls/vulkan/*.h"
+  "taichi/runtime/program_impls/vulkan/*.cpp"
+)
+list(REMOVE_ITEM TAICHI_CORE_SOURCE ${TAICHI_VULKAN_TEMP_SOURCE})
+
+
 # TODO(#2196): Rename these CMAKE variables:
 # CORE_LIBRARY_NAME --> TAICHI_ISOLATED_CORE_LIB_NAME
 # CORE_WITH_PYBIND_LIBRARY_NAME --> TAICHI_CORE_LIB_NAME
@@ -381,16 +392,9 @@ target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/SPIRV-Reflect)
 add_subdirectory(taichi/runtime/gfx)
 target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE gfx_runtime)
 
+
 # Vulkan Device API
 if (TI_WITH_VULKAN)
-    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/Vulkan-Headers/include)
-
-    target_include_directories(${CORE_LIBRARY_NAME} PRIVATE external/volk)
-
-
-    # By specifying SYSTEM, we suppressed the warnings from third-party headers.
-    target_include_directories(${CORE_LIBRARY_NAME} SYSTEM PRIVATE external/VulkanMemoryAllocator/include)
-
     if (APPLE)
         find_library(MOLTEN_VK libMoltenVK.dylib PATHS $HOMEBREW_CELLAR/molten-vk $VULKAN_SDK REQUIRED)
         configure_file(${MOLTEN_VK} ${CMAKE_BINARY_DIR}/libMoltenVK.dylib COPYONLY)
@@ -399,6 +403,14 @@ if (TI_WITH_VULKAN)
             install(FILES ${CMAKE_BINARY_DIR}/libMoltenVK.dylib DESTINATION ${INSTALL_LIB_DIR}/runtime)
         endif()
     endif()
+    add_subdirectory(taichi/backends/vulkan)
+
+    # TODO: this dependency is here because program.cpp includes vulkan_program.h
+    # Should be removed
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE vulkan_rhi)
+
+    add_subdirectory(taichi/runtime/program_impls)
+    target_link_libraries(${CORE_LIBRARY_NAME} PRIVATE vulkan_program_impl)
 endif ()
 
 
@@ -492,6 +504,10 @@ if(TI_WITH_PYTHON AND NOT TI_EMSCRIPTENED)
         ${PROJECT_SOURCE_DIR}/external/imgui
         ${PROJECT_SOURCE_DIR}/external/imgui/backends
       )
+      target_include_directories(${CORE_WITH_PYBIND_LIBRARY_NAME} SYSTEM
+        PRIVATE
+          ${PROJECT_SOURCE_DIR}/external/VulkanMemoryAllocator/include
+        )
 
     if (NOT ANDROID)
       target_include_directories(${CORE_WITH_PYBIND_LIBRARY_NAME}
diff --git a/cmake/TaichiTests.cmake b/cmake/TaichiTests.cmake
index 6fd216b99fe37..498aa60475600 100644
--- a/cmake/TaichiTests.cmake
+++ b/cmake/TaichiTests.cmake
@@ -48,6 +48,11 @@ target_include_directories(${TESTS_NAME}
     ${PROJECT_SOURCE_DIR}/external/Vulkan-Headers/include
   )
 
+target_include_directories(${TESTS_NAME} SYSTEM
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/external/VulkanMemoryAllocator/include
+  )
+
 if (NOT ANDROID)
   target_include_directories(${TESTS_NAME}
   PRIVATE
diff --git a/taichi/backends/vulkan/CMakeLists.txt b/taichi/backends/vulkan/CMakeLists.txt
new file mode 100644
index 0000000000000..1c65ad750850b
--- /dev/null
+++ b/taichi/backends/vulkan/CMakeLists.txt
@@ -0,0 +1,39 @@
+# ./taichi/backends/vulkan/CMakeLists.txt
+
+set(VULKAN_RHI vulkan_rhi)
+add_library(${VULKAN_RHI})
+target_sources(${VULKAN_RHI}
+  PRIVATE
+    vulkan_api.cpp
+    vulkan_device.cpp
+    vulkan_device_creator.cpp
+    vulkan_loader.cpp
+    vulkan_memory_allocator.cpp
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Reflect/spirv_reflect.c
+  )
+
+#TODO 4832, some dependencies here should not be required as they
+# are build requirements of other targets.
+# public dirs here are required by backends/device.cpp
+target_include_directories(${VULKAN_RHI}
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+    ${PROJECT_SOURCE_DIR}/external/eigen
+    ${PROJECT_SOURCE_DIR}/external/FP16/include
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Reflect
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}/external/spdlog/include
+    ${LLVM_INCLUDE_DIRS}
+  PUBLIC
+    ${PROJECT_SOURCE_DIR}/external/volk
+    ${PROJECT_SOURCE_DIR}/external/Vulkan-Headers/include
+    ${PROJECT_SOURCE_DIR}/external/glfw/include
+  )
+
+# By specifying SYSTEM, we suppressed the warnings from third-party headers.
+# This is used to bypass unused variables in the header vk_mem_alloc.h
+target_include_directories(${VULKAN_RHI} SYSTEM
+  PUBLIC
+    ${PROJECT_SOURCE_DIR}/external/VulkanMemoryAllocator/include
+  )
diff --git a/taichi/backends/vulkan/vulkan_api.h b/taichi/backends/vulkan/vulkan_api.h
index ecc794d521b6b..2c080cd17a9a4 100644
--- a/taichi/backends/vulkan/vulkan_api.h
+++ b/taichi/backends/vulkan/vulkan_api.h
@@ -2,7 +2,7 @@
 
 #include "taichi/backends/vulkan/vulkan_common.h"
 
-#include <external/VulkanMemoryAllocator/include/vk_mem_alloc.h>
+#include <vk_mem_alloc.h>
 
 #include <memory>
 #include <vector>
diff --git a/taichi/backends/vulkan/vulkan_device.h b/taichi/backends/vulkan/vulkan_device.h
index 852486aed7a58..d983bb5174dd1 100644
--- a/taichi/backends/vulkan/vulkan_device.h
+++ b/taichi/backends/vulkan/vulkan_device.h
@@ -2,7 +2,7 @@
 
 #include "taichi/backends/vulkan/vulkan_api.h"
 
-#include <external/VulkanMemoryAllocator/include/vk_mem_alloc.h>
+#include <vk_mem_alloc.h>
 
 #ifdef ANDROID
 #include <android/native_window_jni.h>
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 994fcbfbdf351..d94b6b6ddff2b 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -29,7 +29,7 @@
 #include "taichi/backends/cc/cc_program.h"
 #endif
 #ifdef TI_WITH_VULKAN
-#include "taichi/backends/vulkan/vulkan_program.h"
+#include "taichi/runtime/program_impls/vulkan/vulkan_program.h"
 #include "taichi/backends/vulkan/vulkan_loader.h"
 #endif
 #ifdef TI_WITH_DX11
diff --git a/taichi/runtime/program_impls/CMakeLists.txt b/taichi/runtime/program_impls/CMakeLists.txt
new file mode 100644
index 0000000000000..ab14f3f71c843
--- /dev/null
+++ b/taichi/runtime/program_impls/CMakeLists.txt
@@ -0,0 +1,3 @@
+# ./taichi/runtime/program_impls/CMakeLists.txt
+
+add_subdirectory(vulkan)
diff --git a/taichi/runtime/program_impls/vulkan/CMakeLists.txt b/taichi/runtime/program_impls/vulkan/CMakeLists.txt
new file mode 100644
index 0000000000000..01b9f1b4351ff
--- /dev/null
+++ b/taichi/runtime/program_impls/vulkan/CMakeLists.txt
@@ -0,0 +1,19 @@
+# ./taichi/runtime/program_impls/vulkan/CMakeLists.txt
+
+add_library(vulkan_program_impl)
+target_sources(vulkan_program_impl
+  PRIVATE
+    vulkan_program.cpp
+  )
+
+target_include_directories(vulkan_program_impl
+  PRIVATE
+    ${PROJECT_SOURCE_DIR}
+    ${PROJECT_SOURCE_DIR}/external/eigen
+    ${PROJECT_SOURCE_DIR}/external/spdlog/include
+    ${PROJECT_SOURCE_DIR}/external/SPIRV-Tools/include
+    ${LLVM_INCLUDE_DIRS}
+  )
+
+target_link_libraries(vulkan_program_impl PRIVATE vulkan_rhi)
+target_link_libraries(vulkan_program_impl PRIVATE gfx_runtime)
diff --git a/taichi/backends/vulkan/vulkan_program.cpp b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
similarity index 99%
rename from taichi/backends/vulkan/vulkan_program.cpp
rename to taichi/runtime/program_impls/vulkan/vulkan_program.cpp
index d7cac58be11b8..274f45892e080 100644
--- a/taichi/backends/vulkan/vulkan_program.cpp
+++ b/taichi/runtime/program_impls/vulkan/vulkan_program.cpp
@@ -1,4 +1,4 @@
-#include "taichi/backends/vulkan/vulkan_program.h"
+#include "taichi/runtime/program_impls/vulkan/vulkan_program.h"
 
 #include "taichi/runtime/gfx/aot_module_builder_impl.h"
 #include "taichi/runtime/gfx/snode_tree_manager.h"
diff --git a/taichi/backends/vulkan/vulkan_program.h b/taichi/runtime/program_impls/vulkan/vulkan_program.h
similarity index 100%
rename from taichi/backends/vulkan/vulkan_program.h
rename to taichi/runtime/program_impls/vulkan/vulkan_program.h

From 928aef1e1436ac642fd5d0d10338ab94a0087b9e Mon Sep 17 00:00:00 2001
From: Mingrui Zhang <33411325+erizmr@users.noreply.github.com>
Date: Thu, 9 Jun 2022 22:56:32 +0800
Subject: [PATCH 166/176] [autodiff] Add forward mode pipeline for autodiff
 pass (#5098)

* Add forward mode pipeline for autodiff pass
* Replace the grad parameter with AutodiffMode to distinguish three kinds of kernels primal, forward ad and reverse ad
---
 cpp_examples/autograd.cpp                 |  9 ++-
 python/taichi/lang/enums.py               |  3 +-
 python/taichi/lang/kernel_impl.py         | 30 +++++---
 taichi/analysis/offline_cache_util.cpp    |  3 +-
 taichi/backends/cc/codegen_cc.cpp         |  3 +-
 taichi/backends/opengl/codegen_opengl.cpp |  3 +-
 taichi/codegen/spirv/spirv_codegen.cpp    |  3 +-
 taichi/inc/constants.h                    |  2 +
 taichi/ir/transforms.h                    |  7 +-
 taichi/program/function.cpp               |  4 +-
 taichi/program/kernel.cpp                 | 42 ++++++-----
 taichi/program/kernel.h                   | 10 +--
 taichi/program/program.h                  |  8 +-
 taichi/python/export_lang.cpp             | 10 ++-
 taichi/transforms/auto_diff.cpp           | 89 ++++++++++++++++++-----
 taichi/transforms/compile_to_offloads.cpp | 27 +++----
 16 files changed, 165 insertions(+), 88 deletions(-)

diff --git a/cpp_examples/autograd.cpp b/cpp_examples/autograd.cpp
index 81c5a0f4422bc..1f46a1e338ab3 100644
--- a/cpp_examples/autograd.cpp
+++ b/cpp_examples/autograd.cpp
@@ -118,7 +118,7 @@ void autograd() {
         std::make_unique<Kernel>(program, builder.extract_ir(), "init");
   }
 
-  auto get_kernel_cal = [&](bool grad) -> Kernel * {
+  auto get_kernel_cal = [&](AutodiffMode autodiff_mode) -> Kernel * {
     IRBuilder builder;
     auto *loop = builder.create_struct_for(a, 0, 4);
     {
@@ -132,10 +132,11 @@ void autograd() {
           std::make_unique<AtomicOpStmt>(AtomicOpType::add, c_i, val));
     }
 
-    return new Kernel(program, builder.extract_ir(), "cal", grad);
+    return new Kernel(program, builder.extract_ir(), "cal", autodiff_mode);
   };
-  kernel_forward = std::unique_ptr<Kernel>(get_kernel_cal(false));
-  kernel_backward = std::unique_ptr<Kernel>(get_kernel_cal(true));
+  kernel_forward = std::unique_ptr<Kernel>(get_kernel_cal(AutodiffMode::kNone));
+  kernel_backward =
+      std::unique_ptr<Kernel>(get_kernel_cal(AutodiffMode::kReverse));
 
   {
     IRBuilder builder;
diff --git a/python/taichi/lang/enums.py b/python/taichi/lang/enums.py
index 1eb81f136a834..acdd28bfbd2ab 100644
--- a/python/taichi/lang/enums.py
+++ b/python/taichi/lang/enums.py
@@ -1,5 +1,6 @@
 from taichi._lib import core as _ti_core
 
 Layout = _ti_core.Layout
+AutodiffMode = _ti_core.AutodiffMode
 
-__all__ = ['Layout']
+__all__ = ['Layout', 'AutodiffMode']
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index 884298e9f99c7..f1d302b81f326 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -12,7 +12,7 @@
 from taichi.lang.ast import (ASTTransformerContext, KernelSimplicityASTChecker,
                              transform_tree)
 from taichi.lang.ast.ast_transformer_utils import ReturnStatus
-from taichi.lang.enums import Layout
+from taichi.lang.enums import AutodiffMode, Layout
 from taichi.lang.exception import (TaichiCompilationError, TaichiRuntimeError,
                                    TaichiRuntimeTypeError, TaichiSyntaxError,
                                    handle_exception_from_cpp)
@@ -210,7 +210,8 @@ def __call__(self, *args, **kwargs):
             return self.func(*args)
 
         if self.is_real_function:
-            if impl.get_runtime().current_kernel.is_grad:
+            if impl.get_runtime(
+            ).current_kernel.autodiff_mode != AutodiffMode.NONE:
                 raise TaichiSyntaxError(
                     "Real function in gradient kernels unsupported.")
             instance_id, _ = self.mapper.lookup(args)
@@ -400,11 +401,11 @@ def _get_global_vars(_func):
 class Kernel:
     counter = 0
 
-    def __init__(self, _func, is_grad, _classkernel=False):
+    def __init__(self, _func, autodiff_mode, _classkernel=False):
         self.func = _func
         self.kernel_counter = Kernel.counter
         Kernel.counter += 1
-        self.is_grad = is_grad
+        self.autodiff_mode = autodiff_mode
         self.grad = None
         self.arguments = []
         self.return_type = None
@@ -422,7 +423,7 @@ def __init__(self, _func, is_grad, _classkernel=False):
 
     def reset(self):
         self.runtime = impl.get_runtime()
-        if self.is_grad:
+        if self.autodiff_mode != AutodiffMode.NONE:
             self.compiled_functions = self.runtime.compiled_grad_functions
         else:
             self.compiled_functions = self.runtime.compiled_functions
@@ -485,7 +486,7 @@ def materialize(self, key=None, args=None, arg_features=None):
         if key in self.compiled_functions:
             return
         grad_suffix = ""
-        if self.is_grad:
+        if self.autodiff_mode != AutodiffMode.NONE:
             grad_suffix = "_grad"
         kernel_name = f"{self.func.__name__}_c{self.kernel_counter}_{key[1]}{grad_suffix}"
         _logging.trace(f"Compiling kernel {kernel_name}...")
@@ -496,7 +497,7 @@ def materialize(self, key=None, args=None, arg_features=None):
             excluded_parameters=self.template_slot_locations,
             arg_features=arg_features)
 
-        if self.is_grad:
+        if self.autodiff_mode != AutodiffMode.NONE:
             KernelSimplicityASTChecker(self.func).visit(tree)
 
         if impl.current_cfg().use_mesh:
@@ -526,7 +527,7 @@ def taichi_ast_generator(kernel_cxx):
                 self.runtime.current_kernel = None
 
         taichi_kernel = impl.get_runtime().prog.create_kernel(
-            taichi_ast_generator, kernel_name, self.is_grad)
+            taichi_ast_generator, kernel_name, self.autodiff_mode)
 
         self.kernel_cpp = taichi_kernel
 
@@ -725,7 +726,7 @@ def func__(*args):
             # Both the class kernels and the plain-function kernels are unified now.
             # In both cases, |self.grad| is another Kernel instance that computes the
             # gradient. For class kernels, args[0] is always the kernel owner.
-            if not self.is_grad and self.runtime.target_tape and not self.runtime.grad_replaced:
+            if self.autodiff_mode == AutodiffMode.NONE and self.runtime.target_tape and not self.runtime.grad_replaced:
                 self.runtime.target_tape.insert(self, args)
 
             if actual_argument_slot > 8 and (
@@ -797,7 +798,8 @@ def ensure_compiled(self, *args):
     @_shell_pop_print
     def __call__(self, *args, **kwargs):
         args = _process_args(self, args, kwargs)
-        if self.is_grad and impl.current_cfg().opt_level == 0:
+        if self.autodiff_mode != AutodiffMode.NONE and impl.current_cfg(
+        ).opt_level == 0:
             _logging.warn(
                 """opt_level = 1 is enforced to enable gradient computation."""
             )
@@ -845,8 +847,12 @@ def _kernel_impl(_func, level_of_class_stackframe, verbose=False):
 
     if verbose:
         print(f'kernel={_func.__name__} is_classkernel={is_classkernel}')
-    primal = Kernel(_func, is_grad=False, _classkernel=is_classkernel)
-    adjoint = Kernel(_func, is_grad=True, _classkernel=is_classkernel)
+    primal = Kernel(_func,
+                    autodiff_mode=AutodiffMode.NONE,
+                    _classkernel=is_classkernel)
+    adjoint = Kernel(_func,
+                     autodiff_mode=AutodiffMode.REVERSE,
+                     _classkernel=is_classkernel)
     # Having |primal| contains |grad| makes the tape work.
     primal.grad = adjoint
 
diff --git a/taichi/analysis/offline_cache_util.cpp b/taichi/analysis/offline_cache_util.cpp
index 2d4793d39c86e..66b344b184c30 100644
--- a/taichi/analysis/offline_cache_util.cpp
+++ b/taichi/analysis/offline_cache_util.cpp
@@ -181,7 +181,8 @@ std::string get_hashed_offline_cache_key(CompileConfig *config,
   hasher.finish();
 
   auto res = picosha2::get_hash_hex_string(hasher);
-  res.insert(res.begin(), kernel->grad ? 'g' : 'n');
+  res.insert(res.begin(),
+             kernel->autodiff_mode != AutodiffMode::kNone ? 'g' : 'n');
   return res;
 }
 
diff --git a/taichi/backends/cc/codegen_cc.cpp b/taichi/backends/cc/codegen_cc.cpp
index 838f859cbbb92..65cf4307c9923 100644
--- a/taichi/backends/cc/codegen_cc.cpp
+++ b/taichi/backends/cc/codegen_cc.cpp
@@ -48,7 +48,8 @@ class CCTransformer : public IRVisitor {
     auto ir = kernel_->ir.get();
     auto config = kernel_->program->config;
     config.demote_dense_struct_fors = true;
-    irpass::compile_to_executable(ir, config, kernel_, kernel_->grad,
+    irpass::compile_to_executable(ir, config, kernel_,
+                                  /*autodiff_mode=*/kernel_->autodiff_mode,
                                   /*ad_use_stack=*/true, config.print_ir,
                                   /*lower_global_access*/ true);
   }
diff --git a/taichi/backends/opengl/codegen_opengl.cpp b/taichi/backends/opengl/codegen_opengl.cpp
index 910bde1882d65..6c9614764c810 100644
--- a/taichi/backends/opengl/codegen_opengl.cpp
+++ b/taichi/backends/opengl/codegen_opengl.cpp
@@ -1177,7 +1177,8 @@ void OpenglCodeGen::lower() {
   auto ir = kernel_->ir.get();
   auto &config = kernel_->program->config;
   config.demote_dense_struct_fors = true;
-  irpass::compile_to_executable(ir, config, kernel_, kernel_->grad,
+  irpass::compile_to_executable(ir, config, kernel_,
+                                /*autodiff_mode=*/kernel_->autodiff_mode,
                                 /*ad_use_stack=*/false, config.print_ir,
                                 /*lower_global_access=*/true,
                                 /*make_thread_local=*/config.make_thread_local);
diff --git a/taichi/codegen/spirv/spirv_codegen.cpp b/taichi/codegen/spirv/spirv_codegen.cpp
index f812fae204433..a57da19e44aba 100644
--- a/taichi/codegen/spirv/spirv_codegen.cpp
+++ b/taichi/codegen/spirv/spirv_codegen.cpp
@@ -2183,7 +2183,8 @@ void KernelCodegen::run(TaichiKernelAttributes &kernel_attribs,
 void lower(Kernel *kernel) {
   auto &config = kernel->program->config;
   config.demote_dense_struct_fors = true;
-  irpass::compile_to_executable(kernel->ir.get(), config, kernel, kernel->grad,
+  irpass::compile_to_executable(kernel->ir.get(), config, kernel,
+                                kernel->autodiff_mode,
                                 /*ad_use_stack=*/false, config.print_ir,
                                 /*lower_global_access=*/true,
                                 /*make_thread_local=*/false);
diff --git a/taichi/inc/constants.h b/taichi/inc/constants.h
index 970091c2d3e28..6e6877db7b4c5 100644
--- a/taichi/inc/constants.h
+++ b/taichi/inc/constants.h
@@ -48,3 +48,5 @@ T taichi_union_cast(G g) {
 }
 
 enum class ExternalArrayLayout { kAOS, kSOA, kNull };
+
+enum class AutodiffMode { kForward, kReverse, kNone };
diff --git a/taichi/ir/transforms.h b/taichi/ir/transforms.h
index a3b601b915f03..bf6be74a3f7a1 100644
--- a/taichi/ir/transforms.h
+++ b/taichi/ir/transforms.h
@@ -80,6 +80,7 @@ bool lower_access(IRNode *root,
                   const LowerAccessPass::Args &args);
 void auto_diff(IRNode *root,
                const CompileConfig &config,
+               AutodiffMode autodiffMode,
                bool use_stack = false);
 /**
  * Determine all adaptive AD-stacks' size. This pass is idempotent, i.e.,
@@ -147,7 +148,7 @@ void compile_to_offloads(IRNode *ir,
                          const CompileConfig &config,
                          Kernel *kernel,
                          bool verbose,
-                         bool grad,
+                         AutodiffMode autodiff_mode,
                          bool ad_use_stack,
                          bool start_from_ast);
 
@@ -164,7 +165,7 @@ void offload_to_executable(IRNode *ir,
 void compile_to_executable(IRNode *ir,
                            const CompileConfig &config,
                            Kernel *kernel,
-                           bool grad,
+                           AutodiffMode autodiff_mode,
                            bool ad_use_stack,
                            bool verbose,
                            bool lower_global_access = true,
@@ -176,7 +177,7 @@ void compile_to_executable(IRNode *ir,
 void compile_function(IRNode *ir,
                       const CompileConfig &config,
                       Function *func,
-                      bool grad,
+                      AutodiffMode autodiff_mode,
                       bool verbose,
                       bool start_from_ast);
 }  // namespace irpass
diff --git a/taichi/program/function.cpp b/taichi/program/function.cpp
index eb92ef22db84a..ef154208939c6 100644
--- a/taichi/program/function.cpp
+++ b/taichi/program/function.cpp
@@ -19,7 +19,7 @@ void Function::set_function_body(const std::function<void()> &func) {
     func();
   }
   irpass::compile_function(ir.get(), program->config, this,
-                           /*grad=*/false,
+                           /*autodiff_mode=*/AutodiffMode::kNone,
                            /*verbose=*/program->config.print_ir,
                            /*start_from_ast=*/true);
 }
@@ -27,7 +27,7 @@ void Function::set_function_body(const std::function<void()> &func) {
 void Function::set_function_body(std::unique_ptr<IRNode> func_body) {
   ir = std::move(func_body);
   irpass::compile_function(ir.get(), program->config, this,
-                           /*grad=*/false,
+                           /*autodiff_mode=*/AutodiffMode::kNone,
                            /*verbose=*/program->config.print_ir,
                            /*start_from_ast=*/false);
 }
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index 3b12af94ab526..b2747e2939acd 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -23,22 +23,22 @@ class Function;
 Kernel::Kernel(Program &program,
                const std::function<void()> &func,
                const std::string &primal_name,
-               bool grad) {
-  this->init(program, func, primal_name, grad);
+               AutodiffMode autodiff_mode) {
+  this->init(program, func, primal_name, autodiff_mode);
 }
 
 Kernel::Kernel(Program &program,
                const std::function<void(Kernel *)> &func,
                const std::string &primal_name,
-               bool grad) {
-  this->init(program, std::bind(func, this), primal_name, grad);
+               AutodiffMode autodiff_mode) {
+  this->init(program, std::bind(func, this), primal_name, autodiff_mode);
 }
 
 Kernel::Kernel(Program &program,
                std::unique_ptr<IRNode> &&ir,
                const std::string &primal_name,
-               bool grad)
-    : grad(grad), lowered_(false) {
+               AutodiffMode autodiff_mode)
+    : autodiff_mode(autodiff_mode), lowered_(false) {
   this->ir = std::move(ir);
   this->program = &program;
   is_accessor = false;
@@ -49,10 +49,12 @@ Kernel::Kernel(Program &program,
 
   arch = program.config.arch;
 
-  if (!grad) {
+  if (autodiff_mode == AutodiffMode::kNone) {
     name = primal_name;
-  } else {
-    name = primal_name + "_grad";
+  } else if (autodiff_mode == AutodiffMode::kForward) {
+    name = primal_name + "_forward_grad";
+  } else if (autodiff_mode == AutodiffMode::kReverse) {
+    name = primal_name + "_reverse_grad";
   }
 
   if (!program.config.lazy_compilation)
@@ -89,15 +91,17 @@ void Kernel::lower(bool to_executable) {
 
   if (to_executable) {
     irpass::compile_to_executable(
-        ir.get(), config, this, grad,
-        /*ad_use_stack=*/true, verbose, /*lower_global_access=*/to_executable,
+        ir.get(), config, this, /*autodiff_mode=*/autodiff_mode,
+        /*ad_use_stack=*/true, verbose,
+        /*lower_global_access=*/to_executable,
         /*make_thread_local=*/config.make_thread_local,
         /*make_block_local=*/
         is_extension_supported(config.arch, Extension::bls) &&
             config.make_block_local,
         /*start_from_ast=*/ir_is_ast_);
   } else {
-    irpass::compile_to_offloads(ir.get(), config, this, verbose, grad,
+    irpass::compile_to_offloads(ir.get(), config, this, verbose,
+                                /*autodiff_mode=*/autodiff_mode,
                                 /*ad_use_stack=*/true,
                                 /*start_from_ast=*/ir_is_ast_);
   }
@@ -397,8 +401,8 @@ std::string Kernel::get_name() const {
 void Kernel::init(Program &program,
                   const std::function<void()> &func,
                   const std::string &primal_name,
-                  bool grad) {
-  this->grad = grad;
+                  AutodiffMode autodiff_mode) {
+  this->autodiff_mode = autodiff_mode;
   this->lowered_ = false;
   this->program = &program;
 #ifdef TI_WITH_LLVM
@@ -415,10 +419,12 @@ void Kernel::init(Program &program,
 
   this->arch = program.config.arch;
 
-  if (!grad) {
-    this->name = primal_name;
-  } else {
-    this->name = primal_name + "_grad";
+  if (autodiff_mode == AutodiffMode::kNone) {
+    name = primal_name;
+  } else if (autodiff_mode == AutodiffMode::kForward) {
+    name = primal_name + "_forward_grad";
+  } else if (autodiff_mode == AutodiffMode::kReverse) {
+    name = primal_name + "_reverse_grad";
   }
 
   {
diff --git a/taichi/program/kernel.h b/taichi/program/kernel.h
index 0d98252b8c499..e19ce1916e020 100644
--- a/taichi/program/kernel.h
+++ b/taichi/program/kernel.h
@@ -20,7 +20,7 @@ class TI_DLL_EXPORT Kernel : public Callable {
 
   bool is_accessor{false};
   bool is_evaluator{false};
-  bool grad{false};
+  AutodiffMode autodiff_mode{AutodiffMode::kNone};
 
   class LaunchContextBuilder {
    public:
@@ -69,17 +69,17 @@ class TI_DLL_EXPORT Kernel : public Callable {
   Kernel(Program &program,
          const std::function<void()> &func,
          const std::string &name = "",
-         bool grad = false);
+         AutodiffMode autodiff_mode = AutodiffMode::kNone);
 
   Kernel(Program &program,
          const std::function<void(Kernel *)> &func,
          const std::string &name = "",
-         bool grad = false);
+         AutodiffMode autodiff_mode = AutodiffMode::kNone);
 
   Kernel(Program &program,
          std::unique_ptr<IRNode> &&ir,
          const std::string &name = "",
-         bool grad = false);
+         AutodiffMode autodiff_mode = AutodiffMode::kNone);
 
   bool lowered() const {
     return lowered_;
@@ -136,7 +136,7 @@ class TI_DLL_EXPORT Kernel : public Callable {
   void init(Program &program,
             const std::function<void()> &func,
             const std::string &name = "",
-            bool grad = false);
+            AutodiffMode autodiff_mode = AutodiffMode::kNone);
 
   // True if |ir| is a frontend AST. False if it's already offloaded to CHI IR.
   bool ir_is_ast_{false};
diff --git a/taichi/program/program.h b/taichi/program/program.h
index 70f3f21950a6f..48025f9b66109 100644
--- a/taichi/program/program.h
+++ b/taichi/program/program.h
@@ -173,9 +173,9 @@ class TI_DLL_EXPORT Program {
 
   Kernel &kernel(const std::function<void()> &body,
                  const std::string &name = "",
-                 bool grad = false) {
+                 AutodiffMode autodiff_mode = AutodiffMode::kNone) {
     // Expr::set_allow_store(true);
-    auto func = std::make_unique<Kernel>(*this, body, name, grad);
+    auto func = std::make_unique<Kernel>(*this, body, name, autodiff_mode);
     // Expr::set_allow_store(false);
     kernels.emplace_back(std::move(func));
     return *kernels.back();
@@ -183,9 +183,9 @@ class TI_DLL_EXPORT Program {
 
   Kernel &kernel(const std::function<void(Kernel *)> &body,
                  const std::string &name = "",
-                 bool grad = false) {
+                 AutodiffMode autodiff_mode = AutodiffMode::kNone) {
     // Expr::set_allow_store(true);
-    auto func = std::make_unique<Kernel>(*this, body, name, grad);
+    auto func = std::make_unique<Kernel>(*this, body, name, autodiff_mode);
     // Expr::set_allow_store(false);
     kernels.emplace_back(std::move(func));
     return *kernels.back();
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index cdc97b8f49b43..ffb7161dda1eb 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -95,6 +95,12 @@ void export_lang(py::module &m) {
       .value("NULL", ExternalArrayLayout::kNull)
       .export_values();
 
+  py::enum_<AutodiffMode>(m, "AutodiffMode", py::arithmetic())
+      .value("NONE", AutodiffMode::kNone)
+      .value("FORWARD", AutodiffMode::kForward)
+      .value("REVERSE", AutodiffMode::kReverse)
+      .export_values();
+
   // TODO(type): This should be removed
   py::class_<DataType>(m, "DataType")
       .def(py::init<Type *>())
@@ -365,9 +371,9 @@ void export_lang(py::module &m) {
       .def(
           "create_kernel",
           [](Program *program, const std::function<void(Kernel *)> &body,
-             const std::string &name, bool grad) -> Kernel * {
+             const std::string &name, AutodiffMode autodiff_mode) -> Kernel * {
             py::gil_scoped_release release;
-            return &program->kernel(body, name, grad);
+            return &program->kernel(body, name, autodiff_mode);
           },
           py::return_value_policy::reference)
       .def("create_function", &Program::create_function,
diff --git a/taichi/transforms/auto_diff.cpp b/taichi/transforms/auto_diff.cpp
index 65ade4cd7289f..9103f37a2012a 100644
--- a/taichi/transforms/auto_diff.cpp
+++ b/taichi/transforms/auto_diff.cpp
@@ -1035,6 +1035,45 @@ class MakeAdjoint : public ADTransform {
   }
 };
 
+// Forward mode autodiff
+class MakeDual : public ADTransform {
+ public:
+  using ADTransform::visit;
+  Stmt *current_stmt;
+  Block *current_block;
+  // Block *alloca_block;
+  std::map<Stmt *, Stmt *> dual_stmt;
+
+  MakeDual(Block *block) {
+    current_stmt = nullptr;
+    // alloca_block = block;
+    current_block = block;
+  }
+
+  static void run(Block *block) {
+    auto p = MakeDual(block);
+    block->accept(&p);
+  }
+
+  Stmt *insert_grad_stmt(std::unique_ptr<Stmt> &&stmt) override {
+    auto ptr = stmt.get();
+    current_stmt = current_stmt->insert_after_me(std::move(stmt));
+    return ptr;
+  }
+
+  void visit(Block *block) override {
+    std::vector<Stmt *> statements;
+    // always make a copy since the list can be modified.
+    for (auto &stmt : block->statements) {
+      statements.push_back(stmt.get());
+    }
+    for (auto stmt : statements) {
+      current_stmt = stmt;
+      stmt->accept(this);
+    }
+  }
+};
+
 class BackupSSA : public BasicStmtVisitor {
  public:
   using BasicStmtVisitor::visit;
@@ -1131,29 +1170,39 @@ class BackupSSA : public BasicStmtVisitor {
 
 namespace irpass {
 
-void auto_diff(IRNode *root, const CompileConfig &config, bool use_stack) {
+void auto_diff(IRNode *root,
+               const CompileConfig &config,
+               AutodiffMode autodiff_mode,
+               bool use_stack) {
   TI_AUTO_PROF;
-  if (use_stack) {
-    auto IB = IdentifyIndependentBlocks::run(root);
-    ReverseOuterLoops::run(root, IB);
-
-    for (auto ib : IB) {
-      PromoteSSA2LocalVar::run(ib);
-      ReplaceLocalVarWithStacks replace(config.ad_stack_size);
-      ib->accept(&replace);
-      type_check(root, config);
-      MakeAdjoint::run(ib);
+  if (autodiff_mode == AutodiffMode::kReverse) {
+    if (use_stack) {
+      auto IB = IdentifyIndependentBlocks::run(root);
+      ReverseOuterLoops::run(root, IB);
+
+      for (auto ib : IB) {
+        PromoteSSA2LocalVar::run(ib);
+        ReplaceLocalVarWithStacks replace(config.ad_stack_size);
+        ib->accept(&replace);
+        type_check(root, config);
+        MakeAdjoint::run(ib);
+        type_check(root, config);
+        BackupSSA::run(ib);
+        irpass::analysis::verify(root);
+      }
+    } else {
+      auto IB = IdentifyIndependentBlocks::run(root);
+      ReverseOuterLoops::run(root, IB);
       type_check(root, config);
-      BackupSSA::run(ib);
-      irpass::analysis::verify(root);
-    }
-  } else {
-    auto IB = IdentifyIndependentBlocks::run(root);
-    ReverseOuterLoops::run(root, IB);
-    type_check(root, config);
-    for (auto ib : IB) {
-      MakeAdjoint::run(ib);
+      for (auto ib : IB) {
+        MakeAdjoint::run(ib);
+      }
     }
+  } else if (autodiff_mode == AutodiffMode::kForward) {
+    // Forward mode autodiff
+    Block *block = root->as<Block>();
+    PromoteSSA2LocalVar::run(block);
+    MakeDual::run(block);
   }
   type_check(root, config);
   irpass::analysis::verify(root);
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index 2bfc8f5e21065..cfe63000e7194 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -33,7 +33,7 @@ void compile_to_offloads(IRNode *ir,
                          const CompileConfig &config,
                          Kernel *kernel,
                          bool verbose,
-                         bool grad,
+                         AutodiffMode autodiff_mode,
                          bool ad_use_stack,
                          bool start_from_ast) {
   TI_AUTO_PROF;
@@ -41,7 +41,7 @@ void compile_to_offloads(IRNode *ir,
   auto print = make_pass_printer(verbose, kernel->get_name(), ir);
   print("Initial IR");
 
-  if (grad) {
+  if (autodiff_mode == AutodiffMode::kReverse) {
     irpass::reverse_segments(ir);
     print("Segment reversed (for autodiff)");
   }
@@ -57,7 +57,7 @@ void compile_to_offloads(IRNode *ir,
   irpass::analysis::verify(ir);
 
   if (kernel->is_evaluator) {
-    TI_ASSERT(!grad);
+    TI_ASSERT(autodiff_mode == AutodiffMode::kNone);
 
     irpass::demote_operations(ir, config);
     print("Operations demoted");
@@ -85,12 +85,12 @@ void compile_to_offloads(IRNode *ir,
     irpass::analysis::gather_meshfor_relation_types(ir);
   }
 
-  if (grad) {
+  if (autodiff_mode != AutodiffMode::kNone) {
     // Remove local atomics here so that we don't have to handle their gradients
     irpass::demote_atomics(ir, config);
 
     irpass::full_simplify(ir, config, {false, kernel->program});
-    irpass::auto_diff(ir, config, ad_use_stack);
+    irpass::auto_diff(ir, config, autodiff_mode, ad_use_stack);
     irpass::full_simplify(ir, config, {false, kernel->program});
     print("Gradient");
     irpass::analysis::verify(ir);
@@ -256,7 +256,7 @@ void offload_to_executable(IRNode *ir,
 void compile_to_executable(IRNode *ir,
                            const CompileConfig &config,
                            Kernel *kernel,
-                           bool grad,
+                           AutodiffMode autodiff_mode,
                            bool ad_use_stack,
                            bool verbose,
                            bool lower_global_access,
@@ -265,19 +265,20 @@ void compile_to_executable(IRNode *ir,
                            bool start_from_ast) {
   TI_AUTO_PROF;
 
-  compile_to_offloads(ir, config, kernel, verbose, grad, ad_use_stack,
+  compile_to_offloads(ir, config, kernel, verbose, autodiff_mode, ad_use_stack,
                       start_from_ast);
 
-  offload_to_executable(ir, config, kernel, verbose,
-                        /*determine_ad_stack_size=*/grad && ad_use_stack,
-                        lower_global_access, make_thread_local,
-                        make_block_local);
+  offload_to_executable(
+      ir, config, kernel, verbose,
+      /*determine_ad_stack_size=*/autodiff_mode == AutodiffMode::kReverse &&
+          ad_use_stack,
+      lower_global_access, make_thread_local, make_block_local);
 }
 
 void compile_function(IRNode *ir,
                       const CompileConfig &config,
                       Function *func,
-                      bool grad,
+                      AutodiffMode autodiff_mode,
                       bool verbose,
                       bool start_from_ast) {
   TI_AUTO_PROF;
@@ -285,7 +286,7 @@ void compile_function(IRNode *ir,
   auto print = make_pass_printer(verbose, func->get_name(), ir);
   print("Initial IR");
 
-  if (grad) {
+  if (autodiff_mode == AutodiffMode::kReverse) {
     irpass::reverse_segments(ir);
     print("Segment reversed (for autodiff)");
   }

From 88f75a9959d01c0035c395f9d7fddfcfcf27af31 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 10 Jun 2022 10:09:12 +0800
Subject: [PATCH 167/176] [aot] [llvm] LLVM AOT Field #0: Implemented
 FieldCacheData & refactored initialize_llvm_runtime_snodes() (#5108)

* [aot] [llvm] Implemented FieldCacheData and refactored initialize_llvm_runtime_snodes()

* Addressed compilation erros

* Added initialization for struct members

* Minor fix
---
 taichi/llvm/llvm_offline_cache.h | 32 +++++++++++++-
 taichi/llvm/llvm_program.cpp     | 71 +++++++++++++++++++++-----------
 taichi/llvm/llvm_program.h       |  6 +--
 3 files changed, 82 insertions(+), 27 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index c82837e42521e..56de40d4c3276 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -42,7 +42,37 @@ struct LlvmOfflineCache {
     TI_IO_DEF(kernel_key, args, offloaded_task_list);
   };
 
-  std::unordered_map<std::string, KernelCacheData> kernels;
+  struct FieldCacheData {
+    struct SNodeCacheData {
+      int id{0};
+      SNodeType type = SNodeType::undefined;
+      size_t cell_size_bytes{0};
+      size_t chunk_size{0};
+
+      TI_IO_DEF(id, type, cell_size_bytes, chunk_size);
+    };
+
+    int tree_id{0};
+    int root_id{0};
+    size_t root_size{0};
+    std::vector<SNodeCacheData> snode_metas;
+
+    TI_IO_DEF(tree_id, root_id, root_size, snode_metas);
+
+    // TODO(zhanlue)
+    //  Serialize/Deserialize the llvm::Module from StructCompiler
+    //  At runtime, make sure loaded Field-Modules and Kernel-Modules are linked
+    //  altogether.
+  };
+
+  // TODO(zhanlue): we need a better identifier for each FieldCacheData
+  // (SNodeTree) Given that snode_tree_id is not continuous, it is ridiculous to
+  // ask the users to remember each of the snode_tree_ids
+  // ** Find a way to name each SNodeTree **
+  std::unordered_map<int, FieldCacheData> fields;  // key = snode_tree_id
+
+  std::unordered_map<std::string, KernelCacheData>
+      kernels;  // key = kernel_name
 
   TI_IO_DEF(kernels);
 };
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index 65bc1a75e12e1..eea60dad165f7 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -158,9 +158,9 @@ LlvmProgramImpl::clone_struct_compiler_initial_context(
   return tlctx->clone_runtime_module();
 }
 
-void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree,
-                                                     StructCompiler *scomp,
-                                                     uint64 *result_buffer) {
+void LlvmProgramImpl::initialize_llvm_runtime_snodes(
+    const LlvmOfflineCache::FieldCacheData &field_cache_data,
+    uint64 *result_buffer) {
   TaichiLLVMContext *tlctx = nullptr;
   if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
@@ -175,15 +175,16 @@ void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree,
   auto *const runtime_jit = tlctx->runtime_jit_module;
   // By the time this creator is called, "this" is already destroyed.
   // Therefore it is necessary to capture members by values.
-  const auto snodes = scomp->snodes;
-  const int root_id = tree->root()->id;
+  size_t root_size = field_cache_data.root_size;
+  const auto snode_metas = field_cache_data.snode_metas;
+  const int tree_id = field_cache_data.tree_id;
+  const int root_id = field_cache_data.root_id;
 
-  TI_TRACE("Allocating data structure of size {} bytes", scomp->root_size);
-  std::size_t rounded_size =
-      taichi::iroundup(scomp->root_size, taichi_page_size);
+  TI_TRACE("Allocating data structure of size {} bytes", root_size);
+  std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
 
   Ptr root_buffer = snode_tree_buffer_manager_->allocate(
-      runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree->id(),
+      runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id,
       result_buffer);
   if (config->arch == Arch::cuda) {
 #if defined(TI_WITH_CUDA)
@@ -207,33 +208,33 @@ void LlvmProgramImpl::initialize_llvm_runtime_snodes(const SNodeTree *tree,
     alloc = cpu_device()->import_memory(root_buffer, rounded_size);
   }
 
-  snode_tree_allocs_[tree->id()] = alloc;
+  snode_tree_allocs_[tree_id] = alloc;
 
   bool all_dense = config->demote_dense_struct_fors;
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    if (snodes[i]->type != SNodeType::dense &&
-        snodes[i]->type != SNodeType::place &&
-        snodes[i]->type != SNodeType::root) {
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (snode_metas[i].type != SNodeType::dense &&
+        snode_metas[i].type != SNodeType::place &&
+        snode_metas[i].type != SNodeType::root) {
       all_dense = false;
       break;
     }
   }
 
   runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
-      "runtime_initialize_snodes", llvm_runtime_, scomp->root_size, root_id,
-      (int)snodes.size(), tree->id(), rounded_size, root_buffer, all_dense);
+      "runtime_initialize_snodes", llvm_runtime_, root_size, root_id,
+      (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense);
 
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    if (is_gc_able(snodes[i]->type)) {
-      const auto snode_id = snodes[i]->id;
+  for (size_t i = 0; i < snode_metas.size(); i++) {
+    if (is_gc_able(snode_metas[i].type)) {
+      const auto snode_id = snode_metas[i].id;
       std::size_t node_size;
-      auto element_size = snodes[i]->cell_size_bytes;
-      if (snodes[i]->type == SNodeType::pointer) {
+      auto element_size = snode_metas[i].cell_size_bytes;
+      if (snode_metas[i].type == SNodeType::pointer) {
         // pointer. Allocators are for single elements
         node_size = element_size;
       } else {
         // dynamic. Allocators are for the chunks
-        node_size = sizeof(void *) + element_size * snodes[i]->chunk_size;
+        node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
       }
       TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
                node_size);
@@ -275,10 +276,34 @@ void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
   compile_snode_tree_types_impl(tree);
 }
 
+static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
+    const SNodeTree &tree,
+    const StructCompiler &struct_compiler) {
+  LlvmOfflineCache::FieldCacheData ret;
+  ret.tree_id = tree.id();
+  ret.root_id = tree.root()->id;
+  ret.root_size = struct_compiler.root_size;
+
+  const auto &snodes = struct_compiler.snodes;
+  for (size_t i = 0; i < snodes.size(); i++) {
+    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
+    snode_cache_data.id = snodes[i]->id;
+    snode_cache_data.type = snodes[i]->type;
+    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
+    snode_cache_data.chunk_size = snodes[i]->chunk_size;
+
+    ret.snode_metas.emplace_back(std::move(snode_cache_data));
+  }
+
+  return ret;
+}
+
 void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
                                              uint64 *result_buffer) {
   auto struct_compiler = compile_snode_tree_types_impl(tree);
-  initialize_llvm_runtime_snodes(tree, struct_compiler.get(), result_buffer);
+
+  auto field_cache_data = construct_filed_cache_data(*tree, *struct_compiler);
+  initialize_llvm_runtime_snodes(field_cache_data, result_buffer);
 }
 
 uint64 LlvmProgramImpl::fetch_result_uint64(int i, uint64 *result_buffer) {
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index c9029bbcd85f0..69378ee660bf1 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -132,9 +132,9 @@ class LlvmProgramImpl : public ProgramImpl {
   /**
    * Initializes the SNodes for LLVM based backends.
    */
-  void initialize_llvm_runtime_snodes(const SNodeTree *tree,
-                                      StructCompiler *scomp,
-                                      uint64 *result_buffer);
+  void initialize_llvm_runtime_snodes(
+      const LlvmOfflineCache::FieldCacheData &field_cache_data,
+      uint64 *result_buffer);
 
   uint64 fetch_result_uint64(int i, uint64 *result_buffer);
 

From 31ec9c3ea26d8bdc959f75eec25369a4e6381059 Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Fri, 10 Jun 2022 14:47:04 +0800
Subject: [PATCH 168/176] [aot][bug] Use cached compiled kernel pointer when
 it's added to graph (#5122)

multiple times

This bug was triggered when we tried to port stable_fluid demo so this
PR also added a cgraph based stable fluid demo.

```
ti example stable_fluid_graph
```

Note it's not ideal to save both `FunctionType compiled_` as well as
`aot::Kernel compiled_aot_kernel_` inside C++ `Kernel` class. But we plan to clean that
up (likely by getting rid of `FunctionType compiled_`) in #5114.
---
 .../examples/graph/stable_fluid_graph.py      | 340 ++++++++++++++++++
 python/taichi/graph/_graph.py                 |   6 +-
 python/taichi/lang/kernel_impl.py             |   4 +
 taichi/program/graph_builder.cpp              |   6 +-
 taichi/program/graph_builder.h                |   1 -
 taichi/program/kernel.cpp                     |   4 +-
 taichi/program/kernel.h                       |  11 +-
 7 files changed, 362 insertions(+), 10 deletions(-)
 create mode 100644 python/taichi/examples/graph/stable_fluid_graph.py

diff --git a/python/taichi/examples/graph/stable_fluid_graph.py b/python/taichi/examples/graph/stable_fluid_graph.py
new file mode 100644
index 0000000000000..c4b0826e953f6
--- /dev/null
+++ b/python/taichi/examples/graph/stable_fluid_graph.py
@@ -0,0 +1,340 @@
+# References:
+# http://developer.download.nvidia.com/books/HTML/gpugems/gpugems_ch38.html
+# https://github.com/PavelDoGreat/WebGL-Fluid-Simulation
+# https://www.bilibili.com/video/BV1ZK411H7Hc?p=4
+# https://github.com/ShaneFX/GAMES201/tree/master/HW01
+
+import argparse
+
+import numpy as np
+import taichi as ti
+
+ti.init(arch=ti.vulkan)
+
+res = 512
+dt = 0.03
+p_jacobi_iters = 500  # 40 for a quicker but less accurate result
+f_strength = 10000.0
+curl_strength = 0
+time_c = 2
+maxfps = 60
+dye_decay = 1 - 1 / (maxfps * time_c)
+force_radius = res / 2.0
+gravity = True
+paused = False
+
+
+class TexPair:
+    def __init__(self, cur, nxt):
+        self.cur = cur
+        self.nxt = nxt
+
+    def swap(self):
+        self.cur, self.nxt = self.nxt, self.cur
+
+
+@ti.func
+def sample(qf: ti.template(), u, v):
+    I = ti.Vector([int(u), int(v)])
+    I = max(0, min(res - 1, I))
+    return qf[I]
+
+
+@ti.func
+def lerp(vl, vr, frac):
+    # frac: [0.0, 1.0]
+    return vl + frac * (vr - vl)
+
+
+@ti.func
+def bilerp(vf: ti.template(), p):
+    u, v = p
+    s, t = u - 0.5, v - 0.5
+    # floor
+    iu, iv = ti.floor(s), ti.floor(t)
+    # fract
+    fu, fv = s - iu, t - iv
+    a = sample(vf, iu, iv)
+    b = sample(vf, iu + 1, iv)
+    c = sample(vf, iu, iv + 1)
+    d = sample(vf, iu + 1, iv + 1)
+    return lerp(lerp(a, b, fu), lerp(c, d, fu), fv)
+
+
+# 3rd order Runge-Kutta
+@ti.func
+def backtrace(vf: ti.template(), p, dt: ti.template()):
+    v1 = bilerp(vf, p)
+    p1 = p - 0.5 * dt * v1
+    v2 = bilerp(vf, p1)
+    p2 = p - 0.75 * dt * v2
+    v3 = bilerp(vf, p2)
+    p -= dt * ((2 / 9) * v1 + (1 / 3) * v2 + (4 / 9) * v3)
+    return p
+
+
+@ti.kernel
+def advect(vf: ti.types.ndarray(field_dim=2),
+           qf: ti.types.ndarray(field_dim=2),
+           new_qf: ti.types.ndarray(field_dim=2)):
+    for i, j in vf:
+        p = ti.Vector([i, j]) + 0.5
+        p = backtrace(vf, p, dt)
+        new_qf[i, j] = bilerp(qf, p) * dye_decay
+
+
+@ti.kernel
+def apply_impulse(vf: ti.types.ndarray(field_dim=2),
+                  dyef: ti.types.ndarray(field_dim=2),
+                  imp_data: ti.types.ndarray(field_dim=1)):
+    g_dir = -ti.Vector([0, 9.8]) * 300
+    for i, j in vf:
+        omx, omy = imp_data[2], imp_data[3]
+        mdir = ti.Vector([imp_data[0], imp_data[1]])
+        dx, dy = (i + 0.5 - omx), (j + 0.5 - omy)
+        d2 = dx * dx + dy * dy
+        # dv = F * dt
+        factor = ti.exp(-d2 / force_radius)
+
+        dc = dyef[i, j]
+        a = dc.norm()
+
+        momentum = (mdir * f_strength * factor + g_dir * a / (1 + a)) * dt
+
+        v = vf[i, j]
+        vf[i, j] = v + momentum
+        # add dye
+        if mdir.norm() > 0.5:
+            dc += ti.exp(-d2 * (4 / (res / 15)**2)) * ti.Vector(
+                [imp_data[4], imp_data[5], imp_data[6]])
+
+        dyef[i, j] = dc
+
+
+@ti.kernel
+def divergence(vf: ti.types.ndarray(field_dim=2),
+               velocity_divs: ti.types.ndarray(field_dim=2)):
+    for i, j in vf:
+        vl = sample(vf, i - 1, j)
+        vr = sample(vf, i + 1, j)
+        vb = sample(vf, i, j - 1)
+        vt = sample(vf, i, j + 1)
+        vc = sample(vf, i, j)
+        if i == 0:
+            vl.x = -vc.x
+        if i == res - 1:
+            vr.x = -vc.x
+        if j == 0:
+            vb.y = -vc.y
+        if j == res - 1:
+            vt.y = -vc.y
+        velocity_divs[i, j] = (vr.x - vl.x + vt.y - vb.y) * 0.5
+
+
+@ti.kernel
+def pressure_jacobi(pf: ti.types.ndarray(field_dim=2),
+                    new_pf: ti.types.ndarray(field_dim=2),
+                    velocity_divs: ti.types.ndarray(field_dim=2)):
+    for i, j in pf:
+        pl = sample(pf, i - 1, j)
+        pr = sample(pf, i + 1, j)
+        pb = sample(pf, i, j - 1)
+        pt = sample(pf, i, j + 1)
+        div = velocity_divs[i, j]
+        new_pf[i, j] = (pl + pr + pb + pt - div) * 0.25
+
+
+@ti.kernel
+def subtract_gradient(vf: ti.types.ndarray(field_dim=2),
+                      pf: ti.types.ndarray(field_dim=2)):
+    for i, j in vf:
+        pl = sample(pf, i - 1, j)
+        pr = sample(pf, i + 1, j)
+        pb = sample(pf, i, j - 1)
+        pt = sample(pf, i, j + 1)
+        vf[i, j] -= 0.5 * ti.Vector([pr - pl, pt - pb])
+
+
+def solve_pressure_jacobi():
+    for _ in range(p_jacobi_iters):
+        pressure_jacobi(pressures_pair.cur, pressures_pair.nxt, _velocity_divs)
+        pressures_pair.swap()
+
+
+def step_orig(mouse_data):
+    advect(velocities_pair.cur, velocities_pair.cur, velocities_pair.nxt)
+    advect(velocities_pair.cur, dyes_pair.cur, dyes_pair.nxt)
+    velocities_pair.swap()
+    dyes_pair.swap()
+
+    apply_impulse(velocities_pair.cur, dyes_pair.cur, mouse_data)
+
+    divergence(velocities_pair.cur, _velocity_divs)
+
+    solve_pressure_jacobi()
+
+    subtract_gradient(velocities_pair.cur, pressures_pair.cur)
+
+
+mouse_data_ti = ti.ndarray(ti.f32, shape=(8, ))
+
+
+class MouseDataGen(object):
+    def __init__(self):
+        self.prev_mouse = None
+        self.prev_color = None
+
+    def __call__(self, gui):
+        # [0:2]: normalized delta direction
+        # [2:4]: current mouse xy
+        # [4:7]: color
+        mouse_data = np.zeros(8, dtype=np.float32)
+        if gui.is_pressed(ti.GUI.LMB):
+            mxy = np.array(gui.get_cursor_pos(), dtype=np.float32) * res
+            if self.prev_mouse is None:
+                self.prev_mouse = mxy
+                # Set lower bound to 0.3 to prevent too dark colors
+                self.prev_color = (np.random.rand(3) * 0.7) + 0.3
+            else:
+                mdir = mxy - self.prev_mouse
+                mdir = mdir / (np.linalg.norm(mdir) + 1e-5)
+                mouse_data[0], mouse_data[1] = mdir[0], mdir[1]
+                mouse_data[2], mouse_data[3] = mxy[0], mxy[1]
+                mouse_data[4:7] = self.prev_color
+                self.prev_mouse = mxy
+        else:
+            self.prev_mouse = None
+            self.prev_color = None
+        mouse_data_ti.from_numpy(mouse_data)
+        return mouse_data_ti
+
+
+def reset():
+    velocities_pair.cur.fill(0)
+    pressures_pair.cur.fill(0)
+    dyes_pair.cur.fill(0)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--baseline',
+        action='store_true')
+    args, unknown = parser.parse_known_args()
+
+    gui = ti.GUI('Stable Fluid', (res, res))
+    md_gen = MouseDataGen()
+
+    _velocities = ti.Vector.ndarray(2, float, shape=(res, res))
+    _new_velocities = ti.Vector.ndarray(2, float, shape=(res, res))
+    _velocity_divs = ti.ndarray(float, shape=(res, res))
+    velocity_curls = ti.ndarray(float, shape=(res, res))
+    _pressures = ti.ndarray(float, shape=(res, res))
+    _new_pressures = ti.ndarray(float, shape=(res, res))
+    _dye_buffer = ti.Vector.ndarray(3, float, shape=(res, res))
+    _new_dye_buffer = ti.Vector.ndarray(3, float, shape=(res, res))
+
+    if args.baseline:
+        velocities_pair = TexPair(_velocities, _new_velocities)
+        pressures_pair = TexPair(_pressures, _new_pressures)
+        dyes_pair = TexPair(_dye_buffer, _new_dye_buffer)
+    else:
+        print('running in graph mode')
+        velocities_pair_cur = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                                           'velocities_pair_cur',
+                                           ti.f32,
+                                           element_shape=(2, ))
+        velocities_pair_nxt = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                                           'velocities_pair_nxt',
+                                           ti.f32,
+                                           element_shape=(2, ))
+        dyes_pair_cur = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                                     'dyes_pair_cur',
+                                     ti.f32,
+                                     element_shape=(3, ))
+        dyes_pair_nxt = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                                     'dyes_pair_nxt',
+                                     ti.f32,
+                                     element_shape=(3, ))
+        pressures_pair_cur = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                                          'pressures_pair_cur', ti.f32)
+        pressures_pair_nxt = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                                          'pressures_pair_nxt', ti.f32)
+        velocity_divs = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'velocity_divs',
+                                     ti.f32)
+        mouse_data = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'mouse_data', ti.f32)
+
+        g1_builder = ti.graph.GraphBuilder()
+        g1_builder.dispatch(advect, velocities_pair_cur, velocities_pair_cur,
+                    velocities_pair_nxt)
+        g1_builder.dispatch(advect, velocities_pair_cur, dyes_pair_cur, dyes_pair_nxt)
+        g1_builder.dispatch(apply_impulse, velocities_pair_nxt, dyes_pair_nxt, mouse_data)
+        g1_builder.dispatch(divergence, velocities_pair_nxt, velocity_divs)
+        # swap is unrolled in the loop so we only need p_jacobi_iters // 2 iterations.
+        for _ in range(p_jacobi_iters // 2):
+            g1_builder.dispatch(pressure_jacobi, pressures_pair_cur, pressures_pair_nxt,
+                        velocity_divs)
+            g1_builder.dispatch(pressure_jacobi, pressures_pair_nxt, pressures_pair_cur,
+                        velocity_divs)
+        g1_builder.dispatch(subtract_gradient, velocities_pair_nxt, pressures_pair_cur)
+        g1 = g1_builder.compile()
+
+        g2_builder = ti.graph.GraphBuilder()
+        g2_builder.dispatch(advect, velocities_pair_nxt, velocities_pair_nxt,
+                    velocities_pair_cur)
+        g2_builder.dispatch(advect, velocities_pair_nxt, dyes_pair_nxt, dyes_pair_cur)
+        g2_builder.dispatch(apply_impulse, velocities_pair_cur, dyes_pair_cur, mouse_data)
+        g2_builder.dispatch(divergence, velocities_pair_cur, velocity_divs)
+        for _ in range(p_jacobi_iters // 2):
+            g2_builder.dispatch(pressure_jacobi, pressures_pair_cur, pressures_pair_nxt,
+                        velocity_divs)
+            g2_builder.dispatch(pressure_jacobi, pressures_pair_nxt, pressures_pair_cur,
+                        velocity_divs)
+        g2_builder.dispatch(subtract_gradient, velocities_pair_cur, pressures_pair_cur)
+        g2 = g2_builder.compile()
+
+
+    swap = True
+
+    while gui.running:
+        if gui.get_event(ti.GUI.PRESS):
+            e = gui.event
+            if e.key == ti.GUI.ESCAPE:
+                break
+            elif e.key == 'r':
+                paused = False
+                reset()
+            elif e.key == 's':
+                if curl_strength:
+                    curl_strength = 0
+                else:
+                    curl_strength = 7
+            elif e.key == 'g':
+                gravity = not gravity
+            elif e.key == 'p':
+                paused = not paused
+
+        if not paused:
+            _mouse_data = md_gen(gui)
+            if args.baseline:
+                step_orig(_mouse_data)
+                gui.set_image(dyes_pair.cur.to_numpy())
+            else:
+                invoke_args = {
+                    'mouse_data': _mouse_data,
+                    'velocities_pair_cur': _velocities,
+                    'velocities_pair_nxt': _new_velocities,
+                    'dyes_pair_cur': _dye_buffer,
+                    'dyes_pair_nxt': _new_dye_buffer,
+                    'pressures_pair_cur': _pressures,
+                    'pressures_pair_nxt': _new_pressures,
+                    'velocity_divs': _velocity_divs
+                }
+                if swap:
+                    g1.run(invoke_args)
+                    gui.set_image(_dye_buffer.to_numpy())
+                    swap = False
+                else:
+                    g2.run(invoke_args)
+                    gui.set_image(_new_dye_buffer.to_numpy())
+                    swap = True
+        gui.show()
diff --git a/python/taichi/graph/_graph.py b/python/taichi/graph/_graph.py
index f9f06b32c8aa0..f5f476fe69a25 100644
--- a/python/taichi/graph/_graph.py
+++ b/python/taichi/graph/_graph.py
@@ -12,8 +12,8 @@ def gen_cpp_kernel(kernel_fn, args):
     kernel = kernel_fn._primal
     assert isinstance(kernel, kernel_impl.Kernel)
     injected_args = produce_injected_args(kernel, symbolic_args=args)
-    kernel.ensure_compiled(*injected_args)
-    return kernel.kernel_cpp
+    key = kernel.ensure_compiled(*injected_args)
+    return kernel.compiled_kernels[key]
 
 
 class Sequential:
@@ -64,7 +64,7 @@ def run(self, args):
                 arg_floats[k] = v
             else:
                 raise TaichiRuntimeError(
-                    'Only python int, float and ti.Ndarray are supported as runtime arguments'
+                    f'Only python int, float and ti.Ndarray are supported as runtime arguments but got {type(v)}'
                 )
         self._compiled_graph.run(arg_ptrs, arg_ints, arg_floats)
 
diff --git a/python/taichi/lang/kernel_impl.py b/python/taichi/lang/kernel_impl.py
index f1d302b81f326..ad9a7e1afbb84 100644
--- a/python/taichi/lang/kernel_impl.py
+++ b/python/taichi/lang/kernel_impl.py
@@ -420,6 +420,9 @@ def __init__(self, _func, autodiff_mode, _classkernel=False):
         impl.get_runtime().kernels.append(self)
         self.reset()
         self.kernel_cpp = None
+        # TODO[#5114]: get rid of compiled_functions and use compiled_kernels instead.
+        # Main motivation is that compiled_kernels can be potentially serialized in the AOT scenario.
+        self.compiled_kernels = {}
 
     def reset(self):
         self.runtime = impl.get_runtime()
@@ -533,6 +536,7 @@ def taichi_ast_generator(kernel_cxx):
 
         assert key not in self.compiled_functions
         self.compiled_functions[key] = self.get_function_body(taichi_kernel)
+        self.compiled_kernels[key] = taichi_kernel
 
     def get_torch_callbacks(self, v, has_torch, is_ndarray=True):
         callbacks = []
diff --git a/taichi/program/graph_builder.cpp b/taichi/program/graph_builder.cpp
index 76c579d70e74b..8aba27cf3087e 100644
--- a/taichi/program/graph_builder.cpp
+++ b/taichi/program/graph_builder.cpp
@@ -6,11 +6,11 @@ namespace taichi {
 namespace lang {
 void Dispatch::compile(
     std::vector<aot::CompiledDispatch> &compiled_dispatches) {
-  if (!compiled_kernel_) {
-    compiled_kernel_ = kernel_->compile_to_aot_kernel();
+  if (kernel_->compiled_aot_kernel() == nullptr) {
+    kernel_->compile_to_aot_kernel();
   }
   aot::CompiledDispatch dispatch{kernel_->get_name(), symbolic_args_,
-                                 compiled_kernel_.get()};
+                                 kernel_->compiled_aot_kernel()};
   compiled_dispatches.push_back(std::move(dispatch));
 }
 
diff --git a/taichi/program/graph_builder.h b/taichi/program/graph_builder.h
index 129e5adc7b94b..8f9c8a341109b 100644
--- a/taichi/program/graph_builder.h
+++ b/taichi/program/graph_builder.h
@@ -36,7 +36,6 @@ class Dispatch : public Node {
  private:
   mutable bool serialized_{false};
   Kernel *kernel_{nullptr};
-  std::unique_ptr<aot::Kernel> compiled_kernel_{nullptr};
   std::vector<aot::Arg> symbolic_args_;
 };
 
diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
index b2747e2939acd..b731728d8b7d6 100644
--- a/taichi/program/kernel.cpp
+++ b/taichi/program/kernel.cpp
@@ -66,8 +66,8 @@ void Kernel::compile() {
   compiled_ = program->compile(*this);
 }
 
-std::unique_ptr<aot::Kernel> Kernel::compile_to_aot_kernel() {
-  return program->make_aot_kernel(*this);
+void Kernel::compile_to_aot_kernel() {
+  compiled_aot_kernel_ = program->make_aot_kernel(*this);
 }
 
 void Kernel::lower(bool to_executable) {
diff --git a/taichi/program/kernel.h b/taichi/program/kernel.h
index e19ce1916e020..ac300595caef9 100644
--- a/taichi/program/kernel.h
+++ b/taichi/program/kernel.h
@@ -87,7 +87,12 @@ class TI_DLL_EXPORT Kernel : public Callable {
 
   void compile();
 
-  std::unique_ptr<aot::Kernel> compile_to_aot_kernel();
+  void compile_to_aot_kernel();
+
+  aot::Kernel *compiled_aot_kernel() {
+    return compiled_aot_kernel_.get();
+  }
+
   /**
    * Lowers |ir| to CHI IR level
    *
@@ -142,6 +147,10 @@ class TI_DLL_EXPORT Kernel : public Callable {
   bool ir_is_ast_{false};
   // The closure that, if invoked, lauches the backend kernel (shader)
   FunctionType compiled_{nullptr};
+  // TODO[#5114]: It's kinda redundant to keep both compiled_ (used for JIT
+  // execution) as well as compiled_aot_kernel_. In fact we'd better unify
+  // everything around compiled_aot_kernel and rename it.
+  std::unique_ptr<aot::Kernel> compiled_aot_kernel_{nullptr};
   // A flag to record whether |ir| has been fully lowered.
   // lower inital AST all the way down to a bunch of
   // OffloadedStmt for async execution

From 9c3da6517b0b8129344dedcea9203f7a2edb63b0 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Fri, 10 Jun 2022 14:50:52 +0800
Subject: [PATCH 169/176] [aot] [llvm] LLVM AOT Field #1: Adjust
 serialization/deserialization logics for FieldCacheData (#5111)

* [aot] [llvm] Implemented FieldCacheData and refactored initialize_llvm_runtime_snodes()

* Addressed compilation erros

* [aot] [llvm] LLVM AOT Field #1: Adjust serialization/deserialization logics for FieldCacheData
---
 taichi/llvm/llvm_offline_cache.cpp | 14 +++++++++++++
 taichi/llvm/llvm_offline_cache.h   | 32 ++++++++++++++++++++++++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

diff --git a/taichi/llvm/llvm_offline_cache.cpp b/taichi/llvm/llvm_offline_cache.cpp
index 542f02aebc2c7..92994fe9aa2f0 100644
--- a/taichi/llvm/llvm_offline_cache.cpp
+++ b/taichi/llvm/llvm_offline_cache.cpp
@@ -51,6 +51,20 @@ LlvmOfflineCacheFileReader::LlvmOfflineCacheFileReader(
     : path_(path), data_(std::move(data)), format_(format) {
 }
 
+bool LlvmOfflineCacheFileReader::get_field_cache(
+    LlvmOfflineCache::FieldCacheData &res,
+    int snode_tree_id) {
+  auto itr = data_.fields.find(snode_tree_id);
+  if (itr == data_.fields.end()) {
+    TI_DEBUG("Cannot find field with snode_tree_id={}", snode_tree_id);
+    return false;
+  }
+
+  const auto &loaded_field_cache = itr->second;
+  res = loaded_field_cache;  // copy assign
+  return true;
+}
+
 bool LlvmOfflineCacheFileReader::get_kernel_cache(
     LlvmOfflineCache::KernelCacheData &res,
     const std::string &key,
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index 56de40d4c3276..b5403982b10ba 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -59,10 +59,31 @@ struct LlvmOfflineCache {
 
     TI_IO_DEF(tree_id, root_id, root_size, snode_metas);
 
-    // TODO(zhanlue)
-    //  Serialize/Deserialize the llvm::Module from StructCompiler
-    //  At runtime, make sure loaded Field-Modules and Kernel-Modules are linked
-    //  altogether.
+    // TODO(zhanlue): refactor llvm::Modules
+    //
+    // struct_module will eventually get cloned into each kernel_module,
+    // so there's no need to serialize it here.
+    //
+    // We have three different types of llvm::Module
+    // 1. runtime_module: contains runtime functions.
+    // 2. struct_module: contains compiled SNodeTree in llvm::Type.
+    // 3. kernel_modules: contains compiled kernel codes.
+    //
+    // The way those modules work rely on a recursive clone mechanism:
+    //   runtime_module = load("runtime.bc")
+    //   struct_module = clone(runtime_module) + compiled-SNodeTree
+    //   kernel_module = clone(struct_module) + compiled-Kernel
+    //
+    // As a result, every kernel_module contains a copy of struct_module +
+    // runtime_module.
+    //
+    // This recursive clone mechanism is super fragile,
+    // which potentially causes inconsistency between modules if not handled
+    // properly.
+    //
+    // Let's turn to use llvm::link to bind the modules,
+    // and make runtime_module, struct_module, kernel_module independent of each
+    // other
   };
 
   // TODO(zhanlue): we need a better identifier for each FieldCacheData
@@ -83,6 +104,9 @@ class LlvmOfflineCacheFileReader {
                         const std::string &key,
                         llvm::LLVMContext &llvm_ctx);
 
+  bool get_field_cache(LlvmOfflineCache::FieldCacheData &res,
+                       int snode_tree_id);
+
   static std::unique_ptr<LlvmOfflineCacheFileReader> make(
       const std::string &path,
       LlvmOfflineCache::Format format = LlvmOfflineCache::Format::LL);

From a01c373c0f7b98a0e9c43966120b15a7d48e3ff1 Mon Sep 17 00:00:00 2001
From: Olinaaaloompa <106292061+Olinaaaloompa@users.noreply.github.com>
Date: Sat, 11 Jun 2022 00:17:43 +0800
Subject: [PATCH 170/176] Editorial update (#5119)

---
 docs/lang/articles/reference.md | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/docs/lang/articles/reference.md b/docs/lang/articles/reference.md
index 3af07ef06a1c3..14d6e65bae2c7 100644
--- a/docs/lang/articles/reference.md
+++ b/docs/lang/articles/reference.md
@@ -7,12 +7,11 @@ sidebar_position: 9998
 This article describes the syntax and semantics of the Taichi programming
 language.
 
-**To users**: If you have gone through user tutorials and still feel uncertain
+**To users**: If you have gone through the user tutorials and still feel uncertain
 about your program behavior, then you are in the right place. If you find the
-actual behavior different from the one described in this article, feel free to
+actual behavior different from what is described in this article, feel free to
 create an [issue](https://github.com/taichi-dev/taichi/issues/new/choose).
-Anything unspecified in this article is subject to change, so you should not
-rely on it in your programs.
+You should not rely solely on this article since things unspecified are subject to changes.
 
 **To contributors**: This article specifies what the language *should* be. That
 is, you should try to match the implementation of the Taichi compiler with this

From e10a11b341748dc834a3e0f796169d9813388923 Mon Sep 17 00:00:00 2001
From: Bob Cao <bobcaocheng@gmail.com>
Date: Sat, 11 Jun 2022 00:04:56 -0700
Subject: [PATCH 171/176] [lang] Texture support 0/n: IR changes (#5134)

---
 taichi/analysis/gen_offline_cache_key.cpp | 13 ++++++
 taichi/inc/expressions.inc.h              |  2 +
 taichi/inc/statements.inc.h               |  3 ++
 taichi/ir/expression_printer.h            | 10 +++++
 taichi/ir/frontend_ir.cpp                 | 55 +++++++++++++++++++++++
 taichi/ir/frontend_ir.h                   | 35 +++++++++++++++
 taichi/ir/statements.h                    | 31 +++++++++++++
 taichi/ir/stmt_op_types.cpp               | 16 +++++++
 taichi/ir/stmt_op_types.h                 |  4 ++
 taichi/transforms/ir_printer.cpp          | 10 +++++
 10 files changed, 179 insertions(+)

diff --git a/taichi/analysis/gen_offline_cache_key.cpp b/taichi/analysis/gen_offline_cache_key.cpp
index f3ecaae6171e7..f9723e99f0044 100644
--- a/taichi/analysis/gen_offline_cache_key.cpp
+++ b/taichi/analysis/gen_offline_cache_key.cpp
@@ -82,6 +82,18 @@ class ASTSerializer : public IRVisitor, public ExpressionVisitor {
     emit(expr->arg_id);
   }
 
+  void visit(TexturePtrExpression *expr) override {
+    emit(ExprOpCode::TexturePtrExpression);
+    emit(expr->arg_id);
+  }
+
+  void visit(TextureOpExpression *expr) override {
+    emit(ExprOpCode::TextureOpExpression);
+    emit(expr->op);
+    emit(expr->texture_ptr);
+    emit(expr->args.exprs);
+  }
+
   void visit(RandExpression *expr) override {
     emit(ExprOpCode::RandExpression);
     emit(expr->dt);
@@ -611,6 +623,7 @@ class ASTSerializer : public IRVisitor, public ExpressionVisitor {
   DEFINE_EMIT_ENUM(SNodeAccessFlag);
   DEFINE_EMIT_ENUM(MeshRelationAccessType);
   DEFINE_EMIT_ENUM(ExternalFuncType);
+  DEFINE_EMIT_ENUM(TextureOpType);
   DEFINE_EMIT_ENUM(mesh::MeshElementType);
   DEFINE_EMIT_ENUM(mesh::MeshRelationType);
   DEFINE_EMIT_ENUM(mesh::ConvType);
diff --git a/taichi/inc/expressions.inc.h b/taichi/inc/expressions.inc.h
index 4ec43c58357f9..d4ef5dbc6fa47 100644
--- a/taichi/inc/expressions.inc.h
+++ b/taichi/inc/expressions.inc.h
@@ -20,3 +20,5 @@ PER_EXPRESSION(MeshPatchIndexExpression)
 PER_EXPRESSION(MeshRelationAccessExpression)
 PER_EXPRESSION(MeshIndexConversionExpression)
 PER_EXPRESSION(ReferenceExpression)
+PER_EXPRESSION(TextureOpExpression)
+PER_EXPRESSION(TexturePtrExpression)
diff --git a/taichi/inc/statements.inc.h b/taichi/inc/statements.inc.h
index c40c89290afd8..fe12a8941f7f5 100644
--- a/taichi/inc/statements.inc.h
+++ b/taichi/inc/statements.inc.h
@@ -79,5 +79,8 @@ PER_STATEMENT(BlockLocalPtrStmt)
 // Special
 PER_STATEMENT(InternalFuncStmt)
 
+PER_STATEMENT(TexturePtrStmt)
+PER_STATEMENT(TextureOpStmt)
+
 // Quantization
 PER_STATEMENT(BitStructStoreStmt)
diff --git a/taichi/ir/expression_printer.h b/taichi/ir/expression_printer.h
index 29f391d17334a..133b7e4eb27ba 100644
--- a/taichi/ir/expression_printer.h
+++ b/taichi/ir/expression_printer.h
@@ -41,6 +41,16 @@ class ExpressionHumanFriendlyPrinter : public ExpressionPrinter {
         fmt::format("arg[{}] (dt={})", expr->arg_id, data_type_name(expr->dt)));
   }
 
+  void visit(TexturePtrExpression *expr) override {
+    emit(fmt::format("(Texture *)(arg[{}])", expr->arg_id));
+  }
+
+  void visit(TextureOpExpression *expr) override {
+    emit(fmt::format("texture_{}(", texture_op_type_name(expr->op)));
+    visit(expr->args);
+    emit(")");
+  }
+
   void visit(RandExpression *expr) override {
     emit(fmt::format("rand<{}>()", data_type_name(expr->dt)));
   }
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 9ded4ffae1146..93fca8a78e6ef 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -126,6 +126,15 @@ void ArgLoadExpression::flatten(FlattenContext *ctx) {
   stmt = ctx->back_stmt();
 }
 
+void TexturePtrExpression::type_check(CompileConfig *config) {
+}
+
+void TexturePtrExpression::flatten(FlattenContext *ctx) {
+  ctx->push_back<ArgLoadStmt>(arg_id, PrimitiveType::f32, true);
+  ctx->push_back<TexturePtrStmt>(ctx->back_stmt());
+  stmt = ctx->back_stmt();
+}
+
 void RandExpression::type_check(CompileConfig *) {
   TI_ASSERT_INFO(dt->is<PrimitiveType>() && dt != PrimitiveType::unknown,
                  "Invalid dt [{}] for RandExpression", dt->to_string());
@@ -589,6 +598,52 @@ void SNodeOpExpression::flatten(FlattenContext *ctx) {
   stmt = ctx->back_stmt();
 }
 
+void TextureOpExpression::type_check(CompileConfig *config) {
+  if (op == TextureOpType::sample_lod) {
+    // UV, Lod
+    TI_ASSERT_INFO(args.size() == 3,
+                   "Invalid number of args for sample_lod Texture op");
+    TI_ASSERT_TYPE_CHECKED(args[0]);
+    TI_ASSERT_TYPE_CHECKED(args[1]);
+    TI_ASSERT_TYPE_CHECKED(args[2]);
+    if (args[0].get_ret_type() != PrimitiveType::f32 ||
+        args[1].get_ret_type() != PrimitiveType::f32 ||
+        args[2].get_ret_type() != PrimitiveType::f32) {
+      throw TaichiTypeError(
+          fmt::format("All arguments to sample_lod Texture op must be FP32"));
+    }
+  } else if (op == TextureOpType::fetch_texel) {
+    // index, int LOD
+    TI_ASSERT_INFO(args.size() == 3,
+                   "Invalid number of args for fetch_texel Texture op");
+    TI_ASSERT_TYPE_CHECKED(args[0]);
+    TI_ASSERT_TYPE_CHECKED(args[1]);
+    TI_ASSERT_TYPE_CHECKED(args[2]);
+    if (args[0].get_ret_type() != PrimitiveType::i32 ||
+        args[1].get_ret_type() != PrimitiveType::i32 ||
+        args[2].get_ret_type() != PrimitiveType::i32) {
+      throw TaichiTypeError(
+          fmt::format("All arguments to fetch_texel Texture op must be i32"));
+    }
+  } else {
+    TI_ERROR("Invalid TextureOpType");
+  }
+  ret_type =
+      TypeFactory::get_instance().get_pointer_type(PrimitiveType::f32,
+                                                   /*is_bit_pointer=*/false);
+}
+
+void TextureOpExpression::flatten(FlattenContext *ctx) {
+  flatten_rvalue(texture_ptr, ctx);
+  std::vector<Stmt *> arg_stmts;
+  for (Expr &arg : args.exprs) {
+    flatten_rvalue(arg, ctx);
+    arg_stmts.push_back(arg->stmt);
+  }
+  ctx->push_back<TextureOpStmt>(op, texture_ptr->stmt, arg_stmts);
+  stmt = ctx->back_stmt();
+}
+
 void ConstExpression::type_check(CompileConfig *) {
   TI_ASSERT_INFO(
       val.dt->is<PrimitiveType>() && val.dt != PrimitiveType::unknown,
diff --git a/taichi/ir/frontend_ir.h b/taichi/ir/frontend_ir.h
index 6343de337e831..2c2eba5bd12f2 100644
--- a/taichi/ir/frontend_ir.h
+++ b/taichi/ir/frontend_ir.h
@@ -293,6 +293,22 @@ class ArgLoadExpression : public Expression {
   TI_DEFINE_ACCEPT_FOR_EXPRESSION
 };
 
+class Texture;
+
+class TexturePtrExpression : public Expression {
+ public:
+  int arg_id;
+
+  TexturePtrExpression(int arg_id) : arg_id(arg_id) {
+  }
+
+  void type_check(CompileConfig *config) override;
+
+  void flatten(FlattenContext *ctx) override;
+
+  TI_DEFINE_ACCEPT_FOR_EXPRESSION
+};
+
 class RandExpression : public Expression {
  public:
   DataType dt;
@@ -612,6 +628,25 @@ class SNodeOpExpression : public Expression {
   TI_DEFINE_ACCEPT_FOR_EXPRESSION
 };
 
+class TextureOpExpression : public Expression {
+ public:
+  TextureOpType op;
+  Expr texture_ptr;
+  ExprGroup args;
+
+  explicit TextureOpExpression(TextureOpType op,
+                               Expr texture_ptr,
+                               const ExprGroup &args)
+      : op(op), texture_ptr(texture_ptr), args(args) {
+  }
+
+  void type_check(CompileConfig *config) override;
+
+  void flatten(FlattenContext *ctx) override;
+
+  TI_DEFINE_ACCEPT_FOR_EXPRESSION
+};
+
 class ConstExpression : public Expression {
  public:
   TypedConstant val;
diff --git a/taichi/ir/statements.h b/taichi/ir/statements.h
index d9099ccdcf785..25ceed9a47a9e 100644
--- a/taichi/ir/statements.h
+++ b/taichi/ir/statements.h
@@ -1446,6 +1446,37 @@ class InternalFuncStmt : public Stmt {
   TI_DEFINE_ACCEPT_AND_CLONE
 };
 
+class Texture;
+
+class TexturePtrStmt : public Stmt {
+ public:
+  Stmt *arg_load_stmt{nullptr};
+
+  explicit TexturePtrStmt(Stmt *stmt) : arg_load_stmt(stmt) {
+    TI_STMT_REG_FIELDS;
+  }
+
+  TI_STMT_DEF_FIELDS(arg_load_stmt);
+  TI_DEFINE_ACCEPT_AND_CLONE
+};
+
+class TextureOpStmt : public Stmt {
+ public:
+  TextureOpType op;
+  Stmt *texture_ptr;
+  std::vector<Stmt *> args;
+
+  explicit TextureOpStmt(TextureOpType op,
+                         Stmt *texture_ptr,
+                         const std::vector<Stmt *> &args)
+      : op(op), texture_ptr(texture_ptr), args(args) {
+    TI_STMT_REG_FIELDS;
+  }
+
+  TI_STMT_DEF_FIELDS(op, texture_ptr, args);
+  TI_DEFINE_ACCEPT_AND_CLONE
+};
+
 /**
  * A local AD-stack.
  */
diff --git a/taichi/ir/stmt_op_types.cpp b/taichi/ir/stmt_op_types.cpp
index a5f492f869c0a..0ab447c25f68a 100644
--- a/taichi/ir/stmt_op_types.cpp
+++ b/taichi/ir/stmt_op_types.cpp
@@ -146,5 +146,21 @@ std::string snode_op_type_name(SNodeOpType type) {
   }
 }
 
+std::string texture_op_type_name(TextureOpType type) {
+  switch (type) {
+#define REGISTER_TYPE(i) \
+  case TextureOpType::i: \
+    return #i;
+
+    REGISTER_TYPE(sample_lod);
+    REGISTER_TYPE(fetch_texel);
+    REGISTER_TYPE(undefined);
+
+#undef REGISTER_TYPE
+    default:
+      TI_NOT_IMPLEMENTED
+  }
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/ir/stmt_op_types.h b/taichi/ir/stmt_op_types.h
index a71d5512cd6ed..12609d994fbc4 100644
--- a/taichi/ir/stmt_op_types.h
+++ b/taichi/ir/stmt_op_types.h
@@ -84,5 +84,9 @@ enum class SNodeOpType : int {
 
 std::string snode_op_type_name(SNodeOpType type);
 
+enum class TextureOpType : int { sample_lod, fetch_texel, undefined };
+
+std::string texture_op_type_name(TextureOpType type);
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/transforms/ir_printer.cpp b/taichi/transforms/ir_printer.cpp
index 7cb30527128d8..8a005ce45b082 100644
--- a/taichi/transforms/ir_printer.cpp
+++ b/taichi/transforms/ir_printer.cpp
@@ -424,6 +424,16 @@ class IRPrinter : public IRVisitor {
     print("{}{} = arg[{}]", stmt->type_hint(), stmt->name(), stmt->arg_id);
   }
 
+  void visit(TexturePtrStmt *stmt) override {
+    print("<*Texture> {} = {}", stmt->name(), stmt->arg_load_stmt->name());
+  }
+
+  void visit(TextureOpStmt *stmt) override {
+    print("<struct> {} = texture_{}({}, {}, {})", stmt->name(),
+          texture_op_type_name(stmt->op), stmt->args[0]->name(),
+          stmt->args[1]->name(), stmt->args[2]->name());
+  }
+
   void visit(FrontendReturnStmt *stmt) override {
     print("{}{} : return [{}]", stmt->type_hint(), stmt->name(),
           expr_group_to_string(stmt->values));

From 573f89e6dcaf70c2ce4939588b6a7e701dd2c801 Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Mon, 13 Jun 2022 01:31:17 +0800
Subject: [PATCH 172/176] fix mass_spring_3d_ggui backend (#5127)

---
 python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
index 3579afcd3dd8c..4be661f9fbead 100644
--- a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
+++ b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
@@ -1,5 +1,7 @@
 import taichi as ti
-ti.init(arch=ti.gpu)  # Alternatively, ti.init(arch=ti.cpu)
+
+arch = ti.vulkan if ti._lib.core.with_vulkan() else ti.cuda
+ti.init(arch=arch)
 
 n = 128
 quad_size = 1.0 / n

From a0bdb665dd81a6b2c65529dc025fa8a7a7ac6117 Mon Sep 17 00:00:00 2001
From: Zhao Liang <mathzhaoliang@gmail.com>
Date: Mon, 13 Jun 2022 10:45:26 +0800
Subject: [PATCH 173/176] [Example] Fix block_dim warning in ggui (#5128)

* fix block dim warning in ggui

* fix block dim warning in ggui

* fix block dim warning in ggui
---
 python/taichi/examples/ggui_examples/mpm3d_ggui.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/taichi/examples/ggui_examples/mpm3d_ggui.py b/python/taichi/examples/ggui_examples/mpm3d_ggui.py
index e7675c333dd69..3c4edab01f639 100644
--- a/python/taichi/examples/ggui_examples/mpm3d_ggui.py
+++ b/python/taichi/examples/ggui_examples/mpm3d_ggui.py
@@ -55,7 +55,7 @@ def substep(g_x: float, g_y: float, g_z: float):
     for I in ti.grouped(grid_m):
         grid_v[I] = ti.zero(grid_v[I])
         grid_m[I] = 0
-    ti.block_dim(n_grid)
+    ti.loop_config(block_dim=n_grid)
     for p in x:
         if used[p] == 0:
             continue
@@ -114,7 +114,7 @@ def substep(g_x: float, g_y: float, g_z: float):
         cond = (I < bound) & (grid_v[I] < 0) | \
                (I > n_grid - bound) & (grid_v[I] > 0)
         grid_v[I] = 0 if cond else grid_v[I]
-    ti.block_dim(n_grid)
+    ti.loop_config(block_dim=n_grid)
     for p in x:
         if used[p] == 0:
             continue

From aba2871e68f51e9f80b05d642812d2dbbe6c431c Mon Sep 17 00:00:00 2001
From: Ailing <ailzhang@users.noreply.github.com>
Date: Mon, 13 Jun 2022 12:29:42 +0800
Subject: [PATCH 174/176] [ci] Enable yapf and isort on example files (#5140)

Note we explicitly exclude running pylint on them as it requires a bunch
of manual fixes first.
---
 .pre-commit-config.yaml                       |   3 +-
 .../examples/algorithm/mciso_advanced.py      |   3 +-
 .../examples/autodiff/diff_sph/diff_sph.py    | 268 ++++++++++++------
 .../examples/features/io/export_mesh.py       |   4 +-
 .../ggui_examples/mass_spring_3d_ggui.py      |   8 +-
 python/taichi/examples/graph/mpm88_graph.py   |  30 +-
 .../examples/graph/stable_fluid_graph.py      |  48 ++--
 tests/python/examples/__init__.py             |   1 -
 tests/python/examples/autodiff/__init__.py    |   1 -
 9 files changed, 239 insertions(+), 127 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7c0fdf9f995d9..0dc07c04fff9e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ ci:
   autoupdate_schedule: quarterly
   autoupdate_commit_msg: '[misc] Update pre-commit hooks'
 
-exclude: ^((tests/python/test_exception|.*/examples/.*)\.py$|external/)
+exclude: ^((tests/python/test_exception)\.py$|external/)
 repos:
   - repo: https://github.com/google/yapf
     rev: v0.32.0
@@ -33,3 +33,4 @@ repos:
       - id: pylint
         args: ['-rn', '-sn']
         files: ^python/taichi/
+        exclude: ^python/taichi/examples/.*.py
diff --git a/python/taichi/examples/algorithm/mciso_advanced.py b/python/taichi/examples/algorithm/mciso_advanced.py
index 991e56a0fbdc5..fd79563982751 100644
--- a/python/taichi/examples/algorithm/mciso_advanced.py
+++ b/python/taichi/examples/algorithm/mciso_advanced.py
@@ -489,7 +489,8 @@ def main(self):
                          (0, 1))
                 if gui.is_pressed(gui.SPACE):
                     num = ret.shape[0]
-                    writer = ti.tools.PLYWriter(num_vertices=num * 3, num_faces=num)
+                    writer = ti.tools.PLYWriter(num_vertices=num * 3,
+                                                num_faces=num)
                     vertices = ret.reshape(num * 3, 3) * 2 - 1
                     writer.add_vertex_pos(vertices[:, 0], vertices[:, 1],
                                           vertices[:, 2])
diff --git a/python/taichi/examples/autodiff/diff_sph/diff_sph.py b/python/taichi/examples/autodiff/diff_sph/diff_sph.py
index 3070b4aceaeea..8ee5495560e8f 100644
--- a/python/taichi/examples/autodiff/diff_sph/diff_sph.py
+++ b/python/taichi/examples/autodiff/diff_sph/diff_sph.py
@@ -1,23 +1,24 @@
 # Smoothed-particle hydrodynamics (SPH) is a computational method used for simulating the mechanics of continuum media, such as solid mechanics and fluid flows.
 # Here we utilize SPH to simulate a fountain, who tries to hit a target given by the user.
-# The SPH simulator here implemented using Taichi is differentiable. 
-# Therefore, it can be easily embedding into the training pipeline of a neural network modelled controller.  
+# The SPH simulator here implemented using Taichi is differentiable.
+# Therefore, it can be easily embedding into the training pipeline of a neural network modelled controller.
 
-import taichi as ti
-import numpy as np
-import matplotlib.pyplot as plt
+import argparse
 import os
 import pickle as pkl
-import argparse
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+import taichi as ti
+
 parser = argparse.ArgumentParser()
-parser.add_argument(
-        '--train',
-        action='store_true',
-        help='whether train model, default false')
+parser.add_argument('--train',
+                    action='store_true',
+                    help='whether train model, default false')
 parser.add_argument('place_holder', nargs='*')
 args = parser.parse_args()
 
-
 TRAIN = args.train
 TRAIN_OUTPUT_IMG = False
 TRAIN_VISUAL = False
@@ -52,7 +53,15 @@ def zero_grad(self):
 
 @ti.data_oriented
 class Linear:
-    def __init__(self, n_models, batch_size, n_steps, n_input, n_hidden, n_output, needs_grad=False, activation=False):
+    def __init__(self,
+                 n_models,
+                 batch_size,
+                 n_steps,
+                 n_input,
+                 n_hidden,
+                 n_output,
+                 needs_grad=False,
+                 activation=False):
         self.n_models = n_models
         self.batch_size = batch_size
         self.n_steps = n_steps
@@ -69,8 +78,12 @@ def __init__(self, n_models, batch_size, n_steps, n_input, n_hidden, n_output, n
         self.n_hidden_node = self.batch_node.dense(ti.j, self.n_hidden)
         self.weights1_node = self.n_hidden_node.dense(ti.k, self.n_input)
 
-        self.batch_node.dense(ti.axes(1, 2, 3), (self.n_steps, self.batch_size, self.n_hidden)).place(self.hidden)
-        self.batch_node.dense(ti.axes(1, 2, 3), (self.n_steps, self.batch_size, self.n_output)).place(self.output)
+        self.batch_node.dense(
+            ti.axes(1, 2, 3),
+            (self.n_steps, self.batch_size, self.n_hidden)).place(self.hidden)
+        self.batch_node.dense(
+            ti.axes(1, 2, 3),
+            (self.n_steps, self.batch_size, self.n_output)).place(self.output)
 
         self.weights1 = scalar()
         self.bias1 = scalar()
@@ -87,19 +100,29 @@ def parameters(self):
     @ti.kernel
     def weights_init(self):
         q1 = ti.sqrt(6 / self.n_input) * 0.01
-        for model_id, i, j in ti.ndrange(self.n_models, self.n_hidden, self.n_input):
+        for model_id, i, j in ti.ndrange(self.n_models, self.n_hidden,
+                                         self.n_input):
             self.weights1[model_id, i, j] = (ti.random() * 2 - 1) * q1
 
     @ti.kernel
     def _forward(self, t: ti.i32, nn_input: ti.template()):
-        for model_id, k, i, j in ti.ndrange(self.n_models, self.batch_size, self.n_hidden, self.n_input):
-            self.hidden[model_id, t, k, i] += self.weights1[model_id, i, j] * nn_input[model_id, t, k, j]
+        for model_id, k, i, j in ti.ndrange(self.n_models, self.batch_size,
+                                            self.n_hidden, self.n_input):
+            self.hidden[model_id, t, k,
+                        i] += self.weights1[model_id, i,
+                                            j] * nn_input[model_id, t, k, j]
         if ti.static(self.activation):
-            for model_id, k, i in ti.ndrange(self.n_models, self.batch_size, self.n_hidden):
-                self.output[model_id, t, k, i] = ti.tanh(self.hidden[model_id, t, k, i] + self.bias1[model_id, i])
+            for model_id, k, i in ti.ndrange(self.n_models, self.batch_size,
+                                             self.n_hidden):
+                self.output[model_id, t, k,
+                            i] = ti.tanh(self.hidden[model_id, t, k, i] +
+                                         self.bias1[model_id, i])
         else:
-            for model_id, k, i in ti.ndrange(self.n_models, self.batch_size, self.n_hidden):
-                self.output[model_id, t, k, i] = self.hidden[model_id, t, k, i] + self.bias1[model_id, i]
+            for model_id, k, i in ti.ndrange(self.n_models, self.batch_size,
+                                             self.n_hidden):
+                self.output[model_id, t, k,
+                            i] = self.hidden[model_id, t, k,
+                                             i] + self.bias1[model_id, i]
 
     @ti.kernel
     def clear(self):
@@ -129,7 +152,8 @@ def load_weights_from_value(self, w_val, model_id=0):
             self.copy_from_numpy(w, val, model_id)
 
     @ti.kernel
-    def copy_from_numpy(self, dst: ti.template(), src: ti.ext_arr(), model_id: ti.i32):
+    def copy_from_numpy(self, dst: ti.template(), src: ti.ext_arr(),
+                        model_id: ti.i32):
         for I in ti.grouped(src):
             dst[model_id, I] = src[I]
 
@@ -144,14 +168,27 @@ def copy_from_numpy(self, dst: ti.template(), src: ti.ext_arr(), model_id: ti.i3
 learning_rate = 1e-3
 loss = ti.field(float, shape=(), needs_grad=True)
 
-
 if TRAIN:
     batch_size = 16
-    input_states = ti.field(float, shape=(model_num, steps, batch_size, n_input), needs_grad=True)
-    fc1 = Linear(n_models=model_num, batch_size=batch_size, n_steps=steps, n_input=n_input, n_hidden=n_hidden,
-                 n_output=n_output, needs_grad=True, activation=False)
-    fc2 = Linear(n_models=model_num, batch_size=batch_size, n_steps=steps, n_input=n_output, n_hidden=n_hidden,
-                 n_output=n_output_act, needs_grad=True, activation=True)
+    input_states = ti.field(float,
+                            shape=(model_num, steps, batch_size, n_input),
+                            needs_grad=True)
+    fc1 = Linear(n_models=model_num,
+                 batch_size=batch_size,
+                 n_steps=steps,
+                 n_input=n_input,
+                 n_hidden=n_hidden,
+                 n_output=n_output,
+                 needs_grad=True,
+                 activation=False)
+    fc2 = Linear(n_models=model_num,
+                 batch_size=batch_size,
+                 n_steps=steps,
+                 n_input=n_output,
+                 n_hidden=n_hidden,
+                 n_output=n_output_act,
+                 needs_grad=True,
+                 activation=True)
     fc1.weights_init()
     fc2.weights_init()
     NNs = [fc1, fc2]
@@ -169,7 +206,8 @@ def copy_from_numpy(self, dst: ti.template(), src: ti.ext_arr(), model_id: ti.i3
     def targets_generation(num, x_range, y_range, z_range):
         low = np.array([x_range[0], y_range[0], z_range[0]])
         high = np.array([x_range[1], y_range[1], z_range[1]])
-        return np.array([np.random.uniform(low=low, high=high) for _ in range(num)])
+        return np.array(
+            [np.random.uniform(low=low, high=high) for _ in range(num)])
 
     np.random.seed(0)
     all_data = targets_generation(sample_num, x_range, y_range, z_range)
@@ -179,11 +217,25 @@ def targets_generation(num, x_range, y_range, z_range):
     print("training data ", training_data.shape, "test data ", test_data.shape)
 else:
     batch_size = 1
-    input_states = ti.field(float, shape=(model_num, steps, batch_size, n_input), needs_grad=False)
-    fc1 = Linear(n_models=model_num, batch_size=batch_size, n_steps=steps, n_input=n_input, n_hidden=n_hidden,
-                 n_output=n_output, needs_grad=False, activation=False)
-    fc2 = Linear(n_models=model_num, batch_size=batch_size, n_steps=steps, n_input=n_output, n_hidden=n_hidden,
-                 n_output=n_output_act, needs_grad=False, activation=True)
+    input_states = ti.field(float,
+                            shape=(model_num, steps, batch_size, n_input),
+                            needs_grad=False)
+    fc1 = Linear(n_models=model_num,
+                 batch_size=batch_size,
+                 n_steps=steps,
+                 n_input=n_input,
+                 n_hidden=n_hidden,
+                 n_output=n_output,
+                 needs_grad=False,
+                 activation=False)
+    fc2 = Linear(n_models=model_num,
+                 batch_size=batch_size,
+                 n_steps=steps,
+                 n_input=n_output,
+                 n_hidden=n_hidden,
+                 n_output=n_output_act,
+                 needs_grad=False,
+                 activation=True)
     file_dir_path = os.path.dirname(os.path.realpath(__file__))
     fc1.load_weights(f"{file_dir_path}/fc1_pretrained.pkl", model_id=0)
     fc2.load_weights(f"{file_dir_path}/fc2_pretrained.pkl", model_id=0)
@@ -200,13 +252,15 @@ def targets_generation(num, x_range, y_range, z_range):
 max_height = ti.field(float, shape=batch_size, needs_grad=True)
 max_left = ti.field(float, shape=batch_size, needs_grad=True)
 max_right = ti.field(float, shape=batch_size, needs_grad=True)
-jet_force_max = ti.Vector([9.81*3, 9.81*10, 9.81*3])
+jet_force_max = ti.Vector([9.81 * 3, 9.81 * 10, 9.81 * 3])
 
 # Simulation parameters
 particle_radius = 0.01
 particle_diameter = particle_radius * 2
-N_np = ((spawn_box_np[1] - spawn_box_np[0]) / particle_diameter + 1).astype(int)
-N_target_np = ((target_box_np[1] - target_box_np[0]) / particle_diameter + 1).astype(int)
+N_np = ((spawn_box_np[1] - spawn_box_np[0]) / particle_diameter +
+        1).astype(int)
+N_target_np = ((target_box_np[1] - target_box_np[0]) / particle_diameter +
+               1).astype(int)
 
 h = 4.0 * particle_radius
 fluid_particle_num = N_np[0] * N_np[1] * N_np[2]
@@ -217,8 +271,10 @@ def targets_generation(num, x_range, y_range, z_range):
 pos = ti.Vector.field(3, float)
 vel = ti.Vector.field(3, float)
 acc = ti.Vector.field(3, float)
-jet_force = ti.Vector.field(3, float, shape=(steps, batch_size), needs_grad=True)
-
+jet_force = ti.Vector.field(3,
+                            float,
+                            shape=(steps, batch_size),
+                            needs_grad=True)
 
 col = ti.Vector.field(3, float)
 material = ti.field(int)
@@ -227,7 +283,8 @@ def targets_generation(num, x_range, y_range, z_range):
 
 pos_vis_buffer = ti.Vector.field(3, float, shape=particle_num)
 pos_output_buffer = ti.Vector.field(3, float, shape=(steps, particle_num))
-ti.root.dense(ti.ijk, (batch_size, steps, int(particle_num))).place(pos, vel, acc, den, pre)
+ti.root.dense(ti.ijk, (batch_size, steps, int(particle_num))).place(
+    pos, vel, acc, den, pre)
 ti.root.dense(ti.i, int(particle_num)).place(material, col)
 ti.root.lazy_grad()
 
@@ -292,16 +349,15 @@ def W_spiky_gradient(R, h):
 
 
 @ti.kernel
-def initialize_fluid_particle(t: ti.int32, pos: ti.template(), N_fluid: ti.template()):
+def initialize_fluid_particle(t: ti.int32, pos: ti.template(),
+                              N_fluid: ti.template()):
     # Allocate fluid
     for bs, i in ti.ndrange(batch_size, fluid_particle_num):
-        pos[bs, t, i] = (
-            ti.Vector(
-                [int(i % N_fluid[0]), int(i / N_fluid[0]) % N_fluid[1], int(i / N_fluid[0] / N_fluid[1] % N_fluid[2])]
-            )
-            * particle_diameter
-            + spawn_box[0]
-        )
+        pos[bs, t, i] = (ti.Vector([
+            int(i % N_fluid[0]),
+            int(i / N_fluid[0]) % N_fluid[1],
+            int(i / N_fluid[0] / N_fluid[1] % N_fluid[2])
+        ]) * particle_diameter + spawn_box[0])
         vel[bs, t, i] = ti.Vector([0.0, 0.0, 0.0])
         material[i] = 0
         col[i] = ti.Vector([0.4, 0.7, 1.0])
@@ -320,16 +376,17 @@ def initialize_dists():
 
 
 @ti.kernel
-def initialize_target_particle(t: ti.int32, pos: ti.template(), N_target:ti.template(), current_pos: ti.int32):
+def initialize_target_particle(t: ti.int32, pos: ti.template(),
+                               N_target: ti.template(), current_pos: ti.int32):
     # Allocate target cube
-    for bs, i in ti.ndrange(batch_size, (fluid_particle_num, fluid_particle_num + target_particle_num)):
-        pos[bs, t, i] = (
-            ti.Vector(
-                [int(i % N_target[0]), int(i / N_target[0]) % N_target[1], int(i / N_target[0] / N_target[1] % N_target[2])]
-            )
-            * particle_diameter
-            + target_centers[current_pos]
-        )
+    for bs, i in ti.ndrange(
+            batch_size,
+        (fluid_particle_num, fluid_particle_num + target_particle_num)):
+        pos[bs, t, i] = (ti.Vector([
+            int(i % N_target[0]),
+            int(i / N_target[0]) % N_target[1],
+            int(i / N_target[0] / N_target[1] % N_target[2])
+        ]) * particle_diameter + target_centers[current_pos])
         vel[bs, t, i] = ti.Vector([0.0, 0.0, 0.0])
         material[i] = 1
         col[i] = ti.Vector([1.0, 0.65, 0.0])
@@ -355,7 +412,8 @@ def update_density(t: ti.int32):
 @ti.kernel
 def update_pressure(t: ti.int32):
     for bs, i in ti.ndrange(batch_size, particle_num):
-        pre[bs, t, i] = pressure_scale * max(pow(den[bs, t, i] / rest_density, gamma) - 1, 0)
+        pre[bs, t, i] = pressure_scale * max(
+            pow(den[bs, t, i] / rest_density, gamma) - 1, 0)
 
 
 @ti.kernel
@@ -371,9 +429,14 @@ def apply_force(t: ti.int32):
         if material[i] == 1:
             acc[bs, t, i] = ti.Vector([0.0, 0.0, 0.0])
         else:
-            if pos[bs, t, i][0] > 0.2 and pos[bs, t, i][0] < 0.3 and pos[bs, t, i][1] < 0.2 and pos[bs, t, i][2] > 0.2 and pos[bs, t, i][2] < 0.3:
+            if pos[bs, t, i][0] > 0.2 and pos[bs, t, i][0] < 0.3 and pos[
+                    bs, t, i][1] < 0.2 and pos[bs, t,
+                                               i][2] > 0.2 and pos[bs, t,
+                                                                   i][2] < 0.3:
                 indicator = (steps - t) // (steps // 2)
-                acc[bs, t, i] = jet_force[t, bs] + gravity + indicator * (- gravity) * 0.1
+                acc[bs, t,
+                    i] = jet_force[t,
+                                   bs] + gravity + indicator * (-gravity) * 0.1
             else:
                 acc[bs, t, i] = gravity
 
@@ -384,7 +447,10 @@ def update_force(t: ti.int32):
         for j in range(particle_num):
             R = pos[bs, t, i] - pos[bs, t, j]
             # Pressure forces
-            acc[bs, t, i] += -mass * (pre[bs, t, i] / (den[bs, t, i] * den[bs, t, i]) + pre[bs, t, j] / (den[bs, t, j] * den[bs, t, j])) * W_gradient(R, h)
+            acc[bs, t, i] += -mass * (
+                pre[bs, t, i] /
+                (den[bs, t, i] * den[bs, t, i]) + pre[bs, t, j] /
+                (den[bs, t, j] * den[bs, t, j])) * W_gradient(R, h)
 
             # Viscosity forces
             acc[bs, t, i] += viscosity_scale * mass \
@@ -415,7 +481,8 @@ def boundary_handle(t: ti.int32):
         collision_normal_length = collision_normal.norm()
         if collision_normal_length > eps:
             collision_normal /= collision_normal_length
-            vel[bs, t, i] -= (1.0 + damping) * collision_normal.dot(vel[bs, t, i]) * collision_normal
+            vel[bs, t, i] -= (1.0 + damping) * collision_normal.dot(
+                vel[bs, t, i]) * collision_normal
 
 
 @ti.kernel
@@ -435,7 +502,9 @@ def compute_dist(t: ti.int32):
 @ti.kernel
 def compute_loss(t: ti.int32):
     for bs in range(batch_size):
-        max_dist[bs] = ti.sqrt((max_left[bs] - target_centers[bs][0])**2 + (max_right[bs] - target_centers[bs][2])**2 + (max_height[bs] - target_centers[bs][1])**2)
+        max_dist[bs] = ti.sqrt((max_left[bs] - target_centers[bs][0])**2 +
+                               (max_right[bs] - target_centers[bs][2])**2 +
+                               (max_height[bs] - target_centers[bs][1])**2)
         loss[None] += (min_dist[bs] + 0.2 * max_dist[bs]) / batch_size
 
 
@@ -470,7 +539,7 @@ def copy_from_output_to_vis(t: ti.int32):
 
 @ti.kernel
 def fill_target_centers(current_pos: ti.int32, data: ti.any_arr()):
-    for i in range(current_pos, current_pos+batch_size):
+    for i in range(current_pos, current_pos + batch_size):
         for j in ti.static(range(3)):
             target_centers[i][j] = data[i, j]
     print('target_centers ', target_centers[current_pos])
@@ -478,7 +547,7 @@ def fill_target_centers(current_pos: ti.int32, data: ti.any_arr()):
 
 @ti.kernel
 def fill_input_states(current_pos: ti.int32):
-    for t, bs in ti.ndrange(steps, (current_pos, current_pos+batch_size)):
+    for t, bs in ti.ndrange(steps, (current_pos, current_pos + batch_size)):
         for j in ti.static(range(3)):
             input_states[0, t, bs, j] = target_centers[bs][j]
 
@@ -505,12 +574,14 @@ def fill_input_states(current_pos: ti.int32):
         for opt_iter in range(opt_iters):
             loss_epoch = 0.0
             cnt = 0
-            for current_data_offset in range(0, training_sample_num, batch_size):
+            for current_data_offset in range(0, training_sample_num,
+                                             batch_size):
                 fill_target_centers(current_data_offset, training_data)
                 fill_input_states(current_data_offset)
                 initialize_fluid_particle(0, pos, N_fluid)
                 initialize_dists()
-                initialize_target_particle(0, pos, N_target, current_data_offset)
+                initialize_target_particle(0, pos, N_target,
+                                           current_data_offset)
                 fc1.clear()
                 fc2.clear()
                 with ti.Tape(loss=loss):
@@ -530,11 +601,14 @@ def fill_input_states(current_pos: ti.int32):
                         compute_dist(i)
                     compute_loss(steps - 1)
                 optimizer.step()
-                print(f"current opt progress: {current_data_offset + batch_size}/{training_sample_num}, loss: {loss[None]}")
+                print(
+                    f"current opt progress: {current_data_offset + batch_size}/{training_sample_num}, loss: {loss[None]}"
+                )
                 losses.append(loss[None])
                 loss_epoch += loss[None]
                 cnt += 1
-            print(f'opt iter {opt_iter} done. Average loss: {loss_epoch / cnt}')
+            print(
+                f'opt iter {opt_iter} done. Average loss: {loss_epoch / cnt}')
             losses_epoch_avg.append(loss_epoch / cnt)
 
             if TRAIN_VISUAL:
@@ -544,21 +618,34 @@ def fill_input_states(current_pos: ti.int32):
                         if i % substeps == 0:
                             copy_from_output_to_vis(i)
                         scene.set_camera(camera)
-                        scene.point_light((2.0, 2.0, 2.0), color=(1.0, 1.0, 1.0))
-                        scene.particles(pos_vis_buffer, radius=particle_radius, per_vertex_color=col)
+                        scene.point_light((2.0, 2.0, 2.0),
+                                          color=(1.0, 1.0, 1.0))
+                        scene.particles(pos_vis_buffer,
+                                        radius=particle_radius,
+                                        per_vertex_color=col)
                         canvas.scene(scene)
                         if TRAIN_OUTPUT_IMG:
                             if i % substeps == 0:
-                                window.write_image(f'output_img/{opt_iter}/{i:04}.png')
+                                window.write_image(
+                                    f'output_img/{opt_iter}/{i:04}.png')
                         if TRAIN_VISUAL_SHOW:
                             window.show()
             if opt_iter % 2 == 0:
                 os.makedirs(f"saved_models/{opt_iter}", exist_ok=True)
-                fc1.dump_weights(name=f"saved_models/{opt_iter}/fc1_{opt_iter:04}.pkl")
-                fc2.dump_weights(name=f"saved_models/{opt_iter}/fc2_{opt_iter:04}.pkl")
-
-        plt.plot([i for i in range(len(losses))], losses, label='loss per iteration')
-        plt.plot([i * (training_sample_num // batch_size) for i in range(len(losses_epoch_avg))], losses_epoch_avg, label='loss epoch avg.')
+                fc1.dump_weights(
+                    name=f"saved_models/{opt_iter}/fc1_{opt_iter:04}.pkl")
+                fc2.dump_weights(
+                    name=f"saved_models/{opt_iter}/fc2_{opt_iter:04}.pkl")
+
+        plt.plot([i for i in range(len(losses))],
+                 losses,
+                 label='loss per iteration')
+        plt.plot([
+            i * (training_sample_num // batch_size)
+            for i in range(len(losses_epoch_avg))
+        ],
+                 losses_epoch_avg,
+                 label='loss epoch avg.')
         plt.title("Training Loss")
         plt.xlabel("Training Iterations")
         plt.ylabel("Loss")
@@ -579,20 +666,18 @@ def fill_input_states(current_pos: ti.int32):
             window.GUI.text("Space: pause")
             window.GUI.text("Set target positions:")
 
-            target_centers[current_data_offset][0] = window.GUI.slider_float("X",
-                                                                             target_centers[current_data_offset][0],
-                                                                             0.05, 0.45)
-            target_centers[current_data_offset][1] = window.GUI.slider_float("Y",
-                                                                             target_centers[current_data_offset][1],
-                                                                             0.4, 1.0)
-            target_centers[current_data_offset][2] = window.GUI.slider_float("Z",
-                                                                             target_centers[current_data_offset][2],
-                                                                             0.05, 0.45)
+            target_centers[current_data_offset][0] = window.GUI.slider_float(
+                "X", target_centers[current_data_offset][0], 0.05, 0.45)
+            target_centers[current_data_offset][1] = window.GUI.slider_float(
+                "Y", target_centers[current_data_offset][1], 0.4, 1.0)
+            target_centers[current_data_offset][2] = window.GUI.slider_float(
+                "Z", target_centers[current_data_offset][2], 0.05, 0.45)
             window.GUI.end()
 
             if not paused[None]:
                 fill_input_states(current_data_offset)
-                initialize_target_particle(0, pos, N_target, current_data_offset)
+                initialize_target_particle(0, pos, N_target,
+                                           current_data_offset)
                 fc1.clear()
                 fc2.clear()
                 for i in range(1, substeps):
@@ -623,15 +708,18 @@ def fill_input_states(current_pos: ti.int32):
                     paused[None] = not paused[None]
             camera.position(*(camera.curr_position + position_change))
             camera.lookat(*(camera.curr_lookat + position_change))
-            camera.track_user_inputs(window, movement_speed=movement_speed, hold_key=ti.ui.RMB)
+            camera.track_user_inputs(window,
+                                     movement_speed=movement_speed,
+                                     hold_key=ti.ui.RMB)
 
             scene.set_camera(camera)
             scene.point_light((2.0, 2.0, 2.0), color=(1.0, 1.0, 1.0))
-            scene.particles(pos_vis_buffer, radius=particle_radius, per_vertex_color=col)
+            scene.particles(pos_vis_buffer,
+                            radius=particle_radius,
+                            per_vertex_color=col)
             canvas.scene(scene)
             if INFER_OUTPUT_IMG:
                 if cnt % 2 == 0:
                     os.makedirs(f"demo_output_interactive/", exist_ok=True)
                     window.write_image(f'demo_output_interactive/{cnt:04}.png')
             window.show()
-
diff --git a/python/taichi/examples/features/io/export_mesh.py b/python/taichi/examples/features/io/export_mesh.py
index c2090b0e95b6f..21b2bfa495ac9 100644
--- a/python/taichi/examples/features/io/export_mesh.py
+++ b/python/taichi/examples/features/io/export_mesh.py
@@ -92,7 +92,9 @@
     b = np.random.rand(20)
     alpha = np.random.rand(20)
     # re-fill
-    writer = ti.tools.PLYWriter(num_vertices=20, num_faces=12, face_type="quad")
+    writer = ti.tools.PLYWriter(num_vertices=20,
+                                num_faces=12,
+                                face_type="quad")
     writer.add_vertex_pos(x, y, z)
     writer.add_faces(indices)
     writer.add_vertex_channel("vdata1", "double", vdata)
diff --git a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
index 4be661f9fbead..0f23939d12fb3 100644
--- a/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
+++ b/python/taichi/examples/ggui_examples/mass_spring_3d_ggui.py
@@ -27,6 +27,7 @@
 
 bending_springs = False
 
+
 @ti.kernel
 def initialize_mass_points():
     random_offset = ti.Vector([ti.random() - 0.5, ti.random() - 0.5]) * 0.1
@@ -58,6 +59,7 @@ def initialize_mesh_indices():
         else:
             colors[i * n + j] = (1, 0.334, 0.52)
 
+
 initialize_mesh_indices()
 
 spring_offsets = []
@@ -73,6 +75,7 @@ def initialize_mesh_indices():
             if (i, j) != (0, 0) and abs(i) + abs(j) <= 2:
                 spring_offsets.append(ti.Vector([i, j]))
 
+
 @ti.kernel
 def substep():
     for i in ti.grouped(x):
@@ -105,7 +108,6 @@ def substep():
         x[i] += dt * v[i]
 
 
-
 @ti.kernel
 def update_vertices():
     for i, j in ti.ndrange(n, n):
@@ -145,7 +147,9 @@ def update_vertices():
                two_sided=True)
 
     # Draw a smaller ball to avoid visual penetration
-    scene.particles(ball_center, radius=ball_radius * 0.95, color=(0.5, 0.42, 0.8))
+    scene.particles(ball_center,
+                    radius=ball_radius * 0.95,
+                    color=(0.5, 0.42, 0.8))
     canvas.scene(scene)
     window.show()
 
diff --git a/python/taichi/examples/graph/mpm88_graph.py b/python/taichi/examples/graph/mpm88_graph.py
index 50ba7554fa7b8..c59263d7ffb71 100644
--- a/python/taichi/examples/graph/mpm88_graph.py
+++ b/python/taichi/examples/graph/mpm88_graph.py
@@ -1,5 +1,7 @@
 import argparse
+
 import numpy as np
+
 import taichi as ti
 
 ti.init(arch=ti.vulkan)
@@ -17,7 +19,6 @@
 N_ITER = 500  # Use 500 to make speed diff more obvious
 
 
-
 @ti.kernel
 def substep_reset_grid(grid_v: ti.any_arr(field_dim=2),
                        grid_m: ti.any_arr(field_dim=2)):
@@ -95,6 +96,7 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
         v[i] = [0, -1]
         J[i] = 1
 
+
 x = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
 v = ti.Vector.ndarray(2, ti.f32, shape=(n_particles))
 
@@ -105,19 +107,29 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--baseline',
-        action='store_true')
+    parser.add_argument('--baseline', action='store_true')
     args, unknown = parser.parse_known_args()
 
     if not args.baseline:
         print('running in graph mode')
         # Build graph
-        sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'x', ti.f32, element_shape=(2, ))
-        sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'v', ti.f32, element_shape=(2, ))
-        sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'C', ti.f32, element_shape=(2, 2))
+        sym_x = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                             'x',
+                             ti.f32,
+                             element_shape=(2, ))
+        sym_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                             'v',
+                             ti.f32,
+                             element_shape=(2, ))
+        sym_C = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                             'C',
+                             ti.f32,
+                             element_shape=(2, 2))
         sym_J = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'J', ti.f32)
-        sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_v', ti.f32, element_shape=(2, ))
+        sym_grid_v = ti.graph.Arg(ti.graph.ArgKind.NDARRAY,
+                                  'grid_v',
+                                  ti.f32,
+                                  element_shape=(2, ))
         sym_grid_m = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'grid_m', ti.f32)
         g_init_builder = ti.graph.GraphBuilder()
         g_init_builder.dispatch(init_particles, sym_x, sym_v, sym_J)
@@ -127,7 +139,7 @@ def init_particles(x: ti.any_arr(field_dim=1), v: ti.any_arr(field_dim=1),
 
         substep.dispatch(substep_reset_grid, sym_grid_v, sym_grid_m)
         substep.dispatch(substep_p2g, sym_x, sym_v, sym_C, sym_J, sym_grid_v,
-                        sym_grid_m)
+                         sym_grid_m)
         substep.dispatch(substep_update_grid_v, sym_grid_v, sym_grid_m)
         substep.dispatch(substep_g2p, sym_x, sym_v, sym_C, sym_J, sym_grid_v)
 
diff --git a/python/taichi/examples/graph/stable_fluid_graph.py b/python/taichi/examples/graph/stable_fluid_graph.py
index c4b0826e953f6..9a33ac5342fe2 100644
--- a/python/taichi/examples/graph/stable_fluid_graph.py
+++ b/python/taichi/examples/graph/stable_fluid_graph.py
@@ -7,6 +7,7 @@
 import argparse
 
 import numpy as np
+
 import taichi as ti
 
 ti.init(arch=ti.vulkan)
@@ -214,11 +215,10 @@ def reset():
     pressures_pair.cur.fill(0)
     dyes_pair.cur.fill(0)
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--baseline',
-        action='store_true')
+    parser.add_argument('--baseline', action='store_true')
     args, unknown = parser.parse_known_args()
 
     gui = ti.GUI('Stable Fluid', (res, res))
@@ -261,38 +261,44 @@ def reset():
                                           'pressures_pair_nxt', ti.f32)
         velocity_divs = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'velocity_divs',
                                      ti.f32)
-        mouse_data = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'mouse_data', ti.f32)
+        mouse_data = ti.graph.Arg(ti.graph.ArgKind.NDARRAY, 'mouse_data',
+                                  ti.f32)
 
         g1_builder = ti.graph.GraphBuilder()
         g1_builder.dispatch(advect, velocities_pair_cur, velocities_pair_cur,
-                    velocities_pair_nxt)
-        g1_builder.dispatch(advect, velocities_pair_cur, dyes_pair_cur, dyes_pair_nxt)
-        g1_builder.dispatch(apply_impulse, velocities_pair_nxt, dyes_pair_nxt, mouse_data)
+                            velocities_pair_nxt)
+        g1_builder.dispatch(advect, velocities_pair_cur, dyes_pair_cur,
+                            dyes_pair_nxt)
+        g1_builder.dispatch(apply_impulse, velocities_pair_nxt, dyes_pair_nxt,
+                            mouse_data)
         g1_builder.dispatch(divergence, velocities_pair_nxt, velocity_divs)
         # swap is unrolled in the loop so we only need p_jacobi_iters // 2 iterations.
         for _ in range(p_jacobi_iters // 2):
-            g1_builder.dispatch(pressure_jacobi, pressures_pair_cur, pressures_pair_nxt,
-                        velocity_divs)
-            g1_builder.dispatch(pressure_jacobi, pressures_pair_nxt, pressures_pair_cur,
-                        velocity_divs)
-        g1_builder.dispatch(subtract_gradient, velocities_pair_nxt, pressures_pair_cur)
+            g1_builder.dispatch(pressure_jacobi, pressures_pair_cur,
+                                pressures_pair_nxt, velocity_divs)
+            g1_builder.dispatch(pressure_jacobi, pressures_pair_nxt,
+                                pressures_pair_cur, velocity_divs)
+        g1_builder.dispatch(subtract_gradient, velocities_pair_nxt,
+                            pressures_pair_cur)
         g1 = g1_builder.compile()
 
         g2_builder = ti.graph.GraphBuilder()
         g2_builder.dispatch(advect, velocities_pair_nxt, velocities_pair_nxt,
-                    velocities_pair_cur)
-        g2_builder.dispatch(advect, velocities_pair_nxt, dyes_pair_nxt, dyes_pair_cur)
-        g2_builder.dispatch(apply_impulse, velocities_pair_cur, dyes_pair_cur, mouse_data)
+                            velocities_pair_cur)
+        g2_builder.dispatch(advect, velocities_pair_nxt, dyes_pair_nxt,
+                            dyes_pair_cur)
+        g2_builder.dispatch(apply_impulse, velocities_pair_cur, dyes_pair_cur,
+                            mouse_data)
         g2_builder.dispatch(divergence, velocities_pair_cur, velocity_divs)
         for _ in range(p_jacobi_iters // 2):
-            g2_builder.dispatch(pressure_jacobi, pressures_pair_cur, pressures_pair_nxt,
-                        velocity_divs)
-            g2_builder.dispatch(pressure_jacobi, pressures_pair_nxt, pressures_pair_cur,
-                        velocity_divs)
-        g2_builder.dispatch(subtract_gradient, velocities_pair_cur, pressures_pair_cur)
+            g2_builder.dispatch(pressure_jacobi, pressures_pair_cur,
+                                pressures_pair_nxt, velocity_divs)
+            g2_builder.dispatch(pressure_jacobi, pressures_pair_nxt,
+                                pressures_pair_cur, velocity_divs)
+        g2_builder.dispatch(subtract_gradient, velocities_pair_cur,
+                            pressures_pair_cur)
         g2 = g2_builder.compile()
 
-
     swap = True
 
     while gui.running:
diff --git a/tests/python/examples/__init__.py b/tests/python/examples/__init__.py
index 8b137891791fe..e69de29bb2d1d 100644
--- a/tests/python/examples/__init__.py
+++ b/tests/python/examples/__init__.py
@@ -1 +0,0 @@
-
diff --git a/tests/python/examples/autodiff/__init__.py b/tests/python/examples/autodiff/__init__.py
index 8b137891791fe..e69de29bb2d1d 100644
--- a/tests/python/examples/autodiff/__init__.py
+++ b/tests/python/examples/autodiff/__init__.py
@@ -1 +0,0 @@
-

From 6349d60487ee5a1db408f03b93a007b4382de0b2 Mon Sep 17 00:00:00 2001
From: Yi Xu <xy_xuyi@foxmail.com>
Date: Mon, 13 Jun 2022 12:48:00 +0800
Subject: [PATCH 175/176] [type] [refactor] Misc improvements to quant codegen
 (#5129)

* Replace is_custom_type() with is_quant()

* Rename two functions

* Use get_constant() if possible

* Rename two metal functions

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 python/taichi/lang/matrix.py                  |  4 +--
 taichi/backends/metal/codegen_metal.cpp       | 22 ++++++++--------
 .../metal/shaders/snode_bit_pointer.metal.h   |  2 +-
 taichi/codegen/codegen_llvm.h                 |  8 +++---
 taichi/codegen/codegen_llvm_quant.cpp         | 25 +++++++++----------
 taichi/ir/frontend_ir.cpp                     |  6 ++---
 taichi/ir/type_utils.h                        |  2 +-
 taichi/python/export_lang.cpp                 |  2 +-
 taichi/transforms/type_check.cpp              |  2 +-
 9 files changed, 35 insertions(+), 38 deletions(-)

diff --git a/python/taichi/lang/matrix.py b/python/taichi/lang/matrix.py
index 1c7add5cd19c3..103246daeb381 100644
--- a/python/taichi/lang/matrix.py
+++ b/python/taichi/lang/matrix.py
@@ -1457,8 +1457,8 @@ def _calc_dynamic_index_stride(self):
             return
         length = len(paths[0])
         if any(
-                len(path) != length or ti_core.is_custom_type(path[length -
-                                                                   1]._dtype)
+                len(path) != length or ti_core.is_quant(path[length -
+                                                             1]._dtype)
                 for path in paths):
             return
         for i in range(length):
diff --git a/taichi/backends/metal/codegen_metal.cpp b/taichi/backends/metal/codegen_metal.cpp
index a020c4635d26a..e3c9aa95f282d 100644
--- a/taichi/backends/metal/codegen_metal.cpp
+++ b/taichi/backends/metal/codegen_metal.cpp
@@ -981,7 +981,7 @@ class KernelCodegenImpl : public IRVisitor {
       validate_cft_for_metal(cft);
       auto *digits_cit = cft->get_digits_type()->as<CustomIntType>();
       cit = digits_cit;
-      store_value_expr = construct_float_to_custom_int_expr(
+      store_value_expr = construct_quant_fixed_to_quant_int_expr(
           stmt->val, cft->get_scale(), digits_cit);
     } else {
       TI_NOT_IMPLEMENTED;
@@ -1004,10 +1004,10 @@ class KernelCodegenImpl : public IRVisitor {
     TI_ASSERT(ptr_type->is_bit_pointer());
     auto *pointee_type = ptr_type->get_pointee_type();
     if (auto *cit = pointee_type->cast<CustomIntType>()) {
-      return construct_load_as_custom_int(stmt->src, cit);
+      return construct_load_quant_int(stmt->src, cit);
     } else if (auto *cft = pointee_type->cast<CustomFloatType>()) {
       validate_cft_for_metal(cft);
-      const auto loaded = construct_load_as_custom_int(
+      const auto loaded = construct_load_quant_int(
           stmt->src, cft->get_digits_type()->as<CustomIntType>());
       // Computes `float(digits_expr) * scale`
       // See LLVM backend's reconstruct_quant_fixed()
@@ -1033,8 +1033,8 @@ class KernelCodegenImpl : public IRVisitor {
       val_expr = stmt->val->raw_name();
     } else if (auto *cft = pointee_type->cast<CustomFloatType>()) {
       cit = cft->get_digits_type()->as<CustomIntType>();
-      val_expr =
-          construct_float_to_custom_int_expr(stmt->val, cft->get_scale(), cit);
+      val_expr = construct_quant_fixed_to_quant_int_expr(stmt->val,
+                                                         cft->get_scale(), cit);
     } else {
       TI_NOT_IMPLEMENTED;
     }
@@ -1051,7 +1051,7 @@ class KernelCodegenImpl : public IRVisitor {
   }
 
   // Returns the expression of `int(val_stmt * (1.0f / scale) + 0.5f)`
-  std::string construct_float_to_custom_int_expr(
+  std::string construct_quant_fixed_to_quant_int_expr(
       const Stmt *val_stmt,
       float64 scale,
       CustomIntType *digits_cit) const {
@@ -1062,14 +1062,14 @@ class KernelCodegenImpl : public IRVisitor {
     // variables) because |val_stmt| could be used multiple times. If the
     // intermediate variables are named based on |val_stmt|, it would result in
     // symbol redefinitions.
-    return fmt::format("mtl_float_to_custom_int<{}>(/*inv_scale=*/{} * {})",
-                       metal_data_type_name(compute_dt), inv_scale,
-                       val_stmt->raw_name());
+    return fmt::format(
+        "mtl_quant_fixed_to_quant_int<{}>(/*inv_scale=*/{} * {})",
+        metal_data_type_name(compute_dt), inv_scale, val_stmt->raw_name());
   }
 
   // Returns expression of the loaded integer.
-  std::string construct_load_as_custom_int(const Stmt *bit_ptr_stmt,
-                                           CustomIntType *cit) const {
+  std::string construct_load_quant_int(const Stmt *bit_ptr_stmt,
+                                       CustomIntType *cit) const {
     DataType compute_dt(cit->get_compute_type()->as<PrimitiveType>());
     const auto num_bits = cit->get_num_bits();
     if (is_full_bits(num_bits)) {
diff --git a/taichi/backends/metal/shaders/snode_bit_pointer.metal.h b/taichi/backends/metal/shaders/snode_bit_pointer.metal.h
index 63310fe1ff39f..4b390e5ceac42 100644
--- a/taichi/backends/metal/shaders/snode_bit_pointer.metal.h
+++ b/taichi/backends/metal/shaders/snode_bit_pointer.metal.h
@@ -38,7 +38,7 @@ STR(
 
     // |f| should already be scaled. |C| is the compute type.
     template <typename C>
-    C mtl_float_to_custom_int(float f) {
+    C mtl_quant_fixed_to_quant_int(float f) {
       // Branch free implementation of `f + sign(f) * 0.5`.
       // See rounding_prepare_f* in taichi/runtime/llvm/runtime.cpp
       const int32_t delta_bits =
diff --git a/taichi/codegen/codegen_llvm.h b/taichi/codegen/codegen_llvm.h
index af7a0bf091740..662ef2d9d7199 100644
--- a/taichi/codegen/codegen_llvm.h
+++ b/taichi/codegen/codegen_llvm.h
@@ -265,9 +265,9 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   void visit(GlobalStoreStmt *stmt) override;
 
-  llvm::Value *custom_type_to_bits(llvm::Value *val,
-                                   Type *input_type,
-                                   Type *output_type);
+  llvm::Value *quant_int_or_quant_fixed_to_bits(llvm::Value *val,
+                                                Type *input_type,
+                                                Type *output_type);
 
   void visit(BitStructStoreStmt *stmt) override;
 
@@ -399,7 +399,7 @@ class CodeGenLLVM : public IRVisitor, public LLVMModuleBuilder {
 
   llvm::Value *extract_digits_from_f32(llvm::Value *f, bool full);
 
-  llvm::Value *extract_digits_from_quant_float_with_shared_exponent(
+  llvm::Value *extract_digits_from_f32_with_shared_exponent(
       llvm::Value *f,
       llvm::Value *shared_exp);
 
diff --git a/taichi/codegen/codegen_llvm_quant.cpp b/taichi/codegen/codegen_llvm_quant.cpp
index ec3c764bfd78d..6e1aaa175ebde 100644
--- a/taichi/codegen/codegen_llvm_quant.cpp
+++ b/taichi/codegen/codegen_llvm_quant.cpp
@@ -51,9 +51,8 @@ llvm::Value *CodeGenLLVM::quant_fixed_to_quant_int(CustomFloatType *cft,
   // Compute int(real * (1.0 / scale) + 0.5)
   auto s_numeric = 1.0 / cft->get_scale();
   auto compute_type = cft->get_compute_type();
-  s = builder->CreateFPCast(
-      llvm::ConstantFP::get(*llvm_context, llvm::APFloat(s_numeric)),
-      llvm_type(compute_type));
+  s = builder->CreateFPCast(tlctx->get_constant(s_numeric),
+                            llvm_type(compute_type));
   auto input_real = builder->CreateFPCast(real, llvm_type(compute_type));
   auto scaled = builder->CreateFMul(input_real, s);
 
@@ -128,9 +127,9 @@ llvm::Value *CodeGenLLVM::get_exponent_offset(llvm::Value *exponent,
       tlctx->get_constant(0));
 }
 
-llvm::Value *CodeGenLLVM::custom_type_to_bits(llvm::Value *val,
-                                              Type *input_type,
-                                              Type *output_type) {
+llvm::Value *CodeGenLLVM::quant_int_or_quant_fixed_to_bits(llvm::Value *val,
+                                                           Type *input_type,
+                                                           Type *output_type) {
   CustomIntType *cit = nullptr;
   if (auto cft = input_type->cast<CustomFloatType>()) {
     TI_ASSERT(cft->get_exponent_type() == nullptr);
@@ -262,7 +261,8 @@ void CodeGenLLVM::visit(BitStructStoreStmt *stmt) {
       val = builder->CreateBitCast(val, llvm_type(bit_struct_physical_type));
       val = builder->CreateShl(val, digits_snode->bit_offset);
     } else {
-      val = custom_type_to_bits(val, dtype, bit_struct_physical_type);
+      val = quant_int_or_quant_fixed_to_bits(val, dtype,
+                                             bit_struct_physical_type);
       val = builder->CreateShl(val, bit_struct_snode->ch[ch_id]->bit_offset);
     }
 
@@ -374,8 +374,8 @@ void CodeGenLLVM::store_quant_floats_with_shared_exponents(
     for (int c = 0; c < (int)exp->exponent_users.size(); c++) {
       auto user = exp->exponent_users[c];
       auto ch_id = snode->child_id(user);
-      auto digits = extract_digits_from_quant_float_with_shared_exponent(
-          floats[c], max_exp_bits);
+      auto digits =
+          extract_digits_from_f32_with_shared_exponent(floats[c], max_exp_bits);
       auto digits_snode = snode->ch[ch_id].get();
       auto cft = digits_snode->dt->as<CustomFloatType>();
       auto digits_bit_offset = digits_snode->bit_offset;
@@ -435,7 +435,7 @@ llvm::Value *CodeGenLLVM::extract_digits_from_f32(llvm::Value *f, bool full) {
   return digits;
 }
 
-llvm::Value *CodeGenLLVM::extract_digits_from_quant_float_with_shared_exponent(
+llvm::Value *CodeGenLLVM::extract_digits_from_f32_with_shared_exponent(
     llvm::Value *f,
     llvm::Value *shared_exp) {
   auto exp = extract_exponent_from_f32(f);
@@ -518,13 +518,12 @@ llvm::Value *CodeGenLLVM::reconstruct_quant_fixed(llvm::Value *digits,
   // Compute float(digits) * scale
   llvm::Value *cast = nullptr;
   auto compute_type = cft->get_compute_type()->as<PrimitiveType>();
-  if (cft->get_digits_type()->cast<CustomIntType>()->get_is_signed()) {
+  if (cft->get_is_signed()) {
     cast = builder->CreateSIToFP(digits, llvm_type(compute_type));
   } else {
     cast = builder->CreateUIToFP(digits, llvm_type(compute_type));
   }
-  llvm::Value *s =
-      llvm::ConstantFP::get(*llvm_context, llvm::APFloat(cft->get_scale()));
+  llvm::Value *s = tlctx->get_constant(cft->get_scale());
   s = builder->CreateFPCast(s, llvm_type(compute_type));
   return builder->CreateFMul(cast, s);
 }
diff --git a/taichi/ir/frontend_ir.cpp b/taichi/ir/frontend_ir.cpp
index 93fca8a78e6ef..46cf48778ce1b 100644
--- a/taichi/ir/frontend_ir.cpp
+++ b/taichi/ir/frontend_ir.cpp
@@ -519,10 +519,8 @@ void AtomicOpExpression::type_check(CompileConfig *) {
   };
   if (!val->ret_type->is<PrimitiveType>())
     error();
-  if (auto cit = dest->ret_type->cast<CustomIntType>()) {
-    ret_type = cit->get_compute_type();
-  } else if (auto cft = dest->ret_type->cast<CustomFloatType>()) {
-    ret_type = cft->get_compute_type();
+  if (is_quant(dest->ret_type)) {
+    ret_type = dest->ret_type->get_compute_type();
   } else if (dest->ret_type->is<PrimitiveType>()) {
     ret_type = dest->ret_type;
   } else {
diff --git a/taichi/ir/type_utils.h b/taichi/ir/type_utils.h
index b5e10ed0f29c5..da1087bc9d7e5 100644
--- a/taichi/ir/type_utils.h
+++ b/taichi/ir/type_utils.h
@@ -73,7 +73,7 @@ inline PrimitiveTypeID get_primitive_data_type() {
   }
 }
 
-inline bool is_custom_type(DataType dt) {
+inline bool is_quant(DataType dt) {
   return dt->is<CustomIntType>() || dt->is<CustomFloatType>();
 }
 
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index ffb7161dda1eb..fd1443c9f849b 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -865,7 +865,7 @@ void export_lang(py::module &m) {
 #undef PER_TYPE
 
   m.def("data_type_size", data_type_size);
-  m.def("is_custom_type", is_custom_type);
+  m.def("is_quant", is_quant);
   m.def("is_integral", is_integral);
   m.def("is_signed", is_signed);
   m.def("is_real", is_real);
diff --git a/taichi/transforms/type_check.cpp b/taichi/transforms/type_check.cpp
index a7cc5b60ecfb8..10132bafd3d40 100644
--- a/taichi/transforms/type_check.cpp
+++ b/taichi/transforms/type_check.cpp
@@ -23,7 +23,7 @@ class TypeCheck : public IRVisitor {
                          Stmt *&val,
                          const std::string &stmt_name) {
     auto dst_type = dst->ret_type.ptr_removed();
-    if (dst_type->is<CustomIntType>() || dst_type->is<CustomFloatType>()) {
+    if (is_quant(dst_type)) {
       // We force the value type to be the compute_type of the bit pointer.
       // Casting from compute_type to physical_type is handled in codegen.
       dst_type = dst_type->get_compute_type();

From fae94a2111ec20e10cf77d0aaff99e2a46bca372 Mon Sep 17 00:00:00 2001
From: Zhanlue Yang <jim19930609@gmail.com>
Date: Mon, 13 Jun 2022 17:21:17 +0800
Subject: [PATCH 176/176] [aot] [llvm] LLVM AOT Field #2: Updated LLVM
 AOTModuleLoader & AOTModuleBuilder to support Fields (#5120)

* [aot] [llvm] Implemented FieldCacheData and refactored initialize_llvm_runtime_snodes()

* Addressed compilation erros

* [aot] [llvm] LLVM AOT Field #1: Adjust serialization/deserialization logics for FieldCacheData

* [llvm] [aot] Added Field support for LLVM AOT

* [aot] [llvm] LLVM AOT Field #2: Updated LLVM AOTModuleLoader & AOTModuleBuilder to support Fields

* Fixed merge issues

* Stopped abusing Program*
---
 taichi/backends/cpu/aot_module_builder_impl.h |  5 ++
 .../backends/cpu/aot_module_loader_impl.cpp   |  5 --
 .../backends/cuda/aot_module_builder_impl.h   |  5 ++
 .../backends/cuda/aot_module_loader_impl.cpp  |  5 --
 taichi/ir/snode.cpp                           |  2 +-
 taichi/ir/snode.h                             |  2 +-
 taichi/llvm/llvm_aot_module_builder.cpp       | 33 ++++++++++
 taichi/llvm/llvm_aot_module_builder.h         | 12 ++++
 taichi/llvm/llvm_aot_module_loader.cpp        | 55 ++++++++++++++++
 taichi/llvm/llvm_aot_module_loader.h          | 21 ++++++
 taichi/llvm/llvm_offline_cache.h              |  2 +-
 taichi/llvm/llvm_program.cpp                  | 66 +++++++++++--------
 taichi/llvm/llvm_program.h                    | 23 +++++--
 13 files changed, 190 insertions(+), 46 deletions(-)

diff --git a/taichi/backends/cpu/aot_module_builder_impl.h b/taichi/backends/cpu/aot_module_builder_impl.h
index 1d81fa41d7c2e..039174aa88503 100644
--- a/taichi/backends/cpu/aot_module_builder_impl.h
+++ b/taichi/backends/cpu/aot_module_builder_impl.h
@@ -9,6 +9,11 @@ namespace lang {
 namespace cpu {
 
 class AotModuleBuilderImpl : public LlvmAotModuleBuilder {
+ public:
+  explicit AotModuleBuilderImpl(LlvmProgramImpl *prog)
+      : LlvmAotModuleBuilder(prog) {
+  }
+
  private:
   CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) override;
 };
diff --git a/taichi/backends/cpu/aot_module_loader_impl.cpp b/taichi/backends/cpu/aot_module_loader_impl.cpp
index e2ff3b2ecf0f6..16c297dced325 100644
--- a/taichi/backends/cpu/aot_module_loader_impl.cpp
+++ b/taichi/backends/cpu/aot_module_loader_impl.cpp
@@ -44,11 +44,6 @@ class AotModuleImpl : public LlvmAotModule {
     TI_NOT_IMPLEMENTED;
     return nullptr;
   }
-
-  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override {
-    TI_NOT_IMPLEMENTED;
-    return nullptr;
-  }
 };
 
 }  // namespace
diff --git a/taichi/backends/cuda/aot_module_builder_impl.h b/taichi/backends/cuda/aot_module_builder_impl.h
index f0fdc74e14f9c..94ac89380d1e0 100644
--- a/taichi/backends/cuda/aot_module_builder_impl.h
+++ b/taichi/backends/cuda/aot_module_builder_impl.h
@@ -9,6 +9,11 @@ namespace lang {
 namespace cuda {
 
 class AotModuleBuilderImpl : public LlvmAotModuleBuilder {
+ public:
+  explicit AotModuleBuilderImpl(LlvmProgramImpl *prog)
+      : LlvmAotModuleBuilder(prog) {
+  }
+
  private:
   CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) override;
 };
diff --git a/taichi/backends/cuda/aot_module_loader_impl.cpp b/taichi/backends/cuda/aot_module_loader_impl.cpp
index b08efdc9632da..69bf52d749772 100644
--- a/taichi/backends/cuda/aot_module_loader_impl.cpp
+++ b/taichi/backends/cuda/aot_module_loader_impl.cpp
@@ -44,11 +44,6 @@ class AotModuleImpl : public LlvmAotModule {
     TI_NOT_IMPLEMENTED;
     return nullptr;
   }
-
-  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override {
-    TI_NOT_IMPLEMENTED;
-    return nullptr;
-  }
 };
 
 }  // namespace
diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
index 1a583cda431b5..f36511cb27b5f 100644
--- a/taichi/ir/snode.cpp
+++ b/taichi/ir/snode.cpp
@@ -326,7 +326,7 @@ void SNode::set_snode_tree_id(int id) {
   snode_tree_id_ = id;
 }
 
-int SNode::get_snode_tree_id() {
+int SNode::get_snode_tree_id() const {
   return snode_tree_id_;
 }
 
diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
index 8a21721c2a7bc..da7560501d97f 100644
--- a/taichi/ir/snode.h
+++ b/taichi/ir/snode.h
@@ -354,7 +354,7 @@ class SNode {
 
   void set_snode_tree_id(int id);
 
-  int get_snode_tree_id();
+  int get_snode_tree_id() const;
 
   static void reset_counter() {
     counter = 0;
diff --git a/taichi/llvm/llvm_aot_module_builder.cpp b/taichi/llvm/llvm_aot_module_builder.cpp
index d23ee5c47c564..664ee933893c9 100644
--- a/taichi/llvm/llvm_aot_module_builder.cpp
+++ b/taichi/llvm/llvm_aot_module_builder.cpp
@@ -2,6 +2,7 @@
 
 #include <algorithm>
 #include "taichi/llvm/launch_arg_info.h"
+#include "taichi/llvm/llvm_program.h"
 
 namespace taichi {
 namespace lang {
@@ -34,5 +35,37 @@ void LlvmAotModuleBuilder::add_per_backend(const std::string &identifier,
   cache_.kernels[identifier] = std::move(kcache);
 }
 
+void LlvmAotModuleBuilder::add_field_per_backend(const std::string &identifier,
+                                                 const SNode *rep_snode,
+                                                 bool is_scalar,
+                                                 DataType dt,
+                                                 std::vector<int> shape,
+                                                 int row_num,
+                                                 int column_num) {
+  // Field refers to a leaf node(Place SNode) in a SNodeTree.
+  // It makes no sense to just serialize the leaf node or its corresponding
+  // branch. Instead, the minimal unit we have to serialize is the entire
+  // SNodeTree. Note that SNodeTree's uses snode_tree_id as its identifier,
+  // rather than the field's name. (multiple fields may end up referring to the
+  // same SNodeTree)
+
+  // 1. Find snode_tree_id
+  int snode_tree_id = rep_snode->get_snode_tree_id();
+
+  // 2. Fetch Cache from the Program
+  // Kernel compilation is not allowed until all the Fields are finalized,
+  // so we finished SNodeTree compilation during AOTModuleBuilder construction.
+  //
+  // By the time "add_field_per_backend()" is called,
+  // SNodeTrees should have already been finalized,
+  // with compiled info stored in LlvmProgramImpl::cache_data_.
+  TI_ASSERT(prog_ != nullptr);
+  LlvmOfflineCache::FieldCacheData field_cache =
+      prog_->get_cached_field(snode_tree_id);
+
+  // 3. Update AOT Cache
+  cache_.fields[snode_tree_id] = std::move(field_cache);
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/llvm/llvm_aot_module_builder.h b/taichi/llvm/llvm_aot_module_builder.h
index b88133a761783..857f237c4a73c 100644
--- a/taichi/llvm/llvm_aot_module_builder.h
+++ b/taichi/llvm/llvm_aot_module_builder.h
@@ -9,6 +9,9 @@ namespace lang {
 
 class LlvmAotModuleBuilder : public AotModuleBuilder {
  public:
+  explicit LlvmAotModuleBuilder(LlvmProgramImpl *prog) : prog_(prog) {
+  }
+
   void dump(const std::string &output_dir,
             const std::string &filename) const override;
 
@@ -16,8 +19,17 @@ class LlvmAotModuleBuilder : public AotModuleBuilder {
   void add_per_backend(const std::string &identifier, Kernel *kernel) override;
   virtual CodeGenLLVM::CompiledData compile_kernel(Kernel *kernel) = 0;
 
+  void add_field_per_backend(const std::string &identifier,
+                             const SNode *rep_snode,
+                             bool is_scalar,
+                             DataType dt,
+                             std::vector<int> shape,
+                             int row_num,
+                             int column_num) override;
+
  private:
   mutable LlvmOfflineCache cache_;
+  LlvmProgramImpl *prog_ = nullptr;
 };
 
 }  // namespace lang
diff --git a/taichi/llvm/llvm_aot_module_loader.cpp b/taichi/llvm/llvm_aot_module_loader.cpp
index 5d725927388d7..99ca51f665363 100644
--- a/taichi/llvm/llvm_aot_module_loader.cpp
+++ b/taichi/llvm/llvm_aot_module_loader.cpp
@@ -17,6 +17,24 @@ class KernelImpl : public aot::Kernel {
   FunctionType fn_;
 };
 
+class FieldImpl : public aot::Field {
+ public:
+  explicit FieldImpl(const LlvmOfflineCache::FieldCacheData &field)
+      : field_(field) {
+  }
+
+  explicit FieldImpl(LlvmOfflineCache::FieldCacheData &&field)
+      : field_(std::move(field)) {
+  }
+
+  LlvmOfflineCache::FieldCacheData get_field() const {
+    return field_;
+  }
+
+ private:
+  LlvmOfflineCache::FieldCacheData field_;
+};
+
 }  // namespace
 
 LlvmOfflineCache::KernelCacheData LlvmAotModule::load_kernel_from_cache(
@@ -37,5 +55,42 @@ std::unique_ptr<aot::Kernel> LlvmAotModule::make_new_kernel(
   return std::make_unique<KernelImpl>(fn);
 }
 
+std::unique_ptr<aot::Field> LlvmAotModule::make_new_field(
+    const std::string &name) {
+  // Check if "name" represents snode_tree_id.
+  // Avoid using std::atoi due to its poor error handling.
+  char *end;
+  int snode_tree_id = static_cast<int>(strtol(name.c_str(), &end, 10 /*base*/));
+
+  TI_ASSERT(end != name.c_str());
+  TI_ASSERT(*end == '\0');
+
+  // Load FieldCache
+  LlvmOfflineCache::FieldCacheData loaded;
+  auto ok = cache_reader_->get_field_cache(loaded, snode_tree_id);
+  TI_ERROR_IF(!ok, "Failed to load field with id={}", snode_tree_id);
+
+  return std::make_unique<FieldImpl>(std::move(loaded));
+}
+
+void finalize_aot_field(aot::Module *aot_module,
+                        aot::Field *aot_field,
+                        uint64 *result_buffer) {
+  auto *llvm_aot_module = dynamic_cast<LlvmAotModule *>(aot_module);
+  auto *aot_field_impl = dynamic_cast<FieldImpl *>(aot_field);
+
+  TI_ASSERT(llvm_aot_module != nullptr);
+  TI_ASSERT(aot_field_impl != nullptr);
+
+  auto *llvm_prog = llvm_aot_module->get_program();
+  const auto &field_cache = aot_field_impl->get_field();
+
+  int snode_tree_id = field_cache.tree_id;
+  if (!llvm_aot_module->is_snode_tree_initialized(snode_tree_id)) {
+    llvm_prog->initialize_llvm_runtime_snodes(field_cache, result_buffer);
+    llvm_aot_module->set_initialized_snode_tree(snode_tree_id);
+  }
+}
+
 }  // namespace lang
 }  // namespace taichi
diff --git a/taichi/llvm/llvm_aot_module_loader.h b/taichi/llvm/llvm_aot_module_loader.h
index b5e8f527cea67..1e4e093bcfc2c 100644
--- a/taichi/llvm/llvm_aot_module_loader.h
+++ b/taichi/llvm/llvm_aot_module_loader.h
@@ -6,6 +6,10 @@
 namespace taichi {
 namespace lang {
 
+TI_DLL_EXPORT void finalize_aot_field(aot::Module *aot_module,
+                                      aot::Field *aot_field,
+                                      uint64 *result_buffer);
+
 class LlvmAotModule : public aot::Module {
  public:
   explicit LlvmAotModule(const std::string &module_path,
@@ -27,6 +31,18 @@ class LlvmAotModule : public aot::Module {
     return 0;
   }
 
+  LlvmProgramImpl *const get_program() {
+    return program_;
+  }
+
+  void set_initialized_snode_tree(int snode_tree_id) {
+    initialized_snode_tree_ids.insert(snode_tree_id);
+  }
+
+  bool is_snode_tree_initialized(int snode_tree_id) {
+    return initialized_snode_tree_ids.count(snode_tree_id);
+  }
+
  protected:
   virtual FunctionType convert_module_to_function(
       const std::string &name,
@@ -38,8 +54,13 @@ class LlvmAotModule : public aot::Module {
   std::unique_ptr<aot::Kernel> make_new_kernel(
       const std::string &name) override;
 
+  std::unique_ptr<aot::Field> make_new_field(const std::string &name) override;
+
   LlvmProgramImpl *const program_{nullptr};
   std::unique_ptr<LlvmOfflineCacheFileReader> cache_reader_{nullptr};
+
+  // To prevent repeated SNodeTree initialization
+  std::unordered_set<int> initialized_snode_tree_ids;
 };
 
 }  // namespace lang
diff --git a/taichi/llvm/llvm_offline_cache.h b/taichi/llvm/llvm_offline_cache.h
index b5403982b10ba..54b7356903eb0 100644
--- a/taichi/llvm/llvm_offline_cache.h
+++ b/taichi/llvm/llvm_offline_cache.h
@@ -95,7 +95,7 @@ struct LlvmOfflineCache {
   std::unordered_map<std::string, KernelCacheData>
       kernels;  // key = kernel_name
 
-  TI_IO_DEF(kernels);
+  TI_IO_DEF(fields, kernels);
 };
 
 class LlvmOfflineCacheFileReader {
diff --git a/taichi/llvm/llvm_program.cpp b/taichi/llvm/llvm_program.cpp
index eea60dad165f7..a805ade265e37 100644
--- a/taichi/llvm/llvm_program.cpp
+++ b/taichi/llvm/llvm_program.cpp
@@ -273,37 +273,22 @@ std::unique_ptr<StructCompiler> LlvmProgramImpl::compile_snode_tree_types_impl(
 }
 
 void LlvmProgramImpl::compile_snode_tree_types(SNodeTree *tree) {
-  compile_snode_tree_types_impl(tree);
-}
-
-static LlvmOfflineCache::FieldCacheData construct_filed_cache_data(
-    const SNodeTree &tree,
-    const StructCompiler &struct_compiler) {
-  LlvmOfflineCache::FieldCacheData ret;
-  ret.tree_id = tree.id();
-  ret.root_id = tree.root()->id;
-  ret.root_size = struct_compiler.root_size;
-
-  const auto &snodes = struct_compiler.snodes;
-  for (size_t i = 0; i < snodes.size(); i++) {
-    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
-    snode_cache_data.id = snodes[i]->id;
-    snode_cache_data.type = snodes[i]->type;
-    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
-    snode_cache_data.chunk_size = snodes[i]->chunk_size;
-
-    ret.snode_metas.emplace_back(std::move(snode_cache_data));
-  }
+  auto struct_compiler = compile_snode_tree_types_impl(tree);
+  int snode_tree_id = tree->id();
+  int root_id = tree->root()->id;
 
-  return ret;
+  // Add compiled result to Cache
+  cache_field(snode_tree_id, root_id, *struct_compiler);
 }
 
 void LlvmProgramImpl::materialize_snode_tree(SNodeTree *tree,
                                              uint64 *result_buffer) {
-  auto struct_compiler = compile_snode_tree_types_impl(tree);
+  compile_snode_tree_types(tree);
+  int snode_tree_id = tree->id();
 
-  auto field_cache_data = construct_filed_cache_data(*tree, *struct_compiler);
-  initialize_llvm_runtime_snodes(field_cache_data, result_buffer);
+  TI_ASSERT(cache_data_.fields.find(snode_tree_id) != cache_data_.fields.end());
+  initialize_llvm_runtime_snodes(cache_data_.fields.at(snode_tree_id),
+                                 result_buffer);
 }
 
 uint64 LlvmProgramImpl::fetch_result_uint64(int i, uint64 *result_buffer) {
@@ -365,12 +350,12 @@ void LlvmProgramImpl::print_list_manager_info(void *list_manager,
 
 std::unique_ptr<AotModuleBuilder> LlvmProgramImpl::make_aot_module_builder() {
   if (config->arch == Arch::x64 || config->arch == Arch::arm64) {
-    return std::make_unique<cpu::AotModuleBuilderImpl>();
+    return std::make_unique<cpu::AotModuleBuilderImpl>(this);
   }
 
 #if defined(TI_WITH_CUDA)
   if (config->arch == Arch::cuda) {
-    return std::make_unique<cuda::AotModuleBuilderImpl>();
+    return std::make_unique<cuda::AotModuleBuilderImpl>(this);
   }
 #endif
 
@@ -701,6 +686,33 @@ void LlvmProgramImpl::cache_kernel(
   kernel_cache.offloaded_task_list = std::move(offloaded_task_list);
 }
 
+void LlvmProgramImpl::cache_field(int snode_tree_id,
+                                  int root_id,
+                                  const StructCompiler &struct_compiler) {
+  if (cache_data_.fields.find(snode_tree_id) != cache_data_.fields.end()) {
+    // [TODO] check and update the Cache, instead of simply return.
+    return;
+  }
+
+  LlvmOfflineCache::FieldCacheData ret;
+  ret.tree_id = snode_tree_id;
+  ret.root_id = root_id;
+  ret.root_size = struct_compiler.root_size;
+
+  const auto &snodes = struct_compiler.snodes;
+  for (size_t i = 0; i < snodes.size(); i++) {
+    LlvmOfflineCache::FieldCacheData::SNodeCacheData snode_cache_data;
+    snode_cache_data.id = snodes[i]->id;
+    snode_cache_data.type = snodes[i]->type;
+    snode_cache_data.cell_size_bytes = snodes[i]->cell_size_bytes;
+    snode_cache_data.chunk_size = snodes[i]->chunk_size;
+
+    ret.snode_metas.emplace_back(std::move(snode_cache_data));
+  }
+
+  cache_data_.fields[snode_tree_id] = std::move(ret);
+}
+
 void LlvmProgramImpl::dump_cache_data_to_disk() {
   if (config->offline_cache && !cache_data_.kernels.empty()) {
     LlvmOfflineCacheFileWriter writer{};
diff --git a/taichi/llvm/llvm_program.h b/taichi/llvm/llvm_program.h
index 69378ee660bf1..2eec64dd8e7bd 100644
--- a/taichi/llvm/llvm_program.h
+++ b/taichi/llvm/llvm_program.h
@@ -118,10 +118,27 @@ class LlvmProgramImpl : public ProgramImpl {
                     std::vector<LlvmOfflineCache::OffloadedTaskCacheData>
                         &&offloaded_task_list);
 
+  void cache_field(int snode_tree_id,
+                   int root_id,
+                   const StructCompiler &struct_compiler);
+
+  LlvmOfflineCache::FieldCacheData get_cached_field(int snode_tree_id) const {
+    TI_ASSERT(cache_data_.fields.find(snode_tree_id) !=
+              cache_data_.fields.end());
+    return cache_data_.fields.at(snode_tree_id);
+  }
+
   Device *get_compute_device() override {
     return device_.get();
   }
 
+  /**
+   * Initializes the SNodes for LLVM based backends.
+   */
+  void initialize_llvm_runtime_snodes(
+      const LlvmOfflineCache::FieldCacheData &field_cache_data,
+      uint64 *result_buffer);
+
  private:
   std::unique_ptr<llvm::Module> clone_struct_compiler_initial_context(
       bool has_multiple_snode_trees,
@@ -129,12 +146,6 @@ class LlvmProgramImpl : public ProgramImpl {
 
   std::unique_ptr<StructCompiler> compile_snode_tree_types_impl(
       SNodeTree *tree);
-  /**
-   * Initializes the SNodes for LLVM based backends.
-   */
-  void initialize_llvm_runtime_snodes(
-      const LlvmOfflineCache::FieldCacheData &field_cache_data,
-      uint64 *result_buffer);
 
   uint64 fetch_result_uint64(int i, uint64 *result_buffer);