From f688b8eaf5878d6e412972ce382da178217455e4 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 16 Jun 2022 15:49:40 +0800
Subject: [PATCH 1/3] Add adaptive block_dim for cpu

---
 taichi/backends/cpu/codegen_cpu.cpp | 11 +++++++++++
 taichi/program/compile_config.cpp   |  1 +
 taichi/program/compile_config.h     |  1 +
 3 files changed, 13 insertions(+)

diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp
index 61af40c075c0e..52d5c333ebd67 100644
--- a/taichi/backends/cpu/codegen_cpu.cpp
+++ b/taichi/backends/cpu/codegen_cpu.cpp
@@ -57,6 +57,17 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
     llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue);
 
     auto [begin, end] = get_range_for_bounds(stmt);
+
+    // adaptive block_dim
+    if (prog->config.cpu_block_dim_adaptive) {
+      int num_items = (end - begin) / std::abs(step);
+      // ensure each thread has at least ~32 tasks for load balancing
+      // and each task has at least 512 items to amortize scheduler overhead
+      int num_threads = stmt->num_cpu_threads;
+      stmt->block_dim =
+          std::min(512, std::max(1, num_items / (num_threads * 32)));
+    }
+
     create_call(
         "cpu_parallel_range_for",
         {get_arg(0), tlctx->get_constant(stmt->num_cpu_threads), begin, end,
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index ac0b2784d2651..a6ac8fca9a5d8 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -34,6 +34,7 @@ CompileConfig::CompileConfig() {
   verbose_kernel_launches = false;
   kernel_profiler = false;
   default_cpu_block_dim = 32;
+  cpu_block_dim_adaptive = true;
   default_gpu_block_dim = 128;
   gpu_max_reg = 0;  // 0 means using the default value from the CUDA driver.
   verbose = true;
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index 12ec3f621ba26..ee9648e65847c 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -48,6 +48,7 @@ struct CompileConfig {
   DataType default_ip;
   std::string extra_flags;
   int default_cpu_block_dim;
+  bool cpu_block_dim_adaptive;
   int default_gpu_block_dim;
   int gpu_max_reg;
   int ad_stack_size{0};  // 0 = adaptive

From 108413195f8186b0cde6b5c787dfbddc37d2b678 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 16 Jun 2022 20:13:25 +0800
Subject: [PATCH 2/3] Add block_dim_adaptive switch

---
 python/taichi/lang/misc.py              | 16 +++++++++++++++-
 taichi/backends/cpu/codegen_cpu.cpp     | 10 +++++-----
 taichi/python/export_lang.cpp           |  2 ++
 taichi/runtime/llvm/runtime.cpp         |  7 -------
 tests/python/test_parallel_range_for.py | 18 ++++++++++++++++++
 5 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index fb69fe8080c7d..60d93fc409bd6 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -586,13 +586,24 @@ def _block_dim(dim):
     get_runtime().prog.current_ast_builder().block_dim(dim)
 
 
-def loop_config(*, block_dim=None, serialize=False, parallelize=None):
+def _block_dim_adaptive(block_dim_adaptive):
+    """Enable/Disable backends set block_dim adaptively.
+    """
+    get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive
+
+
+def loop_config(*,
+                block_dim=None,
+                serialize=False,
+                parallelize=None,
+                block_dim_adaptive=True):
     """Sets directives for the next loop
 
     Args:
         block_dim (int): The number of threads in a block on GPU
         serialize (bool): Whether to let the for loop execute serially, `serialize=True` equals to `parallelize=1`
         parallelize (int): The number of threads to use on CPU
+        block_dim_adaptive (bool): Whether to allow backends set block_dim adaptively, enabled by default
 
     Examples::
 
@@ -626,6 +637,9 @@ def fill():
     elif parallelize is not None:
         _parallelize(parallelize)
 
+    if not block_dim_adaptive:
+        _block_dim_adaptive(block_dim_adaptive)
+
 
 def global_thread_idx():
     """Returns the global thread id of this running thread,
diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp
index 52d5c333ebd67..74f7708476ace 100644
--- a/taichi/backends/cpu/codegen_cpu.cpp
+++ b/taichi/backends/cpu/codegen_cpu.cpp
@@ -60,12 +60,12 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
 
     // adaptive block_dim
     if (prog->config.cpu_block_dim_adaptive) {
-      int num_items = (end - begin) / std::abs(step);
-      // ensure each thread has at least ~32 tasks for load balancing
-      // and each task has at least 512 items to amortize scheduler overhead
+      int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step);
       int num_threads = stmt->num_cpu_threads;
-      stmt->block_dim =
-          std::min(512, std::max(1, num_items / (num_threads * 32)));
+      int items_per_thread = std::max(1, num_items / (num_threads * 32));
+      // keep each task has at least 512 items to amortize scheduler overhead
+      // also saturate the value to 1024 for better load balancing
+      stmt->block_dim = std::min(1024, std::max(512, items_per_thread));
     }
 
     create_call(
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index 7c58541537b02..049760bbad31a 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -162,6 +162,8 @@ void export_lang(py::module &m) {
                      &CompileConfig::move_loop_invariant_outside_if)
       .def_readwrite("default_cpu_block_dim",
                      &CompileConfig::default_cpu_block_dim)
+      .def_readwrite("cpu_block_dim_adaptive",
+                     &CompileConfig::cpu_block_dim_adaptive)
       .def_readwrite("default_gpu_block_dim",
                      &CompileConfig::default_gpu_block_dim)
       .def_readwrite("gpu_max_reg", &CompileConfig::gpu_max_reg)
diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
index 1da9e16ef6e77..08cef5ad64eaf 100644
--- a/taichi/runtime/llvm/runtime.cpp
+++ b/taichi/runtime/llvm/runtime.cpp
@@ -1467,13 +1467,6 @@ void cpu_parallel_range_for(RuntimeContext *context,
     taichi_printf(context->runtime, "step must not be %d\n", step);
     exit(-1);
   }
-  if (block_dim == 0) {
-    // adaptive block dim
-    auto num_items = (ctx.end - ctx.begin) / std::abs(step);
-    // ensure each thread has at least ~32 tasks for load balancing
-    // and each task has at least 512 items to amortize scheduler overhead
-    block_dim = std::min(512, std::max(1, num_items / (num_threads * 32)));
-  }
   ctx.block_size = block_dim;
   auto runtime = context->runtime;
   runtime->parallel_for(runtime->thread_pool,
diff --git a/tests/python/test_parallel_range_for.py b/tests/python/test_parallel_range_for.py
index 303c14af760ad..eb61cf13d2a5c 100644
--- a/tests/python/test_parallel_range_for.py
+++ b/tests/python/test_parallel_range_for.py
@@ -69,3 +69,21 @@ def foo() -> ti.i32:
         return a
 
     assert foo() == 50
+
+
+@test_utils.test()
+def test_loop_config_block_dim_adaptive():
+    n = 4096
+    val = ti.field(ti.i32, shape=(n))
+
+    @ti.kernel
+    def fill():
+        ti.loop_config(block_dim_adaptive=False)
+        for i in range(n):
+            val[i] = i
+
+    fill()
+    # To speed up
+    val_np = val.to_numpy()
+    for i in range(n):
+        assert val_np[i] == i

From 6b92a523e159691daa717d55d3b65e9fd0880567 Mon Sep 17 00:00:00 2001
From: Bo Qiao <boqiao@taichi.graphics>
Date: Thu, 16 Jun 2022 21:05:07 +0800
Subject: [PATCH 3/3] Pylint

---
 python/taichi/lang/misc.py              | 5 ++++-
 tests/python/test_parallel_range_for.py | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
index 60d93fc409bd6..984cb64f7afbc 100644
--- a/python/taichi/lang/misc.py
+++ b/python/taichi/lang/misc.py
@@ -589,7 +589,10 @@ def _block_dim(dim):
 def _block_dim_adaptive(block_dim_adaptive):
     """Enable/Disable backends set block_dim adaptively.
     """
-    get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive
+    if get_runtime().prog.config.arch != cpu:
+        _logging.warn('Adaptive block_dim is supported on CPU backend only')
+    else:
+        get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive
 
 
 def loop_config(*,
diff --git a/tests/python/test_parallel_range_for.py b/tests/python/test_parallel_range_for.py
index eb61cf13d2a5c..45339b905059c 100644
--- a/tests/python/test_parallel_range_for.py
+++ b/tests/python/test_parallel_range_for.py
@@ -71,7 +71,7 @@ def foo() -> ti.i32:
     assert foo() == 50
 
 
-@test_utils.test()
+@test_utils.test(arch=[ti.cpu])
 def test_loop_config_block_dim_adaptive():
     n = 4096
     val = ti.field(ti.i32, shape=(n))