[refactor] Enable adaptive block_dim selection for CPU backend (#5190)

* Add adaptive block_dim for cpu * Add block_dim_adaptive switch * Pylint
taichi-dev · Jun 17, 2022 · 1ef6a31 · 1ef6a31
1 parent d3827f5
commit 1ef6a31
Show file tree

Hide file tree

Showing 7 changed files with 51 additions and 8 deletions.
diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py
@@ -586,13 +586,27 @@ def _block_dim(dim):
     get_runtime().prog.current_ast_builder().block_dim(dim)
 
 
-def loop_config(*, block_dim=None, serialize=False, parallelize=None):
+def _block_dim_adaptive(block_dim_adaptive):
+    """Enable/Disable backends set block_dim adaptively.
+    """
+    if get_runtime().prog.config.arch != cpu:
+        _logging.warn('Adaptive block_dim is supported on CPU backend only')
+    else:
+        get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive
+
+
+def loop_config(*,
+                block_dim=None,
+                serialize=False,
+                parallelize=None,
+                block_dim_adaptive=True):
     """Sets directives for the next loop
 
     Args:
         block_dim (int): The number of threads in a block on GPU
         serialize (bool): Whether to let the for loop execute serially, `serialize=True` equals to `parallelize=1`
         parallelize (int): The number of threads to use on CPU
+        block_dim_adaptive (bool): Whether to allow backends set block_dim adaptively, enabled by default
 
     Examples::
 
@@ -626,6 +640,9 @@ def fill():
     elif parallelize is not None:
         _parallelize(parallelize)
 
+    if not block_dim_adaptive:
+        _block_dim_adaptive(block_dim_adaptive)
+
 
 def global_thread_idx():
     """Returns the global thread id of this running thread,

diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp
@@ -57,6 +57,17 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
     llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue);
 
     auto [begin, end] = get_range_for_bounds(stmt);
+
+    // adaptive block_dim
+    if (prog->config.cpu_block_dim_adaptive) {
+      int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step);
+      int num_threads = stmt->num_cpu_threads;
+      int items_per_thread = std::max(1, num_items / (num_threads * 32));
+      // keep each task has at least 512 items to amortize scheduler overhead
+      // also saturate the value to 1024 for better load balancing
+      stmt->block_dim = std::min(1024, std::max(512, items_per_thread));
+    }
+
     create_call(
         "cpu_parallel_range_for",
         {get_arg(0), tlctx->get_constant(stmt->num_cpu_threads), begin, end,

diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
@@ -34,6 +34,7 @@ CompileConfig::CompileConfig() {
   verbose_kernel_launches = false;
   kernel_profiler = false;
   default_cpu_block_dim = 32;
+  cpu_block_dim_adaptive = true;
   default_gpu_block_dim = 128;
   gpu_max_reg = 0;  // 0 means using the default value from the CUDA driver.
   verbose = true;

diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
@@ -48,6 +48,7 @@ struct CompileConfig {
   DataType default_ip;
   std::string extra_flags;
   int default_cpu_block_dim;
+  bool cpu_block_dim_adaptive;
   int default_gpu_block_dim;
   int gpu_max_reg;
   int ad_stack_size{0};  // 0 = adaptive

diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
@@ -162,6 +162,8 @@ void export_lang(py::module &m) {
                      &CompileConfig::move_loop_invariant_outside_if)
       .def_readwrite("default_cpu_block_dim",
                      &CompileConfig::default_cpu_block_dim)
+      .def_readwrite("cpu_block_dim_adaptive",
+                     &CompileConfig::cpu_block_dim_adaptive)
       .def_readwrite("default_gpu_block_dim",
                      &CompileConfig::default_gpu_block_dim)
       .def_readwrite("gpu_max_reg", &CompileConfig::gpu_max_reg)

diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp
@@ -1467,13 +1467,6 @@ void cpu_parallel_range_for(RuntimeContext *context,
     taichi_printf(context->runtime, "step must not be %d\n", step);
     exit(-1);
   }
-  if (block_dim == 0) {
-    // adaptive block dim
-    auto num_items = (ctx.end - ctx.begin) / std::abs(step);
-    // ensure each thread has at least ~32 tasks for load balancing
-    // and each task has at least 512 items to amortize scheduler overhead
-    block_dim = std::min(512, std::max(1, num_items / (num_threads * 32)));
-  }
   ctx.block_size = block_dim;
   auto runtime = context->runtime;
   runtime->parallel_for(runtime->thread_pool,

diff --git a/tests/python/test_parallel_range_for.py b/tests/python/test_parallel_range_for.py
@@ -69,3 +69,21 @@ def foo() -> ti.i32:
         return a
 
     assert foo() == 50
+
+
+@test_utils.test(arch=[ti.cpu])
+def test_loop_config_block_dim_adaptive():
+    n = 4096
+    val = ti.field(ti.i32, shape=(n))
+
+    @ti.kernel
+    def fill():
+        ti.loop_config(block_dim_adaptive=False)
+        for i in range(n):
+            val[i] = i
+
+    fill()
+    # To speed up
+    val_np = val.to_numpy()
+    for i in range(n):
+        assert val_np[i] == i