From f688b8eaf5878d6e412972ce382da178217455e4 Mon Sep 17 00:00:00 2001 From: Bo Qiao Date: Thu, 16 Jun 2022 15:49:40 +0800 Subject: [PATCH 1/3] Add adaptive block_dim for cpu --- taichi/backends/cpu/codegen_cpu.cpp | 11 +++++++++++ taichi/program/compile_config.cpp | 1 + taichi/program/compile_config.h | 1 + 3 files changed, 13 insertions(+) diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp index 61af40c075c0e..52d5c333ebd67 100644 --- a/taichi/backends/cpu/codegen_cpu.cpp +++ b/taichi/backends/cpu/codegen_cpu.cpp @@ -57,6 +57,17 @@ class CodeGenLLVMCPU : public CodeGenLLVM { llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue); auto [begin, end] = get_range_for_bounds(stmt); + + // adaptive block_dim + if (prog->config.cpu_block_dim_adaptive) { + int num_items = (end - begin) / std::abs(step); + // ensure each thread has at least ~32 tasks for load balancing + // and each task has at least 512 items to amortize scheduler overhead + int num_threads = stmt->num_cpu_threads; + stmt->block_dim = + std::min(512, std::max(1, num_items / (num_threads * 32))); + } + create_call( "cpu_parallel_range_for", {get_arg(0), tlctx->get_constant(stmt->num_cpu_threads), begin, end, diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp index ac0b2784d2651..a6ac8fca9a5d8 100644 --- a/taichi/program/compile_config.cpp +++ b/taichi/program/compile_config.cpp @@ -34,6 +34,7 @@ CompileConfig::CompileConfig() { verbose_kernel_launches = false; kernel_profiler = false; default_cpu_block_dim = 32; + cpu_block_dim_adaptive = true; default_gpu_block_dim = 128; gpu_max_reg = 0; // 0 means using the default value from the CUDA driver. verbose = true; diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h index 12ec3f621ba26..ee9648e65847c 100644 --- a/taichi/program/compile_config.h +++ b/taichi/program/compile_config.h @@ -48,6 +48,7 @@ struct CompileConfig { DataType default_ip; std::string extra_flags; int default_cpu_block_dim; + bool cpu_block_dim_adaptive; int default_gpu_block_dim; int gpu_max_reg; int ad_stack_size{0}; // 0 = adaptive From 108413195f8186b0cde6b5c787dfbddc37d2b678 Mon Sep 17 00:00:00 2001 From: Bo Qiao Date: Thu, 16 Jun 2022 20:13:25 +0800 Subject: [PATCH 2/3] Add block_dim_adaptive switch --- python/taichi/lang/misc.py | 16 +++++++++++++++- taichi/backends/cpu/codegen_cpu.cpp | 10 +++++----- taichi/python/export_lang.cpp | 2 ++ taichi/runtime/llvm/runtime.cpp | 7 ------- tests/python/test_parallel_range_for.py | 18 ++++++++++++++++++ 5 files changed, 40 insertions(+), 13 deletions(-) diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py index fb69fe8080c7d..60d93fc409bd6 100644 --- a/python/taichi/lang/misc.py +++ b/python/taichi/lang/misc.py @@ -586,13 +586,24 @@ def _block_dim(dim): get_runtime().prog.current_ast_builder().block_dim(dim) -def loop_config(*, block_dim=None, serialize=False, parallelize=None): +def _block_dim_adaptive(block_dim_adaptive): + """Enable/Disable backends set block_dim adaptively. + """ + get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive + + +def loop_config(*, + block_dim=None, + serialize=False, + parallelize=None, + block_dim_adaptive=True): """Sets directives for the next loop Args: block_dim (int): The number of threads in a block on GPU serialize (bool): Whether to let the for loop execute serially, `serialize=True` equals to `parallelize=1` parallelize (int): The number of threads to use on CPU + block_dim_adaptive (bool): Whether to allow backends set block_dim adaptively, enabled by default Examples:: @@ -626,6 +637,9 @@ def fill(): elif parallelize is not None: _parallelize(parallelize) + if not block_dim_adaptive: + _block_dim_adaptive(block_dim_adaptive) + def global_thread_idx(): """Returns the global thread id of this running thread, diff --git a/taichi/backends/cpu/codegen_cpu.cpp b/taichi/backends/cpu/codegen_cpu.cpp index 52d5c333ebd67..74f7708476ace 100644 --- a/taichi/backends/cpu/codegen_cpu.cpp +++ b/taichi/backends/cpu/codegen_cpu.cpp @@ -60,12 +60,12 @@ class CodeGenLLVMCPU : public CodeGenLLVM { // adaptive block_dim if (prog->config.cpu_block_dim_adaptive) { - int num_items = (end - begin) / std::abs(step); - // ensure each thread has at least ~32 tasks for load balancing - // and each task has at least 512 items to amortize scheduler overhead + int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step); int num_threads = stmt->num_cpu_threads; - stmt->block_dim = - std::min(512, std::max(1, num_items / (num_threads * 32))); + int items_per_thread = std::max(1, num_items / (num_threads * 32)); + // keep each task has at least 512 items to amortize scheduler overhead + // also saturate the value to 1024 for better load balancing + stmt->block_dim = std::min(1024, std::max(512, items_per_thread)); } create_call( diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index 7c58541537b02..049760bbad31a 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -162,6 +162,8 @@ void export_lang(py::module &m) { &CompileConfig::move_loop_invariant_outside_if) .def_readwrite("default_cpu_block_dim", &CompileConfig::default_cpu_block_dim) + .def_readwrite("cpu_block_dim_adaptive", + &CompileConfig::cpu_block_dim_adaptive) .def_readwrite("default_gpu_block_dim", &CompileConfig::default_gpu_block_dim) .def_readwrite("gpu_max_reg", &CompileConfig::gpu_max_reg) diff --git a/taichi/runtime/llvm/runtime.cpp b/taichi/runtime/llvm/runtime.cpp index 1da9e16ef6e77..08cef5ad64eaf 100644 --- a/taichi/runtime/llvm/runtime.cpp +++ b/taichi/runtime/llvm/runtime.cpp @@ -1467,13 +1467,6 @@ void cpu_parallel_range_for(RuntimeContext *context, taichi_printf(context->runtime, "step must not be %d\n", step); exit(-1); } - if (block_dim == 0) { - // adaptive block dim - auto num_items = (ctx.end - ctx.begin) / std::abs(step); - // ensure each thread has at least ~32 tasks for load balancing - // and each task has at least 512 items to amortize scheduler overhead - block_dim = std::min(512, std::max(1, num_items / (num_threads * 32))); - } ctx.block_size = block_dim; auto runtime = context->runtime; runtime->parallel_for(runtime->thread_pool, diff --git a/tests/python/test_parallel_range_for.py b/tests/python/test_parallel_range_for.py index 303c14af760ad..eb61cf13d2a5c 100644 --- a/tests/python/test_parallel_range_for.py +++ b/tests/python/test_parallel_range_for.py @@ -69,3 +69,21 @@ def foo() -> ti.i32: return a assert foo() == 50 + + +@test_utils.test() +def test_loop_config_block_dim_adaptive(): + n = 4096 + val = ti.field(ti.i32, shape=(n)) + + @ti.kernel + def fill(): + ti.loop_config(block_dim_adaptive=False) + for i in range(n): + val[i] = i + + fill() + # To speed up + val_np = val.to_numpy() + for i in range(n): + assert val_np[i] == i From 6b92a523e159691daa717d55d3b65e9fd0880567 Mon Sep 17 00:00:00 2001 From: Bo Qiao Date: Thu, 16 Jun 2022 21:05:07 +0800 Subject: [PATCH 3/3] Pylint --- python/taichi/lang/misc.py | 5 ++++- tests/python/test_parallel_range_for.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/python/taichi/lang/misc.py b/python/taichi/lang/misc.py index 60d93fc409bd6..984cb64f7afbc 100644 --- a/python/taichi/lang/misc.py +++ b/python/taichi/lang/misc.py @@ -589,7 +589,10 @@ def _block_dim(dim): def _block_dim_adaptive(block_dim_adaptive): """Enable/Disable backends set block_dim adaptively. """ - get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive + if get_runtime().prog.config.arch != cpu: + _logging.warn('Adaptive block_dim is supported on CPU backend only') + else: + get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive def loop_config(*, diff --git a/tests/python/test_parallel_range_for.py b/tests/python/test_parallel_range_for.py index eb61cf13d2a5c..45339b905059c 100644 --- a/tests/python/test_parallel_range_for.py +++ b/tests/python/test_parallel_range_for.py @@ -71,7 +71,7 @@ def foo() -> ti.i32: assert foo() == 50 -@test_utils.test() +@test_utils.test(arch=[ti.cpu]) def test_loop_config_block_dim_adaptive(): n = 4096 val = ti.field(ti.i32, shape=(n))