Skip to content

Commit

Permalink
[refactor] Enable adaptive block_dim selection for CPU backend (#5190)
Browse files Browse the repository at this point in the history
* Add adaptive block_dim for cpu

* Add block_dim_adaptive switch

* Pylint
  • Loading branch information
qiao-bo authored Jun 17, 2022
1 parent d3827f5 commit 1ef6a31
Show file tree
Hide file tree
Showing 7 changed files with 51 additions and 8 deletions.
19 changes: 18 additions & 1 deletion python/taichi/lang/misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -586,13 +586,27 @@ def _block_dim(dim):
get_runtime().prog.current_ast_builder().block_dim(dim)


def loop_config(*, block_dim=None, serialize=False, parallelize=None):
def _block_dim_adaptive(block_dim_adaptive):
"""Enable/Disable backends set block_dim adaptively.
"""
if get_runtime().prog.config.arch != cpu:
_logging.warn('Adaptive block_dim is supported on CPU backend only')
else:
get_runtime().prog.config.cpu_block_dim_adaptive = block_dim_adaptive


def loop_config(*,
block_dim=None,
serialize=False,
parallelize=None,
block_dim_adaptive=True):
"""Sets directives for the next loop
Args:
block_dim (int): The number of threads in a block on GPU
serialize (bool): Whether to let the for loop execute serially, `serialize=True` equals to `parallelize=1`
parallelize (int): The number of threads to use on CPU
block_dim_adaptive (bool): Whether to allow backends set block_dim adaptively, enabled by default
Examples::
Expand Down Expand Up @@ -626,6 +640,9 @@ def fill():
elif parallelize is not None:
_parallelize(parallelize)

if not block_dim_adaptive:
_block_dim_adaptive(block_dim_adaptive)


def global_thread_idx():
"""Returns the global thread id of this running thread,
Expand Down
11 changes: 11 additions & 0 deletions taichi/backends/cpu/codegen_cpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,17 @@ class CodeGenLLVMCPU : public CodeGenLLVM {
llvm::Value *epilogue = create_xlogue(stmt->tls_epilogue);

auto [begin, end] = get_range_for_bounds(stmt);

// adaptive block_dim
if (prog->config.cpu_block_dim_adaptive) {
int num_items = (stmt->end_value - stmt->begin_value) / std::abs(step);
int num_threads = stmt->num_cpu_threads;
int items_per_thread = std::max(1, num_items / (num_threads * 32));
// keep each task has at least 512 items to amortize scheduler overhead
// also saturate the value to 1024 for better load balancing
stmt->block_dim = std::min(1024, std::max(512, items_per_thread));
}

create_call(
"cpu_parallel_range_for",
{get_arg(0), tlctx->get_constant(stmt->num_cpu_threads), begin, end,
Expand Down
1 change: 1 addition & 0 deletions taichi/program/compile_config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ CompileConfig::CompileConfig() {
verbose_kernel_launches = false;
kernel_profiler = false;
default_cpu_block_dim = 32;
cpu_block_dim_adaptive = true;
default_gpu_block_dim = 128;
gpu_max_reg = 0; // 0 means using the default value from the CUDA driver.
verbose = true;
Expand Down
1 change: 1 addition & 0 deletions taichi/program/compile_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ struct CompileConfig {
DataType default_ip;
std::string extra_flags;
int default_cpu_block_dim;
bool cpu_block_dim_adaptive;
int default_gpu_block_dim;
int gpu_max_reg;
int ad_stack_size{0}; // 0 = adaptive
Expand Down
2 changes: 2 additions & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ void export_lang(py::module &m) {
&CompileConfig::move_loop_invariant_outside_if)
.def_readwrite("default_cpu_block_dim",
&CompileConfig::default_cpu_block_dim)
.def_readwrite("cpu_block_dim_adaptive",
&CompileConfig::cpu_block_dim_adaptive)
.def_readwrite("default_gpu_block_dim",
&CompileConfig::default_gpu_block_dim)
.def_readwrite("gpu_max_reg", &CompileConfig::gpu_max_reg)
Expand Down
7 changes: 0 additions & 7 deletions taichi/runtime/llvm/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1467,13 +1467,6 @@ void cpu_parallel_range_for(RuntimeContext *context,
taichi_printf(context->runtime, "step must not be %d\n", step);
exit(-1);
}
if (block_dim == 0) {
// adaptive block dim
auto num_items = (ctx.end - ctx.begin) / std::abs(step);
// ensure each thread has at least ~32 tasks for load balancing
// and each task has at least 512 items to amortize scheduler overhead
block_dim = std::min(512, std::max(1, num_items / (num_threads * 32)));
}
ctx.block_size = block_dim;
auto runtime = context->runtime;
runtime->parallel_for(runtime->thread_pool,
Expand Down
18 changes: 18 additions & 0 deletions tests/python/test_parallel_range_for.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,3 +69,21 @@ def foo() -> ti.i32:
return a

assert foo() == 50


@test_utils.test(arch=[ti.cpu])
def test_loop_config_block_dim_adaptive():
n = 4096
val = ti.field(ti.i32, shape=(n))

@ti.kernel
def fill():
ti.loop_config(block_dim_adaptive=False)
for i in range(n):
val[i] = i

fill()
# To speed up
val_np = val.to_numpy()
for i in range(n):
assert val_np[i] == i

0 comments on commit 1ef6a31

Please sign in to comment.