From 75d5eccfe40091ba5a259cc2cc391e916e574954 Mon Sep 17 00:00:00 2001 From: Yuanming Hu Date: Thu, 1 Oct 2020 10:23:39 -0400 Subject: [PATCH] [async] Add advection benchmark (#1914) --- benchmarks/async_advection.py | 125 +++++++++++++++++++++++++++++ benchmarks/async_cases.py | 30 ++----- benchmarks/benchmark_async.py | 10 +-- benchmarks/utils.py | 21 +++++ examples/mpm_lagrangian_forces.py | 3 +- python/taichi/lang/__init__.py | 16 ++-- taichi/program/kernel_profiler.cpp | 13 ++- taichi/program/kernel_profiler.h | 6 +- taichi/program/program.cpp | 2 + taichi/python/export_lang.cpp | 2 + 10 files changed, 184 insertions(+), 44 deletions(-) create mode 100644 benchmarks/async_advection.py create mode 100644 benchmarks/utils.py diff --git a/benchmarks/async_advection.py b/benchmarks/async_advection.py new file mode 100644 index 0000000000000..f6dafeace73c8 --- /dev/null +++ b/benchmarks/async_advection.py @@ -0,0 +1,125 @@ +import taichi as ti +import math + +from utils import benchmark_async + +# TODO: staggerred grid + + +@benchmark_async +def advection_2d(scale): + n = 128 * 2**int((math.log(scale, 2)) // 2) + x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n)) + new_x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n)) + v = ti.Vector.field(2, dtype=ti.f32, shape=(n, n)) + dx = 1 / n + inv_dx = 1 / dx + dt = 0.01 + + stagger = ti.Vector([0.5, 0.5]) + + @ti.func + def Vector2(x, y): + return ti.Vector([x, y]) + + @ti.kernel + def init(): + for i, j in v: + v[i, j] = ti.Vector([j / n - 0.5, 0.5 - i / n]) + + for i, j in ti.ndrange(n * 4, n * 4): + ret = ti.taichi_logo(ti.Vector([i, j]) / (n * 4)) + x[i // 4, j // 4][0] += ret / 16 + x[i // 4, j // 4][1] += ret / 16 + x[i // 4, j // 4][2] += ret / 16 + + @ti.func + def vec(x, y): + return ti.Vector([x, y]) + + @ti.func + def clamp(p): + for d in ti.static(range(p.n)): + p[d] = min(1 - 1e-4 - dx + stagger[d] * dx, + max(p[d], stagger[d] * dx)) + return p + + @ti.func + def sample_bilinear(x, p): + p = clamp(p) + + p_grid = p * inv_dx - stagger + + I = ti.cast(ti.floor(p_grid), ti.i32) + f = p_grid - I + g = 1 - f + + return x[I] * (g[0] * g[1]) + x[I + vec(1, 0)] * (f[0] * g[1]) + x[ + I + vec(0, 1)] * (g[0] * f[1]) + x[I + vec(1, 1)] * (f[0] * f[1]) + + @ti.func + def velocity(p): + return sample_bilinear(v, p) + + @ti.func + def sample_min(x, p): + p = clamp(p) + p_grid = p * inv_dx - stagger + I = ti.cast(ti.floor(p_grid), ti.i32) + + return min(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)]) + + @ti.func + def sample_max(x, p): + p = clamp(p) + p_grid = p * inv_dx - stagger + I = ti.cast(ti.floor(p_grid), ti.i32) + + return max(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)]) + + @ti.func + def backtrace(I, dt): # RK3 + p = (I + stagger) * dx + v1 = velocity(p) + p1 = p - 0.5 * dt * v1 + v2 = velocity(p1) + p2 = p - 0.75 * dt * v2 + v3 = velocity(p2) + p -= dt * (2 / 9 * v1 + 1 / 3 * v2 + 4 / 9 * v3) + return p + + @ti.func + def semi_lagrangian(x, new_x, dt): + for I in ti.grouped(x): + new_x[I] = sample_bilinear(x, backtrace(I, dt)) + + @ti.kernel + def advect(): + semi_lagrangian(x(0), new_x(0), dt) + semi_lagrangian(x(1), new_x(1), dt) + semi_lagrangian(x(2), new_x(2), dt) + + for I in ti.grouped(x): + x[I] = new_x[I] + + init() + + def task(): + for i in range(10): + advect() + + ti.benchmark(task, repeat=100) + + visualize = False + + if visualize: + gui = ti.GUI('Advection schemes', (n, n)) + for i in range(10): + for _ in range(10): + advect() + gui.set_image(x.to_numpy()) + gui.show() + + +if __name__ == '__main__': + advection_2d() diff --git a/benchmarks/async_cases.py b/benchmarks/async_cases.py index c91b3aa61146c..aa720f66f258b 100644 --- a/benchmarks/async_cases.py +++ b/benchmarks/async_cases.py @@ -1,36 +1,18 @@ import taichi as ti import os import sys -import functools sys.path.append(os.path.join(ti.core.get_repo_dir(), 'tests', 'python')) from fuse_test_template import template_fuse_dense_x2y2z, \ template_fuse_reduction - -# Note: this is a short-term solution. In the long run we need to think about how to reuse pytest -def benchmark_async(func): - @functools.wraps(func) - def body(): - for arch in [ti.cpu, ti.cuda]: - for async_mode in [True, False]: - os.environ['TI_CURRENT_BENCHMARK'] = func.__name__ - ti.init(arch=arch, async_mode=async_mode) - if arch == ti.cpu: - scale = 2 - else: - # Use more data to hide compilation overhead - # (since CUDA runs much faster than CPUs) - scale = 64 - func(scale) - - return body +from utils import * @benchmark_async def fuse_dense_x2y2z(scale): - template_fuse_dense_x2y2z(size=scale * 10 * 1024**2, + template_fuse_dense_x2y2z(size=scale * 1024**2, repeat=1, benchmark_repeat=100, benchmark=True) @@ -38,7 +20,7 @@ def fuse_dense_x2y2z(scale): @benchmark_async def fuse_reduction(scale): - template_fuse_reduction(size=scale * 10 * 1024**2, + template_fuse_reduction(size=scale * 1024**2, repeat=10, benchmark_repeat=10, benchmark=True) @@ -46,7 +28,7 @@ def fuse_reduction(scale): @benchmark_async def fill_1d(scale): - a = ti.field(dtype=ti.f32, shape=scale * 10 * 1024**2) + a = ti.field(dtype=ti.f32, shape=scale * 1024**2) @ti.kernel def fill(): @@ -81,7 +63,7 @@ def sparse_numpy(scale): a = ti.field(dtype=ti.f32) b = ti.field(dtype=ti.f32) - block_count = 2**int((math.log(scale, 2)) // 2) * 64 + block_count = 2**int((math.log(scale, 2)) // 2) * 4 block_size = 32 # a, b always share the same sparsity ti.root.pointer(ti.ij, block_count).dense(ti.ij, block_size).place(a, b) @@ -145,7 +127,7 @@ def stencil_reduction(scale): b = ti.field(dtype=ti.f32) total = ti.field(dtype=ti.f32, shape=()) - block_count = scale * 512 + block_count = scale * 64 block_size = 1024 # a, b always share the same sparsity ti.root.pointer(ti.i, block_count).dense(ti.i, block_size).place(a, b) diff --git a/benchmarks/benchmark_async.py b/benchmarks/benchmark_async.py index 373f3b11a636d..658eb9cf54b94 100644 --- a/benchmarks/benchmark_async.py +++ b/benchmarks/benchmark_async.py @@ -1,17 +1,13 @@ import taichi as ti from async_cases import * +from async_advection import * rerun = True cases = [ - fuse_dense_x2y2z, - fuse_reduction, - fill_1d, - sparse_numpy, - autodiff, - stencil_reduction, - # mpm_splitted, + fuse_dense_x2y2z, fuse_reduction, fill_1d, sparse_numpy, autodiff, + stencil_reduction, mpm_splitted, advection_2d ] if rerun: diff --git a/benchmarks/utils.py b/benchmarks/utils.py new file mode 100644 index 0000000000000..640277b46956b --- /dev/null +++ b/benchmarks/utils.py @@ -0,0 +1,21 @@ +import taichi as ti +import functools +import os + + +def benchmark_async(func): + @functools.wraps(func) + def body(): + for arch in [ti.cpu, ti.cuda]: + for async_mode in [True, False]: + os.environ['TI_CURRENT_BENCHMARK'] = func.__name__ + ti.init(arch=arch, async_mode=async_mode, kernel_profiler=True) + if arch == ti.cpu: + scale = 2 + else: + # Use more data to hide compilation overhead + # (since CUDA runs much faster than CPUs) + scale = 64 + func(scale) + + return body diff --git a/examples/mpm_lagrangian_forces.py b/examples/mpm_lagrangian_forces.py index 6c9af88a83588..62147eac0eb6f 100644 --- a/examples/mpm_lagrangian_forces.py +++ b/examples/mpm_lagrangian_forces.py @@ -1,7 +1,7 @@ import taichi as ti import numpy as np -ti.init(arch=ti.gpu) +ti.init(arch=ti.gpu, kernel_profiler=True) dim = 2 quality = 8 # Use a larger integral number for higher quality @@ -187,6 +187,7 @@ def main(): color=0xFFFFFF, radius=3) gui.show() + ti.kernel_profiler_print() if __name__ == '__main__': diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py index 5414c47e8098d..fdc96f622c806 100644 --- a/python/taichi/lang/__init__.py +++ b/python/taichi/lang/__init__.py @@ -42,10 +42,10 @@ cc = core.cc gpu = [cuda, metal, opengl] cpu = core.host_arch() -kernel_profiler_print = lambda: core.get_current_program( -).kernel_profiler_print() -kernel_profiler_clear = lambda: core.get_current_program( -).kernel_profiler_clear() +kernel_profiler_print = lambda: get_runtime().prog.kernel_profiler_print() +kernel_profiler_clear = lambda: get_runtime().prog.kernel_profiler_clear() +kernel_profiler_total_time = lambda: get_runtime( +).prog.kernel_profiler_total_time() def memory_profiler_print(): @@ -331,19 +331,23 @@ def run_benchmark(): ti.stat_write('offloaded_tasks', b) elif a == 'launched_tasks': ti.stat_write('launched_tasks', b) - # The reason why we run 3 more times is to warm up + + # Use 3 initial iterations to warm up # instruction/data caches. Discussion: # https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136 for i in range(3): func(*args) ti.sync() + ti.kernel_profiler_clear() t = time.time() for n in range(repeat): func(*args) ti.sync() elapsed = time.time() - t avg = elapsed / repeat - ti.stat_write('running_time', avg) + ti.stat_write('clock_time', avg) + device_time = ti.kernel_profiler_total_time() + ti.stat_write('device_time', device_time) run_benchmark() diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp index 8601457958e56..5e0ad6561b61d 100644 --- a/taichi/program/kernel_profiler.cpp +++ b/taichi/program/kernel_profiler.cpp @@ -42,7 +42,7 @@ void KernelProfilerBase::print() { "name\n"); std::sort(records.begin(), records.end()); for (auto &rec : records) { - auto fraction = rec.total / total_time * 100.0f; + auto fraction = rec.total / total_time_ms * 100.0f; fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n", fraction, rec.total / 1000.0f, rec.counter, rec.min, rec.total / rec.counter, rec.max, rec.name); @@ -53,12 +53,17 @@ void KernelProfilerBase::print() { fmt::print( "[100.00%] Total kernel execution time: {:7.3f} s number of records: " "{}\n", - total_time / 1000.0f, records.size()); + get_total_time(), records.size()); + fmt::print( "========================================================================" "=\n"); } +double KernelProfilerBase::get_total_time() const { + return total_time_ms / 1000.0; +} + namespace { // A simple profiler that uses Time::get_time() class DefaultProfiler : public KernelProfilerBase { @@ -90,7 +95,7 @@ class DefaultProfiler : public KernelProfilerBase { it = std::prev(records.end()); } it->insert_sample(ms); - total_time += ms; + total_time_ms += ms; } private: @@ -150,7 +155,7 @@ class KernelProfilerCUDA : public KernelProfilerBase { it = std::prev(records.end()); } it->insert_sample(ms); - total_time += ms; + total_time_ms += ms; } } outstanding_events.clear(); diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h index 26c9fcc924340..5059906611acc 100644 --- a/taichi/program/kernel_profiler.h +++ b/taichi/program/kernel_profiler.h @@ -30,14 +30,14 @@ struct KernelProfileRecord { class KernelProfilerBase { protected: std::vector records; - double total_time; + double total_time_ms; public: // Needed for the CUDA backend since we need to know which task to "stop" using TaskHandle = void *; void clear() { - total_time = 0; + total_time_ms = 0; records.clear(); } @@ -62,6 +62,8 @@ class KernelProfilerBase { void print(); + double get_total_time() const; + virtual ~KernelProfilerBase() { } }; diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index 320fd3a66124a..2c32bd373681b 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -472,6 +472,8 @@ void Program::synchronize() { if (config.async_mode) { async_engine->synchronize(); } + if (profiler) + profiler->sync(); device_synchronize(); sync = true; } diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index c4b29a6be7b6f..adf530e8503bf 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -153,6 +153,8 @@ void export_lang(py::module &m) { .def(py::init<>()) .def_readonly("config", &Program::config) .def("kernel_profiler_print", &Program::kernel_profiler_print) + .def("kernel_profiler_total_time", + [](Program *program) { return program->profiler->get_total_time(); }) .def("kernel_profiler_clear", &Program::kernel_profiler_clear) .def("print_memory_profiler_info", &Program::print_memory_profiler_info) .def("finalize", &Program::finalize)