From 75d5eccfe40091ba5a259cc2cc391e916e574954 Mon Sep 17 00:00:00 2001
From: Yuanming Hu <yuanming-hu@users.noreply.github.com>
Date: Thu, 1 Oct 2020 10:23:39 -0400
Subject: [PATCH] [async] Add advection benchmark (#1914)

---
 benchmarks/async_advection.py      | 125 +++++++++++++++++++++++++++++
 benchmarks/async_cases.py          |  30 ++-----
 benchmarks/benchmark_async.py      |  10 +--
 benchmarks/utils.py                |  21 +++++
 examples/mpm_lagrangian_forces.py  |   3 +-
 python/taichi/lang/__init__.py     |  16 ++--
 taichi/program/kernel_profiler.cpp |  13 ++-
 taichi/program/kernel_profiler.h   |   6 +-
 taichi/program/program.cpp         |   2 +
 taichi/python/export_lang.cpp      |   2 +
 10 files changed, 184 insertions(+), 44 deletions(-)
 create mode 100644 benchmarks/async_advection.py
 create mode 100644 benchmarks/utils.py

diff --git a/benchmarks/async_advection.py b/benchmarks/async_advection.py
new file mode 100644
index 0000000000000..f6dafeace73c8
--- /dev/null
+++ b/benchmarks/async_advection.py
@@ -0,0 +1,125 @@
+import taichi as ti
+import math
+
+from utils import benchmark_async
+
+# TODO: staggerred grid
+
+
+@benchmark_async
+def advection_2d(scale):
+    n = 128 * 2**int((math.log(scale, 2)) // 2)
+    x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
+    new_x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
+    v = ti.Vector.field(2, dtype=ti.f32, shape=(n, n))
+    dx = 1 / n
+    inv_dx = 1 / dx
+    dt = 0.01
+
+    stagger = ti.Vector([0.5, 0.5])
+
+    @ti.func
+    def Vector2(x, y):
+        return ti.Vector([x, y])
+
+    @ti.kernel
+    def init():
+        for i, j in v:
+            v[i, j] = ti.Vector([j / n - 0.5, 0.5 - i / n])
+
+        for i, j in ti.ndrange(n * 4, n * 4):
+            ret = ti.taichi_logo(ti.Vector([i, j]) / (n * 4))
+            x[i // 4, j // 4][0] += ret / 16
+            x[i // 4, j // 4][1] += ret / 16
+            x[i // 4, j // 4][2] += ret / 16
+
+    @ti.func
+    def vec(x, y):
+        return ti.Vector([x, y])
+
+    @ti.func
+    def clamp(p):
+        for d in ti.static(range(p.n)):
+            p[d] = min(1 - 1e-4 - dx + stagger[d] * dx,
+                       max(p[d], stagger[d] * dx))
+        return p
+
+    @ti.func
+    def sample_bilinear(x, p):
+        p = clamp(p)
+
+        p_grid = p * inv_dx - stagger
+
+        I = ti.cast(ti.floor(p_grid), ti.i32)
+        f = p_grid - I
+        g = 1 - f
+
+        return x[I] * (g[0] * g[1]) + x[I + vec(1, 0)] * (f[0] * g[1]) + x[
+            I + vec(0, 1)] * (g[0] * f[1]) + x[I + vec(1, 1)] * (f[0] * f[1])
+
+    @ti.func
+    def velocity(p):
+        return sample_bilinear(v, p)
+
+    @ti.func
+    def sample_min(x, p):
+        p = clamp(p)
+        p_grid = p * inv_dx - stagger
+        I = ti.cast(ti.floor(p_grid), ti.i32)
+
+        return min(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])
+
+    @ti.func
+    def sample_max(x, p):
+        p = clamp(p)
+        p_grid = p * inv_dx - stagger
+        I = ti.cast(ti.floor(p_grid), ti.i32)
+
+        return max(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])
+
+    @ti.func
+    def backtrace(I, dt):  # RK3
+        p = (I + stagger) * dx
+        v1 = velocity(p)
+        p1 = p - 0.5 * dt * v1
+        v2 = velocity(p1)
+        p2 = p - 0.75 * dt * v2
+        v3 = velocity(p2)
+        p -= dt * (2 / 9 * v1 + 1 / 3 * v2 + 4 / 9 * v3)
+        return p
+
+    @ti.func
+    def semi_lagrangian(x, new_x, dt):
+        for I in ti.grouped(x):
+            new_x[I] = sample_bilinear(x, backtrace(I, dt))
+
+    @ti.kernel
+    def advect():
+        semi_lagrangian(x(0), new_x(0), dt)
+        semi_lagrangian(x(1), new_x(1), dt)
+        semi_lagrangian(x(2), new_x(2), dt)
+
+        for I in ti.grouped(x):
+            x[I] = new_x[I]
+
+    init()
+
+    def task():
+        for i in range(10):
+            advect()
+
+    ti.benchmark(task, repeat=100)
+
+    visualize = False
+
+    if visualize:
+        gui = ti.GUI('Advection schemes', (n, n))
+        for i in range(10):
+            for _ in range(10):
+                advect()
+            gui.set_image(x.to_numpy())
+            gui.show()
+
+
+if __name__ == '__main__':
+    advection_2d()
diff --git a/benchmarks/async_cases.py b/benchmarks/async_cases.py
index c91b3aa61146c..aa720f66f258b 100644
--- a/benchmarks/async_cases.py
+++ b/benchmarks/async_cases.py
@@ -1,36 +1,18 @@
 import taichi as ti
 import os
 import sys
-import functools
 
 sys.path.append(os.path.join(ti.core.get_repo_dir(), 'tests', 'python'))
 
 from fuse_test_template import template_fuse_dense_x2y2z, \
     template_fuse_reduction
 
-
-# Note: this is a short-term solution. In the long run we need to think about how to reuse pytest
-def benchmark_async(func):
-    @functools.wraps(func)
-    def body():
-        for arch in [ti.cpu, ti.cuda]:
-            for async_mode in [True, False]:
-                os.environ['TI_CURRENT_BENCHMARK'] = func.__name__
-                ti.init(arch=arch, async_mode=async_mode)
-                if arch == ti.cpu:
-                    scale = 2
-                else:
-                    # Use more data to hide compilation overhead
-                    # (since CUDA runs much faster than CPUs)
-                    scale = 64
-                func(scale)
-
-    return body
+from utils import *
 
 
 @benchmark_async
 def fuse_dense_x2y2z(scale):
-    template_fuse_dense_x2y2z(size=scale * 10 * 1024**2,
+    template_fuse_dense_x2y2z(size=scale * 1024**2,
                               repeat=1,
                               benchmark_repeat=100,
                               benchmark=True)
@@ -38,7 +20,7 @@ def fuse_dense_x2y2z(scale):
 
 @benchmark_async
 def fuse_reduction(scale):
-    template_fuse_reduction(size=scale * 10 * 1024**2,
+    template_fuse_reduction(size=scale * 1024**2,
                             repeat=10,
                             benchmark_repeat=10,
                             benchmark=True)
@@ -46,7 +28,7 @@ def fuse_reduction(scale):
 
 @benchmark_async
 def fill_1d(scale):
-    a = ti.field(dtype=ti.f32, shape=scale * 10 * 1024**2)
+    a = ti.field(dtype=ti.f32, shape=scale * 1024**2)
 
     @ti.kernel
     def fill():
@@ -81,7 +63,7 @@ def sparse_numpy(scale):
     a = ti.field(dtype=ti.f32)
     b = ti.field(dtype=ti.f32)
 
-    block_count = 2**int((math.log(scale, 2)) // 2) * 64
+    block_count = 2**int((math.log(scale, 2)) // 2) * 4
     block_size = 32
     # a, b always share the same sparsity
     ti.root.pointer(ti.ij, block_count).dense(ti.ij, block_size).place(a, b)
@@ -145,7 +127,7 @@ def stencil_reduction(scale):
     b = ti.field(dtype=ti.f32)
     total = ti.field(dtype=ti.f32, shape=())
 
-    block_count = scale * 512
+    block_count = scale * 64
     block_size = 1024
     # a, b always share the same sparsity
     ti.root.pointer(ti.i, block_count).dense(ti.i, block_size).place(a, b)
diff --git a/benchmarks/benchmark_async.py b/benchmarks/benchmark_async.py
index 373f3b11a636d..658eb9cf54b94 100644
--- a/benchmarks/benchmark_async.py
+++ b/benchmarks/benchmark_async.py
@@ -1,17 +1,13 @@
 import taichi as ti
 
 from async_cases import *
+from async_advection import *
 
 rerun = True
 
 cases = [
-    fuse_dense_x2y2z,
-    fuse_reduction,
-    fill_1d,
-    sparse_numpy,
-    autodiff,
-    stencil_reduction,
-    # mpm_splitted,
+    fuse_dense_x2y2z, fuse_reduction, fill_1d, sparse_numpy, autodiff,
+    stencil_reduction, mpm_splitted, advection_2d
 ]
 
 if rerun:
diff --git a/benchmarks/utils.py b/benchmarks/utils.py
new file mode 100644
index 0000000000000..640277b46956b
--- /dev/null
+++ b/benchmarks/utils.py
@@ -0,0 +1,21 @@
+import taichi as ti
+import functools
+import os
+
+
+def benchmark_async(func):
+    @functools.wraps(func)
+    def body():
+        for arch in [ti.cpu, ti.cuda]:
+            for async_mode in [True, False]:
+                os.environ['TI_CURRENT_BENCHMARK'] = func.__name__
+                ti.init(arch=arch, async_mode=async_mode, kernel_profiler=True)
+                if arch == ti.cpu:
+                    scale = 2
+                else:
+                    # Use more data to hide compilation overhead
+                    # (since CUDA runs much faster than CPUs)
+                    scale = 64
+                func(scale)
+
+    return body
diff --git a/examples/mpm_lagrangian_forces.py b/examples/mpm_lagrangian_forces.py
index 6c9af88a83588..62147eac0eb6f 100644
--- a/examples/mpm_lagrangian_forces.py
+++ b/examples/mpm_lagrangian_forces.py
@@ -1,7 +1,7 @@
 import taichi as ti
 import numpy as np
 
-ti.init(arch=ti.gpu)
+ti.init(arch=ti.gpu, kernel_profiler=True)
 
 dim = 2
 quality = 8  # Use a larger integral number for higher quality
@@ -187,6 +187,7 @@ def main():
                  color=0xFFFFFF,
                  radius=3)
         gui.show()
+        ti.kernel_profiler_print()
 
 
 if __name__ == '__main__':
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index 5414c47e8098d..fdc96f622c806 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -42,10 +42,10 @@
 cc = core.cc
 gpu = [cuda, metal, opengl]
 cpu = core.host_arch()
-kernel_profiler_print = lambda: core.get_current_program(
-).kernel_profiler_print()
-kernel_profiler_clear = lambda: core.get_current_program(
-).kernel_profiler_clear()
+kernel_profiler_print = lambda: get_runtime().prog.kernel_profiler_print()
+kernel_profiler_clear = lambda: get_runtime().prog.kernel_profiler_clear()
+kernel_profiler_total_time = lambda: get_runtime(
+).prog.kernel_profiler_total_time()
 
 
 def memory_profiler_print():
@@ -331,19 +331,23 @@ def run_benchmark():
                 ti.stat_write('offloaded_tasks', b)
             elif a == 'launched_tasks':
                 ti.stat_write('launched_tasks', b)
-        # The reason why we run 3 more times is to warm up
+
+        # Use 3 initial iterations to warm up
         # instruction/data caches. Discussion:
         # https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
         for i in range(3):
             func(*args)
             ti.sync()
+        ti.kernel_profiler_clear()
         t = time.time()
         for n in range(repeat):
             func(*args)
             ti.sync()
         elapsed = time.time() - t
         avg = elapsed / repeat
-        ti.stat_write('running_time', avg)
+        ti.stat_write('clock_time', avg)
+        device_time = ti.kernel_profiler_total_time()
+        ti.stat_write('device_time', device_time)
 
     run_benchmark()
 
diff --git a/taichi/program/kernel_profiler.cpp b/taichi/program/kernel_profiler.cpp
index 8601457958e56..5e0ad6561b61d 100644
--- a/taichi/program/kernel_profiler.cpp
+++ b/taichi/program/kernel_profiler.cpp
@@ -42,7 +42,7 @@ void KernelProfilerBase::print() {
       "name\n");
   std::sort(records.begin(), records.end());
   for (auto &rec : records) {
-    auto fraction = rec.total / total_time * 100.0f;
+    auto fraction = rec.total / total_time_ms * 100.0f;
     fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n",
                fraction, rec.total / 1000.0f, rec.counter, rec.min,
                rec.total / rec.counter, rec.max, rec.name);
@@ -53,12 +53,17 @@ void KernelProfilerBase::print() {
   fmt::print(
       "[100.00%] Total kernel execution time: {:7.3f} s   number of records: "
       "{}\n",
-      total_time / 1000.0f, records.size());
+      get_total_time(), records.size());
+
   fmt::print(
       "========================================================================"
       "=\n");
 }
 
+double KernelProfilerBase::get_total_time() const {
+  return total_time_ms / 1000.0;
+}
+
 namespace {
 // A simple profiler that uses Time::get_time()
 class DefaultProfiler : public KernelProfilerBase {
@@ -90,7 +95,7 @@ class DefaultProfiler : public KernelProfilerBase {
       it = std::prev(records.end());
     }
     it->insert_sample(ms);
-    total_time += ms;
+    total_time_ms += ms;
   }
 
  private:
@@ -150,7 +155,7 @@ class KernelProfilerCUDA : public KernelProfilerBase {
           it = std::prev(records.end());
         }
         it->insert_sample(ms);
-        total_time += ms;
+        total_time_ms += ms;
       }
     }
     outstanding_events.clear();
diff --git a/taichi/program/kernel_profiler.h b/taichi/program/kernel_profiler.h
index 26c9fcc924340..5059906611acc 100644
--- a/taichi/program/kernel_profiler.h
+++ b/taichi/program/kernel_profiler.h
@@ -30,14 +30,14 @@ struct KernelProfileRecord {
 class KernelProfilerBase {
  protected:
   std::vector<KernelProfileRecord> records;
-  double total_time;
+  double total_time_ms;
 
  public:
   // Needed for the CUDA backend since we need to know which task to "stop"
   using TaskHandle = void *;
 
   void clear() {
-    total_time = 0;
+    total_time_ms = 0;
     records.clear();
   }
 
@@ -62,6 +62,8 @@ class KernelProfilerBase {
 
   void print();
 
+  double get_total_time() const;
+
   virtual ~KernelProfilerBase() {
   }
 };
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 320fd3a66124a..2c32bd373681b 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -472,6 +472,8 @@ void Program::synchronize() {
     if (config.async_mode) {
       async_engine->synchronize();
     }
+    if (profiler)
+      profiler->sync();
     device_synchronize();
     sync = true;
   }
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index c4b29a6be7b6f..adf530e8503bf 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -153,6 +153,8 @@ void export_lang(py::module &m) {
       .def(py::init<>())
       .def_readonly("config", &Program::config)
       .def("kernel_profiler_print", &Program::kernel_profiler_print)
+      .def("kernel_profiler_total_time",
+           [](Program *program) { return program->profiler->get_total_time(); })
       .def("kernel_profiler_clear", &Program::kernel_profiler_clear)
       .def("print_memory_profiler_info", &Program::print_memory_profiler_info)
       .def("finalize", &Program::finalize)