[benchmark] Upgrade to Time Performance Regression Test (TPRT) (#1002)

* [skip ci] Upgrade benchmark for TPRT * [skip ci] add my comments * [skip ci] pybenchmark :) * [skip ci] tmp save (mature) * [skip ci] fix typo * [skip ci] add 'repeat=' * [skip ci] nit * [skip ci] run 4 times to warm up * [skip ci] fix typo
taichi-dev · May 19, 2020 · 660e973 · 660e973
1 parent 8454986
commit 660e973
Show file tree

Hide file tree

Showing 11 changed files with 104 additions and 91 deletions.
diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
diff --git a/benchmarks/benchmark_baseline.txt b/benchmarks/benchmark_baseline.txt
diff --git a/benchmarks/fill_dense.py b/benchmarks/fill_dense.py
@@ -3,6 +3,7 @@
 # originally by @KLozes
 
 
+@ti.all_archs
 def benchmark_flat_struct():
     N = 4096
     a = ti.var(dt=ti.f32, shape=(N, N))
@@ -12,9 +13,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=500)
 
 
+@ti.all_archs
 def benchmark_flat_range():
     N = 4096
     a = ti.var(dt=ti.f32, shape=(N, N))
@@ -24,9 +26,10 @@ def fill():
         for i, j in ti.ndrange(N, N):
             a[i, j] = 2.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=700)
 
 
+@ti.all_archs
 def benchmark_nested_struct():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -40,11 +43,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=700)
 
 
+@ti.all_archs
 def benchmark_nested_struct_listgen_8x8():
     a = ti.var(dt=ti.f32)
     ti.cfg.demote_dense_struct_fors = False
@@ -59,11 +61,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=1000)
 
 
+@ti.all_archs
 def benchmark_nested_struct_listgen_16x16():
     a = ti.var(dt=ti.f32)
     ti.cfg.demote_dense_struct_fors = False
@@ -78,11 +79,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=700)
 
 
+@ti.all_archs
 def benchmark_nested_range_blocked():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -97,11 +97,10 @@ def fill():
             for Y in range(64):
                 a[X // N * 8 + Y // 8, X % N * 8 + Y % 8] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=800)
 
 
+@ti.all_archs
 def benchmark_nested_range():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -116,9 +115,10 @@ def fill():
             for i in range(N * 8):
                 a[i, j] = 2.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=1000)
 
 
+@ti.all_archs
 def benchmark_root_listgen():
     a = ti.var(dt=ti.f32)
     ti.cfg.demote_dense_struct_fors = False
@@ -133,9 +133,7 @@ def fill():
         for i, j in a.parent():
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=800)
 
 
 '''

diff --git a/benchmarks/fill_sparse.py b/benchmarks/fill_sparse.py
@@ -1,6 +1,7 @@
 import taichi as ti
 
 
+@ti.archs_support_sparse
 def benchmark_nested_struct():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -19,6 +20,7 @@ def fill():
     return ti.benchmark(fill)
 
 
+@ti.archs_support_sparse
 def benchmark_nested_struct_fill_and_clear():
     a = ti.var(dt=ti.f32)
     N = 512

diff --git a/benchmarks/memory_bound.py b/benchmarks/memory_bound.py
@@ -4,6 +4,7 @@
 
 
 # 4 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_memset():
     a = ti.var(dt=ti.f32, shape=N)
 
@@ -16,6 +17,7 @@ def memset():
 
 
 # 8 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_sscal():
     a = ti.var(dt=ti.f32, shape=N)
 
@@ -28,6 +30,7 @@ def task():
 
 
 # 8 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_memcpy():
     a = ti.var(dt=ti.f32, shape=N)
     b = ti.var(dt=ti.f32, shape=N)
@@ -41,6 +44,7 @@ def memcpy():
 
 
 # 12 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_saxpy():
     x = ti.var(dt=ti.f32, shape=N)
     y = ti.var(dt=ti.f32, shape=N)

diff --git a/benchmarks/minimal.py b/benchmarks/minimal.py
@@ -1,11 +1,12 @@
 import taichi as ti
 
 
+@ti.all_archs
 def benchmark_fill_scalar():
     a = ti.var(dt=ti.f32, shape=())
 
     @ti.kernel
     def fill():
         a[None] = 1.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=1000)
diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py
@@ -1,9 +1,10 @@
-import time
+import taichi as ti
 import numpy as np
+import time
 
 
+@ti.all_archs
 def benchmark_range():
-    import taichi as ti
     quality = 1  # Use a larger value for higher-res simulations
     n_particles, n_grid = 9000 * quality**2, 128 * quality
     dx, inv_dx = 1 / n_grid, float(n_grid)
@@ -124,11 +125,12 @@ def substep():
         # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
         # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
     ti.get_runtime().sync()
-    return (time.time() - t) / 4000
+    avg = (time.time() - t) / 4000 * 1000  # miliseconds
+    ti.stat_write(avg)
 
 
+@ti.all_archs
 def benchmark_struct():
-    import taichi as ti
     quality = 1  # Use a larger value for higher-res simulations
     n_particles, n_grid = 9000 * quality**2, 128 * quality
     dx, inv_dx = 1 / n_grid, float(n_grid)
@@ -251,4 +253,5 @@ def substep():
         # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
         # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
     ti.get_runtime().sync()
-    return (time.time() - t) / 4000
+    avg = (time.time() - t) / 4000 * 1000  # miliseconds
+    ti.stat_write(avg)
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -2,6 +2,10 @@
 import taichi as ti
 
 
+def get_benchmark_dir():
+    return os.path.join(ti.core.get_repo_dir(), 'benchmarks')
+
+
 class Case:
     def __init__(self, name, func):
         self.name = name
@@ -14,20 +18,10 @@ def __lt__(self, other):
     def __eq__(self, other):
         return self.name == other.name
 
-    def pprint(self):
-        print(f' * {self.name[10:]:33}', end='')
-        for i, arch in enumerate(sorted(self.records.keys())):
-            ms = self.records[arch] * 1000
-            arch_name = str(arch)[5:]
-            print(f' {arch_name:8} {ms:7.3f} ms', end='')
-            if i < len(self.records) - 1:
-                print('      ', end='')
-        print()
-
-    def run(self, arch):
-        ti.init(arch=arch)
-        t = self.func()
-        self.records[arch] = t
+    def run(self):
+        print(f'==> {self.name}:')
+        os.environ['TI_CURRENT_BENCHMARK'] = self.name
+        self.func()
 
 
 class Suite:
@@ -42,38 +36,25 @@ def __init__(self, filename):
             sorted(filter(lambda x: x.startswith('benchmark_'), dir(suite))))
         self.cases = [Case(k, getattr(suite, k)) for k in case_keys]
 
-    def print(self):
-        print(f'{self.name}:')
-        for b in self.cases:
-            b.pprint()
-
-    def run(self, arch):
+    def run(self):
         print(f'{self.name}:')
         for case in sorted(self.cases):
-            case.run(arch)
+            case.run()
 
 
 class TaichiBenchmark:
     def __init__(self):
         self.suites = []
-        benchmark_dir = os.path.dirname(__file__)
+        benchmark_dir = get_benchmark_dir()
         for f in map(os.path.basename, sorted(os.listdir(benchmark_dir))):
             if f != 'run.py' and f.endswith('.py') and f[0] != '_':
                 self.suites.append(Suite(f))
 
-    def pprint(self):
-        for s in self.suites:
-            s.print()
-
-    def run(self, arch):
+    def run(self):
         print("Running...")
         for s in self.suites:
-            s.run(arch)
+            s.run()
 
 
 b = TaichiBenchmark()
-b.pprint()
-b.run(ti.x64)
-b.run(ti.cuda)
-print()
-b.pprint()
+b.run()
diff --git a/docs/utilities.rst b/docs/utilities.rst
@@ -55,19 +55,7 @@ For example, this is part of the output by ``ti regression`` after enabling cons
     Discussion at: https://github.com/taichi-dev/taichi/issue/948
 
 
-<<<<<<< HEAD
 The suggested workflow for the performance related PR author to run the regression tests is:
-=======
-The suggested workflow for **the PR author** to run the regression tests is:
-
-* When a performance related PR is ready, checkout that PR locally.
-
-* Run ``ti benchmark && ti regression`` to obtain the result.
-
-* Decide wheater to approve or request change, depends on the result.
-
-* Right before merge, run ``ti baseline`` to save the benchmark result as new baseline.
->>>>>>> master
 
 * Run ``ti benchmark && ti baseline`` in ``master`` to save the current performance as baseline.
 

diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
@@ -247,18 +247,33 @@ def static_print(*args, __p=print, **kwargs):
     __p(*args, **kwargs)
 
 
-def benchmark(func, repeat=100, args=()):
+def benchmark(func, repeat=300, args=()):
     import taichi as ti
     import time
-    for i in range(repeat // 3):
+    # The reason why we run 4 times is to warm up instruction/data caches.
+    # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
+    for i in range(4):
         func(*args)  # compile the kernel first
     ti.sync()
     t = time.time()
     for n in range(repeat):
         func(*args)
     ti.get_runtime().sync()
     elapsed = time.time() - t
-    return elapsed / repeat
+    avg = elapsed / repeat * 1000  # miliseconds
+    ti.stat_write(avg)
+
+
+def stat_write(avg):
+    name = os.environ.get('TI_CURRENT_BENCHMARK')
+    if name is None:
+        return
+    import taichi as ti
+    arch_name = ti.core.arch_name(ti.cfg.arch)
+    output_dir = os.environ.get('TI_BENCHMARK_OUTPUT_DIR', '.')
+    filename = f'{output_dir}/{name}__arch_{arch_name}.dat'
+    with open(filename, 'w') as f:
+        f.write(f'time_avg: {avg:.4f}')
 
 
 def supported_archs():