diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py deleted file mode 100644 index e69de29bb2d1d..0000000000000 diff --git a/benchmarks/benchmark_baseline.txt b/benchmarks/benchmark_baseline.txt deleted file mode 100644 index bd2645c70a6fa..0000000000000 --- a/benchmarks/benchmark_baseline.txt +++ /dev/null @@ -1,9 +0,0 @@ -fill_dense: - * flat_range x86_64 83.230 ms cuda 1.284 ms - * flat_struct x86_64 4.662 ms cuda 2.192 ms - * nested_range x86_64 11.367 ms cuda 1.222 ms - * nested_range_blocked x86_64 4.242 ms cuda 6.570 ms - * nested_struct x86_64 12.860 ms cuda 17.888 ms -mpm2d: - * range x86_64 0.764 ms cuda 0.045 ms - * struct x86_64 0.747 ms cuda 0.206 ms diff --git a/benchmarks/fill_dense.py b/benchmarks/fill_dense.py index e316492d8ccb3..3e0f758b8fc8a 100644 --- a/benchmarks/fill_dense.py +++ b/benchmarks/fill_dense.py @@ -3,6 +3,7 @@ # originally by @KLozes +@ti.all_archs def benchmark_flat_struct(): N = 4096 a = ti.var(dt=ti.f32, shape=(N, N)) @@ -12,9 +13,10 @@ def fill(): for i, j in a: a[i, j] = 2.0 - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=500) +@ti.all_archs def benchmark_flat_range(): N = 4096 a = ti.var(dt=ti.f32, shape=(N, N)) @@ -24,9 +26,10 @@ def fill(): for i, j in ti.ndrange(N, N): a[i, j] = 2.0 - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=700) +@ti.all_archs def benchmark_nested_struct(): a = ti.var(dt=ti.f32) N = 512 @@ -40,11 +43,10 @@ def fill(): for i, j in a: a[i, j] = 2.0 - fill() - - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=700) +@ti.all_archs def benchmark_nested_struct_listgen_8x8(): a = ti.var(dt=ti.f32) ti.cfg.demote_dense_struct_fors = False @@ -59,11 +61,10 @@ def fill(): for i, j in a: a[i, j] = 2.0 - fill() - - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=1000) +@ti.all_archs def benchmark_nested_struct_listgen_16x16(): a = ti.var(dt=ti.f32) ti.cfg.demote_dense_struct_fors = False @@ -78,11 +79,10 @@ def fill(): for i, j in a: a[i, j] = 2.0 - fill() - - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=700) +@ti.all_archs def benchmark_nested_range_blocked(): a = ti.var(dt=ti.f32) N = 512 @@ -97,11 +97,10 @@ def fill(): for Y in range(64): a[X // N * 8 + Y // 8, X % N * 8 + Y % 8] = 2.0 - fill() - - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=800) +@ti.all_archs def benchmark_nested_range(): a = ti.var(dt=ti.f32) N = 512 @@ -116,9 +115,10 @@ def fill(): for i in range(N * 8): a[i, j] = 2.0 - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=1000) +@ti.all_archs def benchmark_root_listgen(): a = ti.var(dt=ti.f32) ti.cfg.demote_dense_struct_fors = False @@ -133,9 +133,7 @@ def fill(): for i, j in a.parent(): a[i, j] = 2.0 - fill() - - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=800) ''' diff --git a/benchmarks/fill_sparse.py b/benchmarks/fill_sparse.py index ee974040262fa..05670476c21dd 100644 --- a/benchmarks/fill_sparse.py +++ b/benchmarks/fill_sparse.py @@ -1,6 +1,7 @@ import taichi as ti +@ti.archs_support_sparse def benchmark_nested_struct(): a = ti.var(dt=ti.f32) N = 512 @@ -19,6 +20,7 @@ def fill(): return ti.benchmark(fill) +@ti.archs_support_sparse def benchmark_nested_struct_fill_and_clear(): a = ti.var(dt=ti.f32) N = 512 diff --git a/benchmarks/memory_bound.py b/benchmarks/memory_bound.py index df74e34d791fc..f004e0218af3f 100644 --- a/benchmarks/memory_bound.py +++ b/benchmarks/memory_bound.py @@ -4,6 +4,7 @@ # 4 B/it +@ti.archs_excluding(ti.opengl) def benchmark_memset(): a = ti.var(dt=ti.f32, shape=N) @@ -16,6 +17,7 @@ def memset(): # 8 B/it +@ti.archs_excluding(ti.opengl) def benchmark_sscal(): a = ti.var(dt=ti.f32, shape=N) @@ -28,6 +30,7 @@ def task(): # 8 B/it +@ti.archs_excluding(ti.opengl) def benchmark_memcpy(): a = ti.var(dt=ti.f32, shape=N) b = ti.var(dt=ti.f32, shape=N) @@ -41,6 +44,7 @@ def memcpy(): # 12 B/it +@ti.archs_excluding(ti.opengl) def benchmark_saxpy(): x = ti.var(dt=ti.f32, shape=N) y = ti.var(dt=ti.f32, shape=N) diff --git a/benchmarks/minimal.py b/benchmarks/minimal.py index 9dbb24b234ffc..b3df395938c52 100644 --- a/benchmarks/minimal.py +++ b/benchmarks/minimal.py @@ -1,6 +1,7 @@ import taichi as ti +@ti.all_archs def benchmark_fill_scalar(): a = ti.var(dt=ti.f32, shape=()) @@ -8,4 +9,4 @@ def benchmark_fill_scalar(): def fill(): a[None] = 1.0 - return ti.benchmark(fill) + return ti.benchmark(fill, repeat=1000) diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py index 958dcbfd5c39b..4c27613deb135 100644 --- a/benchmarks/mpm2d.py +++ b/benchmarks/mpm2d.py @@ -1,9 +1,10 @@ -import time +import taichi as ti import numpy as np +import time +@ti.all_archs def benchmark_range(): - import taichi as ti quality = 1 # Use a larger value for higher-res simulations n_particles, n_grid = 9000 * quality**2, 128 * quality dx, inv_dx = 1 / n_grid, float(n_grid) @@ -124,11 +125,12 @@ def substep(): # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()]) # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk ti.get_runtime().sync() - return (time.time() - t) / 4000 + avg = (time.time() - t) / 4000 * 1000 # miliseconds + ti.stat_write(avg) +@ti.all_archs def benchmark_struct(): - import taichi as ti quality = 1 # Use a larger value for higher-res simulations n_particles, n_grid = 9000 * quality**2, 128 * quality dx, inv_dx = 1 / n_grid, float(n_grid) @@ -251,4 +253,5 @@ def substep(): # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()]) # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk ti.get_runtime().sync() - return (time.time() - t) / 4000 + avg = (time.time() - t) / 4000 * 1000 # miliseconds + ti.stat_write(avg) diff --git a/benchmarks/run.py b/benchmarks/run.py index e6373378e1a5a..47b55529dc99f 100644 --- a/benchmarks/run.py +++ b/benchmarks/run.py @@ -2,6 +2,10 @@ import taichi as ti +def get_benchmark_dir(): + return os.path.join(ti.core.get_repo_dir(), 'benchmarks') + + class Case: def __init__(self, name, func): self.name = name @@ -14,20 +18,10 @@ def __lt__(self, other): def __eq__(self, other): return self.name == other.name - def pprint(self): - print(f' * {self.name[10:]:33}', end='') - for i, arch in enumerate(sorted(self.records.keys())): - ms = self.records[arch] * 1000 - arch_name = str(arch)[5:] - print(f' {arch_name:8} {ms:7.3f} ms', end='') - if i < len(self.records) - 1: - print(' ', end='') - print() - - def run(self, arch): - ti.init(arch=arch) - t = self.func() - self.records[arch] = t + def run(self): + print(f'==> {self.name}:') + os.environ['TI_CURRENT_BENCHMARK'] = self.name + self.func() class Suite: @@ -42,38 +36,25 @@ def __init__(self, filename): sorted(filter(lambda x: x.startswith('benchmark_'), dir(suite)))) self.cases = [Case(k, getattr(suite, k)) for k in case_keys] - def print(self): - print(f'{self.name}:') - for b in self.cases: - b.pprint() - - def run(self, arch): + def run(self): print(f'{self.name}:') for case in sorted(self.cases): - case.run(arch) + case.run() class TaichiBenchmark: def __init__(self): self.suites = [] - benchmark_dir = os.path.dirname(__file__) + benchmark_dir = get_benchmark_dir() for f in map(os.path.basename, sorted(os.listdir(benchmark_dir))): if f != 'run.py' and f.endswith('.py') and f[0] != '_': self.suites.append(Suite(f)) - def pprint(self): - for s in self.suites: - s.print() - - def run(self, arch): + def run(self): print("Running...") for s in self.suites: - s.run(arch) + s.run() b = TaichiBenchmark() -b.pprint() -b.run(ti.x64) -b.run(ti.cuda) -print() -b.pprint() +b.run() diff --git a/docs/utilities.rst b/docs/utilities.rst index a6d707c465f48..777abc64e5d6a 100644 --- a/docs/utilities.rst +++ b/docs/utilities.rst @@ -55,19 +55,7 @@ For example, this is part of the output by ``ti regression`` after enabling cons Discussion at: https://github.com/taichi-dev/taichi/issue/948 -<<<<<<< HEAD The suggested workflow for the performance related PR author to run the regression tests is: -======= -The suggested workflow for **the PR author** to run the regression tests is: - -* When a performance related PR is ready, checkout that PR locally. - -* Run ``ti benchmark && ti regression`` to obtain the result. - -* Decide wheater to approve or request change, depends on the result. - -* Right before merge, run ``ti baseline`` to save the benchmark result as new baseline. ->>>>>>> master * Run ``ti benchmark && ti baseline`` in ``master`` to save the current performance as baseline. diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py index edeb53633e01e..c9db4fa2c0521 100644 --- a/python/taichi/lang/__init__.py +++ b/python/taichi/lang/__init__.py @@ -247,10 +247,12 @@ def static_print(*args, __p=print, **kwargs): __p(*args, **kwargs) -def benchmark(func, repeat=100, args=()): +def benchmark(func, repeat=300, args=()): import taichi as ti import time - for i in range(repeat // 3): + # The reason why we run 4 times is to warm up instruction/data caches. + # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136 + for i in range(4): func(*args) # compile the kernel first ti.sync() t = time.time() @@ -258,7 +260,20 @@ def benchmark(func, repeat=100, args=()): func(*args) ti.get_runtime().sync() elapsed = time.time() - t - return elapsed / repeat + avg = elapsed / repeat * 1000 # miliseconds + ti.stat_write(avg) + + +def stat_write(avg): + name = os.environ.get('TI_CURRENT_BENCHMARK') + if name is None: + return + import taichi as ti + arch_name = ti.core.arch_name(ti.cfg.arch) + output_dir = os.environ.get('TI_BENCHMARK_OUTPUT_DIR', '.') + filename = f'{output_dir}/{name}__arch_{arch_name}.dat' + with open(filename, 'w') as f: + f.write(f'time_avg: {avg:.4f}') def supported_archs(): diff --git a/python/taichi/main.py b/python/taichi/main.py index 3e341d115b93a..c370e045c5276 100644 --- a/python/taichi/main.py +++ b/python/taichi/main.py @@ -88,11 +88,19 @@ def parse_dat(file): for line in open(file).readlines(): try: a, b = line.strip().split(':') except: continue - dict[a.strip()] = int(float(b)) + b = float(b) + if abs(b % 1.0) < 1e-5: # codegen_* + b = int(b) + dict[a.strip()] = b return dict def parse_name(file): - return file[5:-4].replace('__test_', '::', 1) + if file[0:5] == 'test_': + return file[5:-4].replace('__test_', '::', 1) + elif file[0:10] == 'benchmark_': + return '::'.join(reversed(file[10:-4].split('__arch_'))) + else: + raise Exception(f'bad benchmark file name {file}') def get_dats(dir): list = [] @@ -123,7 +131,7 @@ def plot_in_gui(scatter): single_line = spec and len(spec) == 1 xs, ys = get_dats(xd), get_dats(yd) scatter = defaultdict(list) - for name in set(xs.keys()).union(ys.keys()): + for name in reversed(sorted(set(xs.keys()).union(ys.keys()))): file, func = name.split('::') u, v = xs.get(name, {}), ys.get(name, {}) ret = '' @@ -131,19 +139,33 @@ def plot_in_gui(scatter): if spec and key not in spec: continue a, b = u.get(key, 0), v.get(key, 0) - res = b / a if a != 0 else math.inf + if a == 0: + if b == 0: + res = 1.0 + else: + res = math.inf + else: + res = b / a scatter[key].append(res) if res == 1: continue if single_line: ret += f'{file:_<24}{func:_<42}' else: - ret += f'{key:<43}' + ret += f'{key:<38}' res -= 1 color = Fore.RESET if res > 0: color = Fore.RED elif res < 0: color = Fore.GREEN - ret += f'{Fore.MAGENTA}{a:>5}{Fore.RESET} -> ' - ret += f'{Fore.CYAN}{b:>5} {color}{res:>+8.1%}{Fore.RESET}\n' + if isinstance(a, float): + a = f'{a:>7.2}' + else: + a = f'{a:>7}' + if isinstance(b, float): + b = f'{b:>7.2}' + else: + b = f'{b:>7}' + ret += f'{Fore.MAGENTA}{a}{Fore.RESET} -> ' + ret += f'{Fore.CYAN}{b} {color}{res:>+9.1%}{Fore.RESET}\n' if ret != '': if not single_line: print(f'{file:_<24}{func:_<42}') @@ -173,6 +195,10 @@ def make_argument_parser(): '--gui', action='store_true', help='Display benchmark regression result in GUI') + parser.add_argument('-T', + '--tprt', + action='store_true', + help='Benchmark for time performance') parser.add_argument( '-a', '--arch', @@ -285,10 +311,14 @@ def main(debug=False): shutil.rmtree(output_dir, True) os.mkdir(output_dir) os.environ['TI_BENCHMARK_OUTPUT_DIR'] = output_dir - if os.environ.get('TI_WANTED_ARCHS') is None: - # since we only do number-of-statements benchmark + if os.environ.get('TI_WANTED_ARCHS') is None and not args.tprt: + # since we only do number-of-statements benchmark for SPRT os.environ['TI_WANTED_ARCHS'] = 'x64' - test_python(args) + if args.tprt: + os.system('python benchmarks/run.py') + # TODO: benchmark_python(args) + else: + test_python(args) elif mode == "baseline": import shutil baseline_dir = get_benchmark_baseline_dir()