diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py
deleted file mode 100644
index e69de29bb2d1d..0000000000000
diff --git a/benchmarks/benchmark_baseline.txt b/benchmarks/benchmark_baseline.txt
deleted file mode 100644
index bd2645c70a6fa..0000000000000
--- a/benchmarks/benchmark_baseline.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-fill_dense:
- * flat_range                x86_64    83.230 ms       cuda        1.284 ms
- * flat_struct               x86_64     4.662 ms       cuda        2.192 ms
- * nested_range              x86_64    11.367 ms       cuda        1.222 ms
- * nested_range_blocked      x86_64     4.242 ms       cuda        6.570 ms
- * nested_struct             x86_64    12.860 ms       cuda       17.888 ms
-mpm2d:
- * range                     x86_64     0.764 ms       cuda        0.045 ms
- * struct                    x86_64     0.747 ms       cuda        0.206 ms
diff --git a/benchmarks/fill_dense.py b/benchmarks/fill_dense.py
index e316492d8ccb3..3e0f758b8fc8a 100644
--- a/benchmarks/fill_dense.py
+++ b/benchmarks/fill_dense.py
@@ -3,6 +3,7 @@
 # originally by @KLozes
 
 
+@ti.all_archs
 def benchmark_flat_struct():
     N = 4096
     a = ti.var(dt=ti.f32, shape=(N, N))
@@ -12,9 +13,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=500)
 
 
+@ti.all_archs
 def benchmark_flat_range():
     N = 4096
     a = ti.var(dt=ti.f32, shape=(N, N))
@@ -24,9 +26,10 @@ def fill():
         for i, j in ti.ndrange(N, N):
             a[i, j] = 2.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=700)
 
 
+@ti.all_archs
 def benchmark_nested_struct():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -40,11 +43,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=700)
 
 
+@ti.all_archs
 def benchmark_nested_struct_listgen_8x8():
     a = ti.var(dt=ti.f32)
     ti.cfg.demote_dense_struct_fors = False
@@ -59,11 +61,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=1000)
 
 
+@ti.all_archs
 def benchmark_nested_struct_listgen_16x16():
     a = ti.var(dt=ti.f32)
     ti.cfg.demote_dense_struct_fors = False
@@ -78,11 +79,10 @@ def fill():
         for i, j in a:
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=700)
 
 
+@ti.all_archs
 def benchmark_nested_range_blocked():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -97,11 +97,10 @@ def fill():
             for Y in range(64):
                 a[X // N * 8 + Y // 8, X % N * 8 + Y % 8] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=800)
 
 
+@ti.all_archs
 def benchmark_nested_range():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -116,9 +115,10 @@ def fill():
             for i in range(N * 8):
                 a[i, j] = 2.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=1000)
 
 
+@ti.all_archs
 def benchmark_root_listgen():
     a = ti.var(dt=ti.f32)
     ti.cfg.demote_dense_struct_fors = False
@@ -133,9 +133,7 @@ def fill():
         for i, j in a.parent():
             a[i, j] = 2.0
 
-    fill()
-
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=800)
 
 
 '''
diff --git a/benchmarks/fill_sparse.py b/benchmarks/fill_sparse.py
index ee974040262fa..05670476c21dd 100644
--- a/benchmarks/fill_sparse.py
+++ b/benchmarks/fill_sparse.py
@@ -1,6 +1,7 @@
 import taichi as ti
 
 
+@ti.archs_support_sparse
 def benchmark_nested_struct():
     a = ti.var(dt=ti.f32)
     N = 512
@@ -19,6 +20,7 @@ def fill():
     return ti.benchmark(fill)
 
 
+@ti.archs_support_sparse
 def benchmark_nested_struct_fill_and_clear():
     a = ti.var(dt=ti.f32)
     N = 512
diff --git a/benchmarks/memory_bound.py b/benchmarks/memory_bound.py
index df74e34d791fc..f004e0218af3f 100644
--- a/benchmarks/memory_bound.py
+++ b/benchmarks/memory_bound.py
@@ -4,6 +4,7 @@
 
 
 # 4 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_memset():
     a = ti.var(dt=ti.f32, shape=N)
 
@@ -16,6 +17,7 @@ def memset():
 
 
 # 8 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_sscal():
     a = ti.var(dt=ti.f32, shape=N)
 
@@ -28,6 +30,7 @@ def task():
 
 
 # 8 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_memcpy():
     a = ti.var(dt=ti.f32, shape=N)
     b = ti.var(dt=ti.f32, shape=N)
@@ -41,6 +44,7 @@ def memcpy():
 
 
 # 12 B/it
+@ti.archs_excluding(ti.opengl)
 def benchmark_saxpy():
     x = ti.var(dt=ti.f32, shape=N)
     y = ti.var(dt=ti.f32, shape=N)
diff --git a/benchmarks/minimal.py b/benchmarks/minimal.py
index 9dbb24b234ffc..b3df395938c52 100644
--- a/benchmarks/minimal.py
+++ b/benchmarks/minimal.py
@@ -1,6 +1,7 @@
 import taichi as ti
 
 
+@ti.all_archs
 def benchmark_fill_scalar():
     a = ti.var(dt=ti.f32, shape=())
 
@@ -8,4 +9,4 @@ def benchmark_fill_scalar():
     def fill():
         a[None] = 1.0
 
-    return ti.benchmark(fill)
+    return ti.benchmark(fill, repeat=1000)
diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py
index 958dcbfd5c39b..4c27613deb135 100644
--- a/benchmarks/mpm2d.py
+++ b/benchmarks/mpm2d.py
@@ -1,9 +1,10 @@
-import time
+import taichi as ti
 import numpy as np
+import time
 
 
+@ti.all_archs
 def benchmark_range():
-    import taichi as ti
     quality = 1  # Use a larger value for higher-res simulations
     n_particles, n_grid = 9000 * quality**2, 128 * quality
     dx, inv_dx = 1 / n_grid, float(n_grid)
@@ -124,11 +125,12 @@ def substep():
         # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
         # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
     ti.get_runtime().sync()
-    return (time.time() - t) / 4000
+    avg = (time.time() - t) / 4000 * 1000  # miliseconds
+    ti.stat_write(avg)
 
 
+@ti.all_archs
 def benchmark_struct():
-    import taichi as ti
     quality = 1  # Use a larger value for higher-res simulations
     n_particles, n_grid = 9000 * quality**2, 128 * quality
     dx, inv_dx = 1 / n_grid, float(n_grid)
@@ -251,4 +253,5 @@ def substep():
         # gui.circles(x.to_numpy(), radius=1.5, color=colors[material.to_numpy()])
         # gui.show() # Change to gui.show(f'{frame:06d}.png') to write images to disk
     ti.get_runtime().sync()
-    return (time.time() - t) / 4000
+    avg = (time.time() - t) / 4000 * 1000  # miliseconds
+    ti.stat_write(avg)
diff --git a/benchmarks/run.py b/benchmarks/run.py
index e6373378e1a5a..47b55529dc99f 100644
--- a/benchmarks/run.py
+++ b/benchmarks/run.py
@@ -2,6 +2,10 @@
 import taichi as ti
 
 
+def get_benchmark_dir():
+    return os.path.join(ti.core.get_repo_dir(), 'benchmarks')
+
+
 class Case:
     def __init__(self, name, func):
         self.name = name
@@ -14,20 +18,10 @@ def __lt__(self, other):
     def __eq__(self, other):
         return self.name == other.name
 
-    def pprint(self):
-        print(f' * {self.name[10:]:33}', end='')
-        for i, arch in enumerate(sorted(self.records.keys())):
-            ms = self.records[arch] * 1000
-            arch_name = str(arch)[5:]
-            print(f' {arch_name:8} {ms:7.3f} ms', end='')
-            if i < len(self.records) - 1:
-                print('      ', end='')
-        print()
-
-    def run(self, arch):
-        ti.init(arch=arch)
-        t = self.func()
-        self.records[arch] = t
+    def run(self):
+        print(f'==> {self.name}:')
+        os.environ['TI_CURRENT_BENCHMARK'] = self.name
+        self.func()
 
 
 class Suite:
@@ -42,38 +36,25 @@ def __init__(self, filename):
             sorted(filter(lambda x: x.startswith('benchmark_'), dir(suite))))
         self.cases = [Case(k, getattr(suite, k)) for k in case_keys]
 
-    def print(self):
-        print(f'{self.name}:')
-        for b in self.cases:
-            b.pprint()
-
-    def run(self, arch):
+    def run(self):
         print(f'{self.name}:')
         for case in sorted(self.cases):
-            case.run(arch)
+            case.run()
 
 
 class TaichiBenchmark:
     def __init__(self):
         self.suites = []
-        benchmark_dir = os.path.dirname(__file__)
+        benchmark_dir = get_benchmark_dir()
         for f in map(os.path.basename, sorted(os.listdir(benchmark_dir))):
             if f != 'run.py' and f.endswith('.py') and f[0] != '_':
                 self.suites.append(Suite(f))
 
-    def pprint(self):
-        for s in self.suites:
-            s.print()
-
-    def run(self, arch):
+    def run(self):
         print("Running...")
         for s in self.suites:
-            s.run(arch)
+            s.run()
 
 
 b = TaichiBenchmark()
-b.pprint()
-b.run(ti.x64)
-b.run(ti.cuda)
-print()
-b.pprint()
+b.run()
diff --git a/docs/utilities.rst b/docs/utilities.rst
index a6d707c465f48..777abc64e5d6a 100644
--- a/docs/utilities.rst
+++ b/docs/utilities.rst
@@ -55,19 +55,7 @@ For example, this is part of the output by ``ti regression`` after enabling cons
     Discussion at: https://github.com/taichi-dev/taichi/issue/948
 
 
-<<<<<<< HEAD
 The suggested workflow for the performance related PR author to run the regression tests is:
-=======
-The suggested workflow for **the PR author** to run the regression tests is:
-
-* When a performance related PR is ready, checkout that PR locally.
-
-* Run ``ti benchmark && ti regression`` to obtain the result.
-
-* Decide wheater to approve or request change, depends on the result.
-
-* Right before merge, run ``ti baseline`` to save the benchmark result as new baseline.
->>>>>>> master
 
 * Run ``ti benchmark && ti baseline`` in ``master`` to save the current performance as baseline.
 
diff --git a/python/taichi/lang/__init__.py b/python/taichi/lang/__init__.py
index edeb53633e01e..c9db4fa2c0521 100644
--- a/python/taichi/lang/__init__.py
+++ b/python/taichi/lang/__init__.py
@@ -247,10 +247,12 @@ def static_print(*args, __p=print, **kwargs):
     __p(*args, **kwargs)
 
 
-def benchmark(func, repeat=100, args=()):
+def benchmark(func, repeat=300, args=()):
     import taichi as ti
     import time
-    for i in range(repeat // 3):
+    # The reason why we run 4 times is to warm up instruction/data caches.
+    # Discussion: https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
+    for i in range(4):
         func(*args)  # compile the kernel first
     ti.sync()
     t = time.time()
@@ -258,7 +260,20 @@ def benchmark(func, repeat=100, args=()):
         func(*args)
     ti.get_runtime().sync()
     elapsed = time.time() - t
-    return elapsed / repeat
+    avg = elapsed / repeat * 1000  # miliseconds
+    ti.stat_write(avg)
+
+
+def stat_write(avg):
+    name = os.environ.get('TI_CURRENT_BENCHMARK')
+    if name is None:
+        return
+    import taichi as ti
+    arch_name = ti.core.arch_name(ti.cfg.arch)
+    output_dir = os.environ.get('TI_BENCHMARK_OUTPUT_DIR', '.')
+    filename = f'{output_dir}/{name}__arch_{arch_name}.dat'
+    with open(filename, 'w') as f:
+        f.write(f'time_avg: {avg:.4f}')
 
 
 def supported_archs():
diff --git a/python/taichi/main.py b/python/taichi/main.py
index 3e341d115b93a..c370e045c5276 100644
--- a/python/taichi/main.py
+++ b/python/taichi/main.py
@@ -88,11 +88,19 @@ def parse_dat(file):
         for line in open(file).readlines():
             try: a, b = line.strip().split(':')
             except: continue
-            dict[a.strip()] = int(float(b))
+            b = float(b)
+            if abs(b % 1.0) < 1e-5:  # codegen_*
+                b = int(b)
+            dict[a.strip()] = b
         return dict
 
     def parse_name(file):
-        return file[5:-4].replace('__test_', '::', 1)
+        if file[0:5] == 'test_':
+            return file[5:-4].replace('__test_', '::', 1)
+        elif file[0:10] == 'benchmark_':
+            return '::'.join(reversed(file[10:-4].split('__arch_')))
+        else:
+            raise Exception(f'bad benchmark file name {file}')
 
     def get_dats(dir):
         list = []
@@ -123,7 +131,7 @@ def plot_in_gui(scatter):
     single_line = spec and len(spec) == 1
     xs, ys = get_dats(xd), get_dats(yd)
     scatter = defaultdict(list)
-    for name in set(xs.keys()).union(ys.keys()):
+    for name in reversed(sorted(set(xs.keys()).union(ys.keys()))):
         file, func = name.split('::')
         u, v = xs.get(name, {}), ys.get(name, {})
         ret = ''
@@ -131,19 +139,33 @@ def plot_in_gui(scatter):
             if spec and key not in spec:
                 continue
             a, b = u.get(key, 0), v.get(key, 0)
-            res = b / a if a != 0 else math.inf
+            if a == 0:
+                if b == 0:
+                    res = 1.0
+                else:
+                    res = math.inf
+            else:
+                res = b / a
             scatter[key].append(res)
             if res == 1: continue
             if single_line:
                 ret += f'{file:_<24}{func:_<42}'
             else:
-                ret += f'{key:<43}'
+                ret += f'{key:<38}'
             res -= 1
             color = Fore.RESET
             if res > 0: color = Fore.RED
             elif res < 0: color = Fore.GREEN
-            ret += f'{Fore.MAGENTA}{a:>5}{Fore.RESET} -> '
-            ret += f'{Fore.CYAN}{b:>5} {color}{res:>+8.1%}{Fore.RESET}\n'
+            if isinstance(a, float):
+                a = f'{a:>7.2}'
+            else:
+                a = f'{a:>7}'
+            if isinstance(b, float):
+                b = f'{b:>7.2}'
+            else:
+                b = f'{b:>7}'
+            ret += f'{Fore.MAGENTA}{a}{Fore.RESET} -> '
+            ret += f'{Fore.CYAN}{b} {color}{res:>+9.1%}{Fore.RESET}\n'
         if ret != '':
             if not single_line:
                 print(f'{file:_<24}{func:_<42}')
@@ -173,6 +195,10 @@ def make_argument_parser():
                         '--gui',
                         action='store_true',
                         help='Display benchmark regression result in GUI')
+    parser.add_argument('-T',
+                        '--tprt',
+                        action='store_true',
+                        help='Benchmark for time performance')
     parser.add_argument(
         '-a',
         '--arch',
@@ -285,10 +311,14 @@ def main(debug=False):
         shutil.rmtree(output_dir, True)
         os.mkdir(output_dir)
         os.environ['TI_BENCHMARK_OUTPUT_DIR'] = output_dir
-        if os.environ.get('TI_WANTED_ARCHS') is None:
-            # since we only do number-of-statements benchmark
+        if os.environ.get('TI_WANTED_ARCHS') is None and not args.tprt:
+            # since we only do number-of-statements benchmark for SPRT
             os.environ['TI_WANTED_ARCHS'] = 'x64'
-        test_python(args)
+        if args.tprt:
+            os.system('python benchmarks/run.py')
+            # TODO: benchmark_python(args)
+        else:
+            test_python(args)
     elif mode == "baseline":
         import shutil
         baseline_dir = get_benchmark_baseline_dir()