Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[async] Add advection benchmark #1914

Merged
merged 9 commits into from
Oct 1, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions benchmarks/async_advection.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import taichi as ti
import math

from utils import benchmark_async

# TODO: staggerred grid


@benchmark_async
def advection_2d(scale):
n = 128 * 2**int((math.log(scale, 2)) // 2)
x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
new_x = ti.Vector.field(3, dtype=ti.f32, shape=(n, n))
v = ti.Vector.field(2, dtype=ti.f32, shape=(n, n))
dx = 1 / n
inv_dx = 1 / dx
dt = 0.01

stagger = ti.Vector([0.5, 0.5])

@ti.func
def Vector2(x, y):
return ti.Vector([x, y])

@ti.kernel
def init():
for i, j in v:
v[i, j] = ti.Vector([j / n - 0.5, 0.5 - i / n])

for i, j in ti.ndrange(n * 4, n * 4):
ret = ti.taichi_logo(ti.Vector([i, j]) / (n * 4))
x[i // 4, j // 4][0] += ret / 16
x[i // 4, j // 4][1] += ret / 16
x[i // 4, j // 4][2] += ret / 16

@ti.func
def vec(x, y):
return ti.Vector([x, y])

@ti.func
def clamp(p):
for d in ti.static(range(p.n)):
p[d] = min(1 - 1e-4 - dx + stagger[d] * dx,
max(p[d], stagger[d] * dx))
return p

@ti.func
def sample_bilinear(x, p):
p = clamp(p)

p_grid = p * inv_dx - stagger

I = ti.cast(ti.floor(p_grid), ti.i32)
f = p_grid - I
g = 1 - f

return x[I] * (g[0] * g[1]) + x[I + vec(1, 0)] * (f[0] * g[1]) + x[
I + vec(0, 1)] * (g[0] * f[1]) + x[I + vec(1, 1)] * (f[0] * f[1])

@ti.func
def velocity(p):
return sample_bilinear(v, p)

@ti.func
def sample_min(x, p):
p = clamp(p)
p_grid = p * inv_dx - stagger
I = ti.cast(ti.floor(p_grid), ti.i32)

return min(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])

@ti.func
def sample_max(x, p):
p = clamp(p)
p_grid = p * inv_dx - stagger
I = ti.cast(ti.floor(p_grid), ti.i32)

return max(x[I], x[I + vec(1, 0)], x[I + vec(0, 1)], x[I + vec(1, 1)])

@ti.func
def backtrace(I, dt): # RK3
p = (I + stagger) * dx
v1 = velocity(p)
p1 = p - 0.5 * dt * v1
v2 = velocity(p1)
p2 = p - 0.75 * dt * v2
v3 = velocity(p2)
p -= dt * (2 / 9 * v1 + 1 / 3 * v2 + 4 / 9 * v3)
return p

@ti.func
def semi_lagrangian(x, new_x, dt):
for I in ti.grouped(x):
new_x[I] = sample_bilinear(x, backtrace(I, dt))

@ti.kernel
def advect():
semi_lagrangian(x(0), new_x(0), dt)
semi_lagrangian(x(1), new_x(1), dt)
semi_lagrangian(x(2), new_x(2), dt)

for I in ti.grouped(x):
x[I] = new_x[I]

init()

def task():
for i in range(10):
advect()

ti.benchmark(task, repeat=100)

visualize = False

if visualize:
gui = ti.GUI('Advection schemes', (n, n))
for i in range(10):
for _ in range(10):
advect()
gui.set_image(x.to_numpy())
gui.show()


if __name__ == '__main__':
advection_2d()
xumingkuan marked this conversation as resolved.
Show resolved Hide resolved
30 changes: 6 additions & 24 deletions benchmarks/async_cases.py
Original file line number Diff line number Diff line change
@@ -1,52 +1,34 @@
import taichi as ti
import os
import sys
import functools

sys.path.append(os.path.join(ti.core.get_repo_dir(), 'tests', 'python'))

from fuse_test_template import template_fuse_dense_x2y2z, \
template_fuse_reduction


# Note: this is a short-term solution. In the long run we need to think about how to reuse pytest
def benchmark_async(func):
@functools.wraps(func)
def body():
for arch in [ti.cpu, ti.cuda]:
for async_mode in [True, False]:
os.environ['TI_CURRENT_BENCHMARK'] = func.__name__
ti.init(arch=arch, async_mode=async_mode)
if arch == ti.cpu:
scale = 2
else:
# Use more data to hide compilation overhead
# (since CUDA runs much faster than CPUs)
scale = 64
func(scale)

return body
from utils import *


@benchmark_async
def fuse_dense_x2y2z(scale):
template_fuse_dense_x2y2z(size=scale * 10 * 1024**2,
template_fuse_dense_x2y2z(size=scale * 1024**2,
repeat=1,
benchmark_repeat=100,
benchmark=True)


@benchmark_async
def fuse_reduction(scale):
template_fuse_reduction(size=scale * 10 * 1024**2,
template_fuse_reduction(size=scale * 1024**2,
repeat=10,
benchmark_repeat=10,
benchmark=True)


@benchmark_async
def fill_1d(scale):
a = ti.field(dtype=ti.f32, shape=scale * 10 * 1024**2)
a = ti.field(dtype=ti.f32, shape=scale * 1024**2)

@ti.kernel
def fill():
Expand Down Expand Up @@ -81,7 +63,7 @@ def sparse_numpy(scale):
a = ti.field(dtype=ti.f32)
b = ti.field(dtype=ti.f32)

block_count = 2**int((math.log(scale, 2)) // 2) * 64
block_count = 2**int((math.log(scale, 2)) // 2) * 4
block_size = 32
# a, b always share the same sparsity
ti.root.pointer(ti.ij, block_count).dense(ti.ij, block_size).place(a, b)
Expand Down Expand Up @@ -145,7 +127,7 @@ def stencil_reduction(scale):
b = ti.field(dtype=ti.f32)
total = ti.field(dtype=ti.f32, shape=())

block_count = scale * 512
block_count = scale * 64
block_size = 1024
# a, b always share the same sparsity
ti.root.pointer(ti.i, block_count).dense(ti.i, block_size).place(a, b)
Expand Down
10 changes: 3 additions & 7 deletions benchmarks/benchmark_async.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,13 @@
import taichi as ti

from async_cases import *
from async_advection import *

rerun = True

cases = [
fuse_dense_x2y2z,
fuse_reduction,
fill_1d,
sparse_numpy,
autodiff,
stencil_reduction,
# mpm_splitted,
fuse_dense_x2y2z, fuse_reduction, fill_1d, sparse_numpy, autodiff,
stencil_reduction, mpm_splitted, advection_2d
]

if rerun:
Expand Down
21 changes: 21 additions & 0 deletions benchmarks/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import taichi as ti
import functools
import os


def benchmark_async(func):
@functools.wraps(func)
def body():
for arch in [ti.cpu, ti.cuda]:
for async_mode in [True, False]:
os.environ['TI_CURRENT_BENCHMARK'] = func.__name__
ti.init(arch=arch, async_mode=async_mode, kernel_profiler=True)
if arch == ti.cpu:
scale = 2
else:
# Use more data to hide compilation overhead
# (since CUDA runs much faster than CPUs)
scale = 64
func(scale)

return body
3 changes: 2 additions & 1 deletion examples/mpm_lagrangian_forces.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import taichi as ti
import numpy as np

ti.init(arch=ti.gpu)
ti.init(arch=ti.gpu, kernel_profiler=True)

dim = 2
quality = 8 # Use a larger integral number for higher quality
Expand Down Expand Up @@ -187,6 +187,7 @@ def main():
color=0xFFFFFF,
radius=3)
gui.show()
ti.kernel_profiler_print()


if __name__ == '__main__':
Expand Down
16 changes: 10 additions & 6 deletions python/taichi/lang/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,10 @@
cc = core.cc
gpu = [cuda, metal, opengl]
cpu = core.host_arch()
kernel_profiler_print = lambda: core.get_current_program(
).kernel_profiler_print()
kernel_profiler_clear = lambda: core.get_current_program(
).kernel_profiler_clear()
kernel_profiler_print = lambda: get_runtime().prog.kernel_profiler_print()
kernel_profiler_clear = lambda: get_runtime().prog.kernel_profiler_clear()
kernel_profiler_total_time = lambda: get_runtime(
).prog.kernel_profiler_total_time()


def memory_profiler_print():
Expand Down Expand Up @@ -331,19 +331,23 @@ def run_benchmark():
ti.stat_write('offloaded_tasks', b)
elif a == 'launched_tasks':
ti.stat_write('launched_tasks', b)
# The reason why we run 3 more times is to warm up

# Use 3 initial iterations to warm up
# instruction/data caches. Discussion:
# https://github.com/taichi-dev/taichi/pull/1002#discussion_r426312136
for i in range(3):
func(*args)
ti.sync()
ti.kernel_profiler_clear()
t = time.time()
for n in range(repeat):
func(*args)
ti.sync()
elapsed = time.time() - t
avg = elapsed / repeat
ti.stat_write('running_time', avg)
ti.stat_write('clock_time', avg)
device_time = ti.kernel_profiler_total_time()
ti.stat_write('device_time', device_time)

run_benchmark()

Expand Down
13 changes: 9 additions & 4 deletions taichi/program/kernel_profiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ void KernelProfilerBase::print() {
"name\n");
std::sort(records.begin(), records.end());
for (auto &rec : records) {
auto fraction = rec.total / total_time * 100.0f;
auto fraction = rec.total / total_time_ms * 100.0f;
fmt::print("[{:6.2f}% {:7.3f} s {:6d}x |{:9.3f} {:9.3f} {:9.3f} ms] {}\n",
fraction, rec.total / 1000.0f, rec.counter, rec.min,
rec.total / rec.counter, rec.max, rec.name);
Expand All @@ -53,12 +53,17 @@ void KernelProfilerBase::print() {
fmt::print(
"[100.00%] Total kernel execution time: {:7.3f} s number of records: "
"{}\n",
total_time / 1000.0f, records.size());
get_total_time(), records.size());

fmt::print(
"========================================================================"
"=\n");
}

double KernelProfilerBase::get_total_time() const {
return total_time_ms / 1000.0;
}

namespace {
// A simple profiler that uses Time::get_time()
class DefaultProfiler : public KernelProfilerBase {
Expand Down Expand Up @@ -90,7 +95,7 @@ class DefaultProfiler : public KernelProfilerBase {
it = std::prev(records.end());
}
it->insert_sample(ms);
total_time += ms;
total_time_ms += ms;
}

private:
Expand Down Expand Up @@ -150,7 +155,7 @@ class KernelProfilerCUDA : public KernelProfilerBase {
it = std::prev(records.end());
}
it->insert_sample(ms);
total_time += ms;
total_time_ms += ms;
}
}
outstanding_events.clear();
Expand Down
6 changes: 4 additions & 2 deletions taichi/program/kernel_profiler.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,14 +30,14 @@ struct KernelProfileRecord {
class KernelProfilerBase {
protected:
std::vector<KernelProfileRecord> records;
double total_time;
double total_time_ms;

public:
// Needed for the CUDA backend since we need to know which task to "stop"
using TaskHandle = void *;

void clear() {
total_time = 0;
total_time_ms = 0;
records.clear();
}

Expand All @@ -62,6 +62,8 @@ class KernelProfilerBase {

void print();

double get_total_time() const;

virtual ~KernelProfilerBase() {
}
};
Expand Down
2 changes: 2 additions & 0 deletions taichi/program/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,6 +472,8 @@ void Program::synchronize() {
if (config.async_mode) {
async_engine->synchronize();
}
if (profiler)
profiler->sync();
device_synchronize();
sync = true;
}
Expand Down
2 changes: 2 additions & 0 deletions taichi/python/export_lang.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,8 @@ void export_lang(py::module &m) {
.def(py::init<>())
.def_readonly("config", &Program::config)
.def("kernel_profiler_print", &Program::kernel_profiler_print)
.def("kernel_profiler_total_time",
[](Program *program) { return program->profiler->get_total_time(); })
.def("kernel_profiler_clear", &Program::kernel_profiler_clear)
.def("print_memory_profiler_info", &Program::print_memory_profiler_info)
.def("finalize", &Program::finalize)
Expand Down