diff --git a/.gitignore b/.gitignore index 2601a517e145f..3d963a91d7799 100644 --- a/.gitignore +++ b/.gitignore @@ -64,3 +64,4 @@ _build *.bin *.gif *.mp4 +*.dat diff --git a/benchmarks/fill_dense.py b/benchmarks/fill_dense.py index 3e0f758b8fc8a..a7dd4130d44fa 100644 --- a/benchmarks/fill_dense.py +++ b/benchmarks/fill_dense.py @@ -34,9 +34,7 @@ def benchmark_nested_struct(): a = ti.var(dt=ti.f32) N = 512 - @ti.layout - def place(): - ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) + ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): @@ -52,9 +50,7 @@ def benchmark_nested_struct_listgen_8x8(): ti.cfg.demote_dense_struct_fors = False N = 512 - @ti.layout - def place(): - ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) + ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): @@ -70,9 +66,7 @@ def benchmark_nested_struct_listgen_16x16(): ti.cfg.demote_dense_struct_fors = False N = 256 - @ti.layout - def place(): - ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a) + ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a) @ti.kernel def fill(): @@ -87,9 +81,7 @@ def benchmark_nested_range_blocked(): a = ti.var(dt=ti.f32) N = 512 - @ti.layout - def place(): - ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) + ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): @@ -105,9 +97,7 @@ def benchmark_nested_range(): a = ti.var(dt=ti.f32) N = 512 - @ti.layout - def place(): - ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) + ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): @@ -124,9 +114,7 @@ def benchmark_root_listgen(): ti.cfg.demote_dense_struct_fors = False N = 512 - @ti.layout - def place(): - ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) + ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): diff --git a/benchmarks/fill_sparse.py b/benchmarks/fill_sparse.py index 05670476c21dd..35bbfd9f77ba3 100644 --- a/benchmarks/fill_sparse.py +++ b/benchmarks/fill_sparse.py @@ -6,9 +6,7 @@ def benchmark_nested_struct(): a = ti.var(dt=ti.f32) N = 512 - @ti.layout - def place(): - ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) + ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): @@ -25,9 +23,7 @@ def benchmark_nested_struct_fill_and_clear(): a = ti.var(dt=ti.f32) N = 512 - @ti.layout - def place(): - ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) + ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a) @ti.kernel def fill(): diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py index 7db302a1a4ef3..12ac6bc2205b6 100644 --- a/benchmarks/mpm2d.py +++ b/benchmarks/mpm2d.py @@ -98,7 +98,7 @@ def substep(): g_v = grid_v[base + ti.Vector([i, j])] weight = w[i][0] * w[j][1] new_v += weight * g_v - new_C += 4 * inv_dx * weight * ti.outer_product(g_v, dpos) + new_C += 4 * inv_dx * weight * g_v.outer_product(dpos) v[p], C[p] = new_v, new_C x[p] += dt * v[p] # advection @@ -226,7 +226,7 @@ def substep(): g_v = grid_v[base + ti.Vector([i, j])] weight = w[i][0] * w[j][1] new_v += weight * g_v - new_C += 4 * inv_dx * weight * ti.outer_product(g_v, dpos) + new_C += 4 * inv_dx * weight * g_v.outer_product(dpos) v[p], C[p] = new_v, new_C x[p] += dt * v[p] # advection diff --git a/misc/baselines/Makefile b/misc/baselines/Makefile index dcab730264305..018b59ac8ec95 100644 --- a/misc/baselines/Makefile +++ b/misc/baselines/Makefile @@ -11,7 +11,7 @@ assert: assert.cu clang++-8 -std=c++14 assert.cu -S -emit-llvm --cuda-gpu-arch=sm_61 gpu_memory_bound: gpu_memory_bound.cu - nvcc -std=c++14 gpu_memory_bound.cu -O3 -o gpu_memory_bound + nvcc -std=c++14 gpu_memory_bound.cu -O3 -o gpu_memory_bound --gpu-architecture=compute_61 --gpu-code=sm_61,compute_61 cpu_memory_bound: cpu_memory_bound.cpp g++ -std=c++14 cpu_memory_bound.cpp -O3 -o cpu_memory_bound diff --git a/misc/baselines/gpu_memory_bound.cu b/misc/baselines/gpu_memory_bound.cu index ae00dd58bc767..211c1f23ddfdf 100644 --- a/misc/baselines/gpu_memory_bound.cu +++ b/misc/baselines/gpu_memory_bound.cu @@ -4,31 +4,42 @@ #include #include "get_time.h" -__global__ void cpy(float *a, float *b, int *c, int n) { +__global__ void cpy(float *a, float *b, int n) { unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; - a[i] = b[i]; + if (i < n) + a[i] = b[i]; } int main() { - int n = 1024 * 1024 * 1024 / 4; + int n = 1024 * 1024 * 1024; float *a, *b; - int *c; cudaMalloc(&a, n * sizeof(float)); cudaMalloc(&b, n * sizeof(float)); - cudaMalloc(&c, n * sizeof(float)); - for (auto bs : {16, 32, 64, 128, 256}) { + + int repeat = 25; + + double t; + t = get_time(); + for (int i = 0; i < repeat; i++) { + cudaMemcpyAsync(a, b, n * sizeof(float), cudaMemcpyDeviceToDevice, 0); + } + cudaDeviceSynchronize(); + t = (get_time() - t) / repeat; + printf("cuMemcpyAsync 8GB data bw %.3f GB/s\n", + n * 8.0 / t / (1024 * 1024 * 1024.0f)); + + for (auto bs : {32, 64, 128, 256}) { for (int i = 0; i < 10; i++) { - cpy<<>>(a, b, c, n); + cpy<<>>(a, b, n); } cudaDeviceSynchronize(); - int repeat = 100; - auto t = get_time(); + t = get_time(); for (int i = 0; i < repeat; i++) { - cpy<<>>(a, b, c, n); + cpy<<>>(a, b, n); } cudaDeviceSynchronize(); t = (get_time() - t) / repeat; - printf("memcpy 1GB data, block_size %d, %.2f ms bw %.3f GB/s\n", bs, + printf("memcpy 8GB data, block_dim %d, %.2f ms bw %.3f GB/s\n", bs, t * 1000, n * 8.0 / t / (1024 * 1024 * 1024.0f)); } }