Skip to content

Commit

Permalink
[misc] Remove usage of deprecated functions in benchmarks (#1193)
Browse files Browse the repository at this point in the history
  • Loading branch information
yuanming-hu authored Jun 9, 2020
1 parent 4dcc638 commit 49a48df
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 38 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,4 @@ _build
*.bin
*.gif
*.mp4
*.dat
24 changes: 6 additions & 18 deletions benchmarks/fill_dense.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ def benchmark_nested_struct():
a = ti.var(dt=ti.f32)
N = 512

@ti.layout
def place():
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

@ti.kernel
def fill():
Expand All @@ -52,9 +50,7 @@ def benchmark_nested_struct_listgen_8x8():
ti.cfg.demote_dense_struct_fors = False
N = 512

@ti.layout
def place():
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

@ti.kernel
def fill():
Expand All @@ -70,9 +66,7 @@ def benchmark_nested_struct_listgen_16x16():
ti.cfg.demote_dense_struct_fors = False
N = 256

@ti.layout
def place():
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a)
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a)

@ti.kernel
def fill():
Expand All @@ -87,9 +81,7 @@ def benchmark_nested_range_blocked():
a = ti.var(dt=ti.f32)
N = 512

@ti.layout
def place():
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

@ti.kernel
def fill():
Expand All @@ -105,9 +97,7 @@ def benchmark_nested_range():
a = ti.var(dt=ti.f32)
N = 512

@ti.layout
def place():
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

@ti.kernel
def fill():
Expand All @@ -124,9 +114,7 @@ def benchmark_root_listgen():
ti.cfg.demote_dense_struct_fors = False
N = 512

@ti.layout
def place():
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

@ti.kernel
def fill():
Expand Down
8 changes: 2 additions & 6 deletions benchmarks/fill_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,7 @@ def benchmark_nested_struct():
a = ti.var(dt=ti.f32)
N = 512

@ti.layout
def place():
ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

@ti.kernel
def fill():
Expand All @@ -25,9 +23,7 @@ def benchmark_nested_struct_fill_and_clear():
a = ti.var(dt=ti.f32)
N = 512

@ti.layout
def place():
ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)

@ti.kernel
def fill():
Expand Down
4 changes: 2 additions & 2 deletions benchmarks/mpm2d.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ def substep():
g_v = grid_v[base + ti.Vector([i, j])]
weight = w[i][0] * w[j][1]
new_v += weight * g_v
new_C += 4 * inv_dx * weight * ti.outer_product(g_v, dpos)
new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
v[p], C[p] = new_v, new_C
x[p] += dt * v[p] # advection

Expand Down Expand Up @@ -226,7 +226,7 @@ def substep():
g_v = grid_v[base + ti.Vector([i, j])]
weight = w[i][0] * w[j][1]
new_v += weight * g_v
new_C += 4 * inv_dx * weight * ti.outer_product(g_v, dpos)
new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
v[p], C[p] = new_v, new_C
x[p] += dt * v[p] # advection

Expand Down
2 changes: 1 addition & 1 deletion misc/baselines/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ assert: assert.cu
clang++-8 -std=c++14 assert.cu -S -emit-llvm --cuda-gpu-arch=sm_61

gpu_memory_bound: gpu_memory_bound.cu
nvcc -std=c++14 gpu_memory_bound.cu -O3 -o gpu_memory_bound
nvcc -std=c++14 gpu_memory_bound.cu -O3 -o gpu_memory_bound --gpu-architecture=compute_61 --gpu-code=sm_61,compute_61

cpu_memory_bound: cpu_memory_bound.cpp
g++ -std=c++14 cpu_memory_bound.cpp -O3 -o cpu_memory_bound
Expand Down
33 changes: 22 additions & 11 deletions misc/baselines/gpu_memory_bound.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4,31 +4,42 @@
#include <cuda_runtime.h>
#include "get_time.h"

__global__ void cpy(float *a, float *b, int *c, int n) {
__global__ void cpy(float *a, float *b, int n) {
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
a[i] = b[i];
if (i < n)
a[i] = b[i];
}

int main() {
int n = 1024 * 1024 * 1024 / 4;
int n = 1024 * 1024 * 1024;
float *a, *b;
int *c;
cudaMalloc(&a, n * sizeof(float));
cudaMalloc(&b, n * sizeof(float));
cudaMalloc(&c, n * sizeof(float));
for (auto bs : {16, 32, 64, 128, 256}) {

int repeat = 25;

double t;
t = get_time();
for (int i = 0; i < repeat; i++) {
cudaMemcpyAsync(a, b, n * sizeof(float), cudaMemcpyDeviceToDevice, 0);
}
cudaDeviceSynchronize();
t = (get_time() - t) / repeat;
printf("cuMemcpyAsync 8GB data bw %.3f GB/s\n",
n * 8.0 / t / (1024 * 1024 * 1024.0f));

for (auto bs : {32, 64, 128, 256}) {
for (int i = 0; i < 10; i++) {
cpy<<<n / bs, bs>>>(a, b, c, n);
cpy<<<n / bs, bs>>>(a, b, n);
}
cudaDeviceSynchronize();
int repeat = 100;
auto t = get_time();
t = get_time();
for (int i = 0; i < repeat; i++) {
cpy<<<n / bs, bs>>>(a, b, c, n);
cpy<<<n / bs, bs>>>(a, b, n);
}
cudaDeviceSynchronize();
t = (get_time() - t) / repeat;
printf("memcpy 1GB data, block_size %d, %.2f ms bw %.3f GB/s\n", bs,
printf("memcpy 8GB data, block_dim %d, %.2f ms bw %.3f GB/s\n", bs,
t * 1000, n * 8.0 / t / (1024 * 1024 * 1024.0f));
}
}

0 comments on commit 49a48df

Please sign in to comment.