diff --git a/.gitignore b/.gitignore
index 2601a517e145f..3d963a91d7799 100644
--- a/.gitignore
+++ b/.gitignore
@@ -64,3 +64,4 @@ _build
 *.bin
 *.gif
 *.mp4
+*.dat
diff --git a/benchmarks/fill_dense.py b/benchmarks/fill_dense.py
index 3e0f758b8fc8a..a7dd4130d44fa 100644
--- a/benchmarks/fill_dense.py
+++ b/benchmarks/fill_dense.py
@@ -34,9 +34,7 @@ def benchmark_nested_struct():
     a = ti.var(dt=ti.f32)
     N = 512
 
-    @ti.layout
-    def place():
-        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
 
     @ti.kernel
     def fill():
@@ -52,9 +50,7 @@ def benchmark_nested_struct_listgen_8x8():
     ti.cfg.demote_dense_struct_fors = False
     N = 512
 
-    @ti.layout
-    def place():
-        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
 
     @ti.kernel
     def fill():
@@ -70,9 +66,7 @@ def benchmark_nested_struct_listgen_16x16():
     ti.cfg.demote_dense_struct_fors = False
     N = 256
 
-    @ti.layout
-    def place():
-        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a)
+    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [16, 16]).place(a)
 
     @ti.kernel
     def fill():
@@ -87,9 +81,7 @@ def benchmark_nested_range_blocked():
     a = ti.var(dt=ti.f32)
     N = 512
 
-    @ti.layout
-    def place():
-        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
 
     @ti.kernel
     def fill():
@@ -105,9 +97,7 @@ def benchmark_nested_range():
     a = ti.var(dt=ti.f32)
     N = 512
 
-    @ti.layout
-    def place():
-        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
 
     @ti.kernel
     def fill():
@@ -124,9 +114,7 @@ def benchmark_root_listgen():
     ti.cfg.demote_dense_struct_fors = False
     N = 512
 
-    @ti.layout
-    def place():
-        ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+    ti.root.dense(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
 
     @ti.kernel
     def fill():
diff --git a/benchmarks/fill_sparse.py b/benchmarks/fill_sparse.py
index 05670476c21dd..35bbfd9f77ba3 100644
--- a/benchmarks/fill_sparse.py
+++ b/benchmarks/fill_sparse.py
@@ -6,9 +6,7 @@ def benchmark_nested_struct():
     a = ti.var(dt=ti.f32)
     N = 512
 
-    @ti.layout
-    def place():
-        ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+    ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
 
     @ti.kernel
     def fill():
@@ -25,9 +23,7 @@ def benchmark_nested_struct_fill_and_clear():
     a = ti.var(dt=ti.f32)
     N = 512
 
-    @ti.layout
-    def place():
-        ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
+    ti.root.pointer(ti.ij, [N, N]).dense(ti.ij, [8, 8]).place(a)
 
     @ti.kernel
     def fill():
diff --git a/benchmarks/mpm2d.py b/benchmarks/mpm2d.py
index 7db302a1a4ef3..12ac6bc2205b6 100644
--- a/benchmarks/mpm2d.py
+++ b/benchmarks/mpm2d.py
@@ -98,7 +98,7 @@ def substep():
                 g_v = grid_v[base + ti.Vector([i, j])]
                 weight = w[i][0] * w[j][1]
                 new_v += weight * g_v
-                new_C += 4 * inv_dx * weight * ti.outer_product(g_v, dpos)
+                new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
             v[p], C[p] = new_v, new_C
             x[p] += dt * v[p]  # advection
 
@@ -226,7 +226,7 @@ def substep():
                 g_v = grid_v[base + ti.Vector([i, j])]
                 weight = w[i][0] * w[j][1]
                 new_v += weight * g_v
-                new_C += 4 * inv_dx * weight * ti.outer_product(g_v, dpos)
+                new_C += 4 * inv_dx * weight * g_v.outer_product(dpos)
             v[p], C[p] = new_v, new_C
             x[p] += dt * v[p]  # advection
 
diff --git a/misc/baselines/Makefile b/misc/baselines/Makefile
index dcab730264305..018b59ac8ec95 100644
--- a/misc/baselines/Makefile
+++ b/misc/baselines/Makefile
@@ -11,7 +11,7 @@ assert: assert.cu
 	clang++-8 -std=c++14 assert.cu -S -emit-llvm --cuda-gpu-arch=sm_61
 
 gpu_memory_bound: gpu_memory_bound.cu
-	nvcc -std=c++14 gpu_memory_bound.cu -O3 -o gpu_memory_bound
+	nvcc -std=c++14 gpu_memory_bound.cu -O3 -o gpu_memory_bound --gpu-architecture=compute_61 --gpu-code=sm_61,compute_61
 
 cpu_memory_bound: cpu_memory_bound.cpp
 	g++ -std=c++14 cpu_memory_bound.cpp -O3 -o cpu_memory_bound
diff --git a/misc/baselines/gpu_memory_bound.cu b/misc/baselines/gpu_memory_bound.cu
index ae00dd58bc767..211c1f23ddfdf 100644
--- a/misc/baselines/gpu_memory_bound.cu
+++ b/misc/baselines/gpu_memory_bound.cu
@@ -4,31 +4,42 @@
 #include <cuda_runtime.h>
 #include "get_time.h"
 
-__global__ void cpy(float *a, float *b, int *c, int n) {
+__global__ void cpy(float *a, float *b, int n) {
   unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
-  a[i] = b[i];
+  if (i < n)
+    a[i] = b[i];
 }
 
 int main() {
-  int n = 1024 * 1024 * 1024 / 4;
+  int n = 1024 * 1024 * 1024;
   float *a, *b;
-  int *c;
   cudaMalloc(&a, n * sizeof(float));
   cudaMalloc(&b, n * sizeof(float));
-  cudaMalloc(&c, n * sizeof(float));
-  for (auto bs : {16, 32, 64, 128, 256}) {
+
+  int repeat = 25;
+
+  double t;
+  t = get_time();
+  for (int i = 0; i < repeat; i++) {
+    cudaMemcpyAsync(a, b, n * sizeof(float), cudaMemcpyDeviceToDevice, 0);
+  }
+  cudaDeviceSynchronize();
+  t = (get_time() - t) / repeat;
+  printf("cuMemcpyAsync 8GB data bw %.3f GB/s\n",
+         n * 8.0 / t / (1024 * 1024 * 1024.0f));
+
+  for (auto bs : {32, 64, 128, 256}) {
     for (int i = 0; i < 10; i++) {
-      cpy<<<n / bs, bs>>>(a, b, c, n);
+      cpy<<<n / bs, bs>>>(a, b, n);
     }
     cudaDeviceSynchronize();
-    int repeat = 100;
-    auto t = get_time();
+    t = get_time();
     for (int i = 0; i < repeat; i++) {
-      cpy<<<n / bs, bs>>>(a, b, c, n);
+      cpy<<<n / bs, bs>>>(a, b, n);
     }
     cudaDeviceSynchronize();
     t = (get_time() - t) / repeat;
-    printf("memcpy 1GB data, block_size %d, %.2f ms   bw %.3f GB/s\n", bs,
+    printf("memcpy 8GB data, block_dim %d, %.2f ms   bw %.3f GB/s\n", bs,
            t * 1000, n * 8.0 / t / (1024 * 1024 * 1024.0f));
   }
 }