[cuda] Improve kernel return value performance when unified memory is…

… available (#965) * [cuda] Improve kernel return value performance when unified memory is available * synchronize before fetching results
taichi-dev · May 13, 2020 · 979ec63 · 979ec63
1 parent bcd8560
commit 979ec63
Showing 1 changed file with 8 additions and 6 deletions.
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
@@ -485,14 +485,16 @@ Kernel &Program::get_snode_writer(SNode *snode) {
 uint64 Program::fetch_result_uint64(int i) {
   uint64 ret;
   auto arch = config.arch;
+  synchronize();
   if (arch == Arch::cuda) {
-    // TODO: refactor
-    // We use a `memcpy_device_to_host` call here even if we have unified
-    // memory. This simplifies code. Also note that a unified memory (4KB) page
-    // fault is rather expensive for reading 4-8 bytes.
 #if defined(TI_WITH_CUDA)
-    CUDADriver::get_instance().memcpy_device_to_host(
-        &ret, (uint64 *)result_buffer + i, sizeof(uint64));
+    if (config.use_unified_memory) {
+      // More efficient than a cudaMemcpy call in practice
+      ret = ((uint64 *)result_buffer)[i];
+    } else {
+      CUDADriver::get_instance().memcpy_device_to_host(
+          &ret, (uint64 *)result_buffer + i, sizeof(uint64));
+    }
 #else
     TI_NOT_IMPLEMENTED;
 #endif