taichi-dev · k-ye · May 11, 2022 · May 10, 2022 · May 10, 2022 · May 10, 2022
diff --git a/taichi/backends/cuda/codegen_cuda.cpp b/taichi/backends/cuda/codegen_cuda.cpp
@@ -65,13 +65,16 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
       bool transferred = false;
       for (int i = 0; i < (int)args.size(); i++) {
         if (args[i].is_array) {
-          if (args[i].size == 0)
+          const auto &arr_meta = context.array_metadata[i];
+          const auto arr_runtime_sz = arr_meta.runtime_size;
+          if (arr_runtime_sz == 0) {
             continue;
+          }
           arg_buffers[i] = context.get_arg<void *>(i);
-          if (!context.is_device_allocation[i]) {
+          if (!arr_meta.is_device_allocation) {
             // Note: both numpy and PyTorch support arrays/tensors with zeros
             // in shapes, e.g., shape=(0) or shape=(100, 0, 200). This makes
-            // args[i].size = 0.
+            // `arr_runtime_sz` zero.
             unsigned int attr_val = 0;
             uint32_t ret_code =
                 CUDADriver::get_instance().mem_get_attribute.call(
@@ -87,18 +90,18 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
               // See CUDA driver API `cuPointerGetAttribute` for more details.
               transferred = true;
               CUDADriver::get_instance().malloc(&device_buffers[i],
-                                                args[i].size);
+                                                arr_runtime_sz);
               CUDADriver::get_instance().memcpy_host_to_device(
-                  (void *)device_buffers[i], arg_buffers[i], args[i].size);
+                  (void *)device_buffers[i], arg_buffers[i], arr_runtime_sz);
             } else {
               device_buffers[i] = arg_buffers[i];
             }
             // device_buffers[i] saves a raw ptr on CUDA device.
             ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
-                                               args[i].size,
+                                               arr_runtime_sz,
                                                /*is_device_allocation=*/false);
 
-          } else if (args[i].size > 0) {
+          } else if (arr_runtime_sz > 0) {
             // arg_buffers[i] is a DeviceAllocation*
             // TODO: Unwraps DeviceAllocation* can be done at CodeGenLLVM since
             // it's shared by cpu and cuda.
@@ -110,11 +113,13 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
             // if transfer happened.
             // TODO: this logic can be improved but I'll leave it to a followup
             // PR.
+            // FIXME: What if after the kernel launch, we still need the old
+            // `arg_buffers[i]`?
             arg_buffers[i] = device_buffers[i];
 
             // device_buffers[i] saves the unwrapped raw ptr from arg_buffers[i]
             ctx_builder.set_arg_external_array(i, (uint64)device_buffers[i],
-                                               args[i].size,
+                                               arr_runtime_sz,
                                                /*is_device_allocation=*/false);
           }
         }
@@ -134,8 +139,10 @@ class CodeGenLLVMCUDA : public CodeGenLLVM {
         CUDADriver::get_instance().stream_synchronize(nullptr);
         for (int i = 0; i < (int)args.size(); i++) {
           if (device_buffers[i] != arg_buffers[i]) {
+            TI_ASSERT(args[i].is_array);
             CUDADriver::get_instance().memcpy_device_to_host(
-                arg_buffers[i], (void *)device_buffers[i], args[i].size);
+                arg_buffers[i], (void *)device_buffers[i],
+                context.array_metadata[i].runtime_size);
             CUDADriver::get_instance().mem_free((void *)device_buffers[i]);
           }
         }

diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
@@ -2296,14 +2296,16 @@ FunctionType CodeGenLLVM::compile_module_to_executable() {
     // For taichi ndarrays, context.args saves pointer to its
     // |DeviceAllocation|, CPU backend actually want to use the raw ptr here.
     for (int i = 0; i < (int)args.size(); i++) {
-      if (args[i].is_array && context.is_device_allocation[i] &&
-          args[i].size > 0) {
+      const auto &arr_meta = context.array_metadata[i];
+      if (args[i].is_array && arr_meta.is_device_allocation &&
+          arr_meta.runtime_size > 0) {
         DeviceAllocation *ptr =
             static_cast<DeviceAllocation *>(context.get_arg<void *>(i));
         uint64 host_ptr = (uint64)kernel->program->get_llvm_program_impl()
                               ->get_ndarray_alloc_info_ptr(*ptr);
         context.set_arg(i, host_ptr);
-        context.set_device_allocation(i, false);
+        context.set_array_is_device_allocation(i,
+                                               /*is_device_allocation=*/false);
       }
     }
     for (auto task : offloaded_tasks_local) {

diff --git a/taichi/program/callable.h b/taichi/program/callable.h
@@ -30,7 +30,6 @@ class TI_DLL_EXPORT Callable {
                  std::vector<int> element_shape = {})
         : dt(dt),
           is_array(is_array),
-          size(size),
           total_dim(total_dim),
           element_shape(std::move(element_shape)) {
     }

diff --git a/taichi/program/context.h b/taichi/program/context.h
@@ -23,7 +23,11 @@ struct RuntimeContext {
   int32 extra_args[taichi_max_num_args_extra][taichi_max_num_indices];
   int32 cpu_thread_id;
   // |is_device_allocation| is true iff args[i] is a DeviceAllocation*.
-  bool is_device_allocation[taichi_max_num_args_total]{false};
+  struct ArrayMetadata {
+    uint64 runtime_size{0};
+    bool is_device_allocation{false};
+  };
+  ArrayMetadata array_metadata[taichi_max_num_args_total];
   // We move the pointer of result buffer from LLVMRuntime to RuntimeContext
   // because each real function need a place to store its result, but
   // LLVMRuntime is shared among functions. So we moved the pointer to
@@ -45,11 +49,15 @@ struct RuntimeContext {
   template <typename T>
   void set_arg(int i, T v) {
     args[i] = taichi_union_cast_with_different_sizes<uint64>(v);
-    set_device_allocation(i, false);
+    set_array_is_device_allocation(i, /*is_device_allocation=*/false);
   }
 
-  void set_device_allocation(int i, bool is_device_allocation_) {
-    is_device_allocation[i] = is_device_allocation_;
+  void set_array_runtime_size(int i, uint64 size) {
+    array_metadata[i].runtime_size = size;
+  }
+
+  void set_array_is_device_allocation(int i, bool is_device_allocation) {
+    array_metadata[i].is_device_allocation = is_device_allocation;
   }
 
   template <typename T>
@@ -61,7 +69,7 @@ struct RuntimeContext {
                         DeviceAllocation &alloc,
                         const std::vector<int> &shape) {
     args[arg_id] = taichi_union_cast_with_different_sizes<uint64>(&alloc);
-    set_device_allocation(arg_id, true);
+    set_array_is_device_allocation(arg_id, /*is_device_allocation=*/true);
     TI_ASSERT(shape.size() <= taichi_max_num_indices);
     for (int i = 0; i < shape.size(); i++) {
       extra_args[arg_id][i] = shape[i];
@@ -73,7 +81,7 @@ struct RuntimeContext {
                         const std::vector<int> &shape,
                         const std::vector<int> &element_shape) {
     args[arg_id] = taichi_union_cast_with_different_sizes<uint64>(&alloc);
-    set_device_allocation(arg_id, true);
+    set_array_is_device_allocation(arg_id, /*is_device_allocation=*/true);
     TI_ASSERT(shape.size() + element_shape.size() <= taichi_max_num_indices);
     for (int i = 0; i < shape.size(); i++) {
       extra_args[arg_id][i] = shape[i];

diff --git a/taichi/program/kernel.cpp b/taichi/program/kernel.cpp
@@ -238,17 +238,21 @@ void Kernel::LaunchContextBuilder::set_arg_external_array(
        ActionArg("address", fmt::format("0x{:x}", ptr)),
        ActionArg("array_size_in_bytes", (int64)size)});
 
+  // FIXME(https://github.com/taichi-dev/taichi/issues/4949): Make the Metal
+  // backend support Ndarray, then remove this line below.
   kernel_->args[arg_id].size = size;
   ctx_->set_arg(arg_id, ptr);
-  ctx_->set_device_allocation(arg_id, is_device_allocation);
+  ctx_->set_array_runtime_size(arg_id, size);
+  ctx_->set_array_is_device_allocation(arg_id, is_device_allocation);
 }
 
 void Kernel::LaunchContextBuilder::set_arg_external_array_with_shape(
     int arg_id,
     uintptr_t ptr,
     uint64 size,
     const std::vector<int64> &shape) {
-  this->set_arg_external_array(arg_id, ptr, size, false);
+  this->set_arg_external_array(arg_id, ptr, size,
+                               /*is_device_allocation=*/false);
   TI_ASSERT_INFO(shape.size() <= taichi_max_num_indices,
                  "External array cannot have > {max_num_indices} indices");
   for (uint64 i = 0; i < shape.size(); ++i) {
@@ -260,7 +264,8 @@ void Kernel::LaunchContextBuilder::set_arg_ndarray(int arg_id,
                                                    const Ndarray &arr) {
   intptr_t ptr = arr.get_device_allocation_ptr_as_int();
   uint64 arr_size = arr.get_element_size() * arr.get_nelement();
-  this->set_arg_external_array(arg_id, ptr, arr_size, true);
+  this->set_arg_external_array(arg_id, ptr, arr_size,
+                               /*is_device_allocation=*/true);
   TI_ASSERT_INFO(arr.shape.size() <= taichi_max_num_indices,
                  "External array cannot have > {max_num_indices} indices");
   for (uint64 i = 0; i < arr.shape.size(); ++i) {

diff --git a/taichi/runtime/opengl/opengl_api.cpp b/taichi/runtime/opengl/opengl_api.cpp
@@ -232,6 +232,7 @@ void CompiledTaichiKernel::init_args(Kernel *kernel) {
   for (int i = 0; i < arg_count; i++) {
     const auto dtype_name = kernel->args[i].dt.to_string();
     if (kernel->args[i].is_array) {
+      constexpr uint64 kUnkownRuntimeSize = 0;
       arr_args[i] = CompiledArrayArg(
           {/*dtype_enum=*/to_gl_dtype_enum(kernel->args[i].dt), dtype_name,
            /*field_dim=*/kernel->args[i].total_dim -
@@ -240,7 +241,7 @@ void CompiledTaichiKernel::init_args(Kernel *kernel) {
            /*element_shape=*/kernel->args[i].element_shape,
            /*shape_offset_in_bytes_in_args_buf=*/taichi_opengl_extra_args_base +
                i * taichi_max_num_indices * sizeof(int),
-           /*total_size=*/kernel->args[i].size});
+           kUnkownRuntimeSize});
     } else {
       scalar_args[i] = ScalarArg(
           {dtype_name, /*offset_in_bytes_in_args_buf=*/i * sizeof(uint64_t)});
@@ -400,23 +401,26 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
   for (auto &item : program_.arr_args) {
     int i = item.first;
     TI_ASSERT(args[i].is_array);
-    if (args[i].size == 0 || ctx.is_device_allocation[i])
+    const auto &arr_meta = ctx.array_metadata[i];
+    const auto arr_runtime_sz = arr_meta.runtime_size;
+    if ((arr_runtime_sz == 0) || arr_meta.is_device_allocation) {
       continue;
+    }
     has_ext_arr = true;
-    if (args[i].size != item.second.total_size ||
+    if (arr_runtime_sz != item.second.runtime_size ||
         ext_arr_bufs_[i] == kDeviceNullAllocation) {
       if (ext_arr_bufs_[i] != kDeviceNullAllocation) {
         device_->dealloc_memory(ext_arr_bufs_[i]);
       }
       ext_arr_bufs_[i] = device_->allocate_memory(
-          {args[i].size, /*host_write=*/true, /*host_read=*/true,
+          {arr_runtime_sz, /*host_write=*/true, /*host_read=*/true,
            /*export_sharing=*/false});
-      item.second.total_size = args[i].size;
+      item.second.runtime_size = arr_runtime_sz;
     }
     void *host_ptr = (void *)ctx.args[i];
     void *baseptr = device_->map(ext_arr_bufs_[i]);
     if (program_.check_ext_arr_read(i)) {
-      std::memcpy((char *)baseptr, host_ptr, args[i].size);
+      std::memcpy((char *)baseptr, host_ptr, arr_runtime_sz);
     }
     device_->unmap(ext_arr_bufs_[i]);
   }
@@ -468,7 +472,7 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
     //       On most devices this number is 8. But I need to look up how
     //       to query this information so currently this is thrown from OpenGl.
     for (const auto [arg_id, bind_id] : program_.used.arr_arg_to_bind_idx) {
-      if (ctx.is_device_allocation[arg_id]) {
+      if (ctx.array_metadata[arg_id].is_device_allocation) {
         DeviceAllocation *ptr =
             static_cast<DeviceAllocation *>((void *)ctx.args[arg_id]);
 
@@ -503,9 +507,10 @@ void DeviceCompiledTaichiKernel::launch(RuntimeContext &ctx,
   if (has_ext_arr) {
     for (auto &item : program_.arr_args) {
       int i = item.first;
-      if (args[i].size != 0 && !ctx.is_device_allocation[i]) {
+      const auto &arr_meta = ctx.array_metadata[i];
+      if (arr_meta.runtime_size != 0 && !arr_meta.is_device_allocation) {
         uint8_t *baseptr = (uint8_t *)device_->map(ext_arr_bufs_[i]);
-        memcpy((void *)ctx.args[i], baseptr, args[i].size);
+        memcpy((void *)ctx.args[i], baseptr, arr_meta.runtime_size);
         device_->unmap(ext_arr_bufs_[i]);
       }
     }

diff --git a/taichi/runtime/opengl/opengl_api.h b/taichi/runtime/opengl/opengl_api.h
@@ -66,7 +66,7 @@ struct CompiledArrayArg {
   bool is_scalar{false};
   std::vector<int> element_shape;
   size_t shape_offset_in_bytes_in_args_buf{0};
-  size_t total_size{0};  // Runtime information
+  size_t runtime_size{0};  // Runtime information
 
   TI_IO_DEF(field_dim,
             is_scalar,

diff --git a/taichi/runtime/vulkan/runtime.cpp b/taichi/runtime/vulkan/runtime.cpp
@@ -79,7 +79,8 @@ class HostDeviceContextBlitter {
       char *device_ptr = device_base + arg.offset_in_mem;
       do {
         if (arg.is_array) {
-          if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
+          if (!host_ctx_->array_metadata[i].is_device_allocation &&
+              ext_arr_size.at(i)) {
             // Only need to blit ext arrs (host array)
             DeviceAllocation buffer = ext_arrays.at(i);
             char *const device_arr_ptr =
@@ -150,7 +151,8 @@ class HostDeviceContextBlitter {
       for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
         const auto &arg = ctx_attribs_->args()[i];
         if (arg.is_array) {
-          if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
+          if (!host_ctx_->array_metadata[i].is_device_allocation &&
+              ext_arr_size.at(i)) {
             require_sync = true;
           }
         }
@@ -166,7 +168,8 @@ class HostDeviceContextBlitter {
     for (int i = 0; i < ctx_attribs_->args().size(); ++i) {
       const auto &arg = ctx_attribs_->args()[i];
       if (arg.is_array) {
-        if (!host_ctx_->is_device_allocation[i] && ext_arr_size.at(i)) {
+        if (!host_ctx_->array_metadata[i].is_device_allocation &&
+            ext_arr_size.at(i)) {
           // Only need to blit ext arrs (host array)
           DeviceAllocation buffer = ext_arrays.at(i);
           char *const device_arr_ptr =
@@ -455,7 +458,7 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
     const auto &args = ti_kernel->ti_kernel_attribs().ctx_attribs.args();
     for (auto &arg : args) {
       if (arg.is_array) {
-        if (host_ctx->is_device_allocation[i]) {
+        if (host_ctx->array_metadata[i].is_device_allocation) {
           // NDArray
           if (host_ctx->args[i]) {
             any_arrays[i] = *(DeviceAllocation *)(host_ctx->args[i]);
@@ -464,6 +467,8 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
           }
         } else {
           // Compute ext arr sizes
+          // TODO: Consider using `arr_metadata.runtime_size` instead of
+          // computing on our own?
           size_t size = arg.stride;
           bool has_zero_axis = false;
 
@@ -546,7 +551,7 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   // Dealloc external arrays
   for (auto pair : any_arrays) {
     if (pair.second != kDeviceNullAllocation) {
-      if (!host_ctx->is_device_allocation[pair.first]) {
+      if (!host_ctx->array_metadata[pair.first].is_device_allocation) {
         device_->dealloc_memory(pair.second);
       }
     }