taichi-dev · bobcao3 · Apr 28, 2022 · Apr 23, 2022 · Apr 23, 2022 · Apr 23, 2022
diff --git a/taichi/aot/module_loader.h b/taichi/aot/module_loader.h
@@ -166,6 +166,9 @@ class TargetDevice : public Device {
   Stream *get_compute_stream() override {
     TI_NOT_IMPLEMENTED;
   }
+  void wait_idle() override {
+    TI_NOT_IMPLEMENTED;
+  }
 };
 
 }  // namespace aot

diff --git a/taichi/backends/cpu/cpu_device.h b/taichi/backends/cpu/cpu_device.h
@@ -69,8 +69,13 @@ class CpuStream : public Stream {
   ~CpuStream() override{};
 
   std::unique_ptr<CommandList> new_command_list() override{TI_NOT_IMPLEMENTED};
-  void submit(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
-  void submit_synced(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit(CommandList *cmdlist,
+                         const std::vector<StreamSemaphore> &wait_semaphores =
+                             {}) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override{
+      TI_NOT_IMPLEMENTED};
 
   void command_sync() override{TI_NOT_IMPLEMENTED};
 };
@@ -111,6 +116,8 @@ class CpuDevice : public LlvmDevice {
 
   Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};
 
+  void wait_idle() override{TI_NOT_IMPLEMENTED};
+
  private:
   std::vector<AllocInfo> allocations_;
   std::unordered_map<int, std::unique_ptr<VirtualMemoryAllocator>>

diff --git a/taichi/backends/cuda/cuda_device.h b/taichi/backends/cuda/cuda_device.h
@@ -69,8 +69,13 @@ class CudaStream : public Stream {
   ~CudaStream() override{};
 
   std::unique_ptr<CommandList> new_command_list() override{TI_NOT_IMPLEMENTED};
-  void submit(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
-  void submit_synced(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit(CommandList *cmdlist,
+                         const std::vector<StreamSemaphore> &wait_semaphores =
+                             {}) override{TI_NOT_IMPLEMENTED};
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override{
+      TI_NOT_IMPLEMENTED};
 
   void command_sync() override{TI_NOT_IMPLEMENTED};
 };
@@ -123,6 +128,8 @@ class CudaDevice : public LlvmDevice {
 
   Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};
 
+  void wait_idle() override{TI_NOT_IMPLEMENTED};
+
  private:
   std::vector<AllocInfo> allocations_;
   void validate_device_alloc(const DeviceAllocation alloc) {

diff --git a/taichi/backends/device.h b/taichi/backends/device.h
@@ -397,13 +397,26 @@ inline bool operator&(AllocUsage a, AllocUsage b) {
   return static_cast<int>(a) & static_cast<int>(b);
 }
 
+class StreamSemaphoreObject {
+ public:
+  virtual ~StreamSemaphoreObject() {
+  }
+};
+
+using StreamSemaphore = std::shared_ptr<StreamSemaphoreObject>;
+
 class Stream {
  public:
-  virtual ~Stream(){};
+  virtual ~Stream() {
+  }
 
   virtual std::unique_ptr<CommandList> new_command_list() = 0;
-  virtual void submit(CommandList *cmdlist) = 0;
-  virtual void submit_synced(CommandList *cmdlist) = 0;
+  virtual StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
+  virtual StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
 
   virtual void command_sync() = 0;
 };
@@ -457,6 +470,9 @@ class Device {
   // Each thraed will acquire its own stream
   virtual Stream *get_compute_stream() = 0;
 
+  // Wait for all tasks to complete (task from all streams)
+  virtual void wait_idle() = 0;
+
   // Mapping can fail and will return nullptr
   virtual void *map_range(DevicePtr ptr, uint64_t size) = 0;
   virtual void *map(DeviceAllocation alloc) = 0;
@@ -498,8 +514,9 @@ class Surface {
   virtual ~Surface() {
   }
 
-  virtual DeviceAllocation get_target_image() = 0;
-  virtual void present_image() = 0;
+  virtual std::pair<DeviceAllocation, StreamSemaphore> get_target_image() = 0;
+  virtual void present_image(
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
   virtual std::pair<uint32_t, uint32_t> get_size() = 0;
   virtual int get_image_count() = 0;
   virtual BufferFormat image_format() = 0;

diff --git a/taichi/backends/dx/dx_device.cpp b/taichi/backends/dx/dx_device.cpp
@@ -739,6 +739,9 @@ void Dx11Device::image_to_buffer(DevicePtr dst_buf,
   TI_NOT_IMPLEMENTED;
 }
 
+void Dx11Device::wait_idle() {
+}
+
 ID3D11Buffer *Dx11Device::alloc_id_to_buffer(uint32_t alloc_id) {
   return alloc_id_to_buffer_.at(alloc_id);
 }
@@ -805,15 +808,23 @@ std::unique_ptr<CommandList> Dx11Stream::new_command_list() {
   return std::make_unique<Dx11CommandList>(device_);
 }
 
-void Dx11Stream::submit(CommandList *cmdlist) {
+StreamSemaphore Dx11Stream::submit(
+    CommandList *cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   Dx11CommandList *dx_cmd_list = static_cast<Dx11CommandList *>(cmdlist);
   dx_cmd_list->run_commands();
+
+  return std::make_shared<StreamSemaphoreObject>();
 }
 
 // No difference for DX11
-void Dx11Stream::submit_synced(CommandList *cmdlist) {
+StreamSemaphore Dx11Stream::submit_synced(
+    CommandList *cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   Dx11CommandList *dx_cmd_list = static_cast<Dx11CommandList *>(cmdlist);
   dx_cmd_list->run_commands();
+
+  return std::make_shared<StreamSemaphoreObject>();
 }
 
 void Dx11Stream::command_sync() {

diff --git a/taichi/backends/dx/dx_device.h b/taichi/backends/dx/dx_device.h
@@ -94,8 +94,12 @@ class Dx11Stream : public Stream {
   ~Dx11Stream() override;
 
   std::unique_ptr<CommandList> new_command_list() override;
-  void submit(CommandList *cmdlist) override;
-  void submit_synced(CommandList *cmdlist) override;
+  StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
   void command_sync() override;
 
  private:
@@ -247,6 +251,7 @@ class Dx11Device : public GraphicsDevice {
                        DeviceAllocation src_img,
                        ImageLayout img_layout,
                        const BufferImageCopyParams &params) override;
+  void wait_idle() override;
 
   int live_dx11_object_count();
   ID3D11DeviceContext *d3d11_context() {

diff --git a/taichi/backends/opengl/opengl_device.cpp b/taichi/backends/opengl/opengl_device.cpp
@@ -431,14 +431,24 @@ std::unique_ptr<CommandList> GLStream::new_command_list() {
   return std::make_unique<GLCommandList>();
 }
 
-void GLStream::submit(CommandList *_cmdlist) {
+StreamSemaphore GLStream::submit(
+    CommandList *_cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   GLCommandList *cmdlist = static_cast<GLCommandList *>(_cmdlist);
   cmdlist->run_commands();
+
+  // OpenGL is fully serial
+  return std::make_shared<StreamSemaphoreObject>();
 }
 
-void GLStream::submit_synced(CommandList *cmdlist) {
+StreamSemaphore GLStream::submit_synced(
+    CommandList *cmdlist,
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   submit(cmdlist);
   glFinish();
+
+  // OpenGL is fully serial
+  return std::make_shared<StreamSemaphoreObject>();
 }
 void GLStream::command_sync() {
   glFinish();
@@ -559,6 +569,9 @@ Stream *GLDevice::get_graphics_stream() {
   return nullptr;
 }
 
+void GLDevice::wait_idle() {
+}
+
 std::unique_ptr<Surface> GLDevice::create_surface(const SurfaceConfig &config) {
   TI_NOT_IMPLEMENTED;
   return nullptr;
@@ -634,12 +647,13 @@ GLSurface::~GLSurface() {
   TI_NOT_IMPLEMENTED;
 }
 
-DeviceAllocation GLSurface::get_target_image() {
+std::pair<DeviceAllocation, StreamSemaphore> GLSurface::get_target_image() {
   TI_NOT_IMPLEMENTED;
-  return kDeviceNullAllocation;
+  return std::make_pair(kDeviceNullAllocation, nullptr);
 }
 
-void GLSurface::present_image() {
+void GLSurface::present_image(
+    const std::vector<StreamSemaphore> &wait_semaphores) {
   TI_NOT_IMPLEMENTED;
 }
 

diff --git a/taichi/backends/opengl/opengl_device.h b/taichi/backends/opengl/opengl_device.h
@@ -198,8 +198,12 @@ class GLStream : public Stream {
   ~GLStream() override;
 
   std::unique_ptr<CommandList> new_command_list() override;
-  void submit(CommandList *cmdlist) override;
-  void submit_synced(CommandList *cmdlist) override;
+  StreamSemaphore submit(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
+  StreamSemaphore submit_synced(
+      CommandList *cmdlist,
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
 
   void command_sync() override;
 };
@@ -237,6 +241,8 @@ class GLDevice : public GraphicsDevice {
 
   Stream *get_graphics_stream() override;
 
+  void wait_idle() override;
+
   std::unique_ptr<Surface> create_surface(const SurfaceConfig &config) override;
   DeviceAllocation create_image(const ImageParams &params) override;
   void destroy_image(DeviceAllocation handle) override;
@@ -272,8 +278,9 @@ class GLSurface : public Surface {
  public:
   ~GLSurface() override;
 
-  DeviceAllocation get_target_image() override;
-  void present_image() override;
+  std::pair<DeviceAllocation, StreamSemaphore> get_target_image() override;
+  void present_image(
+      const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
   std::pair<uint32_t, uint32_t> get_size() override;
   BufferFormat image_format() override;
   void resize(uint32_t width, uint32_t height) override;

diff --git a/taichi/backends/vulkan/runtime.cpp b/taichi/backends/vulkan/runtime.cpp
@@ -139,7 +139,8 @@ class HostDeviceContextBlitter {
   bool device_to_host(
       CommandList *cmdlist,
       const std::unordered_map<int, DeviceAllocation> &ext_arrays,
-      const std::unordered_map<int, size_t> &ext_arr_size) {
+      const std::unordered_map<int, size_t> &ext_arr_size,
+      const std::vector<StreamSemaphore> &wait_semaphore) {
     if (ctx_attribs_->empty()) {
       return false;
     }
@@ -157,7 +158,7 @@ class HostDeviceContextBlitter {
     }
 
     if (require_sync) {
-      device_->get_compute_stream()->submit_synced(cmdlist);
+      device_->get_compute_stream()->submit_synced(cmdlist, wait_semaphore);
     } else {
       return false;
     }
@@ -520,9 +521,14 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
   }
 
   // If we need to host sync, sync and remove in-flight references
+  std::vector<StreamSemaphore> wait_semaphore;
+  if (last_semaphore_) {
+    wait_semaphore.push_back(last_semaphore_);
+  }
+
   if (ctx_blitter) {
     if (ctx_blitter->device_to_host(current_cmdlist_.get(), any_arrays,
-                                    ext_array_size)) {
+                                    ext_array_size, wait_semaphore)) {
       current_cmdlist_ = nullptr;
       ctx_buffers_.clear();
     }
@@ -536,7 +542,8 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
     auto duration = high_res_clock::now() - current_cmdlist_pending_since_;
     if (std::chrono::duration_cast<std::chrono::microseconds>(duration)
             .count() > max_pending_time) {
-      device_->get_compute_stream()->submit(current_cmdlist_.get());
+      last_semaphore_ = device_->get_compute_stream()->submit(
+          current_cmdlist_.get(), wait_semaphore);
       current_cmdlist_ = nullptr;
     }
   }
@@ -553,10 +560,16 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
 
 void VkRuntime::synchronize() {
   if (current_cmdlist_) {
-    device_->get_compute_stream()->submit(current_cmdlist_.get());
+    std::vector<StreamSemaphore> wait_semaphore;
+    if (last_semaphore_) {
+      wait_semaphore.push_back(last_semaphore_);
+    }
+    device_->get_compute_stream()->submit(current_cmdlist_.get(),
+                                          wait_semaphore);
     current_cmdlist_ = nullptr;
+    last_semaphore_ = nullptr;
   }
-  device_->get_compute_stream()->command_sync();
+  device_->wait_idle();
   ctx_buffers_.clear();
 }
 

diff --git a/taichi/backends/vulkan/runtime.h b/taichi/backends/vulkan/runtime.h
@@ -126,6 +126,7 @@ class TI_DLL_EXPORT VkRuntime {
   std::vector<std::unique_ptr<DeviceAllocationGuard>> ctx_buffers_;
 
   std::unique_ptr<CommandList> current_cmdlist_{nullptr};
+  StreamSemaphore last_semaphore_{nullptr};
   high_res_clock::time_point current_cmdlist_pending_since_;
 
   std::vector<std::unique_ptr<CompiledTaichiKernel>> ti_kernels_;