Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[vulkan] Device API explicit semaphores #4852

Merged
merged 21 commits into from
Apr 28, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
2ea6888
Device API explicit semaphores
bobcao3 Apr 23, 2022
3029d37
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2022
1ffe3f0
fix
bobcao3 Apr 23, 2022
f99b166
Merge branch 'master' of https://github.com/bobcao3/taichi
bobcao3 Apr 23, 2022
2af99ef
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 23, 2022
f767b3d
Destroy the semaphore before the context
bobcao3 Apr 24, 2022
2554deb
Fix type warnings
bobcao3 Apr 24, 2022
73fe3ec
fix nits
bobcao3 Apr 25, 2022
7b966da
return nullptr for devices that don't need semaphores
bobcao3 Apr 25, 2022
a977c03
test out no semaphores between same queue
bobcao3 Apr 26, 2022
d5d508a
Use native command list instead of emulated for dx11
bobcao3 Apr 26, 2022
75116c0
Merge branch 'master' of https://github.com/bobcao3/taichi
bobcao3 Apr 26, 2022
7f51dc8
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 26, 2022
e6c86f0
remove the in-queue semaphore
bobcao3 Apr 26, 2022
7fe9a54
Merge branch 'master' of https://github.com/bobcao3/taichi
bobcao3 Apr 26, 2022
630541a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 26, 2022
14c0aaf
Use flush instead of sync in places
bobcao3 Apr 26, 2022
e522bae
Merge branch 'master' of https://github.com/bobcao3/taichi
bobcao3 Apr 26, 2022
6286db7
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Apr 26, 2022
2158f20
Fix possible null semaphore
bobcao3 Apr 26, 2022
65dc483
merge
bobcao3 Apr 28, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions taichi/aot/module_loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,9 @@ class TargetDevice : public Device {
Stream *get_compute_stream() override {
TI_NOT_IMPLEMENTED;
}
void wait_idle() override {
TI_NOT_IMPLEMENTED;
}
};

} // namespace aot
Expand Down
11 changes: 9 additions & 2 deletions taichi/backends/cpu/cpu_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,13 @@ class CpuStream : public Stream {
~CpuStream() override{};

std::unique_ptr<CommandList> new_command_list() override{TI_NOT_IMPLEMENTED};
void submit(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
void submit_synced(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
StreamSemaphore submit(CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores =
{}) override{TI_NOT_IMPLEMENTED};
StreamSemaphore submit_synced(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) override{
TI_NOT_IMPLEMENTED};

void command_sync() override{TI_NOT_IMPLEMENTED};
};
Expand Down Expand Up @@ -111,6 +116,8 @@ class CpuDevice : public LlvmDevice {

Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};

void wait_idle() override{TI_NOT_IMPLEMENTED};

private:
std::vector<AllocInfo> allocations_;
std::unordered_map<int, std::unique_ptr<VirtualMemoryAllocator>>
Expand Down
11 changes: 9 additions & 2 deletions taichi/backends/cuda/cuda_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,13 @@ class CudaStream : public Stream {
~CudaStream() override{};

std::unique_ptr<CommandList> new_command_list() override{TI_NOT_IMPLEMENTED};
void submit(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
void submit_synced(CommandList *cmdlist) override{TI_NOT_IMPLEMENTED};
StreamSemaphore submit(CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores =
{}) override{TI_NOT_IMPLEMENTED};
StreamSemaphore submit_synced(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) override{
TI_NOT_IMPLEMENTED};

void command_sync() override{TI_NOT_IMPLEMENTED};
};
Expand Down Expand Up @@ -123,6 +128,8 @@ class CudaDevice : public LlvmDevice {

Stream *get_compute_stream() override{TI_NOT_IMPLEMENTED};

void wait_idle() override{TI_NOT_IMPLEMENTED};

private:
std::vector<AllocInfo> allocations_;
void validate_device_alloc(const DeviceAllocation alloc) {
Expand Down
27 changes: 22 additions & 5 deletions taichi/backends/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -397,13 +397,26 @@ inline bool operator&(AllocUsage a, AllocUsage b) {
return static_cast<int>(a) & static_cast<int>(b);
}

class StreamSemaphoreObject {
bobcao3 marked this conversation as resolved.
Show resolved Hide resolved
public:
virtual ~StreamSemaphoreObject() {
}
};

using StreamSemaphore = std::shared_ptr<StreamSemaphoreObject>;

class Stream {
public:
virtual ~Stream(){};
virtual ~Stream() {
}

virtual std::unique_ptr<CommandList> new_command_list() = 0;
virtual void submit(CommandList *cmdlist) = 0;
virtual void submit_synced(CommandList *cmdlist) = 0;
virtual StreamSemaphore submit(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
virtual StreamSemaphore submit_synced(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;

virtual void command_sync() = 0;
};
Expand Down Expand Up @@ -457,6 +470,9 @@ class Device {
// Each thraed will acquire its own stream
virtual Stream *get_compute_stream() = 0;

// Wait for all tasks to complete (task from all streams)
virtual void wait_idle() = 0;

// Mapping can fail and will return nullptr
virtual void *map_range(DevicePtr ptr, uint64_t size) = 0;
virtual void *map(DeviceAllocation alloc) = 0;
Expand Down Expand Up @@ -498,8 +514,9 @@ class Surface {
virtual ~Surface() {
}

virtual DeviceAllocation get_target_image() = 0;
virtual void present_image() = 0;
virtual std::pair<DeviceAllocation, StreamSemaphore> get_target_image() = 0;
bobcao3 marked this conversation as resolved.
Show resolved Hide resolved
virtual void present_image(
const std::vector<StreamSemaphore> &wait_semaphores = {}) = 0;
virtual std::pair<uint32_t, uint32_t> get_size() = 0;
virtual int get_image_count() = 0;
virtual BufferFormat image_format() = 0;
Expand Down
15 changes: 13 additions & 2 deletions taichi/backends/dx/dx_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -739,6 +739,9 @@ void Dx11Device::image_to_buffer(DevicePtr dst_buf,
TI_NOT_IMPLEMENTED;
}

void Dx11Device::wait_idle() {
}

ID3D11Buffer *Dx11Device::alloc_id_to_buffer(uint32_t alloc_id) {
return alloc_id_to_buffer_.at(alloc_id);
}
Expand Down Expand Up @@ -805,15 +808,23 @@ std::unique_ptr<CommandList> Dx11Stream::new_command_list() {
return std::make_unique<Dx11CommandList>(device_);
}

void Dx11Stream::submit(CommandList *cmdlist) {
StreamSemaphore Dx11Stream::submit(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores) {
Dx11CommandList *dx_cmd_list = static_cast<Dx11CommandList *>(cmdlist);
dx_cmd_list->run_commands();

return std::make_shared<StreamSemaphoreObject>();
}

// No difference for DX11
void Dx11Stream::submit_synced(CommandList *cmdlist) {
StreamSemaphore Dx11Stream::submit_synced(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores) {
Dx11CommandList *dx_cmd_list = static_cast<Dx11CommandList *>(cmdlist);
dx_cmd_list->run_commands();

return std::make_shared<StreamSemaphoreObject>();
bobcao3 marked this conversation as resolved.
Show resolved Hide resolved
}

void Dx11Stream::command_sync() {
Expand Down
9 changes: 7 additions & 2 deletions taichi/backends/dx/dx_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,12 @@ class Dx11Stream : public Stream {
~Dx11Stream() override;

std::unique_ptr<CommandList> new_command_list() override;
void submit(CommandList *cmdlist) override;
void submit_synced(CommandList *cmdlist) override;
StreamSemaphore submit(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
StreamSemaphore submit_synced(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
void command_sync() override;

private:
Expand Down Expand Up @@ -247,6 +251,7 @@ class Dx11Device : public GraphicsDevice {
DeviceAllocation src_img,
ImageLayout img_layout,
const BufferImageCopyParams &params) override;
void wait_idle() override;

int live_dx11_object_count();
ID3D11DeviceContext *d3d11_context() {
Expand Down
24 changes: 19 additions & 5 deletions taichi/backends/opengl/opengl_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,14 +431,24 @@ std::unique_ptr<CommandList> GLStream::new_command_list() {
return std::make_unique<GLCommandList>();
}

void GLStream::submit(CommandList *_cmdlist) {
StreamSemaphore GLStream::submit(
CommandList *_cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores) {
GLCommandList *cmdlist = static_cast<GLCommandList *>(_cmdlist);
cmdlist->run_commands();

// OpenGL is fully serial
return std::make_shared<StreamSemaphoreObject>();
}

void GLStream::submit_synced(CommandList *cmdlist) {
StreamSemaphore GLStream::submit_synced(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores) {
submit(cmdlist);
glFinish();

// OpenGL is fully serial
return std::make_shared<StreamSemaphoreObject>();
}
void GLStream::command_sync() {
glFinish();
Expand Down Expand Up @@ -559,6 +569,9 @@ Stream *GLDevice::get_graphics_stream() {
return nullptr;
}

void GLDevice::wait_idle() {
}

std::unique_ptr<Surface> GLDevice::create_surface(const SurfaceConfig &config) {
TI_NOT_IMPLEMENTED;
return nullptr;
Expand Down Expand Up @@ -634,12 +647,13 @@ GLSurface::~GLSurface() {
TI_NOT_IMPLEMENTED;
}

DeviceAllocation GLSurface::get_target_image() {
std::pair<DeviceAllocation, StreamSemaphore> GLSurface::get_target_image() {
TI_NOT_IMPLEMENTED;
return kDeviceNullAllocation;
return std::make_pair(kDeviceNullAllocation, nullptr);
}

void GLSurface::present_image() {
void GLSurface::present_image(
const std::vector<StreamSemaphore> &wait_semaphores) {
TI_NOT_IMPLEMENTED;
}

Expand Down
15 changes: 11 additions & 4 deletions taichi/backends/opengl/opengl_device.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,8 +198,12 @@ class GLStream : public Stream {
~GLStream() override;

std::unique_ptr<CommandList> new_command_list() override;
void submit(CommandList *cmdlist) override;
void submit_synced(CommandList *cmdlist) override;
StreamSemaphore submit(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
StreamSemaphore submit_synced(
CommandList *cmdlist,
const std::vector<StreamSemaphore> &wait_semaphores = {}) override;

void command_sync() override;
};
Expand Down Expand Up @@ -237,6 +241,8 @@ class GLDevice : public GraphicsDevice {

Stream *get_graphics_stream() override;

void wait_idle() override;

std::unique_ptr<Surface> create_surface(const SurfaceConfig &config) override;
DeviceAllocation create_image(const ImageParams &params) override;
void destroy_image(DeviceAllocation handle) override;
Expand Down Expand Up @@ -272,8 +278,9 @@ class GLSurface : public Surface {
public:
~GLSurface() override;

DeviceAllocation get_target_image() override;
void present_image() override;
std::pair<DeviceAllocation, StreamSemaphore> get_target_image() override;
void present_image(
const std::vector<StreamSemaphore> &wait_semaphores = {}) override;
std::pair<uint32_t, uint32_t> get_size() override;
BufferFormat image_format() override;
void resize(uint32_t width, uint32_t height) override;
Expand Down
25 changes: 19 additions & 6 deletions taichi/backends/vulkan/runtime.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ class HostDeviceContextBlitter {
bool device_to_host(
CommandList *cmdlist,
const std::unordered_map<int, DeviceAllocation> &ext_arrays,
const std::unordered_map<int, size_t> &ext_arr_size) {
const std::unordered_map<int, size_t> &ext_arr_size,
const std::vector<StreamSemaphore> &wait_semaphore) {
if (ctx_attribs_->empty()) {
return false;
}
Expand All @@ -157,7 +158,7 @@ class HostDeviceContextBlitter {
}

if (require_sync) {
device_->get_compute_stream()->submit_synced(cmdlist);
device_->get_compute_stream()->submit_synced(cmdlist, wait_semaphore);
bobcao3 marked this conversation as resolved.
Show resolved Hide resolved
} else {
return false;
}
Expand Down Expand Up @@ -520,9 +521,14 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
}

// If we need to host sync, sync and remove in-flight references
std::vector<StreamSemaphore> wait_semaphore;
if (last_semaphore_) {
wait_semaphore.push_back(last_semaphore_);
}

if (ctx_blitter) {
if (ctx_blitter->device_to_host(current_cmdlist_.get(), any_arrays,
ext_array_size)) {
ext_array_size, wait_semaphore)) {
current_cmdlist_ = nullptr;
ctx_buffers_.clear();
}
Expand All @@ -536,7 +542,8 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {
auto duration = high_res_clock::now() - current_cmdlist_pending_since_;
if (std::chrono::duration_cast<std::chrono::microseconds>(duration)
.count() > max_pending_time) {
device_->get_compute_stream()->submit(current_cmdlist_.get());
last_semaphore_ = device_->get_compute_stream()->submit(
current_cmdlist_.get(), wait_semaphore);
current_cmdlist_ = nullptr;
}
}
Expand All @@ -553,10 +560,16 @@ void VkRuntime::launch_kernel(KernelHandle handle, RuntimeContext *host_ctx) {

void VkRuntime::synchronize() {
if (current_cmdlist_) {
device_->get_compute_stream()->submit(current_cmdlist_.get());
std::vector<StreamSemaphore> wait_semaphore;
if (last_semaphore_) {
wait_semaphore.push_back(last_semaphore_);
}
device_->get_compute_stream()->submit(current_cmdlist_.get(),
wait_semaphore);
current_cmdlist_ = nullptr;
last_semaphore_ = nullptr;
}
device_->get_compute_stream()->command_sync();
device_->wait_idle();
ctx_buffers_.clear();
}

Expand Down
1 change: 1 addition & 0 deletions taichi/backends/vulkan/runtime.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ class TI_DLL_EXPORT VkRuntime {
std::vector<std::unique_ptr<DeviceAllocationGuard>> ctx_buffers_;

std::unique_ptr<CommandList> current_cmdlist_{nullptr};
StreamSemaphore last_semaphore_{nullptr};
high_res_clock::time_point current_cmdlist_pending_since_;

std::vector<std::unique_ptr<CompiledTaichiKernel>> ti_kernels_;
Expand Down
Loading