Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[vulkan] Codegen & runtime improvements #5213

Merged
merged 9 commits into from
Jun 23, 2022
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions taichi/backends/vulkan/vulkan_device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1310,7 +1310,6 @@ DeviceAllocation VulkanDevice::allocate_memory(const AllocParams &params) {
if (params.usage & AllocUsage::Index) {
buffer_info.usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
}
buffer_info.sharingMode = VK_SHARING_MODE_CONCURRENT;

uint32_t queue_family_indices[] = {compute_queue_family_index_,
graphics_queue_family_index_};
Expand Down Expand Up @@ -1351,20 +1350,22 @@ DeviceAllocation VulkanDevice::allocate_memory(const AllocParams &params) {
if (params.host_read && params.host_write) {
#endif //__APPLE__
// This should be the unified memory on integrated GPUs
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
alloc_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
alloc_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
#ifdef __APPLE__
// weird behavior on apple: if coherent bit is not set, then the memory
// writes between map() and unmap() cannot be seen by gpu
alloc_info.preferredFlags |= VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
PENGUINLIONG marked this conversation as resolved.
Show resolved Hide resolved
#endif //__APPLE__
} else if (params.host_read) {
alloc_info.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
alloc_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
} else if (params.host_write) {
alloc_info.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
alloc_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
} else {
alloc_info.usage = VMA_MEMORY_USAGE_GPU_ONLY;
alloc_info.requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
}

if (get_cap(DeviceCapability::spirv_has_physical_storage_buffer)) {
Expand Down
15 changes: 10 additions & 5 deletions taichi/codegen/spirv/kernel_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,10 +48,12 @@ std::string TaskAttributes::BufferBind::debug_string() const {
TaskAttributes::buffers_name(buffer), binding);
}

KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
KernelContextAttributes::KernelContextAttributes(const Kernel &kernel,
Device *device)
: args_bytes_(0),
rets_bytes_(0),
extra_args_bytes_(RuntimeContext::extra_args_size) {
arr_access.resize(kernel.args.size(), irpass::ExternalPtrAccess(0));
arg_attribs_vec_.reserve(kernel.args.size());
// TODO: We should be able to limit Kernel args and rets to be primitive types
// as well but let's leave that as a followup up PR.
Expand Down Expand Up @@ -90,12 +92,13 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
ret_attribs_vec_.push_back(ra);
}

auto arange_args = [](auto *vec, size_t offset, bool is_ret) -> size_t {
auto arange_args = [](auto *vec, size_t offset, bool is_ret,
bool has_buffer_ptr) -> size_t {
size_t bytes = offset;
for (int i = 0; i < vec->size(); ++i) {
auto &attribs = (*vec)[i];
const size_t dt_bytes =
(attribs.is_array && !is_ret)
(attribs.is_array && !is_ret && has_buffer_ptr)
ailzhang marked this conversation as resolved.
Show resolved Hide resolved
? sizeof(uint64_t)
: data_type_size(PrimitiveType::get(attribs.dtype));
// Align bytes to the nearest multiple of dt_bytes
Expand All @@ -111,12 +114,14 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
};

TI_TRACE("args:");
args_bytes_ = arange_args(&arg_attribs_vec_, 0, false);
args_bytes_ = arange_args(
&arg_attribs_vec_, 0, false,
device->get_cap(DeviceCapability::spirv_has_physical_storage_buffer));
// Align to extra args
args_bytes_ = (args_bytes_ + 4 - 1) / 4 * 4;

TI_TRACE("rets:");
rets_bytes_ = arange_args(&ret_attribs_vec_, 0, true);
rets_bytes_ = arange_args(&ret_attribs_vec_, 0, true, false);

TI_TRACE("sizes: args={} rets={}", args_bytes(), rets_bytes());
TI_ASSERT(has_rets() == (rets_bytes_ > 0));
Expand Down
8 changes: 6 additions & 2 deletions taichi/codegen/spirv/kernel_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include "taichi/ir/offloaded_task_type.h"
#include "taichi/ir/type.h"
#include "taichi/ir/transforms.h"
#include "taichi/backends/device.h"

namespace taichi {
Expand Down Expand Up @@ -172,7 +173,7 @@ class KernelContextAttributes {
struct RetAttributes : public AttribsBase {};

KernelContextAttributes() = default;
explicit KernelContextAttributes(const Kernel &kernel);
explicit KernelContextAttributes(const Kernel &kernel, Device *device);

/**
* Whether this kernel has any argument
Expand Down Expand Up @@ -234,11 +235,14 @@ class KernelContextAttributes {
return args_bytes();
}

std::vector<irpass::ExternalPtrAccess> arr_access;

TI_IO_DEF(arg_attribs_vec_,
ret_attribs_vec_,
args_bytes_,
rets_bytes_,
extra_args_bytes_);
extra_args_bytes_,
arr_access);

private:
std::vector<ArgAttributes> arg_attribs_vec_;
Expand Down
262 changes: 262 additions & 0 deletions taichi/codegen/spirv/lib_tiny_ir.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
#pragma once

#include "taichi/common/core.h"

#include <vector>
#include <memory>

namespace taichi {
namespace tinyir {

template <typename T>
T ceil_div(T v, T div) {
return (v / div) + (v % div ? 1 : 0);
}

// Forward decl
class Polymorphic;
class Node;
class Type;
class LayoutContext;
class MemRefElementTypeInterface;
class MemRefAggregateTypeInterface;
class ShapedTypeInterface;
class AggregateTypeInterface;
class PointerTypeInterface;
class Block;
class Visitor;

class Polymorphic {
public:
virtual ~Polymorphic() {
}

template <typename T>
bool is() const {
return dynamic_cast<const T *>(this) != nullptr;
}

template <typename T>
T *as() {
return static_cast<T *>(this);
}

template <typename T>
const T *as() const {
return static_cast<const T *>(this);
}

template <typename T>
T *cast() {
return dynamic_cast<T *>(this);
}

template <typename T>
const T *cast() const {
return dynamic_cast<const T *>(this);
}

bool operator==(const Polymorphic &other) const {
return typeid(*this) == typeid(other) && is_equal(other);
}

const bool equals(const Polymorphic *other) const {
return (*this) == (*other);
}

private:
virtual bool is_equal(const Polymorphic &other) const = 0;
};

class Node : public Polymorphic {
public:
using NodeRefs = const std::vector<const Node *>;

Node() {
}

virtual ~Node() {
}

const std::string &debug_name() const {
return debug_name_;
}

void set_debug_name(const std::string &s) {
debug_name_ = s;
}

virtual NodeRefs incoming() const {
return {};
}

virtual NodeRefs outgoing() const {
return {};
}

virtual bool is_leaf() const {
return false;
}

virtual bool is_tree_node() const {
return false;
}

private:
virtual bool is_equal(const Polymorphic &other) const {
return false;
}

std::string debug_name_;
};

class Type : public Node {
public:
Type() {
}

private:
virtual bool is_equal(const Polymorphic &other) const {
return false;
}
};

// The default LayoutContext is the standard C layout
class LayoutContext : public Polymorphic {
private:
std::unordered_map<const MemRefElementTypeInterface *, size_t> size_cache_;
std::unordered_map<const MemRefElementTypeInterface *, size_t>
alignment_cache_;
std::unordered_map<const MemRefAggregateTypeInterface *, std::vector<size_t>>
elem_offset_cache_;

public:
void register_size(const MemRefElementTypeInterface *t, size_t size) {
TI_ASSERT(size != 0);
size_cache_[t] = size;
}

void register_alignment(const MemRefElementTypeInterface *t, size_t size) {
TI_ASSERT(size != 0);
alignment_cache_[t] = size;
}

void register_aggregate(const MemRefAggregateTypeInterface *t, int num_elem) {
elem_offset_cache_[t] = {};
elem_offset_cache_[t].resize(num_elem, 0);
}

void register_elem_offset(const MemRefAggregateTypeInterface *t,
int n,
size_t offset) {
TI_ASSERT(elem_offset_cache_.find(t) != elem_offset_cache_.end());
elem_offset_cache_[t][n] = offset;
}

// Size or alignment can not be zero
size_t query_size(const MemRefElementTypeInterface *t) {
if (size_cache_.find(t) != size_cache_.end()) {
return size_cache_[t];
} else {
return 0;
}
}

size_t query_alignment(const MemRefElementTypeInterface *t) {
if (alignment_cache_.find(t) != alignment_cache_.end()) {
return alignment_cache_[t];
} else {
return 0;
}
}

size_t query_elem_offset(const MemRefAggregateTypeInterface *t, int n) {
if (elem_offset_cache_.find(t) != elem_offset_cache_.end()) {
return elem_offset_cache_[t][n];
} else {
return 0;
}
}

private:
virtual bool is_equal(const Polymorphic &other) const {
// This is only called when `other` has the same typeid
return true;
}
};

class MemRefElementTypeInterface {
public:
virtual size_t memory_size(LayoutContext &ctx) const = 0;
virtual size_t memory_alignment_size(LayoutContext &ctx) const = 0;
};

class MemRefAggregateTypeInterface : public MemRefElementTypeInterface {
public:
virtual size_t nth_element_offset(int n, LayoutContext &ctx) const = 0;
};

class AggregateTypeInterface {
public:
virtual const Type *nth_element_type(int n) const = 0;
virtual int get_num_elements() const = 0;
};

class ShapedTypeInterface {
public:
virtual const Type *element_type() const = 0;
virtual bool is_constant_shape() const = 0;
virtual std::vector<size_t> get_constant_shape() const = 0;
};

class PointerTypeInterface {
public:
virtual const Type *get_pointed_type() const = 0;
};

class Block {
public:
template <typename T, class... E>
T *emplace_back(E... args) {
nodes_.push_back(std::make_unique<T>(args...));
return static_cast<T *>(nodes_.back().get());
}

template <typename T>
T *push_back(std::unique_ptr<T> &&val) {
T *ptr = val.get();
nodes_.push_back(std::move(val));
return ptr;
}

const std::vector<std::unique_ptr<Node>> &nodes() const {
return nodes_;
}

private:
std::vector<std::unique_ptr<Node>> nodes_;
};

class Visitor {
public:
virtual ~Visitor() {
}

virtual void visit(const Node *node) {
if (node->is<Type>()) {
visit_type(node->as<Type>());
}
}

virtual void visit_type(const Type *type) {
}

virtual void visit(const Block *block) {
for (auto &n : block->nodes()) {
visit(n.get());
}
}
};

} // namespace tinyir
} // namespace taichi
Loading