taichi-dev · bobcao3 · Jun 23, 2022 · Jun 20, 2022 · Jun 20, 2022 · Jun 20, 2022
diff --git a/taichi/backends/vulkan/vulkan_device.cpp b/taichi/backends/vulkan/vulkan_device.cpp
@@ -1310,7 +1310,6 @@ DeviceAllocation VulkanDevice::allocate_memory(const AllocParams &params) {
   if (params.usage & AllocUsage::Index) {
     buffer_info.usage |= VK_BUFFER_USAGE_INDEX_BUFFER_BIT;
   }
-  buffer_info.sharingMode = VK_SHARING_MODE_CONCURRENT;
 
   uint32_t queue_family_indices[] = {compute_queue_family_index_,
                                      graphics_queue_family_index_};
@@ -1351,20 +1350,22 @@ DeviceAllocation VulkanDevice::allocate_memory(const AllocParams &params) {
   if (params.host_read && params.host_write) {
 #endif  //__APPLE__
     // This should be the unified memory on integrated GPUs
-    alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
-    alloc_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT |
-                                VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+    alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                               VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
+    alloc_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
 #ifdef __APPLE__
     // weird behavior on apple: if coherent bit is not set, then the memory
     // writes between map() and unmap() cannot be seen by gpu
     alloc_info.preferredFlags |= VK_MEMORY_PROPERTY_HOST_COHERENT_BIT;
 #endif  //__APPLE__
   } else if (params.host_read) {
-    alloc_info.usage = VMA_MEMORY_USAGE_GPU_TO_CPU;
+    alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+    alloc_info.preferredFlags = VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
   } else if (params.host_write) {
-    alloc_info.usage = VMA_MEMORY_USAGE_CPU_TO_GPU;
+    alloc_info.requiredFlags = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT;
+    alloc_info.preferredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
   } else {
-    alloc_info.usage = VMA_MEMORY_USAGE_GPU_ONLY;
+    alloc_info.requiredFlags = VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
   }
 
   if (get_cap(DeviceCapability::spirv_has_physical_storage_buffer)) {

diff --git a/taichi/codegen/spirv/kernel_utils.cpp b/taichi/codegen/spirv/kernel_utils.cpp
@@ -48,10 +48,12 @@ std::string TaskAttributes::BufferBind::debug_string() const {
                      TaskAttributes::buffers_name(buffer), binding);
 }
 
-KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
+KernelContextAttributes::KernelContextAttributes(const Kernel &kernel,
+                                                 Device *device)
     : args_bytes_(0),
       rets_bytes_(0),
       extra_args_bytes_(RuntimeContext::extra_args_size) {
+  arr_access.resize(kernel.args.size(), irpass::ExternalPtrAccess(0));
   arg_attribs_vec_.reserve(kernel.args.size());
   // TODO: We should be able to limit Kernel args and rets to be primitive types
   // as well but let's leave that as a followup up PR.
@@ -90,12 +92,13 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
     ret_attribs_vec_.push_back(ra);
   }
 
-  auto arange_args = [](auto *vec, size_t offset, bool is_ret) -> size_t {
+  auto arange_args = [](auto *vec, size_t offset, bool is_ret,
+                        bool has_buffer_ptr) -> size_t {
     size_t bytes = offset;
     for (int i = 0; i < vec->size(); ++i) {
       auto &attribs = (*vec)[i];
       const size_t dt_bytes =
-          (attribs.is_array && !is_ret)
+          (attribs.is_array && !is_ret && has_buffer_ptr)
               ? sizeof(uint64_t)
               : data_type_size(PrimitiveType::get(attribs.dtype));
       // Align bytes to the nearest multiple of dt_bytes
@@ -111,12 +114,14 @@ KernelContextAttributes::KernelContextAttributes(const Kernel &kernel)
   };
 
   TI_TRACE("args:");
-  args_bytes_ = arange_args(&arg_attribs_vec_, 0, false);
+  args_bytes_ = arange_args(
+      &arg_attribs_vec_, 0, false,
+      device->get_cap(DeviceCapability::spirv_has_physical_storage_buffer));
   // Align to extra args
   args_bytes_ = (args_bytes_ + 4 - 1) / 4 * 4;
 
   TI_TRACE("rets:");
-  rets_bytes_ = arange_args(&ret_attribs_vec_, 0, true);
+  rets_bytes_ = arange_args(&ret_attribs_vec_, 0, true, false);
 
   TI_TRACE("sizes: args={} rets={}", args_bytes(), rets_bytes());
   TI_ASSERT(has_rets() == (rets_bytes_ > 0));

diff --git a/taichi/codegen/spirv/kernel_utils.h b/taichi/codegen/spirv/kernel_utils.h
@@ -6,6 +6,7 @@
 
 #include "taichi/ir/offloaded_task_type.h"
 #include "taichi/ir/type.h"
+#include "taichi/ir/transforms.h"
 #include "taichi/backends/device.h"
 
 namespace taichi {
@@ -172,7 +173,7 @@ class KernelContextAttributes {
   struct RetAttributes : public AttribsBase {};
 
   KernelContextAttributes() = default;
-  explicit KernelContextAttributes(const Kernel &kernel);
+  explicit KernelContextAttributes(const Kernel &kernel, Device *device);
 
   /**
    * Whether this kernel has any argument
@@ -234,11 +235,14 @@ class KernelContextAttributes {
     return args_bytes();
   }
 
+  std::vector<irpass::ExternalPtrAccess> arr_access;
+
   TI_IO_DEF(arg_attribs_vec_,
             ret_attribs_vec_,
             args_bytes_,
             rets_bytes_,
-            extra_args_bytes_);
+            extra_args_bytes_,
+            arr_access);
 
  private:
   std::vector<ArgAttributes> arg_attribs_vec_;

diff --git a/taichi/codegen/spirv/lib_tiny_ir.h b/taichi/codegen/spirv/lib_tiny_ir.h
@@ -0,0 +1,262 @@
+#pragma once
+
+#include "taichi/common/core.h"
+
+#include <vector>
+#include <memory>
+
+namespace taichi {
+namespace tinyir {
+
+template <typename T>
+T ceil_div(T v, T div) {
+  return (v / div) + (v % div ? 1 : 0);
+}
+
+// Forward decl
+class Polymorphic;
+class Node;
+class Type;
+class LayoutContext;
+class MemRefElementTypeInterface;
+class MemRefAggregateTypeInterface;
+class ShapedTypeInterface;
+class AggregateTypeInterface;
+class PointerTypeInterface;
+class Block;
+class Visitor;
+
+class Polymorphic {
+ public:
+  virtual ~Polymorphic() {
+  }
+
+  template <typename T>
+  bool is() const {
+    return dynamic_cast<const T *>(this) != nullptr;
+  }
+
+  template <typename T>
+  T *as() {
+    return static_cast<T *>(this);
+  }
+
+  template <typename T>
+  const T *as() const {
+    return static_cast<const T *>(this);
+  }
+
+  template <typename T>
+  T *cast() {
+    return dynamic_cast<T *>(this);
+  }
+
+  template <typename T>
+  const T *cast() const {
+    return dynamic_cast<const T *>(this);
+  }
+
+  bool operator==(const Polymorphic &other) const {
+    return typeid(*this) == typeid(other) && is_equal(other);
+  }
+
+  const bool equals(const Polymorphic *other) const {
+    return (*this) == (*other);
+  }
+
+ private:
+  virtual bool is_equal(const Polymorphic &other) const = 0;
+};
+
+class Node : public Polymorphic {
+ public:
+  using NodeRefs = const std::vector<const Node *>;
+
+  Node() {
+  }
+
+  virtual ~Node() {
+  }
+
+  const std::string &debug_name() const {
+    return debug_name_;
+  }
+
+  void set_debug_name(const std::string &s) {
+    debug_name_ = s;
+  }
+
+  virtual NodeRefs incoming() const {
+    return {};
+  }
+
+  virtual NodeRefs outgoing() const {
+    return {};
+  }
+
+  virtual bool is_leaf() const {
+    return false;
+  }
+
+  virtual bool is_tree_node() const {
+    return false;
+  }
+
+ private:
+  virtual bool is_equal(const Polymorphic &other) const {
+    return false;
+  }
+
+  std::string debug_name_;
+};
+
+class Type : public Node {
+ public:
+  Type() {
+  }
+
+ private:
+  virtual bool is_equal(const Polymorphic &other) const {
+    return false;
+  }
+};
+
+// The default LayoutContext is the standard C layout
+class LayoutContext : public Polymorphic {
+ private:
+  std::unordered_map<const MemRefElementTypeInterface *, size_t> size_cache_;
+  std::unordered_map<const MemRefElementTypeInterface *, size_t>
+      alignment_cache_;
+  std::unordered_map<const MemRefAggregateTypeInterface *, std::vector<size_t>>
+      elem_offset_cache_;
+
+ public:
+  void register_size(const MemRefElementTypeInterface *t, size_t size) {
+    TI_ASSERT(size != 0);
+    size_cache_[t] = size;
+  }
+
+  void register_alignment(const MemRefElementTypeInterface *t, size_t size) {
+    TI_ASSERT(size != 0);
+    alignment_cache_[t] = size;
+  }
+
+  void register_aggregate(const MemRefAggregateTypeInterface *t, int num_elem) {
+    elem_offset_cache_[t] = {};
+    elem_offset_cache_[t].resize(num_elem, 0);
+  }
+
+  void register_elem_offset(const MemRefAggregateTypeInterface *t,
+                            int n,
+                            size_t offset) {
+    TI_ASSERT(elem_offset_cache_.find(t) != elem_offset_cache_.end());
+    elem_offset_cache_[t][n] = offset;
+  }
+
+  // Size or alignment can not be zero
+  size_t query_size(const MemRefElementTypeInterface *t) {
+    if (size_cache_.find(t) != size_cache_.end()) {
+      return size_cache_[t];
+    } else {
+      return 0;
+    }
+  }
+
+  size_t query_alignment(const MemRefElementTypeInterface *t) {
+    if (alignment_cache_.find(t) != alignment_cache_.end()) {
+      return alignment_cache_[t];
+    } else {
+      return 0;
+    }
+  }
+
+  size_t query_elem_offset(const MemRefAggregateTypeInterface *t, int n) {
+    if (elem_offset_cache_.find(t) != elem_offset_cache_.end()) {
+      return elem_offset_cache_[t][n];
+    } else {
+      return 0;
+    }
+  }
+
+ private:
+  virtual bool is_equal(const Polymorphic &other) const {
+    // This is only called when `other` has the same typeid
+    return true;
+  }
+};
+
+class MemRefElementTypeInterface {
+ public:
+  virtual size_t memory_size(LayoutContext &ctx) const = 0;
+  virtual size_t memory_alignment_size(LayoutContext &ctx) const = 0;
+};
+
+class MemRefAggregateTypeInterface : public MemRefElementTypeInterface {
+ public:
+  virtual size_t nth_element_offset(int n, LayoutContext &ctx) const = 0;
+};
+
+class AggregateTypeInterface {
+ public:
+  virtual const Type *nth_element_type(int n) const = 0;
+  virtual int get_num_elements() const = 0;
+};
+
+class ShapedTypeInterface {
+ public:
+  virtual const Type *element_type() const = 0;
+  virtual bool is_constant_shape() const = 0;
+  virtual std::vector<size_t> get_constant_shape() const = 0;
+};
+
+class PointerTypeInterface {
+ public:
+  virtual const Type *get_pointed_type() const = 0;
+};
+
+class Block {
+ public:
+  template <typename T, class... E>
+  T *emplace_back(E... args) {
+    nodes_.push_back(std::make_unique<T>(args...));
+    return static_cast<T *>(nodes_.back().get());
+  }
+
+  template <typename T>
+  T *push_back(std::unique_ptr<T> &&val) {
+    T *ptr = val.get();
+    nodes_.push_back(std::move(val));
+    return ptr;
+  }
+
+  const std::vector<std::unique_ptr<Node>> &nodes() const {
+    return nodes_;
+  }
+
+ private:
+  std::vector<std::unique_ptr<Node>> nodes_;
+};
+
+class Visitor {
+ public:
+  virtual ~Visitor() {
+  }
+
+  virtual void visit(const Node *node) {
+    if (node->is<Type>()) {
+      visit_type(node->as<Type>());
+    }
+  }
+
+  virtual void visit_type(const Type *type) {
+  }
+
+  virtual void visit(const Block *block) {
+    for (auto &n : block->nodes()) {
+      visit(n.get());
+    }
+  }
+};
+
+}  // namespace tinyir
+}  // namespace taichi