From c30f943a397da5c77e66c799a65141b004e6cf10 Mon Sep 17 00:00:00 2001 From: Ye Kuang Date: Fri, 30 Oct 2020 17:25:15 +0900 Subject: [PATCH 1/2] [metal] Revise NodeManager/ListManager's implementation --- taichi/backends/metal/kernel_manager.cpp | 2 +- .../metal/shaders/runtime_structs.metal.h | 75 +++++++---------- .../metal/shaders/runtime_utils.metal.h | 82 +++++++++++-------- 3 files changed, 76 insertions(+), 83 deletions(-) diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp index bbd1ec1cc9a7b..9bc8c6a08b3fc 100644 --- a/taichi/backends/metal/kernel_manager.cpp +++ b/taichi/backends/metal/kernel_manager.cpp @@ -790,7 +790,7 @@ class KernelManager::Impl { if (compiled_structs_.need_snode_lists_data) { auto *mem_alloc = reinterpret_cast(addr); // Make sure the retured memory address is always greater than 1. - mem_alloc->next = shaders::kAlignment; + mem_alloc->next = shaders::MemoryAllocator::kInitOffset; // root list data are static ListgenElement root_elem; root_elem.mem_offset = 0; diff --git a/taichi/backends/metal/shaders/runtime_structs.metal.h b/taichi/backends/metal/shaders/runtime_structs.metal.h index bfd39c7de1ed1..08cb3b69aade2 100644 --- a/taichi/backends/metal/shaders/runtime_structs.metal.h +++ b/taichi/backends/metal/shaders/runtime_structs.metal.h @@ -31,8 +31,18 @@ STR( // clang-format on constant constexpr int kTaichiMaxNumIndices = 8; constant constexpr int kTaichiNumChunks = 1024; + constant constexpr int kAlignment = 8; + using PtrOffset = int32_t; - struct MemoryAllocator { atomic_int next; }; + struct MemoryAllocator { + atomic_int next; + + constant constexpr static int kInitOffset = 8; + + static inline bool is_valid(PtrOffset v) { + return v >= kInitOffset; + } + }; // ListManagerData manages a list of elements with adjustable size. struct ListManagerData { @@ -44,6 +54,23 @@ STR( atomic_int next; atomic_int chunks[kTaichiNumChunks]; + + struct ReservedElemPtrOffset { + public: + ReservedElemPtrOffset() = default; + explicit ReservedElemPtrOffset(PtrOffset v) : val_(v) {} + + inline bool is_valid() const { return is_valid(val_); } + + inline static bool is_valid(PtrOffset v) { + return MemoryAllocator::is_valid(v); + } + + inline PtrOffset value() const { return val_; } + + private: + PtrOffset val_{0}; + }; }; // NodeManagerData stores the actual data needed to implement NodeManager @@ -54,6 +81,7 @@ STR( // few lists (ListManagerData). In particular, |data_list| stores the actual // data, while |free_list| and |recycle_list| are only meant for GC. struct NodeManagerData { + using ElemIndex = ListManagerData::ReservedElemPtrOffset; // Stores the actual data. ListManagerData data_list; // For GC @@ -62,51 +90,6 @@ STR( atomic_int free_list_used; // Need this field to bookkeep some data during GC int recycled_list_size_backup; - - // Use this type instead of the raw index type (int32_t), because the - // raw value needs to be shifted by |kIndexOffset| in order for the - // spinning memory allocation algorithm to work. - struct ElemIndex { - // The first 8 index values are reserved to encode special status: - // * 0 : nullptr - // * 1 : spinning for allocation - // * 2-7: unused for now - // - /// For each allocated index, it is added by |index_offset| to skip over - /// these reserved values. - constant static constexpr int32_t kIndexOffset = 8; - - ElemIndex() = default; - - static ElemIndex from_index(int i) { - return ElemIndex(i + kIndexOffset); - } - - static ElemIndex from_raw(int r) { - return ElemIndex(r); - } - - inline int32_t index() const { - return raw_ - kIndexOffset; - } - - inline int32_t raw() const { - return raw_; - } - - inline bool is_valid() const { - return raw_ >= kIndexOffset; - } - - inline static bool is_valid(int raw) { - return ElemIndex::from_raw(raw).is_valid(); - } - - private: - explicit ElemIndex(int r) : raw_(r) { - } - int32_t raw_ = 0; - }; }; // This class is very similar to metal::SNodeDescriptor diff --git a/taichi/backends/metal/shaders/runtime_utils.metal.h b/taichi/backends/metal/shaders/runtime_utils.metal.h index 3be9e6a0411f1..3c4304ce7a38d 100644 --- a/taichi/backends/metal/shaders/runtime_utils.metal.h +++ b/taichi/backends/metal/shaders/runtime_utils.metal.h @@ -41,9 +41,6 @@ struct Runtime { // clang-format off METAL_BEGIN_RUNTIME_UTILS_DEF STR( - using PtrOffset = int32_t; - constant constexpr int kAlignment = 8; - [[maybe_unused]] PtrOffset mtl_memalloc_alloc(device MemoryAllocator *ma, int32_t size) { size = ((size + kAlignment - 1) / kAlignment) * kAlignment; @@ -57,6 +54,7 @@ STR( } struct ListManager { + using ReservedElemPtrOffset = ListManagerData::ReservedElemPtrOffset; device ListManagerData *lm_data; device MemoryAllocator *mem_alloc; @@ -74,22 +72,19 @@ STR( resize(0); } - struct ReserveElemResult { - int elem_idx; - PtrOffset chunk_ptr_offs; - }; - - ReserveElemResult reserve_new_elem() { + ReservedElemPtrOffset reserve_new_elem() { const int elem_idx = atomic_fetch_add_explicit( &lm_data->next, 1, metal::memory_order_relaxed); - const int chunk_idx = elem_idx >> lm_data->log2_num_elems_per_chunk; + const int chunk_idx = get_chunk_index(elem_idx); const PtrOffset chunk_ptr_offs = ensure_chunk(chunk_idx); - return {elem_idx, chunk_ptr_offs}; + const auto offset = + get_elem_ptr_offs_from_chunk(elem_idx, chunk_ptr_offs); + return ReservedElemPtrOffset{offset}; } device char *append() { auto reserved = reserve_new_elem(); - return get_elem_from_chunk(reserved.elem_idx, reserved.chunk_ptr_offs); + return get_ptr(reserved); } template @@ -104,8 +99,12 @@ STR( } } + device char *get_ptr(ReservedElemPtrOffset offs) { + return mtl_memalloc_to_ptr(mem_alloc, offs.value()); + } + device char *get_ptr(int i) { - const int chunk_idx = i >> lm_data->log2_num_elems_per_chunk; + const int chunk_idx = get_chunk_index(i); const PtrOffset chunk_ptr_offs = atomic_load_explicit( lm_data->chunks + chunk_idx, metal::memory_order_relaxed); return get_elem_from_chunk(i, chunk_ptr_offs); @@ -117,7 +116,11 @@ STR( } private: - PtrOffset ensure_chunk(int i) { + inline int get_chunk_index(int elem_idx) const { + return elem_idx >> lm_data->log2_num_elems_per_chunk; + } + + PtrOffset ensure_chunk(int chunk_idx) { PtrOffset offs = 0; const int chunk_bytes = (lm_data->element_stride << lm_data->log2_num_elems_per_chunk); @@ -128,11 +131,11 @@ STR( // from requesting memory again. Once allocated, set chunks[i] to the // actual address offset, which is guaranteed to be greater than 1. const bool is_me = atomic_compare_exchange_weak_explicit( - lm_data->chunks + i, &stored, 1, metal::memory_order_relaxed, - metal::memory_order_relaxed); + lm_data->chunks + chunk_idx, &stored, 1, + metal::memory_order_relaxed, metal::memory_order_relaxed); if (is_me) { offs = mtl_memalloc_alloc(mem_alloc, chunk_bytes); - atomic_store_explicit(lm_data->chunks + i, offs, + atomic_store_explicit(lm_data->chunks + chunk_idx, offs, metal::memory_order_relaxed); break; } else if (stored > 1) { @@ -144,11 +147,16 @@ STR( return offs; } - device char *get_elem_from_chunk(int i, PtrOffset chunk_ptr_offs) { - device char *chunk_ptr = reinterpret_cast( - mtl_memalloc_to_ptr(mem_alloc, chunk_ptr_offs)); + PtrOffset get_elem_ptr_offs_from_chunk(int elem_idx, + PtrOffset chunk_ptr_offs) { const uint32_t mask = ((1 << lm_data->log2_num_elems_per_chunk) - 1); - return chunk_ptr + ((i & mask) * lm_data->element_stride); + return chunk_ptr_offs + ((elem_idx & mask) * lm_data->element_stride); + } + + device char *get_elem_from_chunk(int elem_idx, PtrOffset chunk_ptr_offs) { + const auto offs = + get_elem_ptr_offs_from_chunk(elem_idx, chunk_ptr_offs); + return mtl_memalloc_to_ptr(mem_alloc, offs); } }; @@ -172,7 +180,7 @@ STR( return free_list.get(cur_used); } - return ElemIndex::from_index(data_list.reserve_new_elem().elem_idx); + return data_list.reserve_new_elem(); } device byte *get(ElemIndex i) { @@ -180,7 +188,7 @@ STR( data_list.lm_data = &(nm_data->data_list); data_list.mem_alloc = mem_alloc; - return data_list.get_ptr(i.index()); + return data_list.get_ptr(i); } void recycle(ElemIndex i) { @@ -328,33 +336,35 @@ STR( void activate(int i) { device auto *nm_idx_ptr = to_nodemgr_idx_ptr(addr_, i); - auto nm_idx_raw = + auto nm_idx_val = atomic_load_explicit(nm_idx_ptr, metal::memory_order_relaxed); - while (!ElemIndex::is_valid(nm_idx_raw)) { - nm_idx_raw = 0; + while (!ElemIndex::is_valid(nm_idx_val)) { + nm_idx_val = 0; // See ListManager::ensure_chunk() for the allocation algorithm. // See also https://github.com/taichi-dev/taichi/issues/1174. const bool is_me = atomic_compare_exchange_weak_explicit( - nm_idx_ptr, &nm_idx_raw, 1, metal::memory_order_relaxed, + nm_idx_ptr, &nm_idx_val, 1, metal::memory_order_relaxed, metal::memory_order_relaxed); if (is_me) { - nm_idx_raw = nm_.allocate().raw(); - atomic_store_explicit(nm_idx_ptr, nm_idx_raw, + nm_idx_val = nm_.allocate().value(); + atomic_store_explicit(nm_idx_ptr, nm_idx_val, metal::memory_order_relaxed); break; - } else if (ElemIndex::is_valid(nm_idx_raw)) { + } else if (ElemIndex::is_valid(nm_idx_val)) { break; } - // |nm_idx_raw| == 1, just spin + // |nm_idx_val| == 1, just spin } } void deactivate(int i) { device auto *nm_idx_ptr = to_nodemgr_idx_ptr(addr_, i); - const auto old_nm_idx_raw = atomic_exchange_explicit( + const auto old_nm_idx_val = atomic_exchange_explicit( nm_idx_ptr, 0, metal::memory_order_relaxed); - const auto old_nm_idx = ElemIndex::from_raw(old_nm_idx_raw); - if (!old_nm_idx.is_valid()) return; + const auto old_nm_idx = ElemIndex(old_nm_idx_val); + if (!old_nm_idx.is_valid()) { + return; + } nm_.recycle(old_nm_idx); } @@ -366,8 +376,8 @@ STR( static inline ElemIndex to_nodemgr_idx(device byte * addr, int ch_i) { device auto *ptr = to_nodemgr_idx_ptr(addr, ch_i); - const auto r = atomic_load_explicit(ptr, metal::memory_order_relaxed); - return ElemIndex::from_raw(r); + const auto v = atomic_load_explicit(ptr, metal::memory_order_relaxed); + return ElemIndex(v); } static bool is_active(device byte * addr, int ch_i) { From afb66ee4d5273cc0237ade4cfa4a8ff60447a3e3 Mon Sep 17 00:00:00 2001 From: Taichi Gardener Date: Fri, 30 Oct 2020 10:02:54 -0400 Subject: [PATCH 2/2] [skip ci] enforce code format --- .../metal/shaders/runtime_structs.metal.h | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/taichi/backends/metal/shaders/runtime_structs.metal.h b/taichi/backends/metal/shaders/runtime_structs.metal.h index 08cb3b69aade2..15f7a62a72092 100644 --- a/taichi/backends/metal/shaders/runtime_structs.metal.h +++ b/taichi/backends/metal/shaders/runtime_structs.metal.h @@ -32,9 +32,9 @@ STR( constant constexpr int kTaichiMaxNumIndices = 8; constant constexpr int kTaichiNumChunks = 1024; constant constexpr int kAlignment = 8; - using PtrOffset = int32_t; + using PtrOffset = int32_t; - struct MemoryAllocator { + struct MemoryAllocator { atomic_int next; constant constexpr static int kInitOffset = 8; @@ -58,15 +58,20 @@ STR( struct ReservedElemPtrOffset { public: ReservedElemPtrOffset() = default; - explicit ReservedElemPtrOffset(PtrOffset v) : val_(v) {} + explicit ReservedElemPtrOffset(PtrOffset v) : val_(v) { + } - inline bool is_valid() const { return is_valid(val_); } + inline bool is_valid() const { + return is_valid(val_); + } - inline static bool is_valid(PtrOffset v) { - return MemoryAllocator::is_valid(v); + inline static bool is_valid(PtrOffset v) { + return MemoryAllocator::is_valid(v); } - inline PtrOffset value() const { return val_; } + inline PtrOffset value() const { + return val_; + } private: PtrOffset val_{0};