microsoft · davidchisnall · Jul 8, 2019 · Jul 2, 2019 · Jul 4, 2019 · Jul 5, 2019
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -105,7 +105,7 @@ if(NOT DEFINED SNMALLOC_ONLY_HEADER_LIBRARY)
     set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /Zi")
     set(CMAKE_EXE_LINKER_FLAGS_RELEASE "${CMAKE_EXE_LINKER_FLAGS_RELEASE} /DEBUG")
   else()
-    add_compile_options(-march=native -fno-exceptions -fno-rtti -g -ftls-model=initial-exec)
+    add_compile_options(-march=native -fno-exceptions -fno-rtti -g -ftls-model=initial-exec -fomit-frame-pointer)
   endif()
 
   macro(subdirlist result curdir)

diff --git a/src/mem/alloc.h b/src/mem/alloc.h
@@ -234,6 +234,11 @@ namespace snmalloc
     FastFreeLists() : small_fast_free_lists() {}
   };
 
+  SNMALLOC_FAST_PATH void* no_replacement(void*)
+  {
+    return nullptr;
+  }
+
   /**
    * Allocator.  This class is parameterised on three template parameters.  The
    * `MemoryProvider` defines the source of memory for this allocator.
@@ -245,18 +250,27 @@ namespace snmalloc
    * to associate metadata with large (16MiB, by default) regions, allowing an
    * allocator to find the allocator responsible for that region.
    *
-   * The final template parameter, `IsQueueInline`, defines whether the
+   * The next template parameter, `IsQueueInline`, defines whether the
    * message queue for this allocator should be stored as a field of the
    * allocator (`true`) or provided externally, allowing it to be anywhere else
    * in the address space (`false`).
+   *
+   * The final template parameter provides a hook to allow the allocator in use
+   * to be dynamically modified.  This is used to implement a trick from
+   * mimalloc that avoids a conditional branch on the fast path.  We initialise
+   * the thread-local allocator pointer with the address of a global allocator,
+   * which never owns any memory.  When we try to allocate memory, we call the
+   * replacement function.
    */
   template<
     class MemoryProvider = GlobalVirtual,
     class PageMap = SNMALLOC_DEFAULT_PAGEMAP,
-    bool IsQueueInline = true>
+    bool IsQueueInline = true,
+    void* (*Replacement)(void*) = no_replacement>
   class Allocator
   : public FastFreeLists,
-    public Pooled<Allocator<MemoryProvider, PageMap, IsQueueInline>>
+    public Pooled<
+      Allocator<MemoryProvider, PageMap, IsQueueInline, Replacement>>
   {
     LargeAlloc<MemoryProvider> large_allocator;
     PageMap page_map;
@@ -274,7 +288,7 @@ namespace snmalloc
       size_t size,
       ZeroMem zero_mem = NoZero,
       AllowReserve allow_reserve = YesReserve>
-    ALLOCATOR void* alloc()
+    SNMALLOC_FAST_PATH ALLOCATOR void* alloc()
     {
       static_assert(size != 0, "Size must not be zero.");
 #ifdef USE_MALLOC
@@ -310,7 +324,7 @@ namespace snmalloc
     }
 
     template<ZeroMem zero_mem = NoZero, AllowReserve allow_reserve = YesReserve>
-    inline ALLOCATOR void* alloc(size_t size)
+    SNMALLOC_FAST_PATH ALLOCATOR void* alloc(size_t size)
     {
 #ifdef USE_MALLOC
       static_assert(
@@ -637,25 +651,33 @@ namespace snmalloc
 
     struct RemoteCache
     {
-      size_t size = 0;
+      /**
+       * The total amount of memory stored awaiting dispatch to other
+       * allocators.  This is initialised to the maximum size that we use
+       * before caching so that, when we hit the slow path and need to dispatch
+       * everything, we can check if we are a real allocator and lazily provide
+       * a real allocator.
+       */
+      size_t size = REMOTE_CACHE;
       RemoteList list[REMOTE_SLOTS];
 
       /// Used to find the index into the array of queues for remote
       /// deallocation
       /// r is used for which round of sending this is.
       inline size_t get_slot(size_t id, size_t r)
       {
-        constexpr size_t allocator_size =
-          sizeof(Allocator<MemoryProvider, PageMap, IsQueueInline>);
+        constexpr size_t allocator_size = sizeof(
+          Allocator<MemoryProvider, PageMap, IsQueueInline, Replacement>);
         constexpr size_t initial_shift =
           bits::next_pow2_bits_const(allocator_size);
+        assert((initial_shift - (r * REMOTE_SLOT_BITS)) < 64);
         return (id >> (initial_shift + (r * REMOTE_SLOT_BITS))) & REMOTE_MASK;
       }
 
       SNMALLOC_FAST_PATH void
-      dealloc(alloc_id_t target_id, void* p, sizeclass_t sizeclass)
+      dealloc_sized(alloc_id_t target_id, void* p, size_t objectsize)
       {
-        this->size += sizeclass_to_size(sizeclass);
+        this->size += objectsize;
 
         Remote* r = static_cast<Remote*>(p);
         r->set_target_id(target_id);
@@ -666,6 +688,12 @@ namespace snmalloc
         l->last = r;
       }
 
+      SNMALLOC_FAST_PATH void
+      dealloc(alloc_id_t target_id, void* p, sizeclass_t sizeclass)
+      {
+        dealloc_sized(target_id, p, sizeclass_to_size(sizeclass));
+      }
+
       void post(alloc_id_t id)
       {
         // When the cache gets big, post lists to their target allocators.
@@ -780,7 +808,10 @@ namespace snmalloc
 
   public:
     Allocator(
-      MemoryProvider& m, PageMap&& p = PageMap(), RemoteAllocator* r = nullptr)
+      MemoryProvider& m,
+      PageMap&& p = PageMap(),
+      RemoteAllocator* r = nullptr,
+      bool isFake = false)
     : large_allocator(m), page_map(p)
     {
       if constexpr (IsQueueInline)
@@ -796,6 +827,11 @@ namespace snmalloc
       if (id() >= static_cast<alloc_id_t>(-1))
         error("Id should not be -1");
 
+      // If this is fake, don't do any of the bits of initialisation that may
+      // allocate memory.
+      if (isFake)
+        return;
+
       init_message_queue();
       message_queue().invariant();
 
@@ -1036,7 +1072,7 @@ namespace snmalloc
       assert(sizeclass < NUM_SMALL_CLASSES);
       auto& fl = small_fast_free_lists[sizeclass];
       auto head = fl.value;
-      if (likely((reinterpret_cast<size_t>(head) & 1) == 0))
+      if (likely(head != nullptr))
       {
         void* p = head;
         // Read the next slot from the memory that's about to be allocated.
@@ -1055,6 +1091,11 @@ namespace snmalloc
     template<ZeroMem zero_mem, AllowReserve allow_reserve>
     SNMALLOC_SLOW_PATH void* small_alloc_slow(sizeclass_t sizeclass)
     {
+      if (void* replacement = Replacement(this))
+      {
+        return reinterpret_cast<Allocator*>(replacement)
+          ->template small_alloc_slow<zero_mem, allow_reserve>(sizeclass);
+      }
       handle_message_queue();
       size_t rsize = sizeclass_to_size(sizeclass);
       auto& sl = small_classes[sizeclass];
@@ -1205,6 +1246,12 @@ namespace snmalloc
       }
       else
       {
+        if (void* replacement = Replacement(this))
+        {
+          return reinterpret_cast<Allocator*>(replacement)
+            ->template medium_alloc<zero_mem, allow_reserve>(
+              sizeclass, rsize, size);
+        }
         slab = reinterpret_cast<Mediumslab*>(
           large_allocator.template alloc<NoZero, allow_reserve>(
             0, SUPERSLAB_SIZE));
@@ -1277,6 +1324,12 @@ namespace snmalloc
           zero_mem == YesZero ? "zeromem" : "nozeromem",
           allow_reserve == NoReserve ? "noreserve" : "reserve"));
 
+      if (void* replacement = Replacement(this))
+      {
+        return reinterpret_cast<Allocator*>(replacement)
+          ->template large_alloc<zero_mem, allow_reserve>(size);
+      }
+
       size_t size_bits = bits::next_pow2_bits(size);
       size_t large_class = size_bits - SUPERSLAB_BITS;
       assert(large_class < NUM_LARGE_CLASSES);
@@ -1313,21 +1366,47 @@ namespace snmalloc
       large_allocator.dealloc(slab, large_class);
     }
 
-    SNMALLOC_FAST_PATH void
-    remote_dealloc(RemoteAllocator* target, void* p, sizeclass_t sizeclass)
+#if defined(__GNUC__) && !defined(__clang__) && !defined(__OPTIMIZE__)
+    // Don't force this to be always inlined in debug builds with GCC, because
+    // it will fail and then raise an error.
+    inline
+#else
+    SNMALLOC_FAST_PATH
+#endif
+      void
+      remote_dealloc(RemoteAllocator* target, void* p, sizeclass_t sizeclass)
     {
       MEASURE_TIME(remote_dealloc, 4, 16);
+      assert(target->id() != id());
 
       handle_message_queue();
 
       void* offseted = apply_cache_friendly_offset(p, sizeclass);
 
+      // Check whether this will overflow the cache first.  If we are a fake
+      // allocator, then our cache will always be full and so we will never hit
+      // this path.
+      size_t sz = sizeclass_to_size(sizeclass);
+      if ((remote.size + sz) < REMOTE_CACHE)
+      {
+        stats().remote_free(sizeclass);
+        remote.dealloc_sized(target->id(), offseted, sz);
+        return;
+      }
+      // Now that we've established that we're in the slow path (if we're a
+      // real allocator, we will have to empty our cache now), check if we are
+      // a real allocator and construct one if we aren't.
+      if (void* replacement = Replacement(this))
+      {
+        // We have to do a dealloc, not a remote_dealloc here because this may
+        // have been allocated with the allocator that we've just had returned.
+        reinterpret_cast<Allocator*>(replacement)->dealloc(p);
+        return;
+      }
+
       stats().remote_free(sizeclass);
       remote.dealloc(target->id(), offseted, sizeclass);
 
-      if (remote.size < REMOTE_CACHE)
-        return;
-
       stats().remote_post();
       remote.post(id());
     }

diff --git a/src/mem/globalalloc.h b/src/mem/globalalloc.h
@@ -6,11 +6,25 @@
 
 namespace snmalloc
 {
+  inline void* lazy_replacement(void*);
+  using Alloc =
+    Allocator<GlobalVirtual, SNMALLOC_DEFAULT_PAGEMAP, true, lazy_replacement>;
+
   template<class MemoryProvider>
-  class AllocPool : Pool<Allocator<MemoryProvider>, MemoryProvider>
+  class AllocPool : Pool<
+                      Allocator<
+                        MemoryProvider,
+                        SNMALLOC_DEFAULT_PAGEMAP,
+                        true,
+                        lazy_replacement>,
+                      MemoryProvider>
   {
-    using Alloc = Allocator<MemoryProvider>;
-    using Parent = Pool<Allocator<MemoryProvider>, MemoryProvider>;
+    using Alloc = Allocator<
+      MemoryProvider,
+      SNMALLOC_DEFAULT_PAGEMAP,
+      true,
+      lazy_replacement>;
+    using Parent = Pool<Alloc, MemoryProvider>;
 
   public:
     static AllocPool* make(MemoryProvider& mp)
@@ -175,5 +189,4 @@ namespace snmalloc
     return AllocPool<MemoryProvider>::make(mp);
   }
 
-  using Alloc = Allocator<GlobalVirtual>;
 } // namespace snmalloc
diff --git a/src/mem/metaslab.h b/src/mem/metaslab.h
@@ -146,10 +146,10 @@ namespace snmalloc
     {
 #ifndef NDEBUG
       size_t length = 0;
-      void* curr = pointer_offset(slab, head);
-      void* curr_slow = pointer_offset(slab, head);
+      void* curr = (head == 1) ? nullptr : pointer_offset(slab, head);
+      void* curr_slow = (head == 1) ? nullptr : pointer_offset(slab, head);
       bool both = false;
-      while ((reinterpret_cast<size_t>(curr) & 1) == 0)
+      while (curr != nullptr)
       {
         curr = follow_next(curr);
         if (both)
@@ -200,8 +200,8 @@ namespace snmalloc
       UNUSED(length);
 
       // Walk bump-free-list-segment accounting for unused space
-      void* curr = pointer_offset(slab, head);
-      while ((address_cast(curr) & 1) == 0)
+      void* curr = (head == 1) ? nullptr : pointer_offset(slab, head);
+      while (curr != nullptr)
       {
         // Check we are looking at a correctly aligned block
         void* start = curr;

diff --git a/src/mem/slab.h b/src/mem/slab.h
@@ -7,7 +7,7 @@ namespace snmalloc
   struct FreeListHead
   {
     // Use a value with bottom bit set for empty list.
-    void* value = pointer_offset<void*>(nullptr, 1);
+    void* value = nullptr;
   };
 
   class Slab
@@ -98,15 +98,16 @@ namespace snmalloc
             }
             else
             {
-              Metaslab::store_next(curr, pointer_offset(this, bumpptr));
+              Metaslab::store_next(
+                curr, (bumpptr == 1) ? nullptr : pointer_offset(this, bumpptr));
             }
             curr = pointer_offset(this, bumpptr);
             bumpptr = newbumpptr;
             meta.allocated = meta.allocated + 1;
           }
 
           assert(curr != nullptr);
-          Metaslab::store_next(curr, pointer_offset<void*>(nullptr, 1));
+          Metaslab::store_next(curr, nullptr);
         }
       }
 
@@ -177,7 +178,8 @@ namespace snmalloc
       assert(meta.valid_head(is_short()));
 
       // Set the next pointer to the previous head.
-      Metaslab::store_next(p, pointer_offset(this, head));
+      Metaslab::store_next(
+        p, (head == 1) ? nullptr : pointer_offset(this, head));
       meta.debug_slab_invariant(is_short(), this);
       return true;
     }