From 20476eabfd8c1ab96d1a48dd37be5adcae989cf3 Mon Sep 17 00:00:00 2001
From: waddlesplash <waddlesplash@gmail.com>
Date: Sat, 17 Aug 2019 12:28:41 -0400
Subject: [PATCH 01/69] Add missing NULL checks to the allocation path.

---
 rpmalloc/rpmalloc.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 451d03de..c5e2a08c 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1273,6 +1273,9 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 
 	//Find a span in one of the cache levels
 	active_span = _memory_heap_extract_new_span(heap, 1, class_idx);
+	
+	if (!active_span)
+		return active_span;
 
 	//Mark span as owned by this heap and set base data, return first block
 	return _memory_span_set_new_active(heap, heap_class, active_span, class_idx);
@@ -1315,6 +1318,9 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 
 	//Find a span in one of the cache levels
 	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT);
+	
+	if (!span)
+		return span;
 
 	//Mark span as owned by this heap and set base data
 	assert(span->span_count == span_count);

From 07552bca2e7d3c2df0df917573ece23f8f44ce67 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 25 Sep 2019 12:15:06 +0200
Subject: [PATCH 02/69] allow up to 4GiB pages

---
 rpmalloc/rpmalloc.c | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index c5e2a08c..72790f7d 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -714,7 +714,7 @@ _memory_unmap_span(span_t* span) {
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 
 	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
-	span_t* master = is_master ? span : (pointer_offset(span, -(int32_t)(span->total_spans_or_distance * _memory_span_size)));
+	span_t* master = is_master ? span : (pointer_offset(span, -(intptr_t)((uintptr_t)span->total_spans_or_distance * _memory_span_size)));
 	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
 	assert(master->flags & SPAN_FLAG_MASTER);
 
@@ -1501,8 +1501,8 @@ _memory_deallocate_large(span_t* span) {
 		if (span->flags & SPAN_FLAG_MASTER) {
 			heap->span_reserve_master = span;
 		} else { //SPAN_FLAG_SUBSPAN
-			uint32_t distance = span->total_spans_or_distance;
-			span_t* master = pointer_offset(span, -(int32_t)(distance * _memory_span_size));
+			uintptr_t distance = span->total_spans_or_distance;
+			span_t* master = pointer_offset(span, -(intptr_t)(distance * _memory_span_size));
 			heap->span_reserve_master = master;
 			assert(master->flags & SPAN_FLAG_MASTER);
 			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
@@ -1839,10 +1839,15 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	}
 
 	//The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF)
-	if (_memory_page_size < 512)
-		_memory_page_size = 512;
-	if (_memory_page_size > (64 * 1024 * 1024))
-		_memory_page_size = (64 * 1024 * 1024);
+	size_t min_span_size = 512;
+	size_t max_page_size = 4 * 1024 * 1024;
+	const size_t ptrbits = sizeof(void*);
+	if (ptrbits > 4)
+		max_page_size = 4096ULL * 1024ULL * 1024ULL;
+	if (_memory_page_size < min_span_size)
+		_memory_page_size = min_span_size;
+	if (_memory_page_size > max_page_size)
+		_memory_page_size = max_page_size;
 	_memory_page_size_shift = 0;
 	size_t page_size_bit = _memory_page_size;
 	while (page_size_bit != 1) {

From 616295f0113095918d505c675623a17a255a8a7d Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Wed, 25 Sep 2019 12:15:17 +0200
Subject: [PATCH 03/69] print file/line info on fail

---
 test/main.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/test/main.c b/test/main.c
index 679287f5..ce41dc3b 100644
--- a/test/main.c
+++ b/test/main.c
@@ -23,11 +23,13 @@ static void
 test_initialize(void);
 
 static int
-test_fail(const char* reason) {
-	fprintf(stderr, "FAIL: %s\n", reason);
+test_fail_cb(const char* reason, const char* file, int line) {
+	fprintf(stderr, "FAIL: %s @ %s:%d\n", reason, file, line);
 	return -1;
 }
 
+#define test_fail(msg) test_fail_cb(msg, __FILE__, __LINE__)
+
 static int
 test_alloc(void) {
 	unsigned int iloop = 0;
@@ -754,8 +756,8 @@ test_threadspam(void) {
 	num_alloc_threads = _hardware_threads;
 	if (num_alloc_threads < 2)
 		num_alloc_threads = 2;
-	if (num_alloc_threads > 64)
-		num_alloc_threads = 64;
+	if (num_alloc_threads > 16)
+		num_alloc_threads = 16;
 
 	arg.loops = 500;
 	arg.passes = 10;

From 3bbaeacb80fdb563fe948d791ad59bdffec97255 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 5 Nov 2019 17:28:57 +0100
Subject: [PATCH 04/69] allow preload without override together with mac
 interpose

---
 rpmalloc/malloc.c | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 426a14ae..9511cf79 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -30,8 +30,6 @@ _Static_assert(sizeof(void*) == 4, "Data type size mismatch");
 #pragma GCC visibility push(default)
 #endif
 
-#if ENABLE_OVERRIDE
-
 #define USE_IMPLEMENT 1
 #define USE_INTERPOSE 0
 #define USE_ALIAS 0
@@ -39,6 +37,17 @@ _Static_assert(sizeof(void*) == 4, "Data type size mismatch");
 #if defined(__APPLE__) && ENABLE_PRELOAD
 #undef USE_INTERPOSE
 #define USE_INTERPOSE 1
+
+typedef struct interpose_t {
+	void* new_func;
+	void* orig_func;
+} interpose_t;
+
+#define MAC_INTERPOSE_PAIR(newf, oldf) 	{ (void*)newf, (void*)oldf }
+#define MAC_INTERPOSE_SINGLE(newf, oldf) \
+__attribute__((used)) static const interpose_t macinterpose##newf##oldf \
+__attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf)
+
 #endif
 
 #if !defined(_WIN32) && !USE_INTERPOSE
@@ -55,6 +64,8 @@ _Static_assert(sizeof(void*) == 4, "Data type size mismatch");
 #undef calloc
 #endif
 
+#if ENABLE_OVERRIDE
+
 #if USE_IMPLEMENT
 
 extern inline void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
@@ -91,16 +102,6 @@ extern void* _Znajj(uint64_t size, uint64_t align); void* _Znajj(uint64_t size,
 
 #if USE_INTERPOSE
 
-typedef struct interpose_t {
-	void* new_func;
-	void* orig_func;
-} interpose_t;
-
-#define MAC_INTERPOSE_PAIR(newf, oldf) 	{ (void*)newf, (void*)oldf }
-#define MAC_INTERPOSE_SINGLE(newf, oldf) \
-__attribute__((used)) static const interpose_t macinterpose##newf##oldf \
-__attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf)
-
 __attribute__((used)) static const interpose_t macinterpose_malloc[]
 __attribute__ ((section("__DATA, __interpose"))) = {
 	//new and new[]

From c4f3ed40f855d1690c4e1968448fb9791214361d Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 5 Nov 2019 21:10:12 +0100
Subject: [PATCH 05/69] update changelog

---
 CHANGELOG | 9 +++++++++
 README.md | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 94c74f21..a5777205 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,3 +1,12 @@
+1.4.1
+
+Allow up to 4GiB page sizes
+
+Added a missing null check in the non-hot allocation code paths
+
+Fixed compilation issue on macOS when ENABLE_PRELOAD is set but not ENABLE_OVERRIDE
+
+
 1.4.0
 
 Improved cross thread deallocations by using per-span atomic free list to minimize thread
diff --git a/README.md b/README.md
index c8149b8d..ddf7c705 100644
--- a/README.md
+++ b/README.md
@@ -14,7 +14,7 @@ The code should be easily portable to any platform with atomic operations and an
 
 This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
 
-Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))
+Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))  -  Support development through my [GitHub Sponsors page](https://github.com/sponsors/mjansson)
 
 # Performance
 We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2500 lines of C code. All allocations have a natural 16-byte alignment.

From fb8c333019a0b477f29c4914854a878ccc400b0e Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Fri, 8 Nov 2019 23:36:02 +0100
Subject: [PATCH 06/69] avoid redefining win32 preinclude def

---
 rpmalloc/rpmalloc.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 72790f7d..6838147e 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -113,7 +113,9 @@
 #  define FORCEINLINE inline __attribute__((__always_inline__))
 #endif
 #if PLATFORM_WINDOWS
-#  define WIN32_LEAN_AND_MEAN
+#  ifndef WIN32_LEAN_AND_MEAN
+#    define WIN32_LEAN_AND_MEAN
+#  endif
 #  include <windows.h>
 #  if ENABLE_VALIDATE_ARGS
 #    include <Intsafe.h>

From 26147f094a43dc1eaa345a4a4e3d4122ee688d44 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Sat, 9 Nov 2019 12:39:31 +0100
Subject: [PATCH 07/69] Allow huge pages to split into multiple heap control
 structures (#121)

---
 CHANGELOG           |   3 +
 README.md           |   8 +-
 rpmalloc/rpmalloc.c | 192 +++++++++++++++++++++++++++++---------------
 3 files changed, 135 insertions(+), 68 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index a5777205..e2eb4734 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,6 +2,9 @@
 
 Allow up to 4GiB page sizes
 
+Fix an issue where large page sizes in conjunction with many threads waste a lot of memory (previously
+each heap occupied an entire memory page, now heaps can now share a memory page)
+
 Added a missing null check in the non-hot allocation code paths
 
 Fixed compilation issue on macOS when ENABLE_PRELOAD is set but not ENABLE_OVERRIDE
diff --git a/README.md b/README.md
index ddf7c705..370c6397 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,8 @@
 # rpmalloc - Rampant Pixels Memory Allocator
 This library provides a public domain cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. The latest source code is always available at https://github.com/mjansson/rpmalloc
 
+Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))  -  Support development through my [GitHub Sponsors page](https://github.com/sponsors/mjansson)
+
 Platforms currently supported:
 
 - Windows
@@ -14,8 +16,6 @@ The code should be easily portable to any platform with atomic operations and an
 
 This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
 
-Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))  -  Support development through my [GitHub Sponsors page](https://github.com/sponsors/mjansson)
-
 # Performance
 We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2500 lines of C code. All allocations have a natural 16-byte alignment.
 
@@ -154,6 +154,10 @@ VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this gran
 
 All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__.
 
+# Other languages
+
+[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs)
+
 # License
 
 This is free and unencumbered software released into the public domain.
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 6838147e..0253d7aa 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -232,6 +232,8 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
 //! Size of a span header (must be a multiple of SMALL_GRANULARITY)
 #define SPAN_HEADER_SIZE          96
+//! ABA protection size in orhpan heap list (also becomes limit of smallest page size)
+#define HEAP_ORPHAN_ABA_SIZE      512
 
 #if ENABLE_VALIDATE_ARGS
 //! Maximum allocation size to avoid integer overflow
@@ -414,6 +416,8 @@ struct heap_t {
 	//! Allocation stats per size class
 	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
 #endif
+	//! Master heap owning the memory pages
+	heap_t*      master_heap;
 };
 
 struct size_class_t {
@@ -1371,6 +1375,37 @@ _memory_allocate(heap_t* heap, size_t size) {
 	return _memory_allocate_oversized(heap, size);
 }
 
+static void
+_memory_heap_initialize(heap_t* heap) {
+	memset(heap, 0, sizeof(heap_t));
+
+	//Get a new heap ID
+	heap->id = atomic_incr32(&_memory_heap_id);
+	assert(heap->id != 0);
+	//assert(!_memory_heap_lookup(heap->id));
+
+	//Link in heap in heap ID map
+	heap_t* next_heap;
+	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+	do {
+		next_heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap->next_heap = next_heap;
+	} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+}
+
+static void
+_memory_heap_orphan(heap_t* heap) {
+	void* raw_heap;
+	uintptr_t orphan_counter;
+	heap_t* last_heap;
+	do {
+		last_heap = atomic_load_ptr(&_memory_orphan_heaps);
+		heap->next_orphan = (void*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
+		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
+		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
+	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
+}
+
 //! Allocate a new heap
 static heap_t*
 _memory_allocate_heap(void) {
@@ -1383,36 +1418,38 @@ _memory_allocate_heap(void) {
 	atomic_thread_fence_acquire();
 	do {
 		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap = (void*)((uintptr_t)raw_heap & ~(uintptr_t)0x1FF);
+		heap = (void*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
 		if (!heap)
 			break;
 		next_heap = heap->next_orphan;
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)0x1FF));
+		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
 	} while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
 
 	if (!heap) {
 		//Map in pages for a new heap
 		size_t align_offset = 0;
-		heap = _memory_map((1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size, &align_offset);
+		size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+		heap = _memory_map(block_size, &align_offset);
 		if (!heap)
 			return heap;
-		memset(heap, 0, sizeof(heap_t));
-		heap->align_offset = align_offset;
 
-		//Get a new heap ID
-		do {
-			heap->id = atomic_incr32(&_memory_heap_id);
-			if (_memory_heap_lookup(heap->id))
-				heap->id = 0;
-		} while (!heap->id);
+		_memory_heap_initialize(heap);
+		heap->align_offset = align_offset;
 
-		//Link in heap in heap ID map
-		size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-		do {
-			next_heap = atomic_load_ptr(&_memory_heaps[list_idx]);
-			heap->next_heap = next_heap;
-		} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+		//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
+		size_t aligned_heap_size = sizeof(heap_t);
+		if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
+			aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
+		size_t num_heaps = block_size / aligned_heap_size;
+		heap_t* extra_heap = pointer_offset(heap, aligned_heap_size);
+		while (num_heaps > 1) {
+			_memory_heap_initialize(extra_heap);
+			extra_heap->master_heap = heap;
+			_memory_heap_orphan(extra_heap);
+			extra_heap = pointer_offset(extra_heap, aligned_heap_size);
+			--num_heaps;
+		}
 	}
 
 	return heap;
@@ -1691,15 +1728,7 @@ _memory_heap_finalize(void* heapptr) {
 #endif
 
 	//Orphan the heap
-	void* raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* last_heap;
-	do {
-		last_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap->next_orphan = (void*)((uintptr_t)last_heap & ~(uintptr_t)0x1FF);
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)0x1FF));
-	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
+	_memory_heap_orphan(heap);
 
 	set_thread_heap(0);
 
@@ -1840,8 +1869,8 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 			_memory_huge_pages = 1;
 	}
 
-	//The ABA counter in heap orphan list is tied to using 512 (bitmask 0x1FF)
-	size_t min_span_size = 512;
+	//The ABA counter in heap orphan list is tied to using HEAP_ORPHAN_ABA_SIZE
+	size_t min_span_size = HEAP_ORPHAN_ABA_SIZE;
 	size_t max_page_size = 4 * 1024 * 1024;
 	const size_t ptrbits = sizeof(void*);
 	if (ptrbits > 4)
@@ -1895,7 +1924,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
     fls_key = FlsAlloc(&rp_thread_destructor);
 #endif
 
-	atomic_store32(&_memory_heap_id, 0);
+	atomic_store32(&_memory_heap_id, 1);
 	atomic_store32(&_memory_orphan_counter, 0);
 #if ENABLE_STATISTICS
 	atomic_store32(&_memory_active_heaps, 0);
@@ -1947,6 +1976,7 @@ rpmalloc_finalize(void) {
 	//rpmalloc_dump_statistics(stderr);
 
 	//Free all thread caches
+	heap_t* master_heaps = 0;
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
 		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
 		while (heap) {
@@ -1994,12 +2024,23 @@ rpmalloc_finalize(void) {
 			}
 #endif
 			heap_t* next_heap = heap->next_heap;
-			size_t heap_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-			_memory_unmap(heap, heap_size, heap->align_offset, heap_size);
+			if (!heap->master_heap) {
+				heap->next_heap = master_heaps;
+				master_heaps = heap;
+			}
 			heap = next_heap;
 		}
 	}
 
+	//Finally free all master heaps pages
+	heap_t* master_heap = master_heaps;
+	while (master_heap) {
+		heap_t* next_heap = master_heap->next_heap;
+		size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+		_memory_unmap(master_heap, block_size, master_heap->align_offset, block_size);
+		master_heap = next_heap;
+	}
+
 #if ENABLE_GLOBAL_CACHE
 	//Free global caches
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
@@ -2435,61 +2476,80 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 #endif
 }
 
+#if ENABLE_STATISTICS
+
+static void
+_memory_heap_dump_statistics(heap_t* heap, void* file) {
+	fprintf(file, "Heap %d stats:\n", heap->id);
+	fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		if (!heap->size_class_use[iclass].alloc_total)
+			continue;
+		fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
+			atomic_load32(&heap->size_class_use[iclass].alloc_current),
+			heap->size_class_use[iclass].alloc_peak,
+			heap->size_class_use[iclass].alloc_total,
+			atomic_load32(&heap->size_class_use[iclass].free_total),
+			_memory_size_class[iclass].block_size,
+			_memory_size_class[iclass].block_count,
+			heap->size_class_use[iclass].spans_current,
+			heap->size_class_use[iclass].spans_peak,
+			((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
+			((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024),
+			heap->size_class_use[iclass].spans_map_calls);
+	}
+	fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls)
+			continue;
+		fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
+			atomic_load32(&heap->span_use[iclass].current),
+			heap->span_use[iclass].high,
+			((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
+			((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			heap->span_use[iclass].spans_map_calls);
+	}
+	fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
+	fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024));
+}
+
+#endif
+
 void
 rpmalloc_dump_statistics(void* file) {
 #if ENABLE_STATISTICS
 	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
 	assert(atomic_load32(&_memory_active_heaps) == 0);
-
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
 		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
 		while (heap) {
-			fprintf(file, "Heap %d stats:\n", heap->id);
-			fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
-			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+			int need_dump = 0;
+			for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) {
 				if (!heap->size_class_use[iclass].alloc_total) {
 					assert(!atomic_load32(&heap->size_class_use[iclass].free_total));
 					assert(!heap->size_class_use[iclass].spans_map_calls);
 					continue;
 				}
-				fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
-					atomic_load32(&heap->size_class_use[iclass].alloc_current),
-					heap->size_class_use[iclass].alloc_peak,
-					heap->size_class_use[iclass].alloc_total,
-					atomic_load32(&heap->size_class_use[iclass].free_total),
-					_memory_size_class[iclass].block_size,
-					_memory_size_class[iclass].block_count,
-					heap->size_class_use[iclass].spans_current,
-					heap->size_class_use[iclass].spans_peak,
-					((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024),
-					heap->size_class_use[iclass].spans_map_calls);
+				need_dump = 1;
 			}
-			fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+			for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) {
 				if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls)
 					continue;
-				fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
-					atomic_load32(&heap->span_use[iclass].current),
-					heap->span_use[iclass].high,
-					((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
-					((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-					heap->span_use[iclass].spans_map_calls);
+				need_dump = 1;
 			}
-			fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
-			fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024));
+			if (need_dump)
+				_memory_heap_dump_statistics(heap, file);
 			heap = heap->next_heap;
 		}
 	}
-
 	fprintf(file, "Global stats:\n");
 	size_t huge_current = (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
 	size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size;

From 106dbfc7b810fff3f27d057fd5d644c8c01c9d34 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 13 Nov 2019 09:51:53 +0100
Subject: [PATCH 08/69] Add funding config

---
 .github/FUNDING.yml | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .github/FUNDING.yml

diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 00000000..f4801977
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,3 @@
+# These are supported funding model platforms
+
+github: [mjansson]

From 5aa754c14645303f12cd147c78da07af95886af6 Mon Sep 17 00:00:00 2001
From: Johann Neuhauser <39044176+jneuhauser@users.noreply.github.com>
Date: Fri, 29 Nov 2019 08:50:42 +0100
Subject: [PATCH 09/69] Add cross compile setup from environment (#126)

---
 build/ninja/clang.py | 25 +++++++++++++++----------
 build/ninja/gcc.py   | 22 +++++++++++++---------
 2 files changed, 28 insertions(+), 19 deletions(-)

diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index e9ddc508..024b00ad 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -15,13 +15,14 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.sdkpath = ''
     self.includepaths = []
     self.libpaths = libpaths
-    self.ccompiler = 'clang'
-    self.cxxcompiler = 'clang++'
-    self.archiver = 'ar'
-    self.linker = 'clang'
-    self.cxxlinker = 'clang++'
+    self.ccompiler = os.environ.get('CC') or 'clang'
+    self.cxxcompiler = os.environ.get('CXX') or 'clang++'
     if self.target.is_windows():
-      self.archiver = 'llvm-ar'
+      self.archiver = os.environ.get('AR') or 'llvm-ar'
+    else:
+      self.archiver = os.environ.get('AR') or 'ar'
+    self.linker = os.environ.get('CC') or 'clang'
+    self.cxxlinker = os.environ.get('CXX') or 'clang++'
 
     #Default variables
     self.sysroot = ''
@@ -31,12 +32,12 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
       self.deploymenttarget = '10.7'
 
     #Command definitions
-    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
-    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
+    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags $cenvflags -c $in -o $out'
+    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags $cxxenvflags -c $in -o $out'
     self.ccdeps = 'gcc'
     self.ccdepfile = '$out.d'
-    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $out $in'
-    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags -o $out $in $libs $archlibs $oslibs $frameworks'
+    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $arenvflags $out $in'
+    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags -o $out $in $libs $archlibs $oslibs $frameworks'
 
     #Base flags
     self.cflags = ['-D' + project.upper() + '_COMPILE=1',
@@ -155,12 +156,16 @@ def write_variables(self, writer):
     writer.variable('carchflags', '')
     writer.variable('cconfigflags', '')
     writer.variable('cmoreflags', self.cmoreflags)
+    writer.variable('cenvflags', (os.environ.get('CFLAGS') or '').split())
+    writer.variable('cxxenvflags', (os.environ.get('CXXFLAGS') or '').split())
     writer.variable('arflags', self.arflags)
     writer.variable('ararchflags', '')
     writer.variable('arconfigflags', '')
+    writer.variable('arenvflags', (os.environ.get('ARFLAGS') or '').split())
     writer.variable('linkflags', self.linkflags)
     writer.variable('linkarchflags', '')
     writer.variable('linkconfigflags', '')
+    writer.variable('linkenvflags', (os.environ.get('LDFLAGS') or '').split())
     writer.variable('libs', '')
     writer.variable('libpaths', self.make_libpaths(self.libpaths))
     writer.variable('configlibpaths', '')
diff --git a/build/ninja/gcc.py b/build/ninja/gcc.py
index 21fc2e49..20646c19 100644
--- a/build/ninja/gcc.py
+++ b/build/ninja/gcc.py
@@ -13,19 +13,19 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.toolchain = ''
     self.includepaths = []
     self.libpaths = libpaths
-    self.ccompiler = 'gcc'
-    self.cxxcompiler = 'g++'
-    self.archiver = 'ar'
-    self.linker = 'gcc'
-    self.cxxlinker = 'g++'
+    self.ccompiler = os.environ.get('CC') or 'gcc'
+    self.cxxcompiler = os.environ.get('CXX') or 'g++'
+    self.archiver = os.environ.get('AR') or 'ar'
+    self.linker = os.environ.get('CC') or 'gcc'
+    self.cxxlinker = os.environ.get('CXX') or 'g++'
 
     #Command definitions
-    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
-    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags -c $in -o $out'
+    self.cccmd = '$toolchain$cc -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cflags $carchflags $cconfigflags $cmoreflags $cenvflags -c $in -o $out'
+    self.cxxcmd = '$toolchain$cxx -MMD -MT $out -MF $out.d $includepaths $moreincludepaths $cxxflags $carchflags $cconfigflags $cmoreflags $cxxenvflags -c $in -o $out'
     self.ccdeps = 'gcc'
     self.ccdepfile = '$out.d'
-    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $out $in'
-    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags -o $out $in $libs $archlibs $oslibs'
+    self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $arenvflags $out $in'
+    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags -o $out $in $libs $archlibs $oslibs'
 
     #Base flags
     self.cflags = ['-D' + project.upper() + '_COMPILE=1',
@@ -118,12 +118,16 @@ def write_variables(self, writer):
     writer.variable('carchflags', '')
     writer.variable('cconfigflags', '')
     writer.variable('cmoreflags', self.cmoreflags)
+    writer.variable('cenvflags', (os.environ.get('CFLAGS') or '').split())
+    writer.variable('cxxenvflags', (os.environ.get('CXXFLAGS') or '').split())
     writer.variable('arflags', self.arflags)
     writer.variable('ararchflags', '')
     writer.variable('arconfigflags', '')
+    writer.variable('arenvflags', (os.environ.get('ARFLAGS') or '').split())
     writer.variable('linkflags', self.linkflags)
     writer.variable('linkarchflags', '')
     writer.variable('linkconfigflags', '')
+    writer.variable('linkenvflags', (os.environ.get('LDFLAGS') or '').split())
     writer.variable('libs', '')
     writer.variable('libpaths', self.make_libpaths(self.libpaths))
     writer.variable('configlibpaths', '')

From a6664c74610c42ba5d565a9caf6999c2fdce693f Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 11 Dec 2019 23:02:13 +0100
Subject: [PATCH 10/69] compatibility changes for cpp (#129)

---
 rpmalloc/rpmalloc.c | 74 +++++++++++++++++++++------------------------
 1 file changed, 34 insertions(+), 40 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 0253d7aa..34425010 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -107,10 +107,14 @@
 
 /// Platform and arch specifics
 #if defined(_MSC_VER) && !defined(__clang__)
-#  define FORCEINLINE inline __forceinline
+#  ifndef FORCEINLINE
+#    define FORCEINLINE inline __forceinline
+#  endif
 #  define _Static_assert static_assert
 #else
-#  define FORCEINLINE inline __attribute__((__always_inline__))
+#  ifndef FORCEINLINE
+#    define FORCEINLINE inline __attribute__((__always_inline__))
+#  endif
 #endif
 #if PLATFORM_WINDOWS
 #  ifndef WIN32_LEAN_AND_MEAN
@@ -565,16 +569,6 @@ _memory_map_os(size_t size, size_t* offset);
 static void
 _memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
 
-//! Lookup a memory heap from heap ID
-static heap_t*
-_memory_heap_lookup(int32_t id) {
-	uint32_t list_idx = id % HEAP_ARRAY_SIZE;
-	heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
-	while (heap && (heap->id != id))
-		heap = heap->next_heap;
-	return heap;
-}
-
 #if ENABLE_STATISTICS
 #  define _memory_statistics_inc(counter, value) counter += value
 #  define _memory_statistics_dec(counter, value) counter -= value
@@ -644,7 +638,7 @@ static span_t*
 _memory_map_from_reserve(heap_t* heap, size_t span_count) {
 	//Update the heap span reserve
 	span_t* span = heap->span_reserve;
-	heap->span_reserve = pointer_offset(span, span_count * _memory_span_size);
+	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
 	heap->spans_reserved -= span_count;
 
 	_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
@@ -688,7 +682,7 @@ _memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
 	//full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
 	size_t aligned_span_count = _memory_map_align_span_count(span_count);
 	size_t align_offset = 0;
-	span_t* span = _memory_map(aligned_span_count * _memory_span_size, &align_offset);
+	span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset);
 	if (!span)
 		return 0;
 	_memory_span_initialize(span, aligned_span_count, span_count, align_offset);
@@ -700,7 +694,8 @@ _memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
 			_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
 			_memory_heap_cache_insert(heap, heap->span_reserve);
 		}
-		_memory_heap_set_reserved_spans(heap, span, pointer_offset(span, span_count * _memory_span_size), aligned_span_count - span_count);
+		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+		_memory_heap_set_reserved_spans(heap, span, reserved_spans, aligned_span_count - span_count);
 	}
 	return span;
 }
@@ -720,7 +715,7 @@ _memory_unmap_span(span_t* span) {
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 
 	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
-	span_t* master = is_master ? span : (pointer_offset(span, -(intptr_t)((uintptr_t)span->total_spans_or_distance * _memory_span_size)));
+	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->total_spans_or_distance * _memory_span_size)));
 	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
 	assert(master->flags & SPAN_FLAG_MASTER);
 
@@ -894,7 +889,7 @@ _memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
 	void* current_cache, *new_cache;
 	do {
 		current_cache = atomic_load_ptr(&cache->cache);
-		span->prev = (void*)((uintptr_t)current_cache & _memory_span_mask);
+		span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
 		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
 	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
 }
@@ -907,7 +902,7 @@ _memory_cache_extract(global_cache_t* cache) {
 		void* global_span = atomic_load_ptr(&cache->cache);
 		span_ptr = (uintptr_t)global_span & _memory_span_mask;
 		if (span_ptr) {
-			span_t* span = (void*)span_ptr;
+			span_t* span = (span_t*)span_ptr;
 			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
 			//does not manage to traverse the span to being unmapped before we access it
 			void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
@@ -924,9 +919,9 @@ _memory_cache_extract(global_cache_t* cache) {
 static void
 _memory_cache_finalize(global_cache_t* cache) {
 	void* current_cache = atomic_load_ptr(&cache->cache);
-	span_t* span = (void*)((uintptr_t)current_cache & _memory_span_mask);
+	span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
 	while (span) {
-		span_t* skip_span = (void*)((uintptr_t)span->prev & _memory_span_mask);
+		span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask);
 		atomic_add32(&cache->size, -(int32_t)span->list_size);
 		_memory_unmap_span_list(span);
 		span = skip_span;
@@ -963,11 +958,11 @@ _memory_global_cache_extract(size_t span_count) {
 static void
 _memory_heap_cache_adopt_deferred(heap_t* heap) {
 	atomic_thread_fence_acquire();
-	span_t* span = atomic_load_ptr(&heap->span_cache_deferred);
+	span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
 	if (!span)
 		return;
 	do {
-		span = atomic_load_ptr(&heap->span_cache_deferred);
+		span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
 	} while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span));
 	while (span) {
 		span_t* next_span = span->next;
@@ -1345,7 +1340,7 @@ _memory_allocate_huge(size_t size) {
 	if (size & (_memory_page_size - 1))
 		++num_pages;
 	size_t align_offset = 0;
-	span_t* span = _memory_map(num_pages * _memory_page_size, &align_offset);
+	span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset);
 	if (!span)
 		return span;
 	//Store page count in span_count
@@ -1382,13 +1377,12 @@ _memory_heap_initialize(heap_t* heap) {
 	//Get a new heap ID
 	heap->id = atomic_incr32(&_memory_heap_id);
 	assert(heap->id != 0);
-	//assert(!_memory_heap_lookup(heap->id));
 
 	//Link in heap in heap ID map
 	heap_t* next_heap;
 	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
 	do {
-		next_heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
 		heap->next_heap = next_heap;
 	} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
 }
@@ -1399,8 +1393,8 @@ _memory_heap_orphan(heap_t* heap) {
 	uintptr_t orphan_counter;
 	heap_t* last_heap;
 	do {
-		last_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap->next_orphan = (void*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
+		last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps);
+		heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
 		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
 	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
@@ -1418,7 +1412,7 @@ _memory_allocate_heap(void) {
 	atomic_thread_fence_acquire();
 	do {
 		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
-		heap = (void*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
+		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
 		if (!heap)
 			break;
 		next_heap = heap->next_orphan;
@@ -1430,7 +1424,7 @@ _memory_allocate_heap(void) {
 		//Map in pages for a new heap
 		size_t align_offset = 0;
 		size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-		heap = _memory_map(block_size, &align_offset);
+		heap = (heap_t*)_memory_map(block_size, &align_offset);
 		if (!heap)
 			return heap;
 
@@ -1442,12 +1436,12 @@ _memory_allocate_heap(void) {
 		if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
 			aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
 		size_t num_heaps = block_size / aligned_heap_size;
-		heap_t* extra_heap = pointer_offset(heap, aligned_heap_size);
+		heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
 		while (num_heaps > 1) {
 			_memory_heap_initialize(extra_heap);
 			extra_heap->master_heap = heap;
 			_memory_heap_orphan(extra_heap);
-			extra_heap = pointer_offset(extra_heap, aligned_heap_size);
+			extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
 			--num_heaps;
 		}
 	}
@@ -1484,7 +1478,7 @@ _memory_deallocate_defer(span_t* span, void* block) {
 			span_t* last_head;
 			heap_t* heap = span->heap;
 			do {
-				last_head = atomic_load_ptr(&heap->span_cache_deferred);
+				last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
 				span->next = last_head;
 			} while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head));
 			return;
@@ -1541,7 +1535,7 @@ _memory_deallocate_large(span_t* span) {
 			heap->span_reserve_master = span;
 		} else { //SPAN_FLAG_SUBSPAN
 			uintptr_t distance = span->total_spans_or_distance;
-			span_t* master = pointer_offset(span, -(intptr_t)(distance * _memory_span_size));
+			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)(distance * _memory_span_size));
 			heap->span_reserve_master = master;
 			assert(master->flags & SPAN_FLAG_MASTER);
 			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
@@ -1566,7 +1560,7 @@ _memory_deallocate_huge(span_t* span) {
 static void
 _memory_deallocate(void* p) {
 	//Grab the span (always at start of span, using span alignment)
-	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
 	if (UNEXPECTED(!span))
 		return;
 	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
@@ -1582,7 +1576,7 @@ static void*
 _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 	if (p) {
 		//Grab the span using guaranteed span alignment
-		span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
+		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
 		if (span->heap) {
 			if (span->size_class < SIZE_CLASS_COUNT) {
 				//Small/medium sized block
@@ -1658,7 +1652,7 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 static size_t
 _memory_usable_size(void* p) {
 	//Grab the span using guaranteed span alignment
-	span_t* span = (void*)((uintptr_t)p & _memory_span_mask);
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
 	if (span->heap) {
 		//Small/medium block
 		if (span->size_class < SIZE_CLASS_COUNT) {
@@ -1699,7 +1693,7 @@ _memory_adjust_size_class(size_t iclass) {
 
 static void
 _memory_heap_finalize(void* heapptr) {
-	heap_t* heap = heapptr;
+	heap_t* heap = (heap_t*)heapptr;
 	if (!heap)
 		return;
 	//Release thread cache spans back to global cache
@@ -1978,7 +1972,7 @@ rpmalloc_finalize(void) {
 	//Free all thread caches
 	heap_t* master_heaps = 0;
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
 		while (heap) {
 			if (heap->spans_reserved) {
 				span_t* span = _memory_map_spans(heap, heap->spans_reserved);
@@ -2345,7 +2339,7 @@ rpaligned_alloc(size_t alignment, size_t size) {
 	align_offset = 0;
 	mapped_size = num_pages * _memory_page_size;
 
-	span = _memory_map(mapped_size, &align_offset);
+	span = (span_t*)_memory_map(mapped_size, &align_offset);
 	if (!span) {
 		errno = ENOMEM;
 		return 0;
@@ -2424,7 +2418,7 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		if (heap->span_cache[iclass])
 			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
-		span_t* deferred_list = !iclass ? atomic_load_ptr(&heap->span_cache_deferred) : 0;
+		span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0;
 		//TODO: Incorrect, for deferred lists the size is NOT stored in list_size
 		if (deferred_list)
 			stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size;

From 8067c343f3dc9e68d8b4a12216cafc091e9a5edd Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 25 Dec 2019 23:37:13 +0100
Subject: [PATCH 11/69] remove old reference to guards

---
 README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/README.md b/README.md
index 370c6397..d443d412 100644
--- a/README.md
+++ b/README.md
@@ -79,8 +79,6 @@ Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is de
 
 Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`.
 
-Overwrite and underwrite guards are enabled if __ENABLE_GUARDS__ is defined to 1 (default is 0, or disabled), either on compile command line or by settings the value in `rpmalloc.c`. This will introduce up to 64 byte overhead on each allocation to store magic numbers, which will be verified when freeing the memory block. The actual overhead is dependent on the requested size compared to size class limits.
-
 To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1.
 
 To enable the runtime configurable memory page and span sizes, define __ENABLE_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB.

From cab9e2ef2eff596e92cea486035eff854125fcee Mon Sep 17 00:00:00 2001
From: Stas Denisov <nxrighthere@gmail.com>
Date: Sat, 28 Dec 2019 16:59:52 +0500
Subject: [PATCH 12/69] Mention a C# wrapper (#130)

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index d443d412..69d8fdc8 100644
--- a/README.md
+++ b/README.md
@@ -156,6 +156,8 @@ All entry points assume the passed values are valid, for example passing an inva
 
 [Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs)
 
+[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp)
+
 # License
 
 This is free and unencumbered software released into the public domain.

From 3792374b5aac7c86daaa3911179c64c56fe607fd Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 2 Jan 2020 11:04:36 +0100
Subject: [PATCH 13/69] First class heaps (#131)

---
 CHANGELOG           |   6 +
 README.md           |  16 +-
 configure.py        |   3 +-
 rpmalloc/rpmalloc.c | 946 +++++++++++++++++++++++++++++---------------
 rpmalloc/rpmalloc.h |  86 +++-
 test/main.c         | 140 ++++++-
 6 files changed, 865 insertions(+), 332 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index e2eb4734..3e24c064 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -9,6 +9,12 @@ Added a missing null check in the non-hot allocation code paths
 
 Fixed compilation issue on macOS when ENABLE_PRELOAD is set but not ENABLE_OVERRIDE
 
+New first class heap API allowing explicit heap control and release of entire heap in a single call
+
+Added rpaligned_calloc function for aligned and zero intialized allocations
+
+Fixed natural alignment check in rpaligned_realloc to 16 bytes (check was 32, which is wrong)
+
 
 1.4.0
 
diff --git a/README.md b/README.md
index 69d8fdc8..2f3eab34 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# rpmalloc - Rampant Pixels Memory Allocator
+# rpmalloc - RP Memory Allocator
 This library provides a public domain cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. The latest source code is always available at https://github.com/mjansson/rpmalloc
 
 Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))  -  Support development through my [GitHub Sponsors page](https://github.com/sponsors/mjansson)
@@ -17,7 +17,7 @@ The code should be easily portable to any platform with atomic operations and an
 This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
 
 # Performance
-We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2500 lines of C code. All allocations have a natural 16-byte alignment.
+We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2800 lines of C code. All allocations have a natural 16-byte alignment.
 
 Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
 
@@ -50,6 +50,8 @@ Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replaceme
 
 If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__. The list of libc entry points replaced may not be complete, use libc replacement only as a convenience for testing the library on an existing code base, not a final solution.
 
+For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section
+
 # Building
 To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides.
 
@@ -120,11 +122,6 @@ A span that is a subspan of a larger super span can be individually decommitted
 
 If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting invididual pages and the total super span byte size for finally releasing the entire super span memory range.
 
-# Memory guards
-If you define the __ENABLE_GUARDS__ to 1, all memory allocations will be padded with extra guard areas before and after the memory block (while still honoring the requested alignment). These dead zones will be filled with a pattern and checked when the block is freed. If the patterns are not intact the callback set in initialization config is called, or if not set an assert is fired.
-
-Note that the end of the memory block in this case is defined by the total usable size of the block as returned by `rpmalloc_usable_size`, which can be larger than the size passed to allocation request due to size class buckets.
-
 # Memory fragmentation
 There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class.
 
@@ -132,6 +129,9 @@ However, there is memory fragmentation in the meaning that a request for x bytes
 
 rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span.
 
+# First class heaps
+rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time.
+
 # Producer-consumer scenario
 Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads.
 
@@ -192,7 +192,7 @@ not recognized in your country
 
 The MIT License (MIT)
 
-Copyright (c) 2017 Rampant Pixels AB
+Copyright (c) 2017 Mattias Jansson
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
diff --git a/configure.py b/configure.py
index b3e77dbc..514189ed 100755
--- a/configure.py
+++ b/configure.py
@@ -12,6 +12,7 @@
 generator = generator.Generator(project = 'rpmalloc', variables = [('bundleidentifier', 'com.rampantpixels.rpmalloc.$(binname)')])
 
 rpmalloc_lib = generator.lib(module = 'rpmalloc', libname = 'rpmalloc', sources = ['rpmalloc.c'])
+rpmalloc_test_lib = generator.lib(module = 'rpmalloc', libname = 'rpmalloc-test', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1', 'RPMALLOC_FIRST_CLASS_HEAPS=1', 'RPMALLOC_CONFIGURABLE=1']})
 
 if not generator.target.is_android() and not generator.target.is_ios():
 	rpmalloc_so = generator.sharedlib(module = 'rpmalloc', libname = 'rpmalloc', sources = ['rpmalloc.c'])
@@ -19,5 +20,5 @@
 	rpmallocwrap_so = generator.sharedlib(module = 'rpmalloc', libname = 'rpmallocwrap', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_PRELOAD=1', 'ENABLE_OVERRIDE=1']})
 	rpmallocwrap_lib = generator.lib(module = 'rpmalloc', libname = 'rpmallocwrap', sources = ['rpmalloc.c'], variables = {'defines': ['ENABLE_PRELOAD=1', 'ENABLE_OVERRIDE=1']})
 
-	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmalloc_lib], libs = ['rpmalloc'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1']})
+	generator.bin(module = 'test', sources = ['thread.c', 'main.c'], binname = 'rpmalloc-test', implicit_deps = [rpmalloc_test_lib], libs = ['rpmalloc-test'], includepaths = ['rpmalloc', 'test'], variables = {'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1', 'RPMALLOC_FIRST_CLASS_HEAPS=1', 'RPMALLOC_CONFIGURABLE=1']})
 	generator.bin(module = 'test', sources = ['thread.c', 'main-override.cc'], binname = 'rpmallocwrap-test', implicit_deps = [rpmallocwrap_lib], libs = ['rpmallocwrap'], includepaths = ['rpmalloc', 'test'], variables = {'runtime': 'c++', 'defines': ['ENABLE_ASSERTS=1', 'ENABLE_STATISTICS=1']})
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 34425010..ce1952dd 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -171,6 +171,8 @@ static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst =
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, 1) + 1; }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, -1) - 1; }
+static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)_InterlockedExchangeAdd64(val, add) - add; }
 #endif
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
@@ -200,6 +202,8 @@ static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
+static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return atomic_load_explicit(val, memory_order_relaxed); }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 #endif
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
@@ -234,10 +238,10 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 #define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
 //! Maximum size of a large block
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
-//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
-#define SPAN_HEADER_SIZE          96
 //! ABA protection size in orhpan heap list (also becomes limit of smallest page size)
 #define HEAP_ORPHAN_ABA_SIZE      512
+//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
+#define SPAN_HEADER_SIZE          96
 
 #if ENABLE_VALIDATE_ARGS
 //! Maximum allocation size to avoid integer overflow
@@ -278,22 +282,22 @@ struct span_use_t {
 	//! Current number of spans used (actually used, not in cache)
 	atomic32_t current;
 	//! High water mark of spans used
-	uint32_t high;
+	atomic32_t high;
 #if ENABLE_STATISTICS
 	//! Number of spans transitioned to global cache
-	uint32_t spans_to_global;
+	atomic32_t spans_to_global;
 	//! Number of spans transitioned from global cache
-	uint32_t spans_from_global;
+	atomic32_t spans_from_global;
 	//! Number of spans transitioned to thread cache
-	uint32_t spans_to_cache;
+	atomic32_t spans_to_cache;
 	//! Number of spans transitioned from thread cache
-	uint32_t spans_from_cache;
+	atomic32_t spans_from_cache;
 	//! Number of spans transitioned to reserved state
-	uint32_t spans_to_reserved;
+	atomic32_t spans_to_reserved;
 	//! Number of spans transitioned from reserved state
-	uint32_t spans_from_reserved;
+	atomic32_t spans_from_reserved;
 	//! Number of raw memory map calls
-	uint32_t spans_map_calls;
+	atomic32_t spans_map_calls;
 #endif
 };
 typedef struct span_use_t span_use_t;
@@ -306,21 +310,21 @@ struct size_class_use_t {
 	//! Peak number of allocations
 	int32_t alloc_peak;
 	//! Total number of allocations
-	int32_t alloc_total;
+	atomic32_t alloc_total;
 	//! Total number of frees
 	atomic32_t free_total;
 	//! Number of spans in use
-	uint32_t spans_current;
+	atomic32_t spans_current;
 	//! Number of spans transitioned to cache
-	uint32_t spans_peak;
+	int32_t spans_peak;
 	//! Number of spans transitioned to cache
-	uint32_t spans_to_cache;
+	atomic32_t spans_to_cache;
 	//! Number of spans transitioned from cache
-	uint32_t spans_from_cache;
+	atomic32_t spans_from_cache;
 	//! Number of spans transitioned from reserved state
-	uint32_t spans_from_reserved;
+	atomic32_t spans_from_reserved;
 	//! Number of spans mapped
-	uint32_t spans_map_calls;
+	atomic32_t spans_map_calls;
 };
 typedef struct size_class_use_t size_class_use_t;
 #endif
@@ -328,7 +332,10 @@ typedef struct size_class_use_t size_class_use_t;
 typedef enum span_state_t {
 	SPAN_STATE_ACTIVE = 0,
 	SPAN_STATE_PARTIAL,
-	SPAN_STATE_FULL
+	SPAN_STATE_FREE,
+	SPAN_STATE_FULL,
+	SPAN_STATE_LARGE,
+	SPAN_STATE_HUGE
 } span_state_t;
 
 //A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
@@ -391,12 +398,16 @@ struct heap_t {
 #if ENABLE_THREAD_CACHE
 	//! List of free spans (single linked list)
 	span_t*      span_cache[LARGE_CLASS_COUNT];
-	//! List of deferred free spans of class 0 (single linked list)
-	atomicptr_t  span_cache_deferred;
 #endif
+	//! List of deferred free spans (single linked list)
+	atomicptr_t  span_free_deferred;
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	//! Current and high water mark of spans used per span count
 	span_use_t   span_use[LARGE_CLASS_COUNT];
+#endif
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//! Double linked list of large and huge spans allocated by this heap
+	span_t*      full_span;
 #endif
 	//! Mapped but unused spans
 	span_t*      span_reserve;
@@ -414,9 +425,9 @@ struct heap_t {
 	int32_t      id;
 #if ENABLE_STATISTICS
 	//! Number of bytes transitioned thread -> global
-	size_t       thread_to_global;
+	atomic64_t   thread_to_global;
 	//! Number of bytes transitioned global -> thread
-	size_t       global_to_thread;
+	atomic64_t   global_to_thread;
 	//! Allocation stats per size class
 	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
 #endif
@@ -498,6 +509,8 @@ static atomic32_t _memory_active_heaps;
 static atomic32_t _mapped_pages;
 //! Peak number of concurrently mapped memory pages
 static int32_t _mapped_pages_peak;
+//! Number of mapped master spans
+static atomic32_t _master_spans;
 //! Number of currently unused spans
 static atomic32_t _reserved_spans;
 //! Running counter of total number of mapped memory pages since start
@@ -570,25 +583,27 @@ static void
 _memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
 
 #if ENABLE_STATISTICS
-#  define _memory_statistics_inc(counter, value) counter += value
-#  define _memory_statistics_dec(counter, value) counter -= value
-#  define _memory_statistics_add(atomic_counter, value) atomic_add32(atomic_counter, (int32_t)(value))
-#  define _memory_statistics_add_peak(atomic_counter, value, peak) do { int32_t _cur_count = atomic_add32(atomic_counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
-#  define _memory_statistics_sub(atomic_counter, value) atomic_add32(atomic_counter, -(int32_t)(value))
+#  define _memory_statistics_inc(counter) atomic_incr32(counter)
+#  define _memory_statistics_dec(counter) atomic_decr32(counter)
+#  define _memory_statistics_add(counter, value) atomic_add32(counter, (int32_t)(value))
+#  define _memory_statistics_add64(counter, value) atomic_add64(counter, (int64_t)(value))
+#  define _memory_statistics_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
+#  define _memory_statistics_sub(counter, value) atomic_add32(counter, -(int32_t)(value))
 #  define _memory_statistics_inc_alloc(heap, class_idx) do { \
 	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
 	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
 		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
-	heap->size_class_use[class_idx].alloc_total++; \
+	atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \
 } while(0)
 #  define _memory_statistics_inc_free(heap, class_idx) do { \
 	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
 	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
 } while(0)
 #else
-#  define _memory_statistics_inc(counter, value) do {} while(0)
-#  define _memory_statistics_dec(counter, value) do {} while(0)
+#  define _memory_statistics_inc(counter) do {} while(0)
+#  define _memory_statistics_dec(counter) do {} while(0)
 #  define _memory_statistics_add(atomic_counter, value) do {} while(0)
+#  define _memory_statistics_add64(counter, value) do {} while(0)
 #  define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0)
 #  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
 #  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
@@ -598,6 +613,9 @@ _memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
 static void
 _memory_heap_cache_insert(heap_t* heap, span_t* span);
 
+static void
+_memory_global_cache_insert(span_t* span);
+
 //! Map more virtual memory
 static void*
 _memory_map(size_t size, size_t* offset) {
@@ -643,7 +661,7 @@ _memory_map_from_reserve(heap_t* heap, size_t span_count) {
 
 	_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
 	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(heap->span_use[span_count - 1].spans_from_reserved, 1);
+		_memory_statistics_inc(&heap->span_use[span_count - 1].spans_from_reserved);
 
 	return span;
 }
@@ -675,7 +693,7 @@ _memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count
 	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);	
 }
 
-//! Map a akigned set of spans, taking configured mapping granularity and the page size into account
+//! Map an aligned set of spans, taking configured mapping granularity and the page size into account
 static span_t*
 _memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
 	//If we already have some, but not enough, reserved spans, release those to heap cache and map a new
@@ -687,15 +705,17 @@ _memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
 		return 0;
 	_memory_span_initialize(span, aligned_span_count, span_count, align_offset);
 	_memory_statistics_add(&_reserved_spans, aligned_span_count);
+	_memory_statistics_inc(&_master_spans);
 	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(heap->span_use[span_count - 1].spans_map_calls, 1);
+		_memory_statistics_inc(&heap->span_use[span_count - 1].spans_map_calls);
 	if (aligned_span_count > span_count) {
+		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+		size_t reserved_count = aligned_span_count - span_count;
 		if (heap->spans_reserved) {
 			_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
 			_memory_heap_cache_insert(heap, heap->span_reserve);
 		}
-		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
-		_memory_heap_set_reserved_spans(heap, span, reserved_spans, aligned_span_count - span_count);
+		_memory_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
 	}
 	return span;
 }
@@ -740,6 +760,7 @@ _memory_unmap_span(span_t* span) {
 		if (_memory_span_size < _memory_page_size)
 			unmap_count = master->total_spans_or_distance;
 		_memory_statistics_sub(&_reserved_spans, unmap_count);
+		_memory_statistics_sub(&_master_spans, 1);
 		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size);
 	}
 }
@@ -811,9 +832,9 @@ _memory_span_list_split(span_t* span, size_t limit) {
 
 #endif
 
-//! Add a span to partial span double linked list at the head
+//! Add a span to double linked list at the head
 static void
-_memory_span_partial_list_add(span_t** head, span_t* span) {
+_memory_span_double_link_list_add(span_t** head, span_t* span) {
 	if (*head) {
 		span->next = *head;
 		//Maintain pointer to tail span
@@ -826,9 +847,9 @@ _memory_span_partial_list_add(span_t** head, span_t* span) {
 	*head = span;
 }
 
-//! Add a span to partial span double linked list at the tail
+//! Add a span to double linked list at the tail
 static void
-_memory_span_partial_list_add_tail(span_t** head, span_t* span) {
+_memory_span_double_link_list_add_tail(span_t** head, span_t* span) {
 	span->next = 0;
 	if (*head) {
 		span_t* tail = (*head)->prev;
@@ -842,22 +863,24 @@ _memory_span_partial_list_add_tail(span_t** head, span_t* span) {
 	}
 }
 
-//! Pop head span from partial span double linked list
+//! Pop head span from double linked list
 static void
-_memory_span_partial_list_pop_head(span_t** head) {
+_memory_span_double_link_list_pop_head(span_t** head) {
 	span_t* span = *head;
 	*head = span->next;
 	if (*head) {
 		//Maintain pointer to tail span
+		assert(span->next->prev == span);
 		(*head)->prev = span->prev;
 	}
 }
 
-//! Remove a span from partial span double linked list
+//! Remove a span from double linked list
 static void
-_memory_span_partial_list_remove(span_t** head, span_t* span) {
+_memory_span_double_link_list_remove(span_t** head, span_t* span) {
+	assert(*head);
 	if (UNEXPECTED(*head == span)) {
-		_memory_span_partial_list_pop_head(head);
+		_memory_span_double_link_list_pop_head(head);
 	} else {
 		span_t* next_span = span->next;
 		span_t* prev_span = span->prev;
@@ -866,6 +889,7 @@ _memory_span_partial_list_remove(span_t** head, span_t* span) {
 			next_span->prev = prev_span;
 		} else {
 			//Update pointer to tail span
+			assert((*head)->prev == span);
 			(*head)->prev = prev_span;
 		}
 	}
@@ -953,29 +977,46 @@ _memory_global_cache_extract(size_t span_count) {
 
 #endif
 
-#if ENABLE_THREAD_CACHE
-//! Adopt the deferred span cache list
+static void _memory_deallocate_huge(span_t*);
+
+//! Adopt the deferred span cache list, returning the first
 static void
-_memory_heap_cache_adopt_deferred(heap_t* heap) {
+_memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	atomic_thread_fence_acquire();
-	span_t* span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
+	span_t* span = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
 	if (!span)
 		return;
 	do {
-		span = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
-	} while (!atomic_cas_ptr(&heap->span_cache_deferred, 0, span));
+		span = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
+	} while (!atomic_cas_ptr(&heap->span_free_deferred, 0, span));
 	while (span) {
-		span_t* next_span = span->next;
-		_memory_span_list_push(&heap->span_cache[0], span);
-#if ENABLE_STATISTICS
-		atomic_decr32(&heap->span_use[span->span_count - 1].current);
-		++heap->size_class_use[span->size_class].spans_to_cache;
-		--heap->size_class_use[span->size_class].spans_current;
+		span_t* next_span = (span_t*)span->free_list;
+		assert(span->heap == heap);
+		assert(span->state >= SPAN_STATE_FULL);
+		if (UNEXPECTED(span->size_class == (uint32_t)-1)) {
+			_memory_deallocate_huge(span);
+		} else {
+			uint32_t idx = span->span_count - 1;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			_memory_span_double_link_list_remove(&heap->full_span, span);
+#endif
+			span->state = SPAN_STATE_FREE;
+			if (!idx && single_span && !*single_span) {
+				*single_span = span;
+			} else {
+				_memory_statistics_dec(&heap->span_use[idx].current);
+				_memory_statistics_dec(&heap->size_class_use[span->size_class].spans_current);
+#if ENABLE_THREAD_CACHE
+				_memory_statistics_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+				_memory_span_list_push(&heap->span_cache[idx], span);
+#else
+				_memory_unmap_span(span);
 #endif
+			}
+		}
 		span = next_span;
 	}
 }
-#endif
 
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
 static void
@@ -983,9 +1024,7 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 #if ENABLE_THREAD_CACHE
 	size_t span_count = span->span_count;
 	size_t idx = span_count - 1;
-	_memory_statistics_inc(heap->span_use[idx].spans_to_cache, 1);
-	if (!idx)
-		_memory_heap_cache_adopt_deferred(heap);
+	_memory_statistics_inc(&heap->span_use[idx].spans_to_cache);
 #if ENABLE_UNLIMITED_THREAD_CACHE
 	_memory_span_list_push(&heap->span_cache[idx], span);
 #else
@@ -1007,11 +1046,9 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 	}
 	heap->span_cache[idx] = _memory_span_list_split(span, release_count);
 	assert(span->list_size == release_count);
-#if ENABLE_STATISTICS
-	heap->thread_to_global += (size_t)span->list_size * span_count * _memory_span_size;
-	heap->span_use[idx].spans_to_global += span->list_size;
-#endif
 #if ENABLE_GLOBAL_CACHE
+	_memory_statistics_add64(&heap->thread_to_global, (size_t)span->list_size * span_count * _memory_span_size);
+	_memory_statistics_add(&heap->span_use[idx].spans_to_global, span->list_size);
 	_memory_global_cache_insert(span);
 #else
 	_memory_unmap_span_list(span);
@@ -1026,18 +1063,17 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 //! Extract the given number of spans from the different cache levels
 static span_t*
 _memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
-#if ENABLE_THREAD_CACHE
+	span_t* span = 0;
 	size_t idx = span_count - 1;
 	if (!idx)
-		_memory_heap_cache_adopt_deferred(heap);
-	if (heap->span_cache[idx]) {
-#if ENABLE_STATISTICS
-		heap->span_use[idx].spans_from_cache++;
-#endif
-		return _memory_span_list_pop(&heap->span_cache[idx]);
+		_memory_heap_cache_adopt_deferred(heap, &span);
+#if ENABLE_THREAD_CACHE
+	if (!span && heap->span_cache[idx]) {
+		_memory_statistics_inc(&heap->span_use[idx].spans_from_cache);
+		span = _memory_span_list_pop(&heap->span_cache[idx]);
 	}
 #endif
-	return 0;
+	return span;
 }
 
 static span_t*
@@ -1054,13 +1090,13 @@ _memory_heap_global_cache_extract(heap_t* heap, size_t span_count) {
 	size_t idx = span_count - 1;
 	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
 	if (heap->span_cache[idx]) {
-#if ENABLE_STATISTICS
-		heap->global_to_thread += (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size;
-		heap->span_use[idx].spans_from_global += heap->span_cache[idx]->list_size;
-#endif
+		_memory_statistics_add64(&heap->global_to_thread, (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size);
+		_memory_statistics_add(&heap->span_use[idx].spans_from_global, heap->span_cache[idx]->list_size);
 		return _memory_span_list_pop(&heap->span_cache[idx]);
 	}
 #endif
+	(void)sizeof(heap);
+	(void)sizeof(span_count);
 	return 0;
 }
 
@@ -1071,32 +1107,28 @@ _memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_id
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	uint32_t idx = (uint32_t)span_count - 1;
 	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
-	if (current_count > heap->span_use[idx].high)
-		heap->span_use[idx].high = current_count;
-#if ENABLE_STATISTICS
-	uint32_t spans_current = ++heap->size_class_use[class_idx].spans_current;
-	if (spans_current > heap->size_class_use[class_idx].spans_peak)
-		heap->size_class_use[class_idx].spans_peak = spans_current;
+	if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high))
+		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
+	_memory_statistics_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
 #endif
-#endif	
 	span_t* span = _memory_heap_thread_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
+		_memory_statistics_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
 	}
 	span = _memory_heap_reserved_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_reserved, 1);
+		_memory_statistics_inc(&heap->size_class_use[class_idx].spans_from_reserved);
 		return span;
 	}
 	span = _memory_heap_global_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(heap->size_class_use[class_idx].spans_from_cache, 1);
+		_memory_statistics_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
 	}
 	//Final fallback, map in more virtual memory
 	span = _memory_map_spans(heap, span_count);
-	_memory_statistics_inc(heap->size_class_use[class_idx].spans_map_calls, 1);
+	_memory_statistics_inc(&heap->size_class_use[class_idx].spans_map_calls);
 	return span;
 }
 
@@ -1105,14 +1137,20 @@ static void
 _memory_span_release_to_cache(heap_t* heap, span_t* span) {
 	heap_class_t* heap_class = heap->span_class + span->size_class;
 	assert(heap_class->partial_span != span);
+	assert(heap == span->heap);
 	if (span->state == SPAN_STATE_PARTIAL)
-		_memory_span_partial_list_remove(&heap_class->partial_span, span);
+		_memory_span_double_link_list_remove(&heap_class->partial_span, span);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	if (UNEXPECTED(span->state >= SPAN_STATE_FULL))
+		_memory_span_double_link_list_remove(&heap->full_span, span);
+#endif
+	span->state = SPAN_STATE_FREE;
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	atomic_decr32(&heap->span_use[0].current);
 #endif
-	_memory_statistics_inc(heap->span_use[0].spans_to_cache, 1);
-	_memory_statistics_inc(heap->size_class_use[span->size_class].spans_to_cache, 1);
-	_memory_statistics_dec(heap->size_class_use[span->size_class].spans_current, 1);
+	_memory_statistics_inc(&heap->span_use[0].spans_to_cache);
+	_memory_statistics_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+	_memory_statistics_dec(&heap->size_class_use[span->size_class].spans_current);
 	_memory_heap_cache_insert(heap, span);
 }
 
@@ -1169,7 +1207,7 @@ _memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span
 	span->list_size = 0;
 	atomic_thread_fence_release();
 
-	_memory_span_partial_list_add(&heap_class->partial_span, span);
+	_memory_span_double_link_list_add(&heap_class->partial_span, span);
 	return block;
 }
 
@@ -1190,7 +1228,7 @@ static void
 _memory_span_set_active_full(heap_class_t* heap_class, span_t* span) {
 	assert(span->state == SPAN_STATE_ACTIVE);
 	assert(span == heap_class->partial_span);
-	_memory_span_partial_list_pop_head(&heap_class->partial_span);
+	_memory_span_double_link_list_pop_head(&heap_class->partial_span);
 	span->used_count = span->block_count;
 	span->state = SPAN_STATE_FULL;
 	span->free_list = 0;
@@ -1200,9 +1238,12 @@ _memory_span_set_active_full(heap_class_t* heap_class, span_t* span) {
 static void
 _memory_span_set_full_partial(heap_t* heap, span_t* span) {
 	assert(span->state == SPAN_STATE_FULL);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_span_double_link_list_remove(&heap->full_span, span);
+#endif
 	heap_class_t* heap_class = &heap->span_class[span->size_class];
 	span->state = SPAN_STATE_PARTIAL;
-	_memory_span_partial_list_add_tail(&heap_class->partial_span, span);
+	_memory_span_double_link_list_add_tail(&heap_class->partial_span, span);
 }
 
 static void*
@@ -1260,6 +1301,9 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 		assert(!heap_class->free_list);
 		assert(active_span->free_list_limit >= active_span->block_count);
 		_memory_span_set_active_full(heap_class, active_span);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_memory_span_double_link_list_add(&heap->full_span, active_span);
+#endif
 	}
 	assert(!heap_class->free_list);
 
@@ -1274,7 +1318,6 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 
 	//Find a span in one of the cache levels
 	active_span = _memory_heap_extract_new_span(heap, 1, class_idx);
-	
 	if (!active_span)
 		return active_span;
 
@@ -1285,6 +1328,7 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 //! Allocate a small sized memory block from the given heap
 static void*
 _memory_allocate_small(heap_t* heap, size_t size) {
+	assert(heap);
 	//Small sizes have unique size classes
 	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
 	_memory_statistics_inc_alloc(heap, class_idx);
@@ -1296,6 +1340,7 @@ _memory_allocate_small(heap_t* heap, size_t size) {
 //! Allocate a medium sized memory block from the given heap
 static void*
 _memory_allocate_medium(heap_t* heap, size_t size) {
+	assert(heap);
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
 	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
 	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
@@ -1308,6 +1353,7 @@ _memory_allocate_medium(heap_t* heap, size_t size) {
 //! Allocate a large sized memory block from the given heap
 static void*
 _memory_allocate_large(heap_t* heap, size_t size) {
+	assert(heap);
 	//Calculate number of needed max sized spans (including header)
 	//Since this function is never called if size > LARGE_SIZE_LIMIT
 	//the span_count is guaranteed to be <= LARGE_CLASS_COUNT
@@ -1319,7 +1365,6 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 
 	//Find a span in one of the cache levels
 	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT);
-	
 	if (!span)
 		return span;
 
@@ -1327,14 +1372,20 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 	assert(span->span_count == span_count);
 	span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx);
 	span->heap = heap;
+	span->state = SPAN_STATE_LARGE;
 	atomic_thread_fence_release();
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_span_double_link_list_add(&heap->full_span, span);
+#endif
+
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
 
 //! Allocate a huge block by mapping memory pages directly
 static void*
-_memory_allocate_huge(size_t size) {
+_memory_allocate_huge(heap_t* heap, size_t size) {
+	assert(heap);
 	size += SPAN_HEADER_SIZE;
 	size_t num_pages = size >> _memory_page_size_shift;
 	if (size & (_memory_page_size - 1))
@@ -1343,21 +1394,20 @@ _memory_allocate_huge(size_t size) {
 	span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset);
 	if (!span)
 		return span;
+
 	//Store page count in span_count
 	span->size_class = (uint32_t)-1;
 	span->span_count = (uint32_t)num_pages;
 	span->align_offset = (uint32_t)align_offset;
+	span->heap = heap;
+	span->state = SPAN_STATE_HUGE;
 	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
 
-	return pointer_offset(span, SPAN_HEADER_SIZE);
-}
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_span_double_link_list_add(&heap->full_span, span);
+#endif
 
-//! Allocate a block larger than medium size
-static void*
-_memory_allocate_oversized(heap_t* heap, size_t size) {
-	if (size <= LARGE_SIZE_LIMIT)
-		return _memory_allocate_large(heap, size);
-	return _memory_allocate_huge(size);
+	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
 
 //! Allocate a block of the given size
@@ -1367,7 +1417,114 @@ _memory_allocate(heap_t* heap, size_t size) {
 		return _memory_allocate_small(heap, size);
 	else if (size <= _memory_medium_size_limit)
 		return _memory_allocate_medium(heap, size);
-	return _memory_allocate_oversized(heap, size);
+	else if (size <= LARGE_SIZE_LIMIT)
+		return _memory_allocate_large(heap, size);
+	return _memory_allocate_huge(heap, size);
+}
+
+static void*
+_memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _memory_allocate(heap, size);
+
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment) < size) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment & (alignment - 1)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+
+	void* ptr = 0;
+	size_t align_mask = alignment - 1;
+	if (alignment <= _memory_page_size) {
+		ptr = _memory_allocate(heap, size + alignment);
+		if ((uintptr_t)ptr & align_mask)
+			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+		//Mark as having aligned blocks
+		span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
+		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+		return ptr;
+	}
+
+	// Fallback to mapping new pages for this request. Since pointers passed
+	// to rpfree must be able to reach the start of the span by bitmasking of
+	// the address with the span size, the returned aligned pointer from this
+	// function must be with a span size of the start of the mapped area.
+	// In worst case this requires us to loop and map pages until we get a
+	// suitable memory address. It also means we can never align to span size
+	// or greater, since the span header will push alignment more than one
+	// span size away from span start (thus causing pointer mask to give us
+	// an invalid span start on free)
+	if (alignment & align_mask) {
+		errno = EINVAL;
+		return 0;
+	}
+	if (alignment >= _memory_span_size) {
+		errno = EINVAL;
+		return 0;
+	}
+
+	size_t extra_pages = alignment / _memory_page_size;
+
+	// Since each span has a header, we will at least need one extra memory page
+	size_t num_pages = 1 + (size / _memory_page_size);
+	if (size & (_memory_page_size - 1))
+		++num_pages;
+
+	if (extra_pages > num_pages)
+		num_pages = 1 + extra_pages;
+
+	size_t original_pages = num_pages;
+	size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
+	if (limit_pages < (original_pages * 2))
+		limit_pages = original_pages * 2;
+
+	size_t mapped_size, align_offset;
+	span_t* span;
+
+retry:
+	align_offset = 0;
+	mapped_size = num_pages * _memory_page_size;
+
+	span = (span_t*)_memory_map(mapped_size, &align_offset);
+	if (!span) {
+		errno = ENOMEM;
+		return 0;
+	}
+	ptr = pointer_offset(span, SPAN_HEADER_SIZE);
+
+	if ((uintptr_t)ptr & align_mask)
+		ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+
+	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
+	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
+	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
+		_memory_unmap(span, mapped_size, align_offset, mapped_size);
+		++num_pages;
+		if (num_pages > limit_pages) {
+			errno = EINVAL;
+			return 0;
+		}
+		goto retry;
+	}
+
+	//Store page count in span_count
+	span->size_class = (uint32_t)-1;
+	span->span_count = (uint32_t)num_pages;
+	span->align_offset = (uint32_t)align_offset;
+	span->heap = heap;
+	span->state = SPAN_STATE_HUGE;
+	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_span_double_link_list_add(&heap->full_span, span);
+#endif
+
+	return ptr;
 }
 
 static void
@@ -1400,7 +1557,36 @@ _memory_heap_orphan(heap_t* heap) {
 	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
 }
 
-//! Allocate a new heap
+//! Allocate a new heap from newly mapped memory pages
+static heap_t*
+_memory_allocate_heap_new(void) {
+	//Map in pages for a new heap
+	size_t align_offset = 0;
+	size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+	heap_t* heap = (heap_t*)_memory_map(block_size, &align_offset);
+	if (!heap)
+		return heap;
+
+	_memory_heap_initialize(heap);
+	heap->align_offset = align_offset;
+
+	//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
+	size_t aligned_heap_size = sizeof(heap_t);
+	if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
+		aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
+	size_t num_heaps = block_size / aligned_heap_size;
+	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
+	while (num_heaps > 1) {
+		_memory_heap_initialize(extra_heap);
+		extra_heap->master_heap = heap;
+		_memory_heap_orphan(extra_heap);
+		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
+		--num_heaps;
+	}
+	return heap;
+}
+
+//! Allocate a new heap, potentially reusing a previously orphaned heap
 static heap_t*
 _memory_allocate_heap(void) {
 	void* raw_heap;
@@ -1420,71 +1606,45 @@ _memory_allocate_heap(void) {
 		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
 	} while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
 
-	if (!heap) {
-		//Map in pages for a new heap
-		size_t align_offset = 0;
-		size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-		heap = (heap_t*)_memory_map(block_size, &align_offset);
-		if (!heap)
-			return heap;
-
-		_memory_heap_initialize(heap);
-		heap->align_offset = align_offset;
-
-		//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
-		size_t aligned_heap_size = sizeof(heap_t);
-		if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
-			aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
-		size_t num_heaps = block_size / aligned_heap_size;
-		heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
-		while (num_heaps > 1) {
-			_memory_heap_initialize(extra_heap);
-			extra_heap->master_heap = heap;
-			_memory_heap_orphan(extra_heap);
-			extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
-			--num_heaps;
-		}
-	}
+	if (!heap)
+		heap = _memory_allocate_heap_new();
 
 	return heap;
 }
 
 //! Deallocate the given small/medium memory block in the current thread local heap
 static void
-_memory_deallocate_direct(span_t* span, void* block) {
+_memory_deallocate_direct_small_or_medium(span_t* span, void* block) {
 	assert(span->heap == get_thread_heap_raw());
+	assert(span->state <= SPAN_STATE_FULL);
 	uint32_t state = span->state;
 	//Add block to free list
 	*((void**)block) = span->free_list;
 	span->free_list = block;
 	if (UNEXPECTED(state == SPAN_STATE_ACTIVE))
 		return;
+	heap_t* heap = span->heap;
 	uint32_t used = --span->used_count;
 	uint32_t free = span->list_size;
 	if (UNEXPECTED(used == free))
-		_memory_span_release_to_cache(span->heap, span);
+		_memory_span_release_to_cache(heap, span);
 	else if (UNEXPECTED(state == SPAN_STATE_FULL))
-		_memory_span_set_full_partial(span->heap, span);
+		_memory_span_set_full_partial(heap, span);
 }
 
-//! Put the block in the deferred free list of the owning span
 static void
-_memory_deallocate_defer(span_t* span, void* block) {
-	atomic_thread_fence_acquire();
-	if (span->state == SPAN_STATE_FULL) {
-		if ((span->list_size + 1) == span->block_count) {
-			//Span will be completely freed by deferred deallocations, no other thread can
-			//currently touch it. Safe to move to owner heap deferred cache
-			span_t* last_head;
-			heap_t* heap = span->heap;
-			do {
-				last_head = (span_t*)atomic_load_ptr(&heap->span_cache_deferred);
-				span->next = last_head;
-			} while (!atomic_cas_ptr(&heap->span_cache_deferred, span, last_head));
-			return;
-		}
-	}
+_memory_deallocate_defer_free_span(heap_t* heap, span_t* span) {
+	//This list does not need ABA protection, no mutable side state
+	void* last_head;
+	do {
+		last_head = atomic_load_ptr(&heap->span_free_deferred);
+		span->free_list = last_head;
+	} while (!atomic_cas_ptr(&heap->span_free_deferred, span, last_head));
+}
 
+//! Put the block in the deferred free list of the owning span
+static void
+_memory_deallocate_defer_small_or_medium(span_t* span, void* block) {
 	void* free_list;
 	do {
 		atomic_thread_fence_acquire();
@@ -1492,6 +1652,13 @@ _memory_deallocate_defer(span_t* span, void* block) {
 		*((void**)block) = free_list;
 	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
 	++span->list_size;
+	if ((span->state == SPAN_STATE_FULL) && (span->list_size == span->block_count)) {
+		// Span was completely freed by this block. Due to the INVALID_POINTER spin lock
+		// no other thread can reach this state simultaneously on this span.
+		// Safe to move to owner heap deferred cache
+		_memory_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
 	atomic_store_ptr(&span->free_list_deferred, block);
 }
 
@@ -1506,25 +1673,38 @@ _memory_deallocate_small_or_medium(span_t* span, void* p) {
 	}
 	//Check if block belongs to this heap or if deallocation should be deferred
 	if (span->heap == get_thread_heap_raw())
-		_memory_deallocate_direct(span, p);
+		_memory_deallocate_direct_small_or_medium(span, p);
 	else
-		_memory_deallocate_defer(span, p);
+		_memory_deallocate_defer_small_or_medium(span, p);
 }
 
 //! Deallocate the given large memory block to the current heap
 static void
 _memory_deallocate_large(span_t* span) {
-	//Decrease counter
 	assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1));
 	assert(span->size_class >= SIZE_CLASS_COUNT);
 	assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT);
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
-	//Large blocks can always be deallocated and transferred between heaps
-	//Investigate if it is better to defer large spans as well through span_cache_deferred,
-	//possibly with some heuristics to pick either scheme at runtime per deallocation
-	heap_t* heap = get_thread_heap();
+	heap_t* heap = get_thread_heap_raw();
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//If using first class heaps and tracking spans in heap double linked list we must
+	//always defer if from another heap since we cannot touch the list of another heap
+	int defer = (heap != span->heap);
+#else
+	//Otherwise defer if different heap and span count is 1
+	int defer = ((heap != span->heap) && (span->span_count == 1));
+#endif
+	if (defer) {
+		_memory_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	_memory_span_double_link_list_remove(&heap->full_span, span);
+#endif
+	span->state = SPAN_STATE_FREE;
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//Decrease counter
 	size_t idx = span->span_count - 1;
 	atomic_decr32(&span->heap->span_use[idx].current);
 #endif
@@ -1540,7 +1720,7 @@ _memory_deallocate_large(span_t* span) {
 			assert(master->flags & SPAN_FLAG_MASTER);
 			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
 		}
-		_memory_statistics_inc(heap->span_use[idx].spans_to_reserved, 1);
+		_memory_statistics_inc(&heap->span_use[idx].spans_to_reserved);
 	} else {
 		//Insert into cache list
 		_memory_heap_cache_insert(heap, span);
@@ -1550,6 +1730,19 @@ _memory_deallocate_large(span_t* span) {
 //! Deallocate the given huge span
 static void
 _memory_deallocate_huge(span_t* span) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//If using first class heaps and tracking spans in heap double linked list we must
+	//always defer if from another heap since we cannot touch the list of another heap
+	assert(span->heap);
+	if (span->heap != get_thread_heap_raw()) {
+		_memory_deallocate_defer_free_span(span->heap, span);
+		return;
+	}
+
+	_memory_span_double_link_list_remove(&span->heap->full_span, span);
+#endif
+	span->state = SPAN_STATE_FREE;
+
 	//Oversized allocation, page count is stored in span_count
 	size_t num_pages = span->span_count;
 	_memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
@@ -1571,13 +1764,35 @@ _memory_deallocate(void* p) {
 		_memory_deallocate_huge(span);
 }
 
+//! Get the usable size of the given block
+static size_t
+_memory_usable_size(void* p) {
+	//Grab the span using guaranteed span alignment
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (span->size_class != (uint32_t)-1) {
+		//Small/medium block
+		if (span->size_class < SIZE_CLASS_COUNT) {
+			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+		}
+
+		//Large block
+		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
+		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
+	}
+
+	//Oversized block, page count is stored in span_count
+	size_t current_pages = span->span_count;
+	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
+}
+
 //! Reallocate the given block to the given size
 static void*
-_memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
+_memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) {
 	if (p) {
 		//Grab the span using guaranteed span alignment
 		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-		if (span->heap) {
+		if (EXPECTED(span->size_class != (uint32_t)-1)) {
 			if (span->size_class < SIZE_CLASS_COUNT) {
 				//Small/medium sized block
 				assert(span->span_count == 1);
@@ -1634,7 +1849,6 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 	}
 
 	//Size is greater than block size, need to allocate a new block and deallocate the old
-	heap_t* heap = get_thread_heap();
 	//Avoid hysteresis by overallocating if increase is small (below 37%)
 	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
 	size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
@@ -1648,26 +1862,30 @@ _memory_reallocate(void* p, size_t size, size_t oldsize, unsigned int flags) {
 	return block;
 }
 
-//! Get the usable size of the given block
-static size_t
-_memory_usable_size(void* p) {
-	//Grab the span using guaranteed span alignment
-	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-	if (span->heap) {
-		//Small/medium block
-		if (span->size_class < SIZE_CLASS_COUNT) {
-			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-			return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
-		}
+static void*
+_memory_aligned_reallocate(heap_t* heap, void* p, size_t alignment, size_t size, size_t oldsize,
+                           unsigned int flags) {
+	if (alignment <= SMALL_GRANULARITY)
+		return _memory_reallocate(heap, p, size, oldsize, flags);
 
-		//Large block
-		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
-		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
-	}
+	size_t usablesize = _memory_usable_size(p);
+	if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)p & (alignment - 1)))
+		return p;
 
-	//Oversized block, page count is stored in span_count
-	size_t current_pages = span->span_count;
-	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
+	void* block = _memory_aligned_allocate(heap, alignment, size);
+	if (p) {
+		if (!oldsize)
+			oldsize = usablesize;
+		if (!(flags & RPMALLOC_NO_PRESERVE))
+			memcpy(block, p, oldsize < size ? oldsize : size);
+		_memory_deallocate(p);
+	}
+	if (block) {
+		//Mark as having aligned blocks
+		span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask);
+		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+	}
+	return block;
 }
 
 //! Adjust and optimize the size class properties for the given class
@@ -1697,8 +1915,8 @@ _memory_heap_finalize(void* heapptr) {
 	if (!heap)
 		return;
 	//Release thread cache spans back to global cache
+	_memory_heap_cache_adopt_deferred(heap, 0);
 #if ENABLE_THREAD_CACHE
-	_memory_heap_cache_adopt_deferred(heap);
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		span_t* span = heap->span_cache[iclass];
 #if ENABLE_GLOBAL_CACHE
@@ -1706,10 +1924,8 @@ _memory_heap_finalize(void* heapptr) {
 			assert(span->span_count == (iclass + 1));
 			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
 			span_t* next = _memory_span_list_split(span, (uint32_t)release_count);
-#if ENABLE_STATISTICS
-			heap->thread_to_global += (size_t)span->list_size * span->span_count * _memory_span_size;
-			heap->span_use[iclass].spans_to_global += span->list_size;
-#endif
+			_memory_statistics_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
+			_memory_statistics_add(&heap->span_use[iclass].spans_to_global, span->list_size);
 			_memory_global_cache_insert(span);
 			span = next;
 		}
@@ -1725,7 +1941,6 @@ _memory_heap_finalize(void* heapptr) {
 	_memory_heap_orphan(heap);
 
 	set_thread_heap(0);
-
 #if ENABLE_STATISTICS
 	atomic_decr32(&_memory_active_heaps);
 	assert(atomic_load32(&_memory_active_heaps) >= 0);
@@ -1923,6 +2138,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 #if ENABLE_STATISTICS
 	atomic_store32(&_memory_active_heaps, 0);
 	atomic_store32(&_reserved_spans, 0);
+	atomic_store32(&_master_spans, 0);
 	atomic_store32(&_mapped_pages, 0);
 	_mapped_pages_peak = 0;
 	atomic_store32(&_mapped_total, 0);
@@ -1999,22 +2215,30 @@ rpmalloc_finalize(void) {
 							++free_blocks;
 							block = *((void**)block);
 						}
-						if (used_blocks == (free_blocks + span->list_size))
+						if (used_blocks == (free_blocks + span->list_size)) {
 							_memory_heap_cache_insert(heap, span);
+							_memory_statistics_dec(&heap->span_use[0].current);
+							_memory_statistics_dec(&heap->size_class_use[iclass].spans_current);
+						}
 					} else {
-						if (span->used_count == span->list_size)
+						if (span->used_count == span->list_size) {
 							_memory_heap_cache_insert(heap, span);
+							_memory_statistics_dec(&heap->span_use[0].current);
+							_memory_statistics_dec(&heap->size_class_use[iclass].spans_current);
+						}
 					}
 					span = next;
 				}
 			}
 
-#if ENABLE_THREAD_CACHE
 			//Free span caches (other thread might have deferred after the thread using this heap finalized)
-			_memory_heap_cache_adopt_deferred(heap);
+			_memory_heap_cache_adopt_deferred(heap, 0);
+#if ENABLE_THREAD_CACHE
 			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				if (heap->span_cache[iclass])
+				if (heap->span_cache[iclass]) {
 					_memory_unmap_span_list(heap->span_cache[iclass]);
+					heap->span_cache[iclass] = 0;
+				}
 			}
 #endif
 			heap_t* next_heap = heap->next_heap;
@@ -2026,15 +2250,6 @@ rpmalloc_finalize(void) {
 		}
 	}
 
-	//Finally free all master heaps pages
-	heap_t* master_heap = master_heaps;
-	while (master_heap) {
-		heap_t* next_heap = master_heap->next_heap;
-		size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-		_memory_unmap(master_heap, block_size, master_heap->align_offset, block_size);
-		master_heap = next_heap;
-	}
-
 #if ENABLE_GLOBAL_CACHE
 	//Free global caches
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
@@ -2051,6 +2266,15 @@ rpmalloc_finalize(void) {
     FlsFree(fls_key);
 #endif
 
+	//Finally free all master heaps pages
+	heap_t* master_heap = master_heaps;
+	while (master_heap) {
+		heap_t* next_heap = master_heap->next_heap;
+		size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+		_memory_unmap(master_heap, block_size, master_heap->align_offset, block_size);
+		master_heap = next_heap;
+	}
+
 #if ENABLE_STATISTICS
 	//If you hit these asserts you probably have memory leaks or double frees in your code
 	assert(!atomic_load32(&_mapped_pages));
@@ -2068,9 +2292,7 @@ rpmalloc_thread_initialize(void) {
 		heap_t* heap = _memory_allocate_heap();
 		if (heap) {
 			atomic_thread_fence_acquire();
-#if ENABLE_STATISTICS
-			atomic_incr32(&_memory_active_heaps);
-#endif
+			_memory_statistics_inc(&_memory_active_heaps);
 			set_thread_heap(heap);
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 			FlsSetValue(fls_key, heap);
@@ -2127,9 +2349,7 @@ _memory_map_os(size_t size, size_t* offset) {
 		return 0;
 	}
 #endif
-#if ENABLE_STATISTICS
-	atomic_add32(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
-#endif
+	_memory_statistics_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
 	if (padding) {
 		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
 		assert(final_padding <= _memory_span_size);
@@ -2177,10 +2397,8 @@ _memory_unmap_os(void* address, size_t size, size_t offset, size_t release) {
 	}
 #endif
 #endif
-#if ENABLE_STATISTICS
 	if (release)
-		atomic_add32(&_mapped_pages_os, -(int32_t)(release >> _memory_page_size_shift));
-#endif
+		_memory_statistics_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
 }
 
 // Extern interface
@@ -2224,7 +2442,8 @@ rpcalloc(size_t num, size_t size) {
 #endif
 	heap_t* heap = get_thread_heap();
 	void* block = _memory_allocate(heap, total);
-	memset(block, 0, total);
+	if (block)
+		memset(block, 0, total);
 	return block;
 }
 
@@ -2236,7 +2455,8 @@ rprealloc(void* ptr, size_t size) {
 		return ptr;
 	}
 #endif
-	return _memory_reallocate(ptr, size, 0, 0);
+	heap_t* heap = get_thread_heap();
+	return _memory_reallocate(heap, ptr, size, 0, 0);
 }
 
 extern RPMALLOC_ALLOCATOR void*
@@ -2248,126 +2468,40 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
 		return 0;
 	}
 #endif
-	void* block;
-	if (alignment > 32) {
-		size_t usablesize = _memory_usable_size(ptr);
-		if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)ptr & (alignment - 1)))
-			return ptr;
-
-		block = rpaligned_alloc(alignment, size);
-		if (ptr) {
-			if (!oldsize)
-				oldsize = usablesize;
-			if (!(flags & RPMALLOC_NO_PRESERVE))
-				memcpy(block, ptr, oldsize < size ? oldsize : size);
-			rpfree(ptr);
-		}
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
-	} else {
-		block = _memory_reallocate(ptr, size, oldsize, flags);
-	}
-	return block;
+	heap_t* heap = get_thread_heap();
+	return _memory_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags);
 }
 
 extern RPMALLOC_ALLOCATOR void*
 rpaligned_alloc(size_t alignment, size_t size) {
-	if (alignment <= 16)
-		return rpmalloc(size);
+	heap_t* heap = get_thread_heap();
+	return _memory_aligned_allocate(heap, alignment, size);
+}
 
+extern inline RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) {
+	size_t total;
 #if ENABLE_VALIDATE_ARGS
-	if ((size + alignment) < size) {
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
 		errno = EINVAL;
 		return 0;
 	}
-	if (alignment & (alignment - 1)) {
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
 		errno = EINVAL;
 		return 0;
 	}
 #endif
-
-	void* ptr = 0;
-	size_t align_mask = alignment - 1;
-	if (alignment < _memory_page_size) {
-		ptr = rpmalloc(size + alignment);
-		if ((uintptr_t)ptr & align_mask)
-			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
-		return ptr;
-	}
-
-	// Fallback to mapping new pages for this request. Since pointers passed
-	// to rpfree must be able to reach the start of the span by bitmasking of
-	// the address with the span size, the returned aligned pointer from this
-	// function must be with a span size of the start of the mapped area.
-	// In worst case this requires us to loop and map pages until we get a
-	// suitable memory address. It also means we can never align to span size
-	// or greater, since the span header will push alignment more than one
-	// span size away from span start (thus causing pointer mask to give us
-	// an invalid span start on free)
-	if (alignment & align_mask) {
-		errno = EINVAL;
-		return 0;
-	}
-	if (alignment >= _memory_span_size) {
-		errno = EINVAL;
-		return 0;
-	}
-
-	size_t extra_pages = alignment / _memory_page_size;
-
-	// Since each span has a header, we will at least need one extra memory page
-	size_t num_pages = 1 + (size / _memory_page_size);
-	if (size & (_memory_page_size - 1))
-		++num_pages;
-
-	if (extra_pages > num_pages)
-		num_pages = 1 + extra_pages;
-
-	size_t original_pages = num_pages;
-	size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
-	if (limit_pages < (original_pages * 2))
-		limit_pages = original_pages * 2;
-
-	size_t mapped_size, align_offset;
-	span_t* span;
-
-retry:
-	align_offset = 0;
-	mapped_size = num_pages * _memory_page_size;
-
-	span = (span_t*)_memory_map(mapped_size, &align_offset);
-	if (!span) {
-		errno = ENOMEM;
-		return 0;
-	}
-	ptr = pointer_offset(span, SPAN_HEADER_SIZE);
-
-	if ((uintptr_t)ptr & align_mask)
-		ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-
-	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
-	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
-	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
-		_memory_unmap(span, mapped_size, align_offset, mapped_size);
-		++num_pages;
-		if (num_pages > limit_pages) {
-			errno = EINVAL;
-			return 0;
-		}
-		goto retry;
-	}
-
-	//Store page count in span_count
-	span->size_class = (uint32_t)-1;
-	span->span_count = (uint32_t)num_pages;
-	span->align_offset = (uint32_t)align_offset;
-	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
-
-	return ptr;
+#else
+	total = num * size;
+#endif
+	void* block = rpaligned_alloc(alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2418,7 +2552,7 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		if (heap->span_cache[iclass])
 			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
-		span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_cache_deferred) : 0;
+		span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_free_deferred) : 0;
 		//TODO: Incorrect, for deferred lists the size is NOT stored in list_size
 		if (deferred_list)
 			stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size;
@@ -2502,9 +2636,13 @@ _memory_heap_dump_statistics(heap_t* heap, void* file) {
 			atomic_load32(&heap->span_use[iclass].current),
 			heap->span_use[iclass].high,
 			((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+#if ENABLE_THREAD_CACHE
 			heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
 			((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+#else
+			0, 0ULL, 0ULL,
+#endif
 			((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
@@ -2512,7 +2650,7 @@ _memory_heap_dump_statistics(heap_t* heap, void* file) {
 			heap->span_use[iclass].spans_map_calls);
 	}
 	fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
-	fprintf(file, "%17zu %17zu\n", (size_t)heap->thread_to_global / (size_t)(1024 * 1024), (size_t)heap->global_to_thread / (size_t)(1024 * 1024));
+	fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
 }
 
 #endif
@@ -2571,6 +2709,174 @@ rpmalloc_dump_statistics(void* file) {
 #endif
 }
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+extern inline rpmalloc_heap_t*
+rpmalloc_heap_acquire(void) {
+	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
+	// could already be allocated from the heap which would (wrongly) be released when
+	// heap is cleared with rpmalloc_heap_free_all()
+	heap_t* heap = _memory_allocate_heap_new();
+	_memory_statistics_inc(&_memory_active_heaps);
+	return (rpmalloc_heap_t*)heap;
+}
+
+extern inline void
+rpmalloc_heap_release(rpmalloc_heap_t* heap) {
+	if (heap)
+		_memory_heap_finalize((heap_t*)heap);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _memory_allocate((heap_t*)heap, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _memory_aligned_allocate((heap_t*)heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) {
+	return rpmalloc_heap_aligned_calloc(heap, 0, num, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) {
+	size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+	int err = SizeTMult(num, size, &total);
+	if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#else
+	int err = __builtin_umull_overflow(num, size, &total);
+	if (err || (total >= MAX_ALLOC_SIZE)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+#else
+	total = num * size;
+#endif
+	void* block = _memory_aligned_allocate((heap_t*)heap, alignment, total);
+	if (block)
+		memset(block, 0, total);
+	return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if (size >= MAX_ALLOC_SIZE) {
+		errno = EINVAL;
+		return ptr;
+	}
+#endif
+	return _memory_reallocate((heap_t*)heap, ptr, size, 0, flags);
+}
+
+extern inline RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+	if ((size + alignment < size) || (alignment > _memory_page_size)) {
+		errno = EINVAL;
+		return 0;
+	}
+#endif
+	return _memory_aligned_reallocate((heap_t*)heap, ptr, alignment, size, 0, flags);	
+}
+
+extern inline void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
+	(void)sizeof(heap);
+	_memory_deallocate(ptr);
+}
+
+extern inline void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heapptr) {
+	heap_t* heap = (heap_t*)heapptr;
+	span_t* span;
+	span_t* next_span;
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		span = heap->span_class[iclass].partial_span;
+		while (span) {
+			next_span = span->next;
+			_memory_heap_cache_insert(heap, span);
+			span = next_span;
+		}
+	}
+	memset(heap->span_class, 0, sizeof(heap->span_class));
+
+	span = heap->full_span;
+	while (span) {
+		next_span = span->next;
+		_memory_heap_cache_insert(heap, span);
+		span = next_span;
+	}
+
+	_memory_heap_cache_adopt_deferred(heap, 0);
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span = heap->span_cache[iclass];
+#if ENABLE_GLOBAL_CACHE
+		while (span) {
+			assert(span->span_count == (iclass + 1));
+			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
+			next_span = _memory_span_list_split(span, (uint32_t)release_count);
+			_memory_statistics_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
+			_memory_statistics_add(&heap->span_use[iclass].spans_to_global, span->list_size);
+			_memory_global_cache_insert(span);
+			span = next_span;
+		}
+#else
+		if (span)
+			_memory_unmap_span_list(span);
+#endif
+		heap->span_cache[iclass] = 0;
+	}
+#endif
+
+#if ENABLE_STATISTICS
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->size_class_use[iclass].alloc_current, 0);
+		heap->size_class_use[iclass].spans_current = 0;
+	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		atomic_store32(&heap->span_use[iclass].current, 0 );
+	}
+#endif
+
+	_memory_heap_orphan(heap);
+}
+
+extern inline void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
+	rpmalloc_heap_t* prev_heap = (rpmalloc_heap_t*)get_thread_heap_raw();
+	if (prev_heap != heap) {
+		set_thread_heap((heap_t*)heap);
+		if (prev_heap)
+			rpmalloc_heap_release(prev_heap);
+	}
+}
+
+#endif
+
 #if ENABLE_PRELOAD || ENABLE_OVERRIDE
 
 #include "malloc.c"
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 2f48bc97..77b250eb 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -45,11 +45,18 @@ extern "C" {
 # define RPMALLOC_CDECL
 #endif
 
-//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes
+//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce
+//  a very small overhead due to some size calculations not being compile time constants
 #ifndef RPMALLOC_CONFIGURABLE
 #define RPMALLOC_CONFIGURABLE 0
 #endif
 
+//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_* functions).
+//  Will introduce a very small overhead to track fully allocated spans in heaps
+#ifndef RPMALLOC_FIRST_CLASS_HEAPS
+#define RPMALLOC_FIRST_CLASS_HEAPS 0
+#endif
+
 //! Flag to rpaligned_realloc to not preserve content in reallocation
 #define RPMALLOC_NO_PRESERVE    1
 
@@ -240,6 +247,13 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsi
 RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
 rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
 
+//! Allocate a memory block of at least the given size and alignment, and zero initialize it.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpaligned_calloc(size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
 //! Allocate a memory block of at least the given size and alignment.
 //  Alignment must be a power of two and a multiple of sizeof(void*),
 //  and should ideally be less than memory page size. A caveat of rpmalloc
@@ -252,12 +266,80 @@ rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB
 //  and should ideally be less than memory page size. A caveat of rpmalloc
 //  internals is that this must also be strictly less than the span size (default 64KiB)
 RPMALLOC_EXPORT int
-rpposix_memalign(void **memptr, size_t alignment, size_t size);
+rpposix_memalign(void** memptr, size_t alignment, size_t size);
 
 //! Query the usable size of the given memory block (from given pointer to the end of block)
 RPMALLOC_EXPORT size_t
 rpmalloc_usable_size(void* ptr);
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+//! Heap type
+typedef void* rpmalloc_heap_t;
+
+//! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap
+//  if none available. Heap API is imlemented with the strict assumption that only one single
+//  thread will call heap functions for a given heap at any given time, no functions are thread safe.
+RPMALLOC_EXPORT rpmalloc_heap_t*
+rpmalloc_heap_acquire(void);
+
+//! Release a heap (does NOT free the memory allocated by the heap, use rpmalloc_heap_free_all before destroying the heap).
+//  Releasing a heap will enable it to be reused by other threads. Safe to pass a null pointer.
+RPMALLOC_EXPORT void
+rpmalloc_heap_release(rpmalloc_heap_t* heap);
+
+//! Allocate a memory block of at least the given size using the given heap.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size using the given heap. The returned
+//  block will have the requested alignment. Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_calloc(rpmalloc_heap_t* heap, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size using the given heap and zero initialize it. The returned
+//  block will have the requested alignment. Alignment must either be zero, or a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST be allocated
+//  by the same heap given to this function. The returned block will have the requested alignment.
+//  Alignment must be either zero, or a power of two and a multiple of sizeof(void*), and should ideally be
+//  less than memory page size. A caveat of rpmalloc internals is that this must also be strictly less than
+//  the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void*
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment, size_t size, unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Free the given memory block from the given heap. The memory block MUST be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT void
+rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr);
+
+//! Free all memory allocated by the heap
+RPMALLOC_EXPORT void
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap);
+
+//! Set the given heap as the current heap for the calling thread. A heap MUST only be current heap
+//  for a single thread, a heap can never be shared between multiple threads. The previous
+//  current heap for the calling thread is released to be reused by other threads.
+RPMALLOC_EXPORT void
+rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap);
+
+#endif
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/test/main.c b/test/main.c
index ce41dc3b..9b4bf3a7 100644
--- a/test/main.c
+++ b/test/main.c
@@ -444,6 +444,83 @@ allocator_thread(void* argp) {
 	thread_exit((uintptr_t)ret);
 }
 
+static void
+heap_allocator_thread(void* argp) {
+	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
+	unsigned int iloop = 0;
+	unsigned int ipass = 0;
+	unsigned int icheck = 0;
+	unsigned int id = 0;
+	void** addr;
+	uint32_t* data;
+	unsigned int cursize;
+	unsigned int iwait = 0;
+	int ret = 0;
+
+	rpmalloc_heap_t* outer_heap = rpmalloc_heap_acquire();
+
+	addr = rpmalloc_heap_alloc(outer_heap, sizeof(void*) * arg.passes);
+	data = rpmalloc_heap_alloc(outer_heap, 512 * 1024);
+	for (id = 0; id < 512 * 1024 / 4; ++id)
+		data[id] = id;
+
+	thread_sleep(1);
+
+	for (iloop = 0; iloop < arg.loops; ++iloop) {
+		rpmalloc_heap_t* heap = rpmalloc_heap_acquire();
+
+		for (ipass = 0; ipass < arg.passes; ++ipass) {
+			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
+
+			addr[ipass] = rpmalloc_heap_alloc(heap, 4 + cursize);
+			if (addr[ipass] == 0) {
+				ret = test_fail("Allocation failed");
+				goto end;
+			}
+
+			*(uint32_t*)addr[ipass] = (uint32_t)cursize;
+			memcpy(pointer_offset(addr[ipass], 4), data, cursize);
+
+			for (icheck = 0; icheck < ipass; ++icheck) {
+				if (addr[icheck] == addr[ipass]) {
+					ret = test_fail("Identical pointer returned from allocation");
+					goto end;
+				}
+				if (addr[icheck] < addr[ipass]) {
+					if (pointer_offset(addr[icheck], *(uint32_t*)addr[icheck] + 4) > addr[ipass]) {
+						ret = test_fail("Invalid pointer inside another block returned from allocation");
+						goto end;
+					}
+				}
+				else if (addr[icheck] > addr[ipass]) {
+					if (pointer_offset(addr[ipass], *(uint32_t*)addr[ipass] + 4) > addr[icheck]) {
+						ret = test_fail("Invalid pointer inside another block returned from allocation");
+						goto end;
+					}
+				}
+			}
+		}
+
+		for (ipass = 0; ipass < arg.passes; ++ipass) {
+			cursize = *(uint32_t*)addr[ipass];
+
+			if (memcmp(pointer_offset(addr[ipass], 4), data, cursize)) {
+				ret = test_fail("Data corrupted");
+				goto end;
+			}
+		}
+
+		rpmalloc_heap_free_all(heap);
+		rpmalloc_heap_release(heap);
+	}
+
+	rpmalloc_heap_free_all(outer_heap);
+	rpmalloc_heap_release(outer_heap);
+
+end:
+	thread_exit((uintptr_t)ret);
+}
+
 static void
 crossallocator_thread(void* argp) {
 	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
@@ -703,7 +780,7 @@ test_crossthread(void) {
 		arg[ithread].datasize[6] = 3892 + iadd;
 		arg[ithread].datasize[7] = 19 + iadd;
 		arg[ithread].datasize[8] = 154 + iadd;
-		arg[ithread].datasize[9] = 39723 + iadd;
+		arg[ithread].datasize[9] = 9723 + iadd;
 		arg[ithread].datasize[10] = 15 + iadd;
 		arg[ithread].datasize[11] = 493 + iadd;
 		arg[ithread].datasize[12] = 34 + iadd;
@@ -805,11 +882,72 @@ test_threadspam(void) {
 	return 0;
 }
 
+static int
+test_first_class_heaps(void) {
+	uintptr_t thread[32];
+	uintptr_t threadres[32];
+	unsigned int i;
+	size_t num_alloc_threads;
+	allocator_thread_arg_t arg;
+
+	rpmalloc_initialize();
+
+	num_alloc_threads = _hardware_threads;
+	if (num_alloc_threads < 2)
+		num_alloc_threads = 2;
+	if (num_alloc_threads > 32)
+		num_alloc_threads = 32;
+
+	arg.datasize[0] = 19;
+	arg.datasize[1] = 249;
+	arg.datasize[2] = 797;
+	arg.datasize[3] = 3058;
+	arg.datasize[4] = 47892;
+	arg.datasize[5] = 173902;
+	arg.datasize[6] = 389;
+	arg.datasize[7] = 19;
+	arg.datasize[8] = 2493;
+	arg.datasize[9] = 7979;
+	arg.datasize[10] = 3;
+	arg.datasize[11] = 79374;
+	arg.datasize[12] = 3432;
+	arg.datasize[13] = 548;
+	arg.datasize[14] = 38934;
+	arg.datasize[15] = 234;
+	arg.num_datasize = 16;
+	arg.loops = 100;
+	arg.passes = 4000;
+
+	thread_arg targ;
+	targ.fn = heap_allocator_thread;
+	targ.arg = &arg;
+	for (i = 0; i < num_alloc_threads; ++i)
+		thread[i] = thread_run(&targ);
+
+	thread_sleep(1000);
+
+	for (i = 0; i < num_alloc_threads; ++i)
+		threadres[i] = thread_join(thread[i]);
+
+	rpmalloc_finalize();
+
+	for (i = 0; i < num_alloc_threads; ++i) {
+		if (threadres[i])
+			return -1;
+	}
+
+	printf("Heap threaded tests passed\n");
+
+	return 0;
+}
+
 int
 test_run(int argc, char** argv) {
 	(void)sizeof(argc);
 	(void)sizeof(argv);
 	test_initialize();
+	if (test_first_class_heaps())
+		return -1;
 	if (test_alloc())
 		return -1;
 	if (test_realloc())

From c576814c0d7ebb4c9c8303775b8565239ff25ac6 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 2 Jan 2020 11:58:54 +0100
Subject: [PATCH 14/69] Grow or fail flag for realloc (#132)

---
 rpmalloc/rpmalloc.c | 49 ++++++++++++++++++++++++---------------------
 rpmalloc/rpmalloc.h |  4 ++++
 test/main.c         |  6 ++++++
 3 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index ce1952dd..0210635b 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1442,11 +1442,12 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 	size_t align_mask = alignment - 1;
 	if (alignment <= _memory_page_size) {
 		ptr = _memory_allocate(heap, size + alignment);
-		if ((uintptr_t)ptr & align_mask)
+		if ((uintptr_t)ptr & align_mask) {
 			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+			//Mark as having aligned blocks
+			span_t* span = (span_t*)((uintptr_t)ptr & _memory_span_mask);
+			span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+		}
 		return ptr;
 	}
 
@@ -1848,6 +1849,9 @@ _memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned
 		oldsize = 0;
 	}
 
+	if (!!(flags & RPMALLOC_GROW_OR_FAIL))
+		return 0;
+
 	//Size is greater than block size, need to allocate a new block and deallocate the old
 	//Avoid hysteresis by overallocating if increase is small (below 37%)
 	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
@@ -1863,27 +1867,26 @@ _memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned
 }
 
 static void*
-_memory_aligned_reallocate(heap_t* heap, void* p, size_t alignment, size_t size, size_t oldsize,
+_memory_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize,
                            unsigned int flags) {
 	if (alignment <= SMALL_GRANULARITY)
-		return _memory_reallocate(heap, p, size, oldsize, flags);
-
-	size_t usablesize = _memory_usable_size(p);
-	if ((usablesize >= size) && (size >= (usablesize / 2)) && !((uintptr_t)p & (alignment - 1)))
-		return p;
-
-	void* block = _memory_aligned_allocate(heap, alignment, size);
-	if (p) {
-		if (!oldsize)
-			oldsize = usablesize;
-		if (!(flags & RPMALLOC_NO_PRESERVE))
-			memcpy(block, p, oldsize < size ? oldsize : size);
-		_memory_deallocate(p);
-	}
-	if (block) {
-		//Mark as having aligned blocks
-		span_t* span = (span_t*)((uintptr_t)block & _memory_span_mask);
-		span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+		return _memory_reallocate(heap, ptr, size, oldsize, flags);
+
+	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
+	size_t usablesize = _memory_usable_size(ptr);
+	if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
+		if (no_alloc || (size >= (usablesize / 2)))
+			return ptr;
+	}
+	// Aligned alloc marks span as having aligned blocks
+	void* block = (!no_alloc ? _memory_aligned_allocate(heap, alignment, size) : 0);
+	if (EXPECTED(block)) {
+		if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
+			if (!oldsize)
+				oldsize = usablesize;
+			memcpy(block, ptr, oldsize < size ? oldsize : size);
+		}
+		rpfree(ptr);
 	}
 	return block;
 }
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 77b250eb..32e33e25 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -59,6 +59,10 @@ extern "C" {
 
 //! Flag to rpaligned_realloc to not preserve content in reallocation
 #define RPMALLOC_NO_PRESERVE    1
+//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be done in-place,
+//  in which case the original pointer is still valid (just like a call to realloc which failes to allocate
+//  a new block).
+#define RPMALLOC_GROW_OR_FAIL   2
 
 typedef struct rpmalloc_global_statistics_t {
 	//! Current amount of virtual memory mapped, all of which might not have been committed (only if ENABLE_STATISTICS=1)
diff --git a/test/main.c b/test/main.c
index 9b4bf3a7..7732982a 100644
--- a/test/main.c
+++ b/test/main.c
@@ -111,6 +111,8 @@ test_alloc(void) {
 			return test_fail("Bad usable size (aligned realloc)");
 		if (*((uintptr_t*)testptr) != 0x12345678)
 			return test_fail("Data not preserved on realloc");
+		if (rpaligned_realloc(testptr, 128, size * 1024 * 4, 0, RPMALLOC_GROW_OR_FAIL))
+			return test_fail("Realloc with grow-or-fail did not fail as expected");
 		void* unaligned = rprealloc(testptr, size);
 		if (unaligned != testptr) {
 			ptrdiff_t diff = pointer_diff(testptr, unaligned);
@@ -318,6 +320,10 @@ test_realloc(void) {
 	while (bigsize < 3 * 1024 * 1024) {
 		++bigsize;
 		bigptr = rprealloc(bigptr, bigsize);
+		if (rpaligned_realloc(bigptr, 0, bigsize * 32, 0, RPMALLOC_GROW_OR_FAIL))
+			return test_fail("Reallocation with grow-or-fail did not fail as expected");
+		if (rpaligned_realloc(bigptr, 128, bigsize * 32, 0, RPMALLOC_GROW_OR_FAIL))
+			return test_fail("Reallocation with aligned grow-or-fail did not fail as expected");
 	}
 	rpfree(bigptr);
 

From 68c6ae5839ccb42b71e1c1f192d0f7603ae480d5 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Mon, 6 Jan 2020 23:10:14 +0100
Subject: [PATCH 15/69] Refactor span handling (#133)

---
 rpmalloc/rpmalloc.c | 631 ++++++++++++++++++++++----------------------
 test/main.c         |  24 +-
 2 files changed, 333 insertions(+), 322 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 0210635b..7f8659ae 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -75,6 +75,10 @@
 #endif
 
 #if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
+#if DISABLE_UNMAP
+#undef ENABLE_UNLIMITED_GLOBAL_CACHE
+#define ENABLE_UNLIMITED_GLOBAL_CACHE 1
+#endif
 #ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
 //! Unlimited cache disables any global cache limitations
 #define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
@@ -128,6 +132,7 @@
 #  include <unistd.h>
 #  include <stdio.h>
 #  include <stdlib.h>
+#  include <errno.h>
 #  if defined(__APPLE__)
 #    include <mach/mach_vm.h>
 #    include <mach/vm_statistics.h>
@@ -168,20 +173,16 @@ typedef volatile void*     atomicptr_t;
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
-static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, 1) + 1; }
+static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedIncrement(val); }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
-static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)_InterlockedExchangeAdd(val, -1) - 1; }
+static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)_InterlockedDecrement(val); }
 static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
-static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)_InterlockedExchangeAdd64(val, add) - add; }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)_InterlockedExchangeAdd64(val, add) + add; }
 #endif
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
-#  if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange64((volatile long long*)dst, (long long)val, (long long)ref) == (long long)ref) ? 1 : 0; }
-#else
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchange((volatile long*)dst, (long)val, (long)ref) == (long)ref) ? 1 : 0; }
-#endif
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchangePointer ((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
 
 #define EXPECTED(x) (x)
 #define UNEXPECTED(x) (x)
@@ -241,7 +242,7 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 //! ABA protection size in orhpan heap list (also becomes limit of smallest page size)
 #define HEAP_ORPHAN_ABA_SIZE      512
 //! Size of a span header (must be a multiple of SMALL_GRANULARITY)
-#define SPAN_HEADER_SIZE          96
+#define SPAN_HEADER_SIZE          128
 
 #if ENABLE_VALIDATE_ARGS
 //! Maximum allocation size to avoid integer overflow
@@ -254,6 +255,9 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 
 #define INVALID_POINTER ((void*)((uintptr_t)-1))
 
+#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT
+#define SIZE_CLASS_HUGE ((uint32_t)-1)
+
 /// Data types
 //! A memory heap, per thread
 typedef struct heap_t heap_t;
@@ -329,15 +333,6 @@ struct size_class_use_t {
 typedef struct size_class_use_t size_class_use_t;
 #endif
 
-typedef enum span_state_t {
-	SPAN_STATE_ACTIVE = 0,
-	SPAN_STATE_PARTIAL,
-	SPAN_STATE_FREE,
-	SPAN_STATE_FULL,
-	SPAN_STATE_LARGE,
-	SPAN_STATE_HUGE
-} span_state_t;
-
 //A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
 //or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
 //span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
@@ -349,28 +344,28 @@ typedef enum span_state_t {
 struct span_t {
 	//! Free list
 	void*       free_list;
-	//! State
-	uint32_t    state;
-	//! Used count when not active (not including deferred free list)
-	uint32_t    used_count;
-	//! Block count
+	//! Total block count of size class
 	uint32_t    block_count;
 	//! Size class
 	uint32_t    size_class;
 	//! Index of last block initialized in free list
 	uint32_t    free_list_limit;
-	//! Span list size when part of a cache list, or size of deferred free list when partial/full
-	uint32_t    list_size;
+	//! Number of used blocks remaining when in partial state
+	uint32_t    used_count;
 	//! Deferred free list
 	atomicptr_t free_list_deferred;
+	//! Size of deferred free list, or list of spans when part of a cache list
+	uint32_t    list_size;
 	//! Size of a block
 	uint32_t    block_size;
 	//! Flags and counters
 	uint32_t    flags;
 	//! Number of spans
 	uint32_t    span_count;
-	//! Total span counter for master spans, distance for subspans
-	uint32_t    total_spans_or_distance;
+	//! Total span counter for master spans
+	uint32_t    total_spans;
+	//! Offset from master span for subspans
+	uint32_t    offset_from_master;
 	//! Remaining span counter, for master spans
 	atomic32_t  remaining_spans;
 	//! Alignment offset
@@ -388,12 +383,17 @@ struct heap_class_t {
 	//! Free list of active span
 	void*        free_list;
 	//! Double linked list of partially used spans with free blocks for each size class.
-	//  Current active span is at head of list. Previous span pointer in head points to tail span of list.
+	//  Previous span pointer in head points to tail span of list.
 	span_t*      partial_span;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//! Double linked list of fully utilized spans with free blocks for each size class.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      full_span;
+#endif
 };
 
 struct heap_t {
-	//! Active and semi-used span data per size class
+	//! Partial span data per size class
 	heap_class_t span_class[SIZE_CLASS_COUNT];
 #if ENABLE_THREAD_CACHE
 	//! List of free spans (single linked list)
@@ -407,7 +407,7 @@ struct heap_t {
 #endif
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of large and huge spans allocated by this heap
-	span_t*      full_span;
+	span_t*      large_huge_span;
 #endif
 	//! Mapped but unused spans
 	span_t*      span_reserve;
@@ -423,6 +423,8 @@ struct heap_t {
 	size_t       align_offset;
 	//! Heap ID
 	int32_t      id;
+	//! Master heap owning the memory pages
+	heap_t*      master_heap;
 #if ENABLE_STATISTICS
 	//! Number of bytes transitioned thread -> global
 	atomic64_t   thread_to_global;
@@ -431,8 +433,6 @@ struct heap_t {
 	//! Allocation stats per size class
 	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
 #endif
-	//! Master heap owning the memory pages
-	heap_t*      master_heap;
 };
 
 struct size_class_t {
@@ -645,7 +645,7 @@ _memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size
 	assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
 	if (subspan != master) {
 		subspan->flags = SPAN_FLAG_SUBSPAN;
-		subspan->total_spans_or_distance = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
+		subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
 		subspan->align_offset = 0;
 	}
 	subspan->span_count = (uint32_t)span_count;
@@ -686,7 +686,7 @@ _memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, s
 //! Setup a newly mapped span
 static void
 _memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
-	span->total_spans_or_distance = (uint32_t)total_span_count;
+	span->total_spans = (uint32_t)total_span_count;
 	span->span_count = (uint32_t)span_count;
 	span->align_offset = (uint32_t)align_offset;
 	span->flags = SPAN_FLAG_MASTER;
@@ -735,7 +735,7 @@ _memory_unmap_span(span_t* span) {
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 
 	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
-	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->total_spans_or_distance * _memory_span_size)));
+	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size)));
 	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
 	assert(master->flags & SPAN_FLAG_MASTER);
 
@@ -758,10 +758,10 @@ _memory_unmap_span(span_t* span) {
 		assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
 		size_t unmap_count = master->span_count;
 		if (_memory_span_size < _memory_page_size)
-			unmap_count = master->total_spans_or_distance;
+			unmap_count = master->total_spans;
 		_memory_statistics_sub(&_reserved_spans, unmap_count);
 		_memory_statistics_sub(&_master_spans, 1);
-		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans_or_distance * _memory_span_size);
+		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans * _memory_span_size);
 	}
 }
 
@@ -865,13 +865,17 @@ _memory_span_double_link_list_add_tail(span_t** head, span_t* span) {
 
 //! Pop head span from double linked list
 static void
-_memory_span_double_link_list_pop_head(span_t** head) {
-	span_t* span = *head;
+_memory_span_double_link_list_pop_head(span_t** head, span_t* span) {
+	assert(*head == span);
+	span = *head;
 	*head = span->next;
 	if (*head) {
 		//Maintain pointer to tail span
+		assert(span->prev != span);
 		assert(span->next->prev == span);
 		(*head)->prev = span->prev;
+	} else {
+		assert(span->prev == span);
 	}
 }
 
@@ -880,7 +884,7 @@ static void
 _memory_span_double_link_list_remove(span_t** head, span_t* span) {
 	assert(*head);
 	if (UNEXPECTED(*head == span)) {
-		_memory_span_double_link_list_pop_head(head);
+		_memory_span_double_link_list_pop_head(head, span);
 	} else {
 		span_t* next_span = span->next;
 		span_t* prev_span = span->prev;
@@ -979,7 +983,7 @@ _memory_global_cache_extract(size_t span_count) {
 
 static void _memory_deallocate_huge(span_t*);
 
-//! Adopt the deferred span cache list, returning the first
+//! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
 static void
 _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	atomic_thread_fence_acquire();
@@ -992,26 +996,33 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	while (span) {
 		span_t* next_span = (span_t*)span->free_list;
 		assert(span->heap == heap);
-		assert(span->state >= SPAN_STATE_FULL);
-		if (UNEXPECTED(span->size_class == (uint32_t)-1)) {
-			_memory_deallocate_huge(span);
-		} else {
-			uint32_t idx = span->span_count - 1;
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
-			_memory_span_double_link_list_remove(&heap->full_span, span);
+			heap_class_t* heap_class = heap->span_class + span->size_class;
+			_memory_span_double_link_list_remove(&heap_class->full_span, span);
 #endif
-			span->state = SPAN_STATE_FREE;
-			if (!idx && single_span && !*single_span) {
+			if (single_span && !*single_span) {
 				*single_span = span;
 			} else {
-				_memory_statistics_dec(&heap->span_use[idx].current);
+				_memory_statistics_dec(&heap->span_use[0].current);
 				_memory_statistics_dec(&heap->size_class_use[span->size_class].spans_current);
-#if ENABLE_THREAD_CACHE
-				_memory_statistics_inc(&heap->size_class_use[span->size_class].spans_to_cache);
-				_memory_span_list_push(&heap->span_cache[idx], span);
-#else
-				_memory_unmap_span(span);
+				_memory_heap_cache_insert(heap, span);
+			}
+		} else {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			_memory_span_double_link_list_remove(&heap->large_huge_span, span);
 #endif
+			if (span->size_class == SIZE_CLASS_HUGE) {
+				_memory_deallocate_huge(span);
+			} else {
+				assert(span->size_class == SIZE_CLASS_LARGE);
+				uint32_t idx = span->span_count - 1;
+				if (!idx && single_span && !*single_span) {
+					*single_span = span;
+				} else {
+					_memory_statistics_dec(&heap->span_use[idx].current);
+					_memory_heap_cache_insert(heap, span);
+				}
 			}
 		}
 		span = next_span;
@@ -1135,16 +1146,8 @@ _memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_id
 //! Move the span (used for small or medium allocations) to the heap thread cache
 static void
 _memory_span_release_to_cache(heap_t* heap, span_t* span) {
-	heap_class_t* heap_class = heap->span_class + span->size_class;
-	assert(heap_class->partial_span != span);
 	assert(heap == span->heap);
-	if (span->state == SPAN_STATE_PARTIAL)
-		_memory_span_double_link_list_remove(&heap_class->partial_span, span);
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	if (UNEXPECTED(span->state >= SPAN_STATE_FULL))
-		_memory_span_double_link_list_remove(&heap->full_span, span);
-#endif
-	span->state = SPAN_STATE_FREE;
+	assert(span->size_class < SIZE_CLASS_COUNT);
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	atomic_decr32(&heap->span_use[0].current);
 #endif
@@ -1186,76 +1189,49 @@ free_list_partial_init(void** list, void** first_block, void* page_start, void*
 	return block_count;
 }
 
-//! Initialize an unused span (from cache or mapped) to be new active span
+//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list
 static void*
-_memory_span_set_new_active(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) {
+_memory_span_initialize_new(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) {
 	assert(span->span_count == 1);
 	size_class_t* size_class = _memory_size_class + class_idx;
 	span->size_class = class_idx;
 	span->heap = heap;
 	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
-	span->block_count = size_class->block_count;
 	span->block_size = size_class->block_size;
-	span->state = SPAN_STATE_ACTIVE;
+	span->block_count = size_class->block_count;
+	span->used_count = size_class->block_count;
 	span->free_list = 0;
+	span->list_size = 0;
+	atomic_store_ptr(&span->free_list_deferred, 0);
+	atomic_thread_fence_release();
 
 	//Setup free list. Only initialize one system page worth of free blocks in list
 	void* block;
 	span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, 
 		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
-	atomic_store_ptr(&span->free_list_deferred, 0);
-	span->list_size = 0;
-	atomic_thread_fence_release();
-
-	_memory_span_double_link_list_add(&heap_class->partial_span, span);
-	return block;
-}
-
-//! Promote a partially used span (from heap used list) to be new active span
-static void
-_memory_span_set_partial_active(heap_class_t* heap_class, span_t* span) {
-	assert(span->state == SPAN_STATE_PARTIAL);
-	assert(span->block_count == _memory_size_class[span->size_class].block_count);
-	//Move data to heap size class and set span as active
-	heap_class->free_list = span->free_list;
-	span->state = SPAN_STATE_ACTIVE;
-	span->free_list = 0;
-	assert(heap_class->free_list);
-}
-
-//! Mark span as full (from active)
-static void
-_memory_span_set_active_full(heap_class_t* heap_class, span_t* span) {
-	assert(span->state == SPAN_STATE_ACTIVE);
-	assert(span == heap_class->partial_span);
-	_memory_span_double_link_list_pop_head(&heap_class->partial_span);
-	span->used_count = span->block_count;
-	span->state = SPAN_STATE_FULL;
-	span->free_list = 0;
-}
-
-//! Move span from full to partial state
-static void
-_memory_span_set_full_partial(heap_t* heap, span_t* span) {
-	assert(span->state == SPAN_STATE_FULL);
+	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
+	if (span->free_list_limit < span->block_count)
+		_memory_span_double_link_list_add(&heap_class->partial_span, span);
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_remove(&heap->full_span, span);
+	else
+		_memory_span_double_link_list_add(&heap_class->full_span, span);
 #endif
-	heap_class_t* heap_class = &heap->span_class[span->size_class];
-	span->state = SPAN_STATE_PARTIAL;
-	_memory_span_double_link_list_add_tail(&heap_class->partial_span, span);
+	return block;
 }
 
-static void*
-_memory_span_extract_deferred(span_t* span) {
-	void* free_list;
+static void
+_memory_span_extract_free_list_deferred(span_t* span) {
 	do {
-		free_list = atomic_load_ptr(&span->free_list_deferred);
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
+		span->free_list = atomic_load_ptr(&span->free_list_deferred);
+	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, span->free_list));
 	span->list_size = 0;
 	atomic_store_ptr(&span->free_list_deferred, 0);
 	atomic_thread_fence_release();
-	return free_list;
+}
+
+static int
+_memory_span_is_fully_utilized(span_t* span) {
+	return !span->free_list && (span->free_list_limit >= span->block_count);
 }
 
 //! Pop first block from a free list
@@ -1270,59 +1246,50 @@ free_list_pop(void** list) {
 static void*
 _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 	heap_class_t* heap_class = &heap->span_class[class_idx];
-	void* block;
-
-	span_t* active_span = heap_class->partial_span;
-	if (EXPECTED(active_span != 0)) {
-		assert(active_span->state == SPAN_STATE_ACTIVE);
-		assert(active_span->block_count == _memory_size_class[active_span->size_class].block_count);
-		//Swap in free list if not empty
-		if (active_span->free_list) {
-			heap_class->free_list = active_span->free_list;
-			active_span->free_list = 0;
-			return free_list_pop(&heap_class->free_list);
-		}
-		//If the span did not fully initialize free list, link up another page worth of blocks
-		if (active_span->free_list_limit < active_span->block_count) {
-			void* block_start = pointer_offset(active_span, SPAN_HEADER_SIZE + (active_span->free_list_limit * active_span->block_size));
-			active_span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block,
+	span_t* span = heap_class->partial_span;
+	if (EXPECTED(span != 0)) {
+		assert(span->block_count == _memory_size_class[span->size_class].block_count);
+		assert(!_memory_span_is_fully_utilized(span));
+		void* block;
+		if (span->free_list) {
+			//Swap in free list if not empty
+			heap_class->free_list = span->free_list;
+			span->free_list = 0;
+			block = free_list_pop(&heap_class->free_list);
+		} else {
+			//If the span did not fully initialize free list, link up another page worth of blocks			
+			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + (span->free_list_limit * span->block_size));
+			span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block,
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
-				active_span->block_count - active_span->free_list_limit, active_span->block_size);
-			return block;
+				span->block_count - span->free_list_limit, span->block_size);
 		}
-		//Swap in deferred free list
+		span->used_count = span->block_count;
+
+		//Swap in deferred free list if present
 		atomic_thread_fence_acquire();
-		if (atomic_load_ptr(&active_span->free_list_deferred)) {
-			heap_class->free_list = _memory_span_extract_deferred(active_span);
-			return free_list_pop(&heap_class->free_list);
-		}
+		if (atomic_load_ptr(&span->free_list_deferred))
+			_memory_span_extract_free_list_deferred(span);
+
+		//If span is still not fully utilized keep it in partial list and early return block
+		if (!_memory_span_is_fully_utilized(span))
+			return block;
 
-		//If the active span is fully allocated, mark span as free floating (fully allocated and not part of any list)
-		assert(!heap_class->free_list);
-		assert(active_span->free_list_limit >= active_span->block_count);
-		_memory_span_set_active_full(heap_class, active_span);
+		//The span is fully utilized, unlink from partial list and add to fully utilized list
+		_memory_span_double_link_list_pop_head(&heap_class->partial_span, span);
 #if RPMALLOC_FIRST_CLASS_HEAPS
-		_memory_span_double_link_list_add(&heap->full_span, active_span);
+		_memory_span_double_link_list_add(&heap_class->full_span, span);
 #endif
+		return block;
 	}
-	assert(!heap_class->free_list);
-
-	//Try promoting a semi-used span to active
-	active_span = heap_class->partial_span;
-	if (EXPECTED(active_span != 0)) {
-		_memory_span_set_partial_active(heap_class, active_span);
-		return free_list_pop(&heap_class->free_list);
-	}
-	assert(!heap_class->free_list);
-	assert(!heap_class->partial_span);
 
 	//Find a span in one of the cache levels
-	active_span = _memory_heap_extract_new_span(heap, 1, class_idx);
-	if (!active_span)
-		return active_span;
+	span = _memory_heap_extract_new_span(heap, 1, class_idx);
+	if (EXPECTED(span != 0)) {
+		//Mark span as owned by this heap and set base data, return first block
+		return _memory_span_initialize_new(heap, heap_class, span, class_idx);
+	}
 
-	//Mark span as owned by this heap and set base data, return first block
-	return _memory_span_set_new_active(heap, heap_class, active_span, class_idx);
+	return 0;
 }
 
 //! Allocate a small sized memory block from the given heap
@@ -1361,22 +1328,20 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 	size_t span_count = size >> _memory_span_size_shift;
 	if (size & (_memory_span_size - 1))
 		++span_count;
-	size_t idx = span_count - 1;
 
 	//Find a span in one of the cache levels
-	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_COUNT);
+	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_LARGE);
 	if (!span)
 		return span;
 
 	//Mark span as owned by this heap and set base data
 	assert(span->span_count == span_count);
-	span->size_class = (uint32_t)(SIZE_CLASS_COUNT + idx);
+	span->size_class = SIZE_CLASS_LARGE;
 	span->heap = heap;
-	span->state = SPAN_STATE_LARGE;
 	atomic_thread_fence_release();
 
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_add(&heap->full_span, span);
+	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
@@ -1396,15 +1361,14 @@ _memory_allocate_huge(heap_t* heap, size_t size) {
 		return span;
 
 	//Store page count in span_count
-	span->size_class = (uint32_t)-1;
+	span->size_class = SIZE_CLASS_HUGE;
 	span->span_count = (uint32_t)num_pages;
 	span->align_offset = (uint32_t)align_offset;
 	span->heap = heap;
-	span->state = SPAN_STATE_HUGE;
 	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
 
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_add(&heap->full_span, span);
+	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
@@ -1514,15 +1478,14 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 	}
 
 	//Store page count in span_count
-	span->size_class = (uint32_t)-1;
+	span->size_class = SIZE_CLASS_HUGE;
 	span->span_count = (uint32_t)num_pages;
 	span->align_offset = (uint32_t)align_offset;
 	span->heap = heap;
-	span->state = SPAN_STATE_HUGE;
 	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
 
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_add(&heap->full_span, span);
+	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
 
 	return ptr;
@@ -1616,31 +1579,34 @@ _memory_allocate_heap(void) {
 //! Deallocate the given small/medium memory block in the current thread local heap
 static void
 _memory_deallocate_direct_small_or_medium(span_t* span, void* block) {
+	heap_t* heap = span->heap;
 	assert(span->heap == get_thread_heap_raw());
-	assert(span->state <= SPAN_STATE_FULL);
-	uint32_t state = span->state;
 	//Add block to free list
+	if (UNEXPECTED(_memory_span_is_fully_utilized(span))) {
+		span->used_count = span->block_count;
+		heap_class_t* heap_class = &heap->span_class[span->size_class];
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_memory_span_double_link_list_remove(&heap_class->full_span, span);
+#endif
+		_memory_span_double_link_list_add_tail(&heap_class->partial_span, span);
+	}
 	*((void**)block) = span->free_list;
 	span->free_list = block;
-	if (UNEXPECTED(state == SPAN_STATE_ACTIVE))
-		return;
-	heap_t* heap = span->heap;
 	uint32_t used = --span->used_count;
 	uint32_t free = span->list_size;
-	if (UNEXPECTED(used == free))
+	if (UNEXPECTED(used == free)) {
+		heap_class_t* heap_class = &heap->span_class[span->size_class];
+		_memory_span_double_link_list_remove(&heap_class->partial_span, span);
 		_memory_span_release_to_cache(heap, span);
-	else if (UNEXPECTED(state == SPAN_STATE_FULL))
-		_memory_span_set_full_partial(heap, span);
+	}		
 }
 
 static void
 _memory_deallocate_defer_free_span(heap_t* heap, span_t* span) {
 	//This list does not need ABA protection, no mutable side state
-	void* last_head;
 	do {
-		last_head = atomic_load_ptr(&heap->span_free_deferred);
-		span->free_list = last_head;
-	} while (!atomic_cas_ptr(&heap->span_free_deferred, span, last_head));
+		span->free_list = atomic_load_ptr(&heap->span_free_deferred);
+	} while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list));
 }
 
 //! Put the block in the deferred free list of the owning span
@@ -1652,15 +1618,14 @@ _memory_deallocate_defer_small_or_medium(span_t* span, void* block) {
 		free_list = atomic_load_ptr(&span->free_list_deferred);
 		*((void**)block) = free_list;
 	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
-	++span->list_size;
-	if ((span->state == SPAN_STATE_FULL) && (span->list_size == span->block_count)) {
+	uint32_t free_count = ++span->list_size;
+	atomic_store_ptr(&span->free_list_deferred, block);
+	if (free_count == span->block_count) {
 		// Span was completely freed by this block. Due to the INVALID_POINTER spin lock
 		// no other thread can reach this state simultaneously on this span.
 		// Safe to move to owner heap deferred cache
 		_memory_deallocate_defer_free_span(span->heap, span);
-		return;
 	}
-	atomic_store_ptr(&span->free_list_deferred, block);
 }
 
 static void
@@ -1682,9 +1647,7 @@ _memory_deallocate_small_or_medium(span_t* span, void* p) {
 //! Deallocate the given large memory block to the current heap
 static void
 _memory_deallocate_large(span_t* span) {
-	assert(span->span_count == ((size_t)span->size_class - SIZE_CLASS_COUNT + 1));
-	assert(span->size_class >= SIZE_CLASS_COUNT);
-	assert(span->size_class - SIZE_CLASS_COUNT < LARGE_CLASS_COUNT);
+	assert(span->size_class == SIZE_CLASS_LARGE);
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
 	heap_t* heap = get_thread_heap_raw();
@@ -1693,7 +1656,7 @@ _memory_deallocate_large(span_t* span) {
 	//always defer if from another heap since we cannot touch the list of another heap
 	int defer = (heap != span->heap);
 #else
-	//Otherwise defer if different heap and span count is 1
+	//Otherwise defer if different heap and span count is 1 to avoide too many span transitions
 	int defer = ((heap != span->heap) && (span->span_count == 1));
 #endif
 	if (defer) {
@@ -1701,22 +1664,21 @@ _memory_deallocate_large(span_t* span) {
 		return;
 	}
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_remove(&heap->full_span, span);
+	_memory_span_double_link_list_remove(&heap->large_huge_span, span);
 #endif
-	span->state = SPAN_STATE_FREE;
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	//Decrease counter
 	size_t idx = span->span_count - 1;
 	atomic_decr32(&span->heap->span_use[idx].current);
 #endif
+	span->heap = heap;
 	if ((span->span_count > 1) && !heap->spans_reserved) {
 		heap->span_reserve = span;
 		heap->spans_reserved = span->span_count;
 		if (span->flags & SPAN_FLAG_MASTER) {
 			heap->span_reserve_master = span;
 		} else { //SPAN_FLAG_SUBSPAN
-			uintptr_t distance = span->total_spans_or_distance;
-			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)(distance * _memory_span_size));
+			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)(span->offset_from_master * _memory_span_size));
 			heap->span_reserve_master = master;
 			assert(master->flags & SPAN_FLAG_MASTER);
 			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
@@ -1740,9 +1702,8 @@ _memory_deallocate_huge(span_t* span) {
 		return;
 	}
 
-	_memory_span_double_link_list_remove(&span->heap->full_span, span);
+	_memory_span_double_link_list_remove(&span->heap->large_huge_span, span);
 #endif
-	span->state = SPAN_STATE_FREE;
 
 	//Oversized allocation, page count is stored in span_count
 	size_t num_pages = span->span_count;
@@ -1759,7 +1720,7 @@ _memory_deallocate(void* p) {
 		return;
 	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
 		_memory_deallocate_small_or_medium(span, p);
-	else if (span->size_class != (uint32_t)-1)
+	else if (span->size_class == SIZE_CLASS_LARGE)
 		_memory_deallocate_large(span);
 	else
 		_memory_deallocate_huge(span);
@@ -1770,18 +1731,16 @@ static size_t
 _memory_usable_size(void* p) {
 	//Grab the span using guaranteed span alignment
 	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-	if (span->size_class != (uint32_t)-1) {
+	if (span->size_class < SIZE_CLASS_COUNT) {
 		//Small/medium block
-		if (span->size_class < SIZE_CLASS_COUNT) {
-			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-			return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
-		}
-
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+	}
+	if (span->size_class == SIZE_CLASS_LARGE) {
 		//Large block
-		size_t current_spans = (span->size_class - SIZE_CLASS_COUNT) + 1;
+		size_t current_spans = span->span_count;
 		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
 	}
-
 	//Oversized block, page count is stored in span_count
 	size_t current_pages = span->span_count;
 	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
@@ -1793,39 +1752,36 @@ _memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned
 	if (p) {
 		//Grab the span using guaranteed span alignment
 		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-		if (EXPECTED(span->size_class != (uint32_t)-1)) {
-			if (span->size_class < SIZE_CLASS_COUNT) {
-				//Small/medium sized block
-				assert(span->span_count == 1);
-				void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-				uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
-				uint32_t block_idx = block_offset / span->block_size;
-				void* block = pointer_offset(blocks_start, block_idx * span->block_size);
-				if (!oldsize)
-					oldsize = span->block_size - (uint32_t)pointer_diff(p, block);
-				if ((size_t)span->block_size >= size) {
-					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
-					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-						memmove(block, p, oldsize);
-					return block;
-				}
-			} else {
-				//Large block
-				size_t total_size = size + SPAN_HEADER_SIZE;
-				size_t num_spans = total_size >> _memory_span_size_shift;
-				if (total_size & (_memory_span_mask - 1))
-					++num_spans;
-				size_t current_spans = span->span_count;
-				assert(current_spans == ((span->size_class - SIZE_CLASS_COUNT) + 1));
-				void* block = pointer_offset(span, SPAN_HEADER_SIZE);
-				if (!oldsize)
-					oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-				if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
-					//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
-					if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-						memmove(block, p, oldsize);
-					return block;
-				}
+		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			//Small/medium sized block
+			assert(span->span_count == 1);
+			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+			uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+			uint32_t block_idx = block_offset / span->block_size;
+			void* block = pointer_offset(blocks_start, block_idx * span->block_size);
+			if (!oldsize)
+				oldsize = span->block_size - (uint32_t)pointer_diff(p, block);
+			if ((size_t)span->block_size >= size) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
+			}
+		} else if (span->size_class == SIZE_CLASS_LARGE) {
+			//Large block
+			size_t total_size = size + SPAN_HEADER_SIZE;
+			size_t num_spans = total_size >> _memory_span_size_shift;
+			if (total_size & (_memory_span_mask - 1))
+				++num_spans;
+			size_t current_spans = span->span_count;
+			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
+			if (!oldsize)
+				oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+			if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
+				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
+				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+					memmove(block, p, oldsize);
+				return block;
 			}
 		} else {
 			//Oversized block
@@ -1880,7 +1836,7 @@ _memory_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t siz
 	}
 	// Aligned alloc marks span as having aligned blocks
 	void* block = (!no_alloc ? _memory_aligned_allocate(heap, alignment, size) : 0);
-	if (EXPECTED(block)) {
+	if (EXPECTED(block != 0)) {
 		if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
 			if (!oldsize)
 				oldsize = usablesize;
@@ -2180,6 +2136,34 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	return 0;
 }
 
+static span_t*
+_memory_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t* class_span, uint32_t class_free_blocks) {
+	(void)sizeof(heap);
+	(void)sizeof(iclass);
+	uint32_t free_blocks = span->list_size;
+	if (span == class_span) {
+		free_blocks += class_free_blocks;
+		class_span = 0;
+		class_free_blocks = 0;
+	}
+	uint32_t block_count = span->block_count;
+	if (span->free_list_limit < span->block_count)
+		block_count = span->free_list_limit;
+	void* block = span->free_list;
+	while (block) {
+		++free_blocks;
+		block = *((void**)block);
+	}
+	//If this assert triggers you have memory leaks
+	assert(free_blocks == block_count);
+	if (free_blocks == block_count) {
+		_memory_statistics_dec(&heap->span_use[0].current);
+		_memory_statistics_dec(&heap->size_class_use[iclass].spans_current);
+		_memory_unmap_span(span);
+	}
+	return class_span;
+}
+
 //! Finalize the allocator
 void
 rpmalloc_finalize(void) {
@@ -2198,44 +2182,51 @@ rpmalloc_finalize(void) {
 				_memory_unmap_span(span);
 			}
 
+			_memory_heap_cache_adopt_deferred(heap, 0);
+
 			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 				heap_class_t* heap_class = heap->span_class + iclass;
+				
+				span_t* class_span = (span_t*)((uintptr_t)heap_class->free_list & _memory_span_mask);
+				uint32_t class_free_blocks = 0;
+				void* block = heap_class->free_list;
+				while (block) {
+					++class_free_blocks;
+					block = *((void**)block);
+				}
+
 				span_t* span = heap_class->partial_span;
 				while (span) {
 					span_t* next = span->next;
-					if (span->state == SPAN_STATE_ACTIVE) {
-						uint32_t used_blocks = span->block_count;
-						if (span->free_list_limit < span->block_count)
-							used_blocks = span->free_list_limit;
-						uint32_t free_blocks = 0;
-						void* block = heap_class->free_list;
-						while (block) {
-							++free_blocks;
-							block = *((void**)block);
-						}
-						block = span->free_list;
-						while (block) {
-							++free_blocks;
-							block = *((void**)block);
-						}
-						if (used_blocks == (free_blocks + span->list_size)) {
-							_memory_heap_cache_insert(heap, span);
-							_memory_statistics_dec(&heap->span_use[0].current);
-							_memory_statistics_dec(&heap->size_class_use[iclass].spans_current);
-						}
-					} else {
-						if (span->used_count == span->list_size) {
-							_memory_heap_cache_insert(heap, span);
-							_memory_statistics_dec(&heap->span_use[0].current);
-							_memory_statistics_dec(&heap->size_class_use[iclass].spans_current);
-						}
-					}
+					class_span = _memory_span_finalize(heap, iclass, span, class_span, class_free_blocks);
 					span = next;
 				}
+#if RPMALLOC_FIRST_CLASS_HEAPS
+				span = heap_class->full_span;
+				while (span) {
+					span_t* next = span->next;
+					class_span = _memory_span_finalize(heap, iclass, span, class_span, class_free_blocks);
+					span = next;
+				}
+#endif
+				if (class_span)
+					class_span = _memory_span_finalize(heap, iclass, class_span, class_span, class_free_blocks);
 			}
 
-			//Free span caches (other thread might have deferred after the thread using this heap finalized)
-			_memory_heap_cache_adopt_deferred(heap, 0);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			span_t* span = heap->large_huge_span;
+			while (span) {
+				span_t* next = span->next;
+				if (span->size_class == SIZE_CLASS_HUGE) {
+					_memory_deallocate_huge(span);
+				} else {
+					_memory_statistics_dec(&heap->span_use[span->span_count - 1].current);
+					_memory_unmap_span(span);
+				}
+				span = next;
+			}				
+#endif
+
 #if ENABLE_THREAD_CACHE
 			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 				if (heap->span_cache[iclass]) {
@@ -2244,6 +2235,8 @@ rpmalloc_finalize(void) {
 				}
 			}
 #endif
+			assert(!atomic_load_ptr(&heap->span_free_deferred));
+
 			heap_t* next_heap = heap->next_heap;
 			if (!heap->master_heap) {
 				heap->next_heap = master_heaps;
@@ -2544,8 +2537,10 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 		while (span) {
 			atomic_thread_fence_acquire();
 			size_t free_count = span->list_size;
-			if (span->state == SPAN_STATE_PARTIAL)
-				free_count += (size_class->block_count - span->used_count);
+			size_t block_count = size_class->block_count;
+			if (span->free_list_limit < block_count)
+				block_count = span->free_list_limit;
+			free_count += (block_count - span->used_count);
 			stats->sizecache = free_count * size_class->block_size;
 			span = span->next;
 		}
@@ -2555,36 +2550,40 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		if (heap->span_cache[iclass])
 			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
-		span_t* deferred_list = !iclass ? (span_t*)atomic_load_ptr(&heap->span_free_deferred) : 0;
-		//TODO: Incorrect, for deferred lists the size is NOT stored in list_size
-		if (deferred_list)
-			stats->spancache = (size_t)deferred_list->list_size * (iclass + 1) * _memory_span_size;
 	}
 #endif
+
+	span_t* deferred = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
+	while (deferred) {
+		if (deferred->size_class != SIZE_CLASS_HUGE)
+			stats->spancache = (size_t)deferred->span_count * _memory_span_size;
+		deferred = (span_t*)deferred->free_list;
+	}
+
 #if ENABLE_STATISTICS
-	stats->thread_to_global = heap->thread_to_global;
-	stats->global_to_thread = heap->global_to_thread;
+	stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global);
+	stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread);
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		stats->span_use[iclass].current = (size_t)atomic_load32(&heap->span_use[iclass].current);
-		stats->span_use[iclass].peak = (size_t)heap->span_use[iclass].high;
-		stats->span_use[iclass].to_global = (size_t)heap->span_use[iclass].spans_to_global;
-		stats->span_use[iclass].from_global = (size_t)heap->span_use[iclass].spans_from_global;
-		stats->span_use[iclass].to_cache = (size_t)heap->span_use[iclass].spans_to_cache;
-		stats->span_use[iclass].from_cache = (size_t)heap->span_use[iclass].spans_from_cache;
-		stats->span_use[iclass].to_reserved = (size_t)heap->span_use[iclass].spans_to_reserved;
-		stats->span_use[iclass].from_reserved = (size_t)heap->span_use[iclass].spans_from_reserved;
-		stats->span_use[iclass].map_calls = (size_t)heap->span_use[iclass].spans_map_calls;
+		stats->span_use[iclass].peak = (size_t)atomic_load32(&heap->span_use[iclass].high);
+		stats->span_use[iclass].to_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global);
+		stats->span_use[iclass].from_global = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global);
+		stats->span_use[iclass].to_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache);
+		stats->span_use[iclass].from_cache = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache);
+		stats->span_use[iclass].to_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved);
+		stats->span_use[iclass].from_reserved = (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved);
+		stats->span_use[iclass].map_calls = (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls);
 	}
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		stats->size_use[iclass].alloc_current = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current);
 		stats->size_use[iclass].alloc_peak = (size_t)heap->size_class_use[iclass].alloc_peak;
-		stats->size_use[iclass].alloc_total = (size_t)heap->size_class_use[iclass].alloc_total;
+		stats->size_use[iclass].alloc_total = (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total);
 		stats->size_use[iclass].free_total = (size_t)atomic_load32(&heap->size_class_use[iclass].free_total);
-		stats->size_use[iclass].spans_to_cache = (size_t)heap->size_class_use[iclass].spans_to_cache;
-		stats->size_use[iclass].spans_from_cache = (size_t)heap->size_class_use[iclass].spans_from_cache;
-		stats->size_use[iclass].spans_from_reserved = (size_t)heap->size_class_use[iclass].spans_from_reserved;
-		stats->size_use[iclass].map_calls = (size_t)heap->size_class_use[iclass].spans_map_calls;
+		stats->size_use[iclass].spans_to_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache);
+		stats->size_use[iclass].spans_from_cache = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache);
+		stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved);
+		stats->size_use[iclass].map_calls = (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls);
 	}
 #endif
 }
@@ -2614,43 +2613,43 @@ _memory_heap_dump_statistics(heap_t* heap, void* file) {
 	fprintf(file, "Heap %d stats:\n", heap->id);
 	fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB FromCacheMiB FromReserveMiB MmapCalls\n");
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		if (!heap->size_class_use[iclass].alloc_total)
+		if (!atomic_load32(&heap->size_class_use[iclass].alloc_total))
 			continue;
 		fprintf(file, "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu %9u\n", (uint32_t)iclass,
 			atomic_load32(&heap->size_class_use[iclass].alloc_current),
 			heap->size_class_use[iclass].alloc_peak,
-			heap->size_class_use[iclass].alloc_total,
+			atomic_load32(&heap->size_class_use[iclass].alloc_total),
 			atomic_load32(&heap->size_class_use[iclass].free_total),
 			_memory_size_class[iclass].block_size,
 			_memory_size_class[iclass].block_count,
-			heap->size_class_use[iclass].spans_current,
+			atomic_load32(&heap->size_class_use[iclass].spans_current),
 			heap->size_class_use[iclass].spans_peak,
 			((size_t)heap->size_class_use[iclass].alloc_peak * (size_t)_memory_size_class[iclass].block_size) / (size_t)(1024 * 1024),
-			((size_t)heap->size_class_use[iclass].spans_to_cache * _memory_span_size) / (size_t)(1024 * 1024),
-			((size_t)heap->size_class_use[iclass].spans_from_cache * _memory_span_size) / (size_t)(1024 * 1024),
-			((size_t)heap->size_class_use[iclass].spans_from_reserved * _memory_span_size) / (size_t)(1024 * 1024),
-			heap->size_class_use[iclass].spans_map_calls);
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_reserved) * _memory_span_size) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
 	}
 	fprintf(file, "Spans  Current     Peak  PeakMiB  Cached  ToCacheMiB FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB FromGlobalMiB  MmapCalls\n");
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls)
+		if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
 			continue;
 		fprintf(file, "%4u: %8d %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n", (uint32_t)(iclass + 1),
 			atomic_load32(&heap->span_use[iclass].current),
-			heap->span_use[iclass].high,
-			((size_t)heap->span_use[iclass].high * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->span_use[iclass].high),
+			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
 #if ENABLE_THREAD_CACHE
 			heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
-			((size_t)heap->span_use[iclass].spans_to_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-			((size_t)heap->span_use[iclass].spans_from_cache * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 #else
 			0, 0ULL, 0ULL,
 #endif
-			((size_t)heap->span_use[iclass].spans_to_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-			((size_t)heap->span_use[iclass].spans_from_reserved * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
-			((size_t)heap->span_use[iclass].spans_to_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-			((size_t)heap->span_use[iclass].spans_from_global * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
-			heap->span_use[iclass].spans_map_calls);
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
+			atomic_load32(&heap->span_use[iclass].spans_map_calls));
 	}
 	fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
 	fprintf(file, "%17zu %17zu\n", (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024), (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
@@ -2668,15 +2667,15 @@ rpmalloc_dump_statistics(void* file) {
 		while (heap) {
 			int need_dump = 0;
 			for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) {
-				if (!heap->size_class_use[iclass].alloc_total) {
+				if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) {
 					assert(!atomic_load32(&heap->size_class_use[iclass].free_total));
-					assert(!heap->size_class_use[iclass].spans_map_calls);
+					assert(!atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
 					continue;
 				}
 				need_dump = 1;
 			}
 			for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT); ++iclass) {
-				if (!heap->span_use[iclass].high && !heap->span_use[iclass].spans_map_calls)
+				if (!atomic_load32(&heap->span_use[iclass].high) && !atomic_load32(&heap->span_use[iclass].spans_map_calls))
 					continue;
 				need_dump = 1;
 			}
@@ -2816,6 +2815,9 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heapptr) {
 	heap_t* heap = (heap_t*)heapptr;
 	span_t* span;
 	span_t* next_span;
+
+	_memory_heap_cache_adopt_deferred(heap, 0);
+
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		span = heap->span_class[iclass].partial_span;
 		while (span) {
@@ -2823,17 +2825,26 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heapptr) {
 			_memory_heap_cache_insert(heap, span);
 			span = next_span;
 		}
+		span = heap->span_class[iclass].full_span;
+		while (span) {
+			next_span = span->next;
+			_memory_heap_cache_insert(heap, span);
+			span = next_span;
+		}
 	}
 	memset(heap->span_class, 0, sizeof(heap->span_class));
 
-	span = heap->full_span;
+	span = heap->large_huge_span;
 	while (span) {
 		next_span = span->next;
-		_memory_heap_cache_insert(heap, span);
+		if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE))
+			_memory_deallocate_huge(span);
+		else
+			_memory_heap_cache_insert(heap, span);
 		span = next_span;
 	}
+	heap->large_huge_span = 0;
 
-	_memory_heap_cache_adopt_deferred(heap, 0);
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		span = heap->span_cache[iclass];
@@ -2858,7 +2869,7 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heapptr) {
 #if ENABLE_STATISTICS
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		atomic_store32(&heap->size_class_use[iclass].alloc_current, 0);
-		heap->size_class_use[iclass].spans_current = 0;
+		atomic_store32(&heap->size_class_use[iclass].spans_current, 0);
 	}
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		atomic_store32(&heap->span_use[iclass].current, 0 );
diff --git a/test/main.c b/test/main.c
index 7732982a..049e0736 100644
--- a/test/main.c
+++ b/test/main.c
@@ -548,7 +548,7 @@ crossallocator_thread(void* argp) {
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
 			size_t iarg = (iloop + ipass + iextra++) % arg.num_datasize;
-			cursize = arg.datasize[iarg] + ((iloop + ipass) % 21);
+			cursize = arg.datasize[iarg] + ((iloop + ipass) % 439);
 			void* first_addr = rpmalloc(cursize);
 			if (first_addr == 0) {
 				ret = test_fail("Allocation failed");
@@ -564,7 +564,7 @@ crossallocator_thread(void* argp) {
 			}
 
 			iarg = (iloop + ipass + iextra++) % arg.num_datasize;
-			cursize = arg.datasize[iarg] + ((iloop + ipass) % 17);
+			cursize = arg.datasize[iarg] + ((iloop + ipass) % 751);
 			void* third_addr = rpmalloc(cursize);
 			if (third_addr == 0) {
 				ret = test_fail("Allocation failed");
@@ -759,17 +759,17 @@ test_threaded(void) {
 
 static int 
 test_crossthread(void) {
-	uintptr_t thread[8];
-	allocator_thread_arg_t arg[8];
-	thread_arg targ[8];
+	uintptr_t thread[32];
+	allocator_thread_arg_t arg[32];
+	thread_arg targ[32];
 
 	rpmalloc_initialize();
 
 	size_t num_alloc_threads = _hardware_threads;
 	if (num_alloc_threads < 2)
 		num_alloc_threads = 2;
-	if (num_alloc_threads > 4)
-		num_alloc_threads = 4;
+	if (num_alloc_threads > 16)
+		num_alloc_threads = 16;
 
 	for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread) {
 		unsigned int iadd = (ithread * (16 + ithread) + ithread) % 128;
@@ -787,10 +787,10 @@ test_crossthread(void) {
 		arg[ithread].datasize[7] = 19 + iadd;
 		arg[ithread].datasize[8] = 154 + iadd;
 		arg[ithread].datasize[9] = 9723 + iadd;
-		arg[ithread].datasize[10] = 15 + iadd;
-		arg[ithread].datasize[11] = 493 + iadd;
+		arg[ithread].datasize[10] = 15543 + iadd;
+		arg[ithread].datasize[11] = 32493 + iadd;
 		arg[ithread].datasize[12] = 34 + iadd;
-		arg[ithread].datasize[13] = 894 + iadd;
+		arg[ithread].datasize[13] = 1894 + iadd;
 		arg[ithread].datasize[14] = 193 + iadd;
 		arg[ithread].datasize[15] = 2893 + iadd;
 		arg[ithread].num_datasize = 16;
@@ -952,8 +952,6 @@ test_run(int argc, char** argv) {
 	(void)sizeof(argc);
 	(void)sizeof(argv);
 	test_initialize();
-	if (test_first_class_heaps())
-		return -1;
 	if (test_alloc())
 		return -1;
 	if (test_realloc())
@@ -966,6 +964,8 @@ test_run(int argc, char** argv) {
 		return -1;
 	if (test_threaded())
 		return -1;
+	if (test_first_class_heaps())
+		return -1;
 	printf("All tests passed\n");
 	return 0;
 }

From 80cdb3bd4c600ed98b98a0ba4bc5f9e843712a51 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 7 Jan 2020 10:55:18 +0100
Subject: [PATCH 16/69] improve first class heap orphan handling (#134)

---
 rpmalloc/rpmalloc.c | 100 +++++++++++++++++++++-----------------------
 test/main.c         |  92 ++++++++++++++++++++++++++++------------
 2 files changed, 112 insertions(+), 80 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 7f8659ae..9c203d8a 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -500,6 +500,10 @@ static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
 //! Orphaned heaps
 static atomicptr_t _memory_orphan_heaps;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+//! Orphaned heaps (first class heaps)
+static atomicptr_t _memory_first_class_orphan_heaps;
+#endif
 //! Running orphan counter to avoid ABA issues in linked list
 static atomic32_t _memory_orphan_counter;
 #if ENABLE_STATISTICS
@@ -837,64 +841,33 @@ static void
 _memory_span_double_link_list_add(span_t** head, span_t* span) {
 	if (*head) {
 		span->next = *head;
-		//Maintain pointer to tail span
-		span->prev = (*head)->prev;
 		(*head)->prev = span;
 	} else {
 		span->next = 0;
-		span->prev = span;
 	}
 	*head = span;
 }
 
-//! Add a span to double linked list at the tail
-static void
-_memory_span_double_link_list_add_tail(span_t** head, span_t* span) {
-	span->next = 0;
-	if (*head) {
-		span_t* tail = (*head)->prev;
-		tail->next = span;
-		span->prev = tail;
-		//Maintain pointer to tail span
-		(*head)->prev = span;
-	} else {
-		span->prev = span;
-		*head = span;
-	}
-}
-
 //! Pop head span from double linked list
 static void
 _memory_span_double_link_list_pop_head(span_t** head, span_t* span) {
 	assert(*head == span);
 	span = *head;
 	*head = span->next;
-	if (*head) {
-		//Maintain pointer to tail span
-		assert(span->prev != span);
-		assert(span->next->prev == span);
-		(*head)->prev = span->prev;
-	} else {
-		assert(span->prev == span);
-	}
 }
 
 //! Remove a span from double linked list
 static void
 _memory_span_double_link_list_remove(span_t** head, span_t* span) {
 	assert(*head);
-	if (UNEXPECTED(*head == span)) {
-		_memory_span_double_link_list_pop_head(head, span);
+	if (*head == span) {
+		*head = span->next;
 	} else {
 		span_t* next_span = span->next;
 		span_t* prev_span = span->prev;
 		prev_span->next = next_span;
 		if (EXPECTED(next_span != 0)) {
 			next_span->prev = prev_span;
-		} else {
-			//Update pointer to tail span
-			assert((*head)->prev == span);
-			(*head)->prev = prev_span;
 		}
 	}
 }
@@ -1509,16 +1482,22 @@ _memory_heap_initialize(heap_t* heap) {
 }
 
 static void
-_memory_heap_orphan(heap_t* heap) {
+_memory_heap_orphan(heap_t* heap, int first_class) {
 	void* raw_heap;
 	uintptr_t orphan_counter;
 	heap_t* last_heap;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	atomicptr_t* heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
+#else
+	(void)sizeof(first_class);
+	atomicptr_t* heap_list = &_memory_orphan_heaps;
+#endif
 	do {
-		last_heap = (heap_t*)atomic_load_ptr(&_memory_orphan_heaps);
+		last_heap = (heap_t*)atomic_load_ptr(heap_list);
 		heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
 		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
-	} while (!atomic_cas_ptr(&_memory_orphan_heaps, raw_heap, last_heap));
+	} while (!atomic_cas_ptr(heap_list, raw_heap, last_heap));
 }
 
 //! Allocate a new heap from newly mapped memory pages
@@ -1543,36 +1522,45 @@ _memory_allocate_heap_new(void) {
 	while (num_heaps > 1) {
 		_memory_heap_initialize(extra_heap);
 		extra_heap->master_heap = heap;
-		_memory_heap_orphan(extra_heap);
+		_memory_heap_orphan(extra_heap, 1);
 		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
 		--num_heaps;
 	}
 	return heap;
 }
 
-//! Allocate a new heap, potentially reusing a previously orphaned heap
 static heap_t*
-_memory_allocate_heap(void) {
+_memory_heap_extract_orphan(atomicptr_t* heap_list) {
 	void* raw_heap;
 	void* next_raw_heap;
 	uintptr_t orphan_counter;
 	heap_t* heap;
 	heap_t* next_heap;
-	//Try getting an orphaned heap
 	atomic_thread_fence_acquire();
 	do {
-		raw_heap = atomic_load_ptr(&_memory_orphan_heaps);
+		raw_heap = atomic_load_ptr(heap_list);
 		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
 		if (!heap)
 			break;
 		next_heap = heap->next_orphan;
 		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
 		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
-	} while (!atomic_cas_ptr(&_memory_orphan_heaps, next_raw_heap, raw_heap));
+	} while (!atomic_cas_ptr(heap_list, next_raw_heap, raw_heap));
+	return heap;
+}
 
+//! Allocate a new heap, potentially reusing a previously orphaned heap
+static heap_t*
+_memory_allocate_heap(int first_class) {
+	heap_t* heap = 0;
+	if (first_class == 0)
+		heap = _memory_heap_extract_orphan(&_memory_orphan_heaps);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	if (!heap)
+		heap = _memory_heap_extract_orphan(&_memory_first_class_orphan_heaps);
+#endif
 	if (!heap)
 		heap = _memory_allocate_heap_new();
-
 	return heap;
 }
 
@@ -1588,7 +1576,7 @@ _memory_deallocate_direct_small_or_medium(span_t* span, void* block) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 		_memory_span_double_link_list_remove(&heap_class->full_span, span);
 #endif
-		_memory_span_double_link_list_add_tail(&heap_class->partial_span, span);
+		_memory_span_double_link_list_add(&heap_class->partial_span, span);
 	}
 	*((void**)block) = span->free_list;
 	span->free_list = block;
@@ -1869,7 +1857,7 @@ _memory_adjust_size_class(size_t iclass) {
 }
 
 static void
-_memory_heap_finalize(void* heapptr) {
+_memory_heap_finalize(void* heapptr, int first_class) {
 	heap_t* heap = (heap_t*)heapptr;
 	if (!heap)
 		return;
@@ -1897,7 +1885,7 @@ _memory_heap_finalize(void* heapptr) {
 #endif
 
 	//Orphan the heap
-	_memory_heap_orphan(heap);
+	_memory_heap_orphan(heap, first_class);
 
 	set_thread_heap(0);
 #if ENABLE_STATISTICS
@@ -1906,6 +1894,11 @@ _memory_heap_finalize(void* heapptr) {
 #endif
 }
 
+static void
+_memory_heap_finalize_raw(void* heapptr) {
+	_memory_heap_finalize(heapptr, 0);
+}
+
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 #include <fibersapi.h>
 static DWORD fls_key;
@@ -2085,7 +2078,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	_memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2);
 
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-	if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize))
+	if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize_raw))
 		return -1;
 #endif
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
@@ -2253,6 +2246,9 @@ rpmalloc_finalize(void) {
 #endif
 
 	atomic_store_ptr(&_memory_orphan_heaps, 0);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	atomic_store_ptr(&_memory_first_class_orphan_heaps, 0);
+#endif
 	atomic_thread_fence_release();
 
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
@@ -2285,7 +2281,7 @@ rpmalloc_finalize(void) {
 extern inline void
 rpmalloc_thread_initialize(void) {
 	if (!get_thread_heap_raw()) {
-		heap_t* heap = _memory_allocate_heap();
+		heap_t* heap = _memory_allocate_heap(0);
 		if (heap) {
 			atomic_thread_fence_acquire();
 			_memory_statistics_inc(&_memory_active_heaps);
@@ -2302,7 +2298,7 @@ void
 rpmalloc_thread_finalize(void) {
 	heap_t* heap = get_thread_heap_raw();
 	if (heap)
-		_memory_heap_finalize(heap);
+		_memory_heap_finalize_raw(heap);
 }
 
 int
@@ -2718,7 +2714,7 @@ rpmalloc_heap_acquire(void) {
 	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
 	// could already be allocated from the heap which would (wrongly) be released when
 	// heap is cleared with rpmalloc_heap_free_all()
-	heap_t* heap = _memory_allocate_heap_new();
+	heap_t* heap = _memory_allocate_heap(1);
 	_memory_statistics_inc(&_memory_active_heaps);
 	return (rpmalloc_heap_t*)heap;
 }
@@ -2726,7 +2722,7 @@ rpmalloc_heap_acquire(void) {
 extern inline void
 rpmalloc_heap_release(rpmalloc_heap_t* heap) {
 	if (heap)
-		_memory_heap_finalize((heap_t*)heap);
+		_memory_heap_finalize((heap_t*)heap, 1);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2875,8 +2871,6 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heapptr) {
 		atomic_store32(&heap->span_use[iclass].current, 0 );
 	}
 #endif
-
-	_memory_heap_orphan(heap);
 }
 
 extern inline void
diff --git a/test/main.c b/test/main.c
index 049e0736..877c5d30 100644
--- a/test/main.c
+++ b/test/main.c
@@ -30,6 +30,11 @@ test_fail_cb(const char* reason, const char* file, int line) {
 
 #define test_fail(msg) test_fail_cb(msg, __FILE__, __LINE__)
 
+static void
+defer_free_thread(void *arg) {
+	rpfree(arg);
+}
+
 static int
 test_alloc(void) {
 	unsigned int iloop = 0;
@@ -286,6 +291,20 @@ test_alloc(void) {
 	}
 	rpmalloc_finalize();
 
+	// Test that a full span with deferred block is finalized properly
+	rpmalloc_initialize();
+	{
+		addr[0] = rpmalloc(23457);
+	
+		thread_arg targ;
+		targ.fn = defer_free_thread;
+		targ.arg = addr[0];
+		uintptr_t thread = thread_run(&targ);
+		thread_sleep(100);
+		thread_join(thread);
+	}
+	rpmalloc_finalize();
+
 	printf("Memory allocation tests passed\n");
 
 	return 0;
@@ -372,6 +391,7 @@ typedef struct _allocator_thread_arg {
 	unsigned int        num_datasize; //max 32
 	void**              pointers;
 	void**              crossthread_pointers;
+	int                 init_fini_each_loop;
 } allocator_thread_arg_t;
 
 static void
@@ -396,7 +416,13 @@ allocator_thread(void* argp) {
 
 	thread_sleep(1);
 
+	if (arg.init_fini_each_loop)
+		rpmalloc_thread_finalize();
+
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
+		if (arg.init_fini_each_loop)
+			rpmalloc_thread_initialize();
+
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
 			cursize = 4 + arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
 
@@ -439,8 +465,14 @@ allocator_thread(void* argp) {
 
 			rpfree(addr[ipass]);
 		}
+
+		if (arg.init_fini_each_loop)
+			rpmalloc_thread_finalize();
 	}
 
+	if (arg.init_fini_each_loop)
+		rpmalloc_thread_initialize();
+
 	rpfree(data);
 	rpfree(addr);
 
@@ -733,6 +765,7 @@ test_threaded(void) {
 	arg.num_datasize = 16;
 	arg.loops = 100;
 	arg.passes = 4000;
+	arg.init_fini_each_loop = 0;
 
 	thread_arg targ;
 	targ.fn = allocator_thread;
@@ -894,41 +927,46 @@ test_first_class_heaps(void) {
 	uintptr_t threadres[32];
 	unsigned int i;
 	size_t num_alloc_threads;
-	allocator_thread_arg_t arg;
+	allocator_thread_arg_t arg[32];
 
 	rpmalloc_initialize();
 
-	num_alloc_threads = _hardware_threads;
+	num_alloc_threads = _hardware_threads * 2;
 	if (num_alloc_threads < 2)
 		num_alloc_threads = 2;
-	if (num_alloc_threads > 32)
-		num_alloc_threads = 32;
+	if (num_alloc_threads > 16)
+		num_alloc_threads = 16;
 
-	arg.datasize[0] = 19;
-	arg.datasize[1] = 249;
-	arg.datasize[2] = 797;
-	arg.datasize[3] = 3058;
-	arg.datasize[4] = 47892;
-	arg.datasize[5] = 173902;
-	arg.datasize[6] = 389;
-	arg.datasize[7] = 19;
-	arg.datasize[8] = 2493;
-	arg.datasize[9] = 7979;
-	arg.datasize[10] = 3;
-	arg.datasize[11] = 79374;
-	arg.datasize[12] = 3432;
-	arg.datasize[13] = 548;
-	arg.datasize[14] = 38934;
-	arg.datasize[15] = 234;
-	arg.num_datasize = 16;
-	arg.loops = 100;
-	arg.passes = 4000;
+	for (i = 0; i < num_alloc_threads; ++i) {
+		arg[i].datasize[0] = 19;
+		arg[i].datasize[1] = 249;
+		arg[i].datasize[2] = 797;
+		arg[i].datasize[3] = 3058;
+		arg[i].datasize[4] = 47892;
+		arg[i].datasize[5] = 173902;
+		arg[i].datasize[6] = 389;
+		arg[i].datasize[7] = 19;
+		arg[i].datasize[8] = 2493;
+		arg[i].datasize[9] = 7979;
+		arg[i].datasize[10] = 3;
+		arg[i].datasize[11] = 79374;
+		arg[i].datasize[12] = 3432;
+		arg[i].datasize[13] = 548;
+		arg[i].datasize[14] = 38934;
+		arg[i].datasize[15] = 234;
+		arg[i].num_datasize = 16;
+		arg[i].loops = 100;
+		arg[i].passes = 4000;
+		arg[i].init_fini_each_loop = 1;
+
+		thread_arg targ;
+		targ.fn = heap_allocator_thread;
+		if ((i % 2) != 0)
+			targ.fn = allocator_thread;
+		targ.arg = &arg[i];
 
-	thread_arg targ;
-	targ.fn = heap_allocator_thread;
-	targ.arg = &arg;
-	for (i = 0; i < num_alloc_threads; ++i)
 		thread[i] = thread_run(&targ);
+	}
 
 	thread_sleep(1000);
 

From a8db569ffb5a9965f1a218051b8282613c4d785b Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 7 Jan 2020 13:55:32 +0100
Subject: [PATCH 17/69] Improve aligned allocs up to 128 byte alignments (#135)

---
 CHANGELOG           |  3 +++
 rpmalloc/rpmalloc.c | 17 +++++++++++++++--
 test/main.c         | 19 ++++++++++++++++---
 3 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 3e24c064..a26b81d0 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -15,6 +15,9 @@ Added rpaligned_calloc function for aligned and zero intialized allocations
 
 Fixed natural alignment check in rpaligned_realloc to 16 bytes (check was 32, which is wrong)
 
+Minor performance improvements for all code paths by simplified span handling, and for aligned allocations
+with alignment less or equal to 128 bytes by utilizing natural block alignments
+
 
 1.4.0
 
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 9c203d8a..f2ea0b14 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -217,7 +217,7 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 #endif
 
 /// Preconfigured limits and sizes
-//! Granularity of a small allocation block
+//! Granularity of a small allocation block (must be power of two)
 #define SMALL_GRANULARITY         16
 //! Small granularity shift count
 #define SMALL_GRANULARITY_SHIFT   4
@@ -241,9 +241,12 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
 //! ABA protection size in orhpan heap list (also becomes limit of smallest page size)
 #define HEAP_ORPHAN_ABA_SIZE      512
-//! Size of a span header (must be a multiple of SMALL_GRANULARITY)
+//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
 #define SPAN_HEADER_SIZE          128
 
+_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two");
+_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two");
+
 #if ENABLE_VALIDATE_ARGS
 //! Maximum allocation size to avoid integer overflow
 #undef  MAX_ALLOC_SIZE
@@ -1375,6 +1378,16 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 	}
 #endif
 
+	if ((alignment <= SPAN_HEADER_SIZE) && (size < _memory_medium_size_limit)) {
+		// If alignment is less or equal to span header size (which is power of two),
+		// and size aligned to span header size multiples is less than size + alignment,
+		// then use natural alignment of blocks to provide alignment
+		size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE;
+		assert(!(multiple_size % SPAN_HEADER_SIZE));
+		if (multiple_size <= (size + alignment))
+			return _memory_allocate(heap, multiple_size);
+	}
+
 	void* ptr = 0;
 	size_t align_mask = alignment - 1;
 	if (alignment <= _memory_page_size) {
diff --git a/test/main.c b/test/main.c
index 877c5d30..5545f751 100644
--- a/test/main.c
+++ b/test/main.c
@@ -129,11 +129,23 @@ test_alloc(void) {
 		rpfree(testptr);
 	}
 
-	static size_t alignment[3] = { 0, 64, 256 };
+	static size_t alignment[5] = { 0, 32, 64, 128, 256 };
+	for (iloop = 0; iloop < 5; ++iloop) {
+		for (ipass = 0; ipass < 128 * 1024; ++ipass) {
+			size_t this_alignment = alignment[iloop];
+			char* baseptr = rpaligned_alloc(this_alignment, ipass);
+			if (this_alignment && ((uintptr_t)baseptr & (this_alignment - 1)))
+				return test_fail("Alignment failed");
+			rpfree(baseptr);
+		}
+	}
 	for (iloop = 0; iloop < 64; ++iloop) {
 		for (ipass = 0; ipass < 8142; ++ipass) {
+			size_t this_alignment = alignment[ipass % 5];
 			size_t size = iloop + ipass + datasize[(iloop + ipass) % 7];
-			char* baseptr = rpaligned_alloc(alignment[ipass % 3], size);
+			char* baseptr = rpaligned_alloc(this_alignment, size);
+			if (this_alignment && ((uintptr_t)baseptr & (this_alignment - 1)))
+				return test_fail("Alignment failed");
 			for (size_t ibyte = 0; ibyte < size; ++ibyte)
 				baseptr[ibyte] = (char)(ibyte & 0xFF);
 
@@ -146,8 +158,9 @@ test_alloc(void) {
 			}
 
 			size_t alignsize = (iloop * ipass + datasize[(iloop + ipass * 3) % 7]) & 0x2FF;
+			this_alignment = alignment[(ipass + 1) % 5];
 			capsize = (capsize > alignsize ? alignsize : capsize);
-			baseptr = rpaligned_realloc(baseptr, 128, alignsize, resize, 0);
+			baseptr = rpaligned_realloc(baseptr, this_alignment, alignsize, resize, 0);
 			for (size_t ibyte = 0; ibyte < capsize; ++ibyte) {
 				if (baseptr[ibyte] != (char)(ibyte & 0xFF))
 					return test_fail("Data not preserved on realloc");

From 47b4ba06b795708f35aa2edff2eff9d99342789f Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Tue, 7 Jan 2020 21:49:09 +0100
Subject: [PATCH 18/69] clang compilation compatibility

---
 rpmalloc/rpmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index f2ea0b14..ab69a6f3 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1382,7 +1382,7 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 		// If alignment is less or equal to span header size (which is power of two),
 		// and size aligned to span header size multiples is less than size + alignment,
 		// then use natural alignment of blocks to provide alignment
-		size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE;
+		size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(uintptr_t)(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE;
 		assert(!(multiple_size % SPAN_HEADER_SIZE));
 		if (multiple_size <= (size + alignment))
 			return _memory_allocate(heap, multiple_size);

From 6cfd116d7952aafb415ab9ea9b7f0f9b298d2485 Mon Sep 17 00:00:00 2001
From: John Regan <john@jrjrtech.com>
Date: Wed, 8 Jan 2020 02:15:13 -0500
Subject: [PATCH 19/69] fix compilation when targeting OSX < 10.15 (#136)

---
 rpmalloc/malloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 9511cf79..e5e2f010 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -114,7 +114,9 @@ __attribute__ ((section("__DATA, __interpose"))) = {
 	MAC_INTERPOSE_PAIR(rpmalloc, calloc),
 	MAC_INTERPOSE_PAIR(rprealloc, realloc),
 	MAC_INTERPOSE_PAIR(rprealloc, reallocf),
+#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15
 	MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc),
+#endif
 	MAC_INTERPOSE_PAIR(rpmemalign, memalign),
 	MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign),
 	MAC_INTERPOSE_PAIR(rpfree, free),

From 439e3a449c739712c9a43bb77cd4a5664c737697 Mon Sep 17 00:00:00 2001
From: John Regan <john@jrjrtech.com>
Date: Wed, 8 Jan 2020 02:16:19 -0500
Subject: [PATCH 20/69] Fix compilation with mingw (#137)

* set define to allow "zu" format specifier
* determine size of void* at compile-time with UINTPTR_MAX, prevents warning about overflow
---
 rpmalloc/rpmalloc.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index ab69a6f3..22fae379 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -124,6 +124,9 @@
 #  ifndef WIN32_LEAN_AND_MEAN
 #    define WIN32_LEAN_AND_MEAN
 #  endif
+#  ifndef __USE_MINGW_ANSI_STDIO
+#  define __USE_MINGW_ANSI_STDIO 1
+#  endif
 #  include <windows.h>
 #  if ENABLE_VALIDATE_ARGS
 #    include <Intsafe.h>
@@ -132,7 +135,6 @@
 #  include <unistd.h>
 #  include <stdio.h>
 #  include <stdlib.h>
-#  include <errno.h>
 #  if defined(__APPLE__)
 #    include <mach/mach_vm.h>
 #    include <mach/vm_statistics.h>
@@ -146,6 +148,7 @@
 
 #include <stdint.h>
 #include <string.h>
+#include <errno.h>
 
 #if ENABLE_ASSERTS
 #  undef NDEBUG
@@ -2045,10 +2048,12 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 
 	//The ABA counter in heap orphan list is tied to using HEAP_ORPHAN_ABA_SIZE
 	size_t min_span_size = HEAP_ORPHAN_ABA_SIZE;
-	size_t max_page_size = 4 * 1024 * 1024;
-	const size_t ptrbits = sizeof(void*);
-	if (ptrbits > 4)
-		max_page_size = 4096ULL * 1024ULL * 1024ULL;
+	size_t max_page_size;
+#if UINTPTR_MAX > 0xFFFFFFFF
+	max_page_size = 4096ULL * 1024ULL * 1024ULL;
+#else
+	max_page_size = 4 * 1024 * 1024;
+#endif
 	if (_memory_page_size < min_span_size)
 		_memory_page_size = min_span_size;
 	if (_memory_page_size > max_page_size)

From 797132c5130ab35a52f99b4948f3bbccefd24730 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 8 Jan 2020 11:24:43 +0100
Subject: [PATCH 21/69] Sanitize size calculations

---
 rpmalloc/rpmalloc.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 22fae379..1a91701c 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -771,7 +771,7 @@ _memory_unmap_span(span_t* span) {
 			unmap_count = master->total_spans;
 		_memory_statistics_sub(&_reserved_spans, unmap_count);
 		_memory_statistics_sub(&_master_spans, 1);
-		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, master->total_spans * _memory_span_size);
+		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
 	}
 }
 
@@ -1145,7 +1145,7 @@ free_list_partial_init(void** list, void** first_block, void* page_start, void*
 	*first_block = block_start;
 	if (block_count > 1) {
 		void* free_block = pointer_offset(block_start, block_size);
-		void* block_end = pointer_offset(block_start, block_size * block_count);
+		void* block_end = pointer_offset(block_start, (size_t)block_size * block_count);
 		//If block size is less than half a memory page, bound init to next memory page boundary
 		if (block_size < (_memory_page_size >> 1)) {
 			void* page_end = pointer_offset(page_start, _memory_page_size);
@@ -1237,7 +1237,7 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 			block = free_list_pop(&heap_class->free_list);
 		} else {
 			//If the span did not fully initialize free list, link up another page worth of blocks			
-			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + (span->free_list_limit * span->block_size));
+			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size));
 			span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block,
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
 				span->block_count - span->free_list_limit, span->block_size);
@@ -1682,7 +1682,7 @@ _memory_deallocate_large(span_t* span) {
 		if (span->flags & SPAN_FLAG_MASTER) {
 			heap->span_reserve_master = span;
 		} else { //SPAN_FLAG_SUBSPAN
-			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)(span->offset_from_master * _memory_span_size));
+			span_t* master = (span_t*)pointer_offset(span, -(intptr_t)((size_t)span->offset_from_master * _memory_span_size));
 			heap->span_reserve_master = master;
 			assert(master->flags & SPAN_FLAG_MASTER);
 			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
@@ -1762,9 +1762,9 @@ _memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned
 			void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
 			uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
 			uint32_t block_idx = block_offset / span->block_size;
-			void* block = pointer_offset(blocks_start, block_idx * span->block_size);
+			void* block = pointer_offset(blocks_start, (size_t)block_idx * span->block_size);
 			if (!oldsize)
-				oldsize = span->block_size - (uint32_t)pointer_diff(p, block);
+				oldsize = (size_t)span->block_size - pointer_diff(p, block);
 			if ((size_t)span->block_size >= size) {
 				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
 				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))

From 8d3070ffc9fabc0aaeb1dcb38794a0c6fd723fb3 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Fri, 31 Jan 2020 13:54:01 +0100
Subject: [PATCH 22/69] Update atomics to avoid explicit barriers (#143)

---
 rpmalloc/rpmalloc.c | 79 +++++++++++++++++++++------------------------
 rpmalloc/rpmalloc.h |  7 ++--
 test/main.c         |  1 -
 test/thread.c       | 19 +----------
 test/thread.h       |  3 --
 5 files changed, 41 insertions(+), 68 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 1a91701c..3a6ef3fd 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -171,9 +171,6 @@ typedef volatile long      atomic32_t;
 typedef volatile long long atomic64_t;
 typedef volatile void*     atomicptr_t;
 
-#define atomic_thread_fence_acquire()
-#define atomic_thread_fence_release()
-
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedIncrement(val); }
@@ -185,7 +182,9 @@ static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchangePointer ((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
+static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
+static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_cas_ptr(dst, val, ref); }
 
 #define EXPECTED(x) (x)
 #define UNEXPECTED(x) (x)
@@ -198,9 +197,6 @@ typedef volatile _Atomic(int32_t) atomic32_t;
 typedef volatile _Atomic(int64_t) atomic64_t;
 typedef volatile _Atomic(void*) atomicptr_t;
 
-#define atomic_thread_fence_acquire() atomic_thread_fence(memory_order_acquire)
-#define atomic_thread_fence_release() atomic_thread_fence(memory_order_release)
-
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
@@ -212,7 +208,9 @@ static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return a
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_release, memory_order_acquire); }
+static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_relaxed, memory_order_relaxed); }
+static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
 
 #define EXPECTED(x) __builtin_expect((x), 1)
 #define UNEXPECTED(x) __builtin_expect((x), 0)
@@ -612,10 +610,10 @@ _memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
 #else
 #  define _memory_statistics_inc(counter) do {} while(0)
 #  define _memory_statistics_dec(counter) do {} while(0)
-#  define _memory_statistics_add(atomic_counter, value) do {} while(0)
+#  define _memory_statistics_add(counter, value) do {} while(0)
 #  define _memory_statistics_add64(counter, value) do {} while(0)
-#  define _memory_statistics_add_peak(atomic_counter, value, peak) do {} while (0)
-#  define _memory_statistics_sub(atomic_counter, value) do {} while(0)
+#  define _memory_statistics_add_peak(counter, value, peak) do {} while (0)
+#  define _memory_statistics_sub(counter, value) do {} while(0)
 #  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
 #  define _memory_statistics_inc_free(heap, class_idx) do {} while(0)
 #endif
@@ -700,7 +698,7 @@ _memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count
 	span->span_count = (uint32_t)span_count;
 	span->align_offset = (uint32_t)align_offset;
 	span->flags = SPAN_FLAG_MASTER;
-	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);	
+	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
 }
 
 //! Map an aligned set of spans, taking configured mapping granularity and the page size into account
@@ -885,7 +883,9 @@ static void
 _memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
 	assert((span->list_size == 1) || (span->next != 0));
 	int32_t list_size = (int32_t)span->list_size;
-	//Unmap if cache has reached the limit
+	//Unmap if cache has reached the limit. Does not need stronger synchronization, the worst
+	//case is that the span list is unmapped when it could have been cached (no real dependency
+	//between the two variables)
 	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
 #if !ENABLE_UNLIMITED_GLOBAL_CACHE
 		_memory_unmap_span_list(span);
@@ -965,13 +965,11 @@ static void _memory_deallocate_huge(span_t*);
 //! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
 static void
 _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
-	atomic_thread_fence_acquire();
 	span_t* span = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
 	if (!span)
 		return;
-	do {
+	while (!atomic_cas_ptr(&heap->span_free_deferred, 0, span))
 		span = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
-	} while (!atomic_cas_ptr(&heap->span_free_deferred, 0, span));
 	while (span) {
 		span_t* next_span = (span_t*)span->free_list;
 		assert(span->heap == heap);
@@ -1181,8 +1179,7 @@ _memory_span_initialize_new(heap_t* heap, heap_class_t* heap_class, span_t* span
 	span->used_count = size_class->block_count;
 	span->free_list = 0;
 	span->list_size = 0;
-	atomic_store_ptr(&span->free_list_deferred, 0);
-	atomic_thread_fence_release();
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
 
 	//Setup free list. Only initialize one system page worth of free blocks in list
 	void* block;
@@ -1200,12 +1197,14 @@ _memory_span_initialize_new(heap_t* heap, heap_class_t* heap_class, span_t* span
 
 static void
 _memory_span_extract_free_list_deferred(span_t* span) {
+	// Here we do not need any acquire semantics on the CAS operation since we are not
+	// interested in the list size, we simply reset it to zero with release semantics on store.
+	// Refer to _memory_deallocate_defer_small_or_medium for further comments on this dependency
 	do {
 		span->free_list = atomic_load_ptr(&span->free_list_deferred);
 	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, span->free_list));
 	span->list_size = 0;
-	atomic_store_ptr(&span->free_list_deferred, 0);
-	atomic_thread_fence_release();
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
 }
 
 static int
@@ -1245,7 +1244,6 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 		span->used_count = span->block_count;
 
 		//Swap in deferred free list if present
-		atomic_thread_fence_acquire();
 		if (atomic_load_ptr(&span->free_list_deferred))
 			_memory_span_extract_free_list_deferred(span);
 
@@ -1317,7 +1315,6 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 	assert(span->span_count == span_count);
 	span->size_class = SIZE_CLASS_LARGE;
 	span->heap = heap;
-	atomic_thread_fence_release();
 
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_add(&heap->large_huge_span, span);
@@ -1552,7 +1549,6 @@ _memory_heap_extract_orphan(atomicptr_t* heap_list) {
 	uintptr_t orphan_counter;
 	heap_t* heap;
 	heap_t* next_heap;
-	atomic_thread_fence_acquire();
 	do {
 		raw_heap = atomic_load_ptr(heap_list);
 		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
@@ -1616,14 +1612,17 @@ _memory_deallocate_defer_free_span(heap_t* heap, span_t* span) {
 //! Put the block in the deferred free list of the owning span
 static void
 _memory_deallocate_defer_small_or_medium(span_t* span, void* block) {
+	// The memory ordering here is a bit tricky, to avoid having to ABA protect
+	// the deferred free list to avoid desynchronization of list and list size
+	// we need to have acquire semantics on successful CAS of the pointer to
+	// guarantee the list_size variable validity + release semantics on pointer store
 	void* free_list;
 	do {
-		atomic_thread_fence_acquire();
 		free_list = atomic_load_ptr(&span->free_list_deferred);
 		*((void**)block) = free_list;
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, free_list));
+	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, free_list));
 	uint32_t free_count = ++span->list_size;
-	atomic_store_ptr(&span->free_list_deferred, block);
+	atomic_store_ptr_release(&span->free_list_deferred, block);
 	if (free_count == span->block_count) {
 		// Span was completely freed by this block. Due to the INVALID_POINTER spin lock
 		// no other thread can reach this state simultaneously on this span.
@@ -1764,7 +1763,7 @@ _memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned
 			uint32_t block_idx = block_offset / span->block_size;
 			void* block = pointer_offset(blocks_start, (size_t)block_idx * span->block_size);
 			if (!oldsize)
-				oldsize = (size_t)span->block_size - pointer_diff(p, block);
+				oldsize = (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block));
 			if ((size_t)span->block_size >= size) {
 				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
 				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
@@ -2178,8 +2177,6 @@ _memory_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t* class_s
 //! Finalize the allocator
 void
 rpmalloc_finalize(void) {
-	atomic_thread_fence_acquire();
-
 	rpmalloc_thread_finalize();
 	//rpmalloc_dump_statistics(stderr);
 
@@ -2267,7 +2264,6 @@ rpmalloc_finalize(void) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	atomic_store_ptr(&_memory_first_class_orphan_heaps, 0);
 #endif
-	atomic_thread_fence_release();
 
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
 	pthread_key_delete(_memory_thread_heap);
@@ -2301,7 +2297,6 @@ rpmalloc_thread_initialize(void) {
 	if (!get_thread_heap_raw()) {
 		heap_t* heap = _memory_allocate_heap(0);
 		if (heap) {
-			atomic_thread_fence_acquire();
 			_memory_statistics_inc(&_memory_active_heaps);
 			set_thread_heap(heap);
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
@@ -2549,7 +2544,6 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 		heap_class_t* heap_class = heap->span_class + iclass;
 		span_t* span = heap_class->partial_span;
 		while (span) {
-			atomic_thread_fence_acquire();
 			size_t free_count = span->list_size;
 			size_t block_count = size_class->block_count;
 			if (span->free_list_limit < block_count)
@@ -2734,13 +2728,13 @@ rpmalloc_heap_acquire(void) {
 	// heap is cleared with rpmalloc_heap_free_all()
 	heap_t* heap = _memory_allocate_heap(1);
 	_memory_statistics_inc(&_memory_active_heaps);
-	return (rpmalloc_heap_t*)heap;
+	return heap;
 }
 
 extern inline void
 rpmalloc_heap_release(rpmalloc_heap_t* heap) {
 	if (heap)
-		_memory_heap_finalize((heap_t*)heap, 1);
+		_memory_heap_finalize(heap, 1);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2751,7 +2745,7 @@ rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
 		return ptr;
 	}
 #endif
-	return _memory_allocate((heap_t*)heap, size);
+	return _memory_allocate(heap, size);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2762,7 +2756,7 @@ rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size
 		return ptr;
 	}
 #endif
-	return _memory_aligned_allocate((heap_t*)heap, alignment, size);
+	return _memory_aligned_allocate(heap, alignment, size);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2790,7 +2784,7 @@ rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num
 #else
 	total = num * size;
 #endif
-	void* block = _memory_aligned_allocate((heap_t*)heap, alignment, total);
+	void* block = _memory_aligned_allocate(heap, alignment, total);
 	if (block)
 		memset(block, 0, total);
 	return block;
@@ -2804,7 +2798,7 @@ rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned in
 		return ptr;
 	}
 #endif
-	return _memory_reallocate((heap_t*)heap, ptr, size, 0, flags);
+	return _memory_reallocate(heap, ptr, size, 0, flags);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2815,7 +2809,7 @@ rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment
 		return 0;
 	}
 #endif
-	return _memory_aligned_reallocate((heap_t*)heap, ptr, alignment, size, 0, flags);	
+	return _memory_aligned_reallocate(heap, ptr, alignment, size, 0, flags);	
 }
 
 extern inline void
@@ -2825,8 +2819,7 @@ rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
 }
 
 extern inline void
-rpmalloc_heap_free_all(rpmalloc_heap_t* heapptr) {
-	heap_t* heap = (heap_t*)heapptr;
+rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 	span_t* span;
 	span_t* next_span;
 
@@ -2893,9 +2886,9 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heapptr) {
 
 extern inline void
 rpmalloc_heap_thread_set_current(rpmalloc_heap_t* heap) {
-	rpmalloc_heap_t* prev_heap = (rpmalloc_heap_t*)get_thread_heap_raw();
+	heap_t* prev_heap = get_thread_heap_raw();
 	if (prev_heap != heap) {
-		set_thread_heap((heap_t*)heap);
+		set_thread_heap(heap);
 		if (prev_heap)
 			rpmalloc_heap_release(prev_heap);
 	}
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 32e33e25..bfbdbdea 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -20,11 +20,12 @@ extern "C" {
 #if defined(__clang__) || defined(__GNUC__)
 # define RPMALLOC_EXPORT __attribute__((visibility("default")))
 # define RPMALLOC_ALLOCATOR 
-# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
-# if defined(__clang_major__) && (__clang_major__ < 4)
+# if (defined(__clang_major__) && (__clang_major__ < 4)) || (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD)
+# define RPMALLOC_ATTRIB_MALLOC
 # define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
 # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
 # else
+# define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
 # define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
 # define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)  __attribute__((alloc_size(count, size)))
 # endif
@@ -279,7 +280,7 @@ rpmalloc_usable_size(void* ptr);
 #if RPMALLOC_FIRST_CLASS_HEAPS
 
 //! Heap type
-typedef void* rpmalloc_heap_t;
+typedef struct heap_t rpmalloc_heap_t;
 
 //! Acquire a new heap. Will reuse existing released heaps or allocate memory for a new heap
 //  if none available. Heap API is imlemented with the strict assumption that only one single
diff --git a/test/main.c b/test/main.c
index 5545f751..c6338c8e 100644
--- a/test/main.c
+++ b/test/main.c
@@ -907,7 +907,6 @@ test_threadspam(void) {
 
 	for (j = 0; j < num_passes; ++j) {
 		thread_sleep(10);
-		thread_fence();
 
 		for (i = 0; i < num_alloc_threads; ++i) {
 			threadres[i] = thread_join(thread[i]);
diff --git a/test/thread.c b/test/thread.c
index 15b8d1d2..5e3ad8da 100644
--- a/test/thread.c
+++ b/test/thread.c
@@ -23,12 +23,6 @@ thread_entry(void* argptr) {
 #  include <pthread.h>
 #  include <sched.h>
 
-#if !defined(__x86_64__) && !defined(_AMD64_) && !defined(_M_AMD64) && !defined(__i386__)
-#  define MEMORY_BARRIER __sync_synchronize()
-#else
-#  define MEMORY_BARRIER __asm__ __volatile__("":::"memory")
-#endif
-
 static void*
 thread_entry(void* argptr) {
 	thread_arg* arg = argptr;
@@ -94,19 +88,8 @@ thread_sleep(int milliseconds) {
 void
 thread_yield(void) {
 #ifdef _WIN32
-	Sleep(0);
-	_ReadWriteBarrier();
+	SleepEx(0, 1);
 #else
 	sched_yield();
-	MEMORY_BARRIER;
-#endif
-}
-
-void
-thread_fence(void) {
-#ifdef _WIN32
-	_ReadWriteBarrier();
-#else
-	MEMORY_BARRIER;
 #endif
 }
diff --git a/test/thread.h b/test/thread.h
index 5c0d873c..062970eb 100644
--- a/test/thread.h
+++ b/test/thread.h
@@ -26,9 +26,6 @@ thread_sleep(int milliseconds);
 extern void
 thread_yield(void);
 
-extern void
-thread_fence(void);
-
 #ifdef __cplusplus
 }
 #endif

From 313785648b57f24d0a9e847b7bae48025d91c674 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Fri, 31 Jan 2020 23:51:56 +0100
Subject: [PATCH 23/69] Refactor finalization to support global scope dynamic
 deallocation (#145)

---
 CHANGELOG           |   3 +
 README.md           |  12 +-
 rpmalloc/rpmalloc.c | 378 +++++++++++++++++++++++++-------------------
 3 files changed, 221 insertions(+), 172 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index a26b81d0..2e1a037b 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -18,6 +18,9 @@ Fixed natural alignment check in rpaligned_realloc to 16 bytes (check was 32, wh
 Minor performance improvements for all code paths by simplified span handling, and for aligned allocations
 with alignment less or equal to 128 bytes by utilizing natural block alignments
 
+Refactor finalization to be compatible with global scope data causing dynamic allocations and frees, like
+C++ objects with custom ctors/dtors.
+
 
 1.4.0
 
diff --git a/README.md b/README.md
index 2f3eab34..5543ddbc 100644
--- a/README.md
+++ b/README.md
@@ -17,7 +17,7 @@ The code should be easily portable to any platform with atomic operations and an
 This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
 
 # Performance
-We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~2800 lines of C code. All allocations have a natural 16-byte alignment.
+We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment.
 
 Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
 
@@ -50,7 +50,7 @@ Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replaceme
 
 If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__. The list of libc entry points replaced may not be complete, use libc replacement only as a convenience for testing the library on an existing code base, not a final solution.
 
-For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section
+For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1.
 
 # Building
 To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides.
@@ -83,7 +83,9 @@ Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disa
 
 To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1.
 
-To enable the runtime configurable memory page and span sizes, define __ENABLE_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB.
+To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB.
+
+To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled.
 
 # Huge pages
 The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done.
@@ -146,12 +148,12 @@ Since each thread cache maps spans of memory pages per size class, a thread that
 Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead).
 
 # Caveats
-Cross-thread deallocations could leave dangling spans in the owning thread heap partially used list if the deallocation is the last used block in the span and the span is previously marked as partial (at least one block deallocated by the owning thread). However, an optimization for GC like use cases is that if all the blocks in the span are freed by other threads, the span can immediately be inserted in the owning thread span cache.
-
 VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space.
 
 All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__.
 
+To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks.
+
 # Other languages
 
 [Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs)
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 3a6ef3fd..7ad25af8 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -174,8 +174,8 @@ typedef volatile void*     atomicptr_t;
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedIncrement(val); }
-#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)_InterlockedDecrement(val); }
+#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)_InterlockedExchangeAdd64(val, add) + add; }
 #endif
@@ -200,8 +200,8 @@ typedef volatile _Atomic(void*) atomicptr_t;
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
-#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
+#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return atomic_load_explicit(val, memory_order_relaxed); }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 #endif
@@ -389,11 +389,9 @@ struct heap_class_t {
 	//! Double linked list of partially used spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      partial_span;
-#if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of fully utilized spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      full_span;
-#endif
 };
 
 struct heap_t {
@@ -413,6 +411,8 @@ struct heap_t {
 	//! Double linked list of large and huge spans allocated by this heap
 	span_t*      large_huge_span;
 #endif
+	//! Number of full spans
+	size_t       large_huge_span_count;
 	//! Mapped but unused spans
 	span_t*      span_reserve;
 	//! Master span for mapped but unused spans
@@ -427,8 +427,12 @@ struct heap_t {
 	size_t       align_offset;
 	//! Heap ID
 	int32_t      id;
+	//! Finalization state flag
+	int          finalize;
 	//! Master heap owning the memory pages
 	heap_t*      master_heap;
+	//! Child count
+	atomic32_t   child_count;
 #if ENABLE_STATISTICS
 	//! Number of bytes transitioned thread -> global
 	atomic64_t   thread_to_global;
@@ -624,6 +628,9 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span);
 static void
 _memory_global_cache_insert(span_t* span);
 
+static void
+_memory_heap_finalize(heap_t* heap);
+
 //! Map more virtual memory
 static void*
 _memory_map(size_t size, size_t* offset) {
@@ -974,10 +981,8 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 		span_t* next_span = (span_t*)span->free_list;
 		assert(span->heap == heap);
 		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
-#if RPMALLOC_FIRST_CLASS_HEAPS
 			heap_class_t* heap_class = heap->span_class + span->size_class;
 			_memory_span_double_link_list_remove(&heap_class->full_span, span);
-#endif
 			if (single_span && !*single_span) {
 				*single_span = span;
 			} else {
@@ -989,6 +994,7 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 			_memory_span_double_link_list_remove(&heap->large_huge_span, span);
 #endif
+			--heap->large_huge_span_count;
 			if (span->size_class == SIZE_CLASS_HUGE) {
 				_memory_deallocate_huge(span);
 			} else {
@@ -1006,9 +1012,92 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	}
 }
 
+static void
+_memory_heap_global_finalize(heap_t* heap);
+
+static void
+_memory_unlink_orphan_heap(atomicptr_t* list, heap_t* heap) {
+	heap_t* orphan = (heap_t*)((uintptr_t)atomic_load_ptr(list) & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
+	if (orphan == heap) {
+		//We're now in single-threaded finalization phase, no need to ABA protect or CAS
+		atomic_store_ptr(list, heap->next_orphan);
+	} else if (orphan) {
+		heap_t* last = orphan;
+		while (orphan && (orphan != heap)) {
+			last = orphan;
+			orphan = orphan->next_orphan;
+		}
+		if (orphan == heap)
+			last->next_orphan = heap->next_orphan;
+	}
+}
+
+static void
+_memory_unmap_heap(heap_t* heap) {
+	if (!heap->master_heap) {
+		if (!atomic_load32(&heap->child_count)) {
+			_memory_unlink_orphan_heap(&_memory_orphan_heaps, heap);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			_memory_unlink_orphan_heap(&_memory_first_class_orphan_heaps, heap);
+#endif
+			size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+			_memory_unmap(heap, block_size, heap->align_offset, block_size);
+		}
+	} else {
+		if (atomic_decr32(&heap->master_heap->child_count) == 0) {
+			_memory_heap_global_finalize(heap->master_heap);
+		}
+	}
+}
+
+static void
+_memory_heap_global_finalize(heap_t* heap) {
+	if (heap->finalize++ > 1) {
+		--heap->finalize;
+		return;
+	}
+
+	_memory_heap_finalize(heap);
+	if (heap->large_huge_span_count) {
+		--heap->finalize;
+		return;
+	}
+
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_t* span = heap->span_cache[iclass];
+		heap->span_cache[iclass] = 0;
+		if (span)
+			_memory_unmap_span_list(span);
+	}
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		heap_class_t* heap_class = heap->span_class + iclass;
+		if (heap_class->free_list || heap_class->partial_span || heap_class->full_span) {
+			--heap->finalize;
+			return;
+		}
+	}
+	//Heap is now completely free, unmap and remove from heap list
+	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+	heap_t* list_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+	if (list_heap == heap) {
+		atomic_store_ptr(&_memory_heaps[list_idx], heap->next_heap);
+	} else {
+		while (list_heap->next_heap != heap)
+			list_heap = list_heap->next_heap;
+		list_heap->next_heap = heap->next_heap;
+	}
+
+	_memory_unmap_heap( heap );
+}
+
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
 static void
 _memory_heap_cache_insert(heap_t* heap, span_t* span) {
+	if (UNEXPECTED(heap->finalize != 0)) {
+		_memory_unmap_span(span);
+		_memory_heap_global_finalize(heap);
+		return;
+	}
 #if ENABLE_THREAD_CACHE
 	size_t span_count = span->span_count;
 	size_t idx = span_count - 1;
@@ -1176,7 +1265,6 @@ _memory_span_initialize_new(heap_t* heap, heap_class_t* heap_class, span_t* span
 	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
 	span->block_size = size_class->block_size;
 	span->block_count = size_class->block_count;
-	span->used_count = size_class->block_count;
 	span->free_list = 0;
 	span->list_size = 0;
 	atomic_store_ptr_release(&span->free_list_deferred, 0);
@@ -1186,29 +1274,31 @@ _memory_span_initialize_new(heap_t* heap, heap_class_t* heap_class, span_t* span
 	span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, 
 		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
 	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
-	if (span->free_list_limit < span->block_count)
+	if (span->free_list_limit < span->block_count) {
 		_memory_span_double_link_list_add(&heap_class->partial_span, span);
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	else
+		span->used_count = span->free_list_limit;
+	} else {
 		_memory_span_double_link_list_add(&heap_class->full_span, span);
-#endif
+		span->used_count = span->block_count;
+	}
 	return block;
 }
 
 static void
 _memory_span_extract_free_list_deferred(span_t* span) {
-	// Here we do not need any acquire semantics on the CAS operation since we are not
-	// interested in the list size, we simply reset it to zero with release semantics on store.
+	// We need acquire semantics on the CAS operation since we are interested in the list size
 	// Refer to _memory_deallocate_defer_small_or_medium for further comments on this dependency
 	do {
 		span->free_list = atomic_load_ptr(&span->free_list_deferred);
-	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr(&span->free_list_deferred, INVALID_POINTER, span->free_list));
+	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, span->free_list));
+	span->used_count -= span->list_size;
 	span->list_size = 0;
 	atomic_store_ptr_release(&span->free_list_deferred, 0);
 }
 
 static int
 _memory_span_is_fully_utilized(span_t* span) {
+	assert(span->free_list_limit <= span->block_count);
 	return !span->free_list && (span->free_list_limit >= span->block_count);
 }
 
@@ -1241,7 +1331,8 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
 				span->block_count - span->free_list_limit, span->block_size);
 		}
-		span->used_count = span->block_count;
+		assert(span->free_list_limit <= span->block_count);
+		span->used_count = span->free_list_limit;
 
 		//Swap in deferred free list if present
 		if (atomic_load_ptr(&span->free_list_deferred))
@@ -1253,9 +1344,7 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 
 		//The span is fully utilized, unlink from partial list and add to fully utilized list
 		_memory_span_double_link_list_pop_head(&heap_class->partial_span, span);
-#if RPMALLOC_FIRST_CLASS_HEAPS
 		_memory_span_double_link_list_add(&heap_class->full_span, span);
-#endif
 		return block;
 	}
 
@@ -1319,6 +1408,7 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
+	++heap->large_huge_span_count;
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -1346,6 +1436,7 @@ _memory_allocate_huge(heap_t* heap, size_t size) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
+	++heap->large_huge_span_count;
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -1473,6 +1564,7 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
+	++heap->large_huge_span_count;
 
 	return ptr;
 }
@@ -1482,8 +1574,7 @@ _memory_heap_initialize(heap_t* heap) {
 	memset(heap, 0, sizeof(heap_t));
 
 	//Get a new heap ID
-	heap->id = atomic_incr32(&_memory_heap_id);
-	assert(heap->id != 0);
+	heap->id = 1 + atomic_incr32(&_memory_heap_id);
 
 	//Link in heap in heap ID map
 	heap_t* next_heap;
@@ -1531,6 +1622,7 @@ _memory_allocate_heap_new(void) {
 	if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
 		aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
 	size_t num_heaps = block_size / aligned_heap_size;
+	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
 	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
 	while (num_heaps > 1) {
 		_memory_heap_initialize(extra_heap);
@@ -1580,25 +1672,22 @@ _memory_allocate_heap(int first_class) {
 static void
 _memory_deallocate_direct_small_or_medium(span_t* span, void* block) {
 	heap_t* heap = span->heap;
-	assert(span->heap == get_thread_heap_raw());
+	assert(heap == get_thread_heap_raw() || heap->finalize);
 	//Add block to free list
 	if (UNEXPECTED(_memory_span_is_fully_utilized(span))) {
 		span->used_count = span->block_count;
 		heap_class_t* heap_class = &heap->span_class[span->size_class];
-#if RPMALLOC_FIRST_CLASS_HEAPS
 		_memory_span_double_link_list_remove(&heap_class->full_span, span);
-#endif
 		_memory_span_double_link_list_add(&heap_class->partial_span, span);
 	}
+	--span->used_count;
 	*((void**)block) = span->free_list;
 	span->free_list = block;
-	uint32_t used = --span->used_count;
-	uint32_t free = span->list_size;
-	if (UNEXPECTED(used == free)) {
+	if (UNEXPECTED(span->used_count == span->list_size)) {
 		heap_class_t* heap_class = &heap->span_class[span->size_class];
 		_memory_span_double_link_list_remove(&heap_class->partial_span, span);
 		_memory_span_release_to_cache(heap, span);
-	}		
+	}
 }
 
 static void
@@ -1641,7 +1730,7 @@ _memory_deallocate_small_or_medium(span_t* span, void* p) {
 		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
 	}
 	//Check if block belongs to this heap or if deallocation should be deferred
-	if (span->heap == get_thread_heap_raw())
+	if ((span->heap == get_thread_heap_raw()) || span->heap->finalize)
 		_memory_deallocate_direct_small_or_medium(span, p);
 	else
 		_memory_deallocate_defer_small_or_medium(span, p);
@@ -1654,20 +1743,15 @@ _memory_deallocate_large(span_t* span) {
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
 	heap_t* heap = get_thread_heap_raw();
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	//If using first class heaps and tracking spans in heap double linked list we must
-	//always defer if from another heap since we cannot touch the list of another heap
-	int defer = (heap != span->heap);
-#else
-	//Otherwise defer if different heap and span count is 1 to avoide too many span transitions
-	int defer = ((heap != span->heap) && (span->span_count == 1));
-#endif
+	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
+	int defer = (heap != span->heap) && !span->heap->finalize;
 	if (defer) {
 		_memory_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
+	--span->heap->large_huge_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_remove(&heap->large_huge_span, span);
+	_memory_span_double_link_list_remove(&span->heap->large_huge_span, span);
 #endif
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	//Decrease counter
@@ -1675,7 +1759,7 @@ _memory_deallocate_large(span_t* span) {
 	atomic_decr32(&span->heap->span_use[idx].current);
 #endif
 	span->heap = heap;
-	if ((span->span_count > 1) && !heap->spans_reserved) {
+	if ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved) {
 		heap->span_reserve = span;
 		heap->spans_reserved = span->span_count;
 		if (span->flags & SPAN_FLAG_MASTER) {
@@ -1696,15 +1780,14 @@ _memory_deallocate_large(span_t* span) {
 //! Deallocate the given huge span
 static void
 _memory_deallocate_huge(span_t* span) {
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	//If using first class heaps and tracking spans in heap double linked list we must
-	//always defer if from another heap since we cannot touch the list of another heap
 	assert(span->heap);
-	if (span->heap != get_thread_heap_raw()) {
+	if ((span->heap != get_thread_heap_raw()) && !span->heap->finalize) {
 		_memory_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
 
+	--span->heap->large_huge_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_remove(&span->heap->large_huge_span, span);
 #endif
 
@@ -1872,7 +1955,7 @@ _memory_adjust_size_class(size_t iclass) {
 }
 
 static void
-_memory_heap_finalize(void* heapptr, int first_class) {
+_memory_heap_release(void* heapptr, int first_class) {
 	heap_t* heap = (heap_t*)heapptr;
 	if (!heap)
 		return;
@@ -1881,6 +1964,11 @@ _memory_heap_finalize(void* heapptr, int first_class) {
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		span_t* span = heap->span_cache[iclass];
+		heap->span_cache[iclass] = 0;
+		if (span && heap->finalize) {
+			_memory_unmap_span_list(span);
+			continue;
+		}
 #if ENABLE_GLOBAL_CACHE
 		while (span) {
 			assert(span->span_count == (iclass + 1));
@@ -1895,7 +1983,6 @@ _memory_heap_finalize(void* heapptr, int first_class) {
 		if (span)
 			_memory_unmap_span_list(span);
 #endif
-		heap->span_cache[iclass] = 0;
 	}
 #endif
 
@@ -1910,8 +1997,8 @@ _memory_heap_finalize(void* heapptr, int first_class) {
 }
 
 static void
-_memory_heap_finalize_raw(void* heapptr) {
-	_memory_heap_finalize(heapptr, 0);
+_memory_heap_release_raw(void* heapptr) {
+	_memory_heap_release(heapptr, 0);
 }
 
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
@@ -2095,28 +2182,13 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	_memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2);
 
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-	if (pthread_key_create(&_memory_thread_heap, _memory_heap_finalize_raw))
+	if (pthread_key_create(&_memory_thread_heap, _memory_heap_release_raw))
 		return -1;
 #endif
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
     fls_key = FlsAlloc(&rp_thread_destructor);
 #endif
 
-	atomic_store32(&_memory_heap_id, 1);
-	atomic_store32(&_memory_orphan_counter, 0);
-#if ENABLE_STATISTICS
-	atomic_store32(&_memory_active_heaps, 0);
-	atomic_store32(&_reserved_spans, 0);
-	atomic_store32(&_master_spans, 0);
-	atomic_store32(&_mapped_pages, 0);
-	_mapped_pages_peak = 0;
-	atomic_store32(&_mapped_total, 0);
-	atomic_store32(&_unmapped_total, 0);
-	atomic_store32(&_mapped_pages_os, 0);
-	atomic_store32(&_huge_pages_current, 0);
-	_huge_pages_peak = 0;
-#endif
-
 	//Setup all small and medium size classes
 	size_t iclass = 0;
 	_memory_size_class[iclass].block_size = SMALL_GRANULARITY;
@@ -2138,40 +2210,85 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		_memory_adjust_size_class(SMALL_CLASS_COUNT + iclass);
 	}
 
-	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx)
-		atomic_store_ptr(&_memory_heaps[list_idx], 0);
-
 	//Initialize this thread
 	rpmalloc_thread_initialize();
 	return 0;
 }
 
 static span_t*
-_memory_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t* class_span, uint32_t class_free_blocks) {
-	(void)sizeof(heap);
-	(void)sizeof(iclass);
-	uint32_t free_blocks = span->list_size;
+_memory_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
+	heap_class_t* heap_class = heap->span_class + iclass;
+	span_t* class_span = (span_t*)((uintptr_t)heap_class->free_list & _memory_span_mask);
 	if (span == class_span) {
-		free_blocks += class_free_blocks;
-		class_span = 0;
-		class_free_blocks = 0;
-	}
-	uint32_t block_count = span->block_count;
-	if (span->free_list_limit < span->block_count)
-		block_count = span->free_list_limit;
-	void* block = span->free_list;
-	while (block) {
-		++free_blocks;
-		block = *((void**)block);
+		// Adopt the heap class free list back into the span free list
+		void* block = span->free_list;
+		void* last_block = 0;
+		while (block) {
+			last_block = block;
+			block = *((void**)block);
+		}
+		uint32_t free_count = 0;
+		block = heap_class->free_list;
+		while (block) {
+			++free_count;
+			block = *((void**)block);
+		}
+		if (last_block) {
+			*((void**)last_block) = heap_class->free_list;
+		} else {
+			span->free_list = heap_class->free_list;
+		}
+		heap_class->free_list = 0;
+		span->used_count -= free_count;
 	}
 	//If this assert triggers you have memory leaks
-	assert(free_blocks == block_count);
-	if (free_blocks == block_count) {
+	assert(span->list_size == span->used_count);
+	if (span->list_size == span->used_count) {
 		_memory_statistics_dec(&heap->span_use[0].current);
 		_memory_statistics_dec(&heap->size_class_use[iclass].spans_current);
+		// This function only used for spans in double linked lists
+		if (list_head)
+			_memory_span_double_link_list_remove(list_head, span);
+		_memory_unmap_span(span);
+	}
+	return (span == class_span) ? 0 : class_span;
+}
+
+static void
+_memory_heap_finalize(heap_t* heap) {
+	if (heap->spans_reserved) {
+		span_t* span = _memory_map_spans(heap, heap->spans_reserved);
 		_memory_unmap_span(span);
+		heap->spans_reserved = 0;
+	}
+
+	_memory_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		heap_class_t* heap_class = heap->span_class + iclass;				
+		span_t* span = heap_class->partial_span;
+		while (span) {
+			span_t* next = span->next;
+			_memory_span_finalize(heap, iclass, span, &heap_class->partial_span);
+			span = next;
+		}
+		span = heap_class->full_span;
+		while (span) {
+			span_t* next = span->next;
+			_memory_span_finalize(heap, iclass, span, &heap_class->full_span);
+			span = next;
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (heap->span_cache[iclass]) {
+			_memory_unmap_span_list(heap->span_cache[iclass]);
+			heap->span_cache[iclass] = 0;
+		}
 	}
-	return class_span;
+#endif
+	assert(!atomic_load_ptr(&heap->span_free_deferred));
 }
 
 //! Finalize the allocator
@@ -2180,76 +2297,13 @@ rpmalloc_finalize(void) {
 	rpmalloc_thread_finalize();
 	//rpmalloc_dump_statistics(stderr);
 
-	//Free all thread caches
-	heap_t* master_heaps = 0;
+	//Free all thread caches and fully free spans
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
 		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
 		while (heap) {
-			if (heap->spans_reserved) {
-				span_t* span = _memory_map_spans(heap, heap->spans_reserved);
-				_memory_unmap_span(span);
-			}
-
-			_memory_heap_cache_adopt_deferred(heap, 0);
-
-			for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-				heap_class_t* heap_class = heap->span_class + iclass;
-				
-				span_t* class_span = (span_t*)((uintptr_t)heap_class->free_list & _memory_span_mask);
-				uint32_t class_free_blocks = 0;
-				void* block = heap_class->free_list;
-				while (block) {
-					++class_free_blocks;
-					block = *((void**)block);
-				}
-
-				span_t* span = heap_class->partial_span;
-				while (span) {
-					span_t* next = span->next;
-					class_span = _memory_span_finalize(heap, iclass, span, class_span, class_free_blocks);
-					span = next;
-				}
-#if RPMALLOC_FIRST_CLASS_HEAPS
-				span = heap_class->full_span;
-				while (span) {
-					span_t* next = span->next;
-					class_span = _memory_span_finalize(heap, iclass, span, class_span, class_free_blocks);
-					span = next;
-				}
-#endif
-				if (class_span)
-					class_span = _memory_span_finalize(heap, iclass, class_span, class_span, class_free_blocks);
-			}
-
-#if RPMALLOC_FIRST_CLASS_HEAPS
-			span_t* span = heap->large_huge_span;
-			while (span) {
-				span_t* next = span->next;
-				if (span->size_class == SIZE_CLASS_HUGE) {
-					_memory_deallocate_huge(span);
-				} else {
-					_memory_statistics_dec(&heap->span_use[span->span_count - 1].current);
-					_memory_unmap_span(span);
-				}
-				span = next;
-			}				
-#endif
-
-#if ENABLE_THREAD_CACHE
-			for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-				if (heap->span_cache[iclass]) {
-					_memory_unmap_span_list(heap->span_cache[iclass]);
-					heap->span_cache[iclass] = 0;
-				}
-			}
-#endif
-			assert(!atomic_load_ptr(&heap->span_free_deferred));
-
 			heap_t* next_heap = heap->next_heap;
-			if (!heap->master_heap) {
-				heap->next_heap = master_heaps;
-				master_heaps = heap;
-			}
+			heap->finalize = 1;
+			_memory_heap_global_finalize(heap);
 			heap = next_heap;
 		}
 	}
@@ -2260,29 +2314,15 @@ rpmalloc_finalize(void) {
 		_memory_cache_finalize(&_memory_span_cache[iclass]);
 #endif
 
-	atomic_store_ptr(&_memory_orphan_heaps, 0);
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	atomic_store_ptr(&_memory_first_class_orphan_heaps, 0);
-#endif
-
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
 	pthread_key_delete(_memory_thread_heap);
 #endif
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-    FlsFree(fls_key);
+	FlsFree(fls_key);
+	fls_key = 0;
 #endif
-
-	//Finally free all master heaps pages
-	heap_t* master_heap = master_heaps;
-	while (master_heap) {
-		heap_t* next_heap = master_heap->next_heap;
-		size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-		_memory_unmap(master_heap, block_size, master_heap->align_offset, block_size);
-		master_heap = next_heap;
-	}
-
 #if ENABLE_STATISTICS
-	//If you hit these asserts you probably have memory leaks or double frees in your code
+	//If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code
 	assert(!atomic_load32(&_mapped_pages));
 	assert(!atomic_load32(&_reserved_spans));
 	assert(!atomic_load32(&_mapped_pages_os));
@@ -2311,7 +2351,10 @@ void
 rpmalloc_thread_finalize(void) {
 	heap_t* heap = get_thread_heap_raw();
 	if (heap)
-		_memory_heap_finalize_raw(heap);
+		_memory_heap_release_raw(heap);
+#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+	FlsSetValue(fls_key, 0);
+#endif
 }
 
 int
@@ -2734,7 +2777,7 @@ rpmalloc_heap_acquire(void) {
 extern inline void
 rpmalloc_heap_release(rpmalloc_heap_t* heap) {
 	if (heap)
-		_memory_heap_finalize(heap, 1);
+		_memory_heap_release(heap, 1);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2851,6 +2894,7 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 		span = next_span;
 	}
 	heap->large_huge_span = 0;
+	heap->large_huge_span_count = 0;
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {

From 8b6e42b2e3bd5130b83d2c103f452a921261b5c6 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Sat, 1 Feb 2020 22:34:35 +0100
Subject: [PATCH 24/69] allow preconfigured huge page sizes while still set
 token privileges (#148)

---
 rpmalloc/rpmalloc.c | 62 +++++++++++++++++++++++++--------------------
 1 file changed, 34 insertions(+), 28 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 7ad25af8..d116ce8e 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -2031,7 +2031,6 @@ rpmalloc_initialize(void) {
 		rpmalloc_thread_initialize();
 		return 0;
 	}
-	memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
 	return rpmalloc_initialize_config(0);
 }
 
@@ -2045,6 +2044,8 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 
 	if (config)
 		memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
+	else
+		memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
 
 	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
 		_memory_config.memory_map = _memory_map_os;
@@ -2065,35 +2066,10 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		GetSystemInfo(&system_info);
 		_memory_page_size = system_info.dwPageSize;
 		_memory_map_granularity = system_info.dwAllocationGranularity;
-		if (config && config->enable_huge_pages) {
-			HANDLE token = 0;
-			size_t large_page_minimum = GetLargePageMinimum();
-			if (large_page_minimum)
-				OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
-			if (token) {
-				LUID luid;
-				if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
-					TOKEN_PRIVILEGES token_privileges;
-					memset(&token_privileges, 0, sizeof(token_privileges));
-					token_privileges.PrivilegeCount = 1;
-					token_privileges.Privileges[0].Luid = luid;
-					token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-					if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
-						DWORD err = GetLastError();
-						if (err == ERROR_SUCCESS) {
-							_memory_huge_pages = 1;
-							_memory_page_size = large_page_minimum;
-							_memory_map_granularity = large_page_minimum;
-						}
-					}
-				}
-				CloseHandle(token);
-			}
-		}
 #else
 		_memory_page_size = (size_t)sysconf(_SC_PAGESIZE);
 		_memory_map_granularity = _memory_page_size;
-		if (config && config->enable_huge_pages) {
+		if (_memory_config.enable_huge_pages) {
 #if defined(__linux__)
 			size_t huge_page_size = 0;
 			FILE* meminfo = fopen("/proc/meminfo", "r");
@@ -2128,10 +2104,40 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		}
 #endif
 	} else {
-		if (config && config->enable_huge_pages)
+		if (_memory_config.enable_huge_pages)
 			_memory_huge_pages = 1;
 	}
 
+#if PLATFORM_WINDOWS
+	if (_memory_config.enable_huge_pages) {
+		HANDLE token = 0;
+		size_t large_page_minimum = GetLargePageMinimum();
+		if (large_page_minimum)
+			OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+		if (token) {
+			LUID luid;
+			if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+				TOKEN_PRIVILEGES token_privileges;
+				memset(&token_privileges, 0, sizeof(token_privileges));
+				token_privileges.PrivilegeCount = 1;
+				token_privileges.Privileges[0].Luid = luid;
+				token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+				if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+					DWORD err = GetLastError();
+					if (err == ERROR_SUCCESS) {
+						_memory_huge_pages = 1;
+						if (large_page_minimum > _memory_page_size)
+						 	_memory_page_size = large_page_minimum;
+						if (large_page_minimum > _memory_map_granularity)
+							_memory_map_granularity = large_page_minimum;
+					}
+				}
+			}
+			CloseHandle(token);
+		}
+	}
+#endif
+
 	//The ABA counter in heap orphan list is tied to using HEAP_ORPHAN_ABA_SIZE
 	size_t min_span_size = HEAP_ORPHAN_ABA_SIZE;
 	size_t max_page_size;

From 00f21ead1bfaadd2980b9d265c9ffa9dcf646b57 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Mon, 3 Feb 2020 12:11:44 +0100
Subject: [PATCH 25/69] faster heap ownership tests, avoid full span lists by
 default (#149)

---
 rpmalloc/rpmalloc.c | 110 +++++++++++++++++++++++++++++++++-----------
 1 file changed, 84 insertions(+), 26 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index d116ce8e..a80dcb84 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -389,12 +389,16 @@ struct heap_class_t {
 	//! Double linked list of partially used spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      partial_span;
+#if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of fully utilized spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      full_span;
+#endif
 };
 
 struct heap_t {
+	//! Owning thread ID
+	uintptr_t    owner_thread;
 	//! Partial span data per size class
 	heap_class_t span_class[SIZE_CLASS_COUNT];
 #if ENABLE_THREAD_CACHE
@@ -412,7 +416,7 @@ struct heap_t {
 	span_t*      large_huge_span;
 #endif
 	//! Number of full spans
-	size_t       large_huge_span_count;
+	size_t       full_span_count;
 	//! Mapped but unused spans
 	span_t*      span_reserve;
 	//! Master span for mapped but unused spans
@@ -576,6 +580,32 @@ get_thread_heap(void) {
 #endif
 }
 
+//! Fast thread ID
+static inline uintptr_t 
+get_thread_id(void) {
+#if defined(_WIN32)
+	return (uintptr_t)NtCurrentTeb();
+#elif defined(__GNUC__) || defined(__clang__)
+	uintptr_t tid;
+#  if defined(__i386__)
+	__asm__("movl %%gs:0, %0" : "=r" (tid) : : );
+#  elif defined(__MACH__)
+	__asm__("movq %%gs:0, %0" : "=r" (tid) : : );
+#  elif defined(__x86_64__)
+	__asm__("movq %%fs:0, %0" : "=r" (tid) : : );
+#  elif defined(__arm__)
+	asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
+#  elif defined(__aarch64__)
+	asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
+#  else
+	tid = (uintptr_t)get_thread_heap_raw();
+#  endif
+	return tid;
+#else
+	return (uintptr_t)get_thread_heap_raw();
+#endif
+}
+
 //! Set the current thread heap
 static void
 set_thread_heap(heap_t* heap) {
@@ -584,6 +614,8 @@ set_thread_heap(heap_t* heap) {
 #else
 	_memory_thread_heap = heap;
 #endif
+	if (heap)
+		heap->owner_thread = get_thread_id();
 }
 
 //! Default implementation to map more virtual memory
@@ -980,9 +1012,13 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	while (span) {
 		span_t* next_span = (span_t*)span->free_list;
 		assert(span->heap == heap);
+		assert(heap->full_span_count);
+		--heap->full_span_count;
 		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
 			heap_class_t* heap_class = heap->span_class + span->size_class;
 			_memory_span_double_link_list_remove(&heap_class->full_span, span);
+#endif
 			if (single_span && !*single_span) {
 				*single_span = span;
 			} else {
@@ -994,7 +1030,6 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 			_memory_span_double_link_list_remove(&heap->large_huge_span, span);
 #endif
-			--heap->large_huge_span_count;
 			if (span->size_class == SIZE_CLASS_HUGE) {
 				_memory_deallocate_huge(span);
 			} else {
@@ -1058,10 +1093,6 @@ _memory_heap_global_finalize(heap_t* heap) {
 	}
 
 	_memory_heap_finalize(heap);
-	if (heap->large_huge_span_count) {
-		--heap->finalize;
-		return;
-	}
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		span_t* span = heap->span_cache[iclass];
@@ -1069,9 +1100,15 @@ _memory_heap_global_finalize(heap_t* heap) {
 		if (span)
 			_memory_unmap_span_list(span);
 	}
+
+	if (heap->full_span_count) {
+		--heap->finalize;
+		return;
+	}
+
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		heap_class_t* heap_class = heap->span_class + iclass;
-		if (heap_class->free_list || heap_class->partial_span || heap_class->full_span) {
+		if (heap_class->free_list || heap_class->partial_span) {
 			--heap->finalize;
 			return;
 		}
@@ -1278,7 +1315,10 @@ _memory_span_initialize_new(heap_t* heap, heap_class_t* heap_class, span_t* span
 		_memory_span_double_link_list_add(&heap_class->partial_span, span);
 		span->used_count = span->free_list_limit;
 	} else {
+#if RPMALLOC_FIRST_CLASS_HEAPS
 		_memory_span_double_link_list_add(&heap_class->full_span, span);
+#endif
+		++heap->full_span_count;
 		span->used_count = span->block_count;
 	}
 	return block;
@@ -1344,7 +1384,10 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 
 		//The span is fully utilized, unlink from partial list and add to fully utilized list
 		_memory_span_double_link_list_pop_head(&heap_class->partial_span, span);
+#if RPMALLOC_FIRST_CLASS_HEAPS
 		_memory_span_double_link_list_add(&heap_class->full_span, span);
+#endif
+		++heap->full_span_count;
 		return block;
 	}
 
@@ -1408,7 +1451,7 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
-	++heap->large_huge_span_count;
+	++heap->full_span_count;
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -1436,7 +1479,7 @@ _memory_allocate_huge(heap_t* heap, size_t size) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
-	++heap->large_huge_span_count;
+	++heap->full_span_count;
 
 	return pointer_offset(span, SPAN_HEADER_SIZE);
 }
@@ -1564,7 +1607,7 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
-	++heap->large_huge_span_count;
+	++heap->full_span_count;
 
 	return ptr;
 }
@@ -1590,6 +1633,7 @@ _memory_heap_orphan(heap_t* heap, int first_class) {
 	void* raw_heap;
 	uintptr_t orphan_counter;
 	heap_t* last_heap;
+	heap->owner_thread = (uintptr_t)-1;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	atomicptr_t* heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
 #else
@@ -1672,13 +1716,16 @@ _memory_allocate_heap(int first_class) {
 static void
 _memory_deallocate_direct_small_or_medium(span_t* span, void* block) {
 	heap_t* heap = span->heap;
-	assert(heap == get_thread_heap_raw() || heap->finalize);
+	assert(heap->owner_thread == get_thread_id() || heap->finalize);
 	//Add block to free list
 	if (UNEXPECTED(_memory_span_is_fully_utilized(span))) {
 		span->used_count = span->block_count;
 		heap_class_t* heap_class = &heap->span_class[span->size_class];
+#if RPMALLOC_FIRST_CLASS_HEAPS
 		_memory_span_double_link_list_remove(&heap_class->full_span, span);
+#endif
 		_memory_span_double_link_list_add(&heap_class->partial_span, span);
+		--heap->full_span_count;
 	}
 	--span->used_count;
 	*((void**)block) = span->free_list;
@@ -1730,7 +1777,7 @@ _memory_deallocate_small_or_medium(span_t* span, void* p) {
 		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
 	}
 	//Check if block belongs to this heap or if deallocation should be deferred
-	if ((span->heap == get_thread_heap_raw()) || span->heap->finalize)
+	if ((span->heap->owner_thread == get_thread_id()) || span->heap->finalize)
 		_memory_deallocate_direct_small_or_medium(span, p);
 	else
 		_memory_deallocate_defer_small_or_medium(span, p);
@@ -1742,14 +1789,14 @@ _memory_deallocate_large(span_t* span) {
 	assert(span->size_class == SIZE_CLASS_LARGE);
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
-	heap_t* heap = get_thread_heap_raw();
 	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
-	int defer = (heap != span->heap) && !span->heap->finalize;
+	int defer = (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize;
 	if (defer) {
 		_memory_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
-	--span->heap->large_huge_span_count;
+	assert(span->heap->full_span_count);
+	--span->heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_remove(&span->heap->large_huge_span, span);
 #endif
@@ -1758,6 +1805,8 @@ _memory_deallocate_large(span_t* span) {
 	size_t idx = span->span_count - 1;
 	atomic_decr32(&span->heap->span_use[idx].current);
 #endif
+	heap_t* heap = get_thread_heap();
+	assert(heap);
 	span->heap = heap;
 	if ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved) {
 		heap->span_reserve = span;
@@ -1781,12 +1830,12 @@ _memory_deallocate_large(span_t* span) {
 static void
 _memory_deallocate_huge(span_t* span) {
 	assert(span->heap);
-	if ((span->heap != get_thread_heap_raw()) && !span->heap->finalize) {
+	if ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize) {
 		_memory_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
-
-	--span->heap->large_huge_span_count;
+	assert(span->heap->full_span_count);
+	--span->heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_span_double_link_list_remove(&span->heap->large_huge_span, span);
 #endif
@@ -2221,7 +2270,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	return 0;
 }
 
-static span_t*
+static int
 _memory_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
 	heap_class_t* heap_class = heap->span_class + iclass;
 	span_t* class_span = (span_t*)((uintptr_t)heap_class->free_list & _memory_span_mask);
@@ -2256,8 +2305,9 @@ _memory_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_h
 		if (list_head)
 			_memory_span_double_link_list_remove(list_head, span);
 		_memory_unmap_span(span);
+		return 1;
 	}
-	return (span == class_span) ? 0 : class_span;
+	return 0;
 }
 
 static void
@@ -2278,11 +2328,19 @@ _memory_heap_finalize(heap_t* heap) {
 			_memory_span_finalize(heap, iclass, span, &heap_class->partial_span);
 			span = next;
 		}
-		span = heap_class->full_span;
-		while (span) {
-			span_t* next = span->next;
-			_memory_span_finalize(heap, iclass, span, &heap_class->full_span);
-			span = next;
+		// If class still has a free list it must be a full span
+		if (heap_class->free_list) {
+			span_t* class_span = (span_t*)((uintptr_t)heap_class->free_list & _memory_span_mask);
+			span_t** list = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			list = &heap_class->full_span;
+#endif
+			--heap->full_span_count;
+			if (!_memory_span_finalize(heap, iclass, class_span, list)) {
+				if (list)
+					_memory_span_double_link_list_remove(list, span);
+				_memory_span_double_link_list_add(&heap_class->partial_span, span);
+			}
 		}
 	}
 
@@ -2900,7 +2958,7 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 		span = next_span;
 	}
 	heap->large_huge_span = 0;
-	heap->large_huge_span_count = 0;
+	heap->full_span_count = 0;
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {

From 7b6f0b1b8f5f996326c9e4236a23462adb42120d Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 3 Feb 2020 20:22:05 +0100
Subject: [PATCH 26/69] clang compatibility fix

---
 rpmalloc/rpmalloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index a80dcb84..a555ab0e 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1052,7 +1052,8 @@ _memory_heap_global_finalize(heap_t* heap);
 
 static void
 _memory_unlink_orphan_heap(atomicptr_t* list, heap_t* heap) {
-	heap_t* orphan = (heap_t*)((uintptr_t)atomic_load_ptr(list) & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
+	void* raworphan = atomic_load_ptr(list);
+	heap_t* orphan = (heap_t*)((uintptr_t)raworphan & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
 	if (orphan == heap) {
 		//We're now in single-threaded finalization phase, no need to ABA protect or CAS
 		atomic_store_ptr(list, heap->next_orphan);

From c5416a871fa2fad05a65a786dfef2a44f36ef8c4 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Mon, 3 Feb 2020 21:23:49 +0100
Subject: [PATCH 27/69] fixed finalization with remaining partial allocated
 spans

---
 rpmalloc/rpmalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index a555ab0e..8558d182 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -2339,8 +2339,8 @@ _memory_heap_finalize(heap_t* heap) {
 			--heap->full_span_count;
 			if (!_memory_span_finalize(heap, iclass, class_span, list)) {
 				if (list)
-					_memory_span_double_link_list_remove(list, span);
-				_memory_span_double_link_list_add(&heap_class->partial_span, span);
+					_memory_span_double_link_list_remove(list, class_span);
+				_memory_span_double_link_list_add(&heap_class->partial_span, class_span);
 			}
 		}
 	}

From f17c52a97cc5e52b1b4ba0ac95ebe7be9f87e312 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 5 Feb 2020 08:50:50 +0100
Subject: [PATCH 28/69] bugfix for deferred free of huge spans decrementing
 span count twice

---
 rpmalloc/rpmalloc.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 8558d182..c951efdd 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1012,9 +1012,9 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	while (span) {
 		span_t* next_span = (span_t*)span->free_list;
 		assert(span->heap == heap);
-		assert(heap->full_span_count);
-		--heap->full_span_count;
 		if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+			assert(heap->full_span_count);
+			--heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 			heap_class_t* heap_class = heap->span_class + span->size_class;
 			_memory_span_double_link_list_remove(&heap_class->full_span, span);
@@ -1027,13 +1027,15 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 				_memory_heap_cache_insert(heap, span);
 			}
 		} else {
-#if RPMALLOC_FIRST_CLASS_HEAPS
-			_memory_span_double_link_list_remove(&heap->large_huge_span, span);
-#endif
 			if (span->size_class == SIZE_CLASS_HUGE) {
 				_memory_deallocate_huge(span);
 			} else {
 				assert(span->size_class == SIZE_CLASS_LARGE);
+				assert(heap->full_span_count);
+				--heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+				_memory_span_double_link_list_remove(&heap->large_huge_span, span);
+#endif
 				uint32_t idx = span->span_count - 1;
 				if (!idx && single_span && !*single_span) {
 					*single_span = span;

From 01cdb956be7a000ee402ff33766774f034010643 Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Sun, 15 Mar 2020 11:24:20 +0000
Subject: [PATCH 29/69] Build fix on ARM for the process id register assembly.
 (#155)

---
 rpmalloc/rpmalloc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index c951efdd..f6231afd 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -594,9 +594,9 @@ get_thread_id(void) {
 #  elif defined(__x86_64__)
 	__asm__("movq %%fs:0, %0" : "=r" (tid) : : );
 #  elif defined(__arm__)
-	asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
+	__asm__ volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
 #  elif defined(__aarch64__)
-	asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
+	__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tid));
 #  else
 	tid = (uintptr_t)get_thread_heap_raw();
 #  endif

From 29d7959bb2164cf445dd54c864d6154e6ae5ee00 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Sun, 15 Mar 2020 13:53:39 +0100
Subject: [PATCH 30/69] reduce tests on x86 arch (#156)

---
 rpmalloc/malloc.c   |  4 ++--
 rpmalloc/rpmalloc.c | 10 +++++-----
 test/main.c         | 17 ++++++++++++++++-
 3 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index e5e2f010..e9b08c01 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -94,8 +94,8 @@ extern void* _Znamm(uint64_t size, uint64_t align); void* _Znamm(uint64_t size,
 // 32-bit operators new and new[], normal and aligned
 extern void* _Znwj(uint32_t size); void* _Znwj(uint32_t size) { return rpmalloc(size); }
 extern void* _Znaj(uint32_t size); void* _Znaj(uint32_t size) { return rpmalloc(size); }
-extern void* _Znwjj(uint64_t size, uint64_t align); void* _Znwjj(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
-extern void* _Znajj(uint64_t size, uint64_t align); void* _Znajj(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _Znwjj(uint32_t size, uint32_t align); void* _Znwjj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern void* _Znajj(uint32_t size, uint32_t align); void* _Znajj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
 #endif
 
 #endif
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index f6231afd..119fe230 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -173,17 +173,17 @@ typedef volatile void*     atomicptr_t;
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
-static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)_InterlockedIncrement(val); }
-static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)_InterlockedDecrement(val); }
+static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)InterlockedIncrement(val); }
+static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)InterlockedDecrement(val); }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
-static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)_InterlockedExchangeAdd64(val, add) + add; }
+static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)InterlockedExchangeAdd64(val, add) + add; }
 #endif
-static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)_InterlockedExchangeAdd(val, add) + add; }
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
-static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (_InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
+static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
 static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_cas_ptr(dst, val, ref); }
 
 #define EXPECTED(x) (x)
diff --git a/test/main.c b/test/main.c
index c6338c8e..5b553a62 100644
--- a/test/main.c
+++ b/test/main.c
@@ -495,6 +495,8 @@ allocator_thread(void* argp) {
 	thread_exit((uintptr_t)ret);
 }
 
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
 static void
 heap_allocator_thread(void* argp) {
 	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
@@ -572,6 +574,8 @@ heap_allocator_thread(void* argp) {
 	thread_exit((uintptr_t)ret);
 }
 
+#endif
+
 static void
 crossallocator_thread(void* argp) {
 	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
@@ -819,8 +823,13 @@ test_crossthread(void) {
 
 	for (unsigned int ithread = 0; ithread < num_alloc_threads; ++ithread) {
 		unsigned int iadd = (ithread * (16 + ithread) + ithread) % 128;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
 		arg[ithread].loops = 50;
 		arg[ithread].passes = 1024;
+#else
+		arg[ithread].loops = 20;
+		arg[ithread].passes = 256;
+#endif
 		arg[ithread].pointers = rpmalloc(sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
 		memset(arg[ithread].pointers, 0, sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
 		arg[ithread].datasize[0] = 19 + iadd;
@@ -935,6 +944,7 @@ test_threadspam(void) {
 
 static int
 test_first_class_heaps(void) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
 	uintptr_t thread[32];
 	uintptr_t threadres[32];
 	unsigned int i;
@@ -967,8 +977,13 @@ test_first_class_heaps(void) {
 		arg[i].datasize[14] = 38934;
 		arg[i].datasize[15] = 234;
 		arg[i].num_datasize = 16;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
 		arg[i].loops = 100;
 		arg[i].passes = 4000;
+#else
+		arg[i].loops = 50;
+		arg[i].passes = 1000;
+#endif
 		arg[i].init_fini_each_loop = 1;
 
 		thread_arg targ;
@@ -993,7 +1008,7 @@ test_first_class_heaps(void) {
 	}
 
 	printf("Heap threaded tests passed\n");
-
+#endif
 	return 0;
 }
 

From 886245b464e70f3339deaa8dfbbe1d9e383b3ad3 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Sun, 15 Mar 2020 18:20:16 +0100
Subject: [PATCH 31/69] additional x86 test reduction

---
 test/main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/main.c b/test/main.c
index 5b553a62..a6f54b17 100644
--- a/test/main.c
+++ b/test/main.c
@@ -827,8 +827,8 @@ test_crossthread(void) {
 		arg[ithread].loops = 50;
 		arg[ithread].passes = 1024;
 #else
-		arg[ithread].loops = 20;
-		arg[ithread].passes = 256;
+		arg[ithread].loops = 10;
+		arg[ithread].passes = 128;
 #endif
 		arg[ithread].pointers = rpmalloc(sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
 		memset(arg[ithread].pointers, 0, sizeof(void*) * arg[ithread].loops * arg[ithread].passes);

From 2b519192ff726fc90e55a8eaa364b0209e85c964 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 24 Mar 2020 18:12:37 +0100
Subject: [PATCH 32/69] Refactor for readability (#162)

- Rename internal methods to have rpmalloc prefix to make callstacks easier to read
- Improve grouping of functions based on context
---
 README.md           |    6 +
 rpmalloc/rpmalloc.c | 1808 +++++++++++++++++++++++--------------------
 test/main.c         |    9 +
 3 files changed, 965 insertions(+), 858 deletions(-)

diff --git a/README.md b/README.md
index 5543ddbc..dd1ed8e7 100644
--- a/README.md
+++ b/README.md
@@ -31,6 +31,12 @@ The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8
 
 Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines.
 
+# Required functions
+
+Before calling any other function in the API, you __MUST__ call the initization function, either __rpmalloc_initialize__ or __pmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point.
+
+Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case.
+
 # Using
 The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads:
 
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 119fe230..f1f09d20 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -11,7 +11,12 @@
 
 #include "rpmalloc.h"
 
+////////////
+///
 /// Build time configurable limits
+///
+//////
+
 #ifndef HEAP_ARRAY_SIZE
 //! Size of heap hashmap
 #define HEAP_ARRAY_SIZE           47
@@ -150,6 +155,29 @@
 #include <string.h>
 #include <errno.h>
 
+#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#include <fibersapi.h>
+static DWORD fls_key;
+static void NTAPI
+_rpmalloc_thread_destructor(void* value) {
+	if (value)
+		rpmalloc_thread_finalize();
+}
+#endif
+
+#if PLATFORM_POSIX
+#  include <sys/mman.h>
+#  include <sched.h>
+#  ifdef __FreeBSD__
+#    include <sys/sysctl.h>
+#    define MAP_HUGETLB MAP_ALIGNED_SUPER
+#  endif
+#  ifndef MAP_UNINITIALIZED
+#    define MAP_UNINITIALIZED 0
+#  endif
+#endif
+#include <errno.h>
+
 #if ENABLE_ASSERTS
 #  undef NDEBUG
 #  if defined(_MSC_VER) && !defined(_DEBUG)
@@ -164,7 +192,12 @@
 #  include <stdio.h>
 #endif
 
-/// Atomic access abstraction
+//////
+///
+/// Atomic access abstraction (since MSVC does not do C11 yet)
+///
+//////
+
 #if defined(_MSC_VER) && !defined(__clang__)
 
 typedef volatile long      atomic32_t;
@@ -217,7 +250,44 @@ static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, v
     
 #endif
 
+////////////
+///
+/// Statistics related functions (evaluate to nothing when statistics not enabled)
+///
+//////
+
+#if ENABLE_STATISTICS
+#  define _rpmalloc_stat_inc(counter) atomic_incr32(counter)
+#  define _rpmalloc_stat_dec(counter) atomic_decr32(counter)
+#  define _rpmalloc_stat_add(counter, value) atomic_add32(counter, (int32_t)(value))
+#  define _rpmalloc_stat_add64(counter, value) atomic_add64(counter, (int64_t)(value))
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
+#  define _rpmalloc_stat_sub(counter, value) atomic_add32(counter, -(int32_t)(value))
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do { \
+	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
+	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
+		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
+	atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \
+} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do { \
+	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
+	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
+} while(0)
+#else
+#  define _rpmalloc_stat_inc(counter) do {} while(0)
+#  define _rpmalloc_stat_dec(counter) do {} while(0)
+#  define _rpmalloc_stat_add(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add64(counter, value) do {} while(0)
+#  define _rpmalloc_stat_add_peak(counter, value, peak) do {} while (0)
+#  define _rpmalloc_stat_sub(counter, value) do {} while(0)
+#  define _rpmalloc_stat_inc_alloc(heap, class_idx) do {} while(0)
+#  define _rpmalloc_stat_inc_free(heap, class_idx) do {} while(0)
+#endif
+
+///
 /// Preconfigured limits and sizes
+///
+
 //! Granularity of a small allocation block (must be power of two)
 #define SMALL_GRANULARITY         16
 //! Small granularity shift count
@@ -262,11 +332,14 @@ _Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header si
 #define SIZE_CLASS_LARGE SIZE_CLASS_COUNT
 #define SIZE_CLASS_HUGE ((uint32_t)-1)
 
+////////////
+///
 /// Data types
+///
+//////
+
 //! A memory heap, per thread
 typedef struct heap_t heap_t;
-//! Heap spans per size class
-typedef struct heap_class_t heap_class_t;
 //! Span of memory pages
 typedef struct span_t span_t;
 //! Span list
@@ -337,14 +410,14 @@ struct size_class_use_t {
 typedef struct size_class_use_t size_class_use_t;
 #endif
 
-//A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
-//or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
-//span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
-//(super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
-//that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
-//superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
-//in the same call to release the virtual memory range, but individual subranges can be decommitted individually
-//to reduce physical memory use).
+// A span can either represent a single span of memory pages with size declared by span_map_count configuration variable,
+// or a set of spans in a continuous region, a super span. Any reference to the term "span" usually refers to both a single
+// span or a super span. A super span can further be divided into multiple spans (or this, super spans), where the first
+// (super)span is the master and subsequent (super)spans are subspans. The master span keeps track of how many subspans
+// that are still alive and mapped in virtual memory, and once all subspans and master have been unmapped the entire
+// superspan region is released and unmapped (on Windows for example, the entire superspan range has to be released
+// in the same call to release the virtual memory range, but individual subranges can be decommitted individually
+// to reduce physical memory use).
 struct span_t {
 	//! Free list
 	void*       free_list;
@@ -383,24 +456,20 @@ struct span_t {
 };
 _Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
 
-struct heap_class_t {
+// Control structure for a heap, either a thread heap or a first class heap if enabled
+struct heap_t {
+	//! Owning thread ID
+	uintptr_t    owner_thread;
 	//! Free list of active span
-	void*        free_list;
+	void*        free_list[SIZE_CLASS_COUNT];
 	//! Double linked list of partially used spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
-	span_t*      partial_span;
+	span_t*      partial_span[SIZE_CLASS_COUNT];
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of fully utilized spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
-	span_t*      full_span;
+	span_t*      full_span[SIZE_CLASS_COUNT];
 #endif
-};
-
-struct heap_t {
-	//! Owning thread ID
-	uintptr_t    owner_thread;
-	//! Partial span data per size class
-	heap_class_t span_class[SIZE_CLASS_COUNT];
 #if ENABLE_THREAD_CACHE
 	//! List of free spans (single linked list)
 	span_t*      span_cache[LARGE_CLASS_COUNT];
@@ -447,6 +516,7 @@ struct heap_t {
 #endif
 };
 
+// Size class for defining a block size bucket
 struct size_class_t {
 	//! Size of blocks in this class
 	uint32_t block_size;
@@ -466,7 +536,12 @@ struct global_cache_t {
 	atomic32_t counter;
 };
 
+////////////
+///
 /// Global data
+///
+//////
+
 //! Initialized flag
 static int _rpmalloc_initialized;
 //! Configuration
@@ -541,6 +616,12 @@ static atomic32_t _huge_pages_current;
 static int32_t _huge_pages_peak;
 #endif
 
+////////////
+///
+/// Thread local heap and ID
+///
+//////
+
 //! Current thread heap
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
 static pthread_key_t _memory_thread_heap;
@@ -618,209 +699,143 @@ set_thread_heap(heap_t* heap) {
 		heap->owner_thread = get_thread_id();
 }
 
-//! Default implementation to map more virtual memory
-static void*
-_memory_map_os(size_t size, size_t* offset);
-
-//! Default implementation to unmap virtual memory
-static void
-_memory_unmap_os(void* address, size_t size, size_t offset, size_t release);
-
-#if ENABLE_STATISTICS
-#  define _memory_statistics_inc(counter) atomic_incr32(counter)
-#  define _memory_statistics_dec(counter) atomic_decr32(counter)
-#  define _memory_statistics_add(counter, value) atomic_add32(counter, (int32_t)(value))
-#  define _memory_statistics_add64(counter, value) atomic_add64(counter, (int64_t)(value))
-#  define _memory_statistics_add_peak(counter, value, peak) do { int32_t _cur_count = atomic_add32(counter, (int32_t)(value)); if (_cur_count > (peak)) peak = _cur_count; } while (0)
-#  define _memory_statistics_sub(counter, value) atomic_add32(counter, -(int32_t)(value))
-#  define _memory_statistics_inc_alloc(heap, class_idx) do { \
-	int32_t alloc_current = atomic_incr32(&heap->size_class_use[class_idx].alloc_current); \
-	if (alloc_current > heap->size_class_use[class_idx].alloc_peak) \
-		heap->size_class_use[class_idx].alloc_peak = alloc_current; \
-	atomic_incr32(&heap->size_class_use[class_idx].alloc_total); \
-} while(0)
-#  define _memory_statistics_inc_free(heap, class_idx) do { \
-	atomic_decr32(&heap->size_class_use[class_idx].alloc_current); \
-	atomic_incr32(&heap->size_class_use[class_idx].free_total); \
-} while(0)
-#else
-#  define _memory_statistics_inc(counter) do {} while(0)
-#  define _memory_statistics_dec(counter) do {} while(0)
-#  define _memory_statistics_add(counter, value) do {} while(0)
-#  define _memory_statistics_add64(counter, value) do {} while(0)
-#  define _memory_statistics_add_peak(counter, value, peak) do {} while (0)
-#  define _memory_statistics_sub(counter, value) do {} while(0)
-#  define _memory_statistics_inc_alloc(heap, class_idx) do {} while(0)
-#  define _memory_statistics_inc_free(heap, class_idx) do {} while(0)
-#endif
-
-static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span);
-
-static void
-_memory_global_cache_insert(span_t* span);
-
-static void
-_memory_heap_finalize(heap_t* heap);
+////////////
+///
+/// Low level memory map/unmap
+///
+//////
 
 //! Map more virtual memory
+//  size is number of bytes to map
+//  offset receives the offset in bytes from start of mapped region
+//  returns address to start of mapped region to use
 static void*
-_memory_map(size_t size, size_t* offset) {
+_rpmalloc_mmap(size_t size, size_t* offset) {
 	assert(!(size % _memory_page_size));
 	assert(size >= _memory_page_size);
-	_memory_statistics_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
-	_memory_statistics_add(&_mapped_total, (size >> _memory_page_size_shift));
+	_rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift), _mapped_pages_peak);
+	_rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift));
 	return _memory_config.memory_map(size, offset);
 }
 
 //! Unmap virtual memory
+//  address is the memory address to unmap, as returned from _memory_map
+//  size is the number of bytes to unmap, which might be less than full region for a partial unmap
+//  offset is the offset in bytes to the actual mapped region, as set by _memory_map
+//  release is set to 0 for partial unmap, or size of entire range for a full unmap
 static void
-_memory_unmap(void* address, size_t size, size_t offset, size_t release) {
+_rpmalloc_unmap(void* address, size_t size, size_t offset, size_t release) {
 	assert(!release || (release >= size));
 	assert(!release || (release >= _memory_page_size));
 	if (release) {
 		assert(!(release % _memory_page_size));
-		_memory_statistics_sub(&_mapped_pages, (release >> _memory_page_size_shift));
-		_memory_statistics_add(&_unmapped_total, (release >> _memory_page_size_shift));
+		_rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift));
+		_rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift));
 	}
 	_memory_config.memory_unmap(address, size, offset, release);
 }
 
-//! Declare the span to be a subspan and store distance from master span and span count
-static void
-_memory_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
-	assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
-	if (subspan != master) {
-		subspan->flags = SPAN_FLAG_SUBSPAN;
-		subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
-		subspan->align_offset = 0;
+//! Default implementation to map new pages to virtual memory
+static void*
+_rpmalloc_mmap_os(size_t size, size_t* offset) {
+	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
+	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
+	assert(size >= _memory_page_size);
+#if PLATFORM_WINDOWS
+	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
+	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
+	if (!ptr) {
+		assert(!"Failed to map virtual memory block");
+		return 0;
 	}
-	subspan->span_count = (uint32_t)span_count;
-}
-
-//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
-static span_t*
-_memory_map_from_reserve(heap_t* heap, size_t span_count) {
-	//Update the heap span reserve
-	span_t* span = heap->span_reserve;
-	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
-	heap->spans_reserved -= span_count;
-
-	_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
-	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(&heap->span_use[span_count - 1].spans_from_reserved);
-
-	return span;
-}
-
-//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
-static size_t
-_memory_map_align_span_count(size_t span_count) {
-	size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
-	if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
-		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);	
-	return request_count;
-}
-
-//! Store the given spans as reserve in the given heap
-static void
-_memory_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
-	heap->span_reserve_master = master;
-	heap->span_reserve = reserve;
-	heap->spans_reserved = reserve_span_count;
-}
-
-//! Setup a newly mapped span
-static void
-_memory_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
-	span->total_spans = (uint32_t)total_span_count;
-	span->span_count = (uint32_t)span_count;
-	span->align_offset = (uint32_t)align_offset;
-	span->flags = SPAN_FLAG_MASTER;
-	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
-}
-
-//! Map an aligned set of spans, taking configured mapping granularity and the page size into account
-static span_t*
-_memory_map_aligned_span_count(heap_t* heap, size_t span_count) {
-	//If we already have some, but not enough, reserved spans, release those to heap cache and map a new
-	//full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
-	size_t aligned_span_count = _memory_map_align_span_count(span_count);
-	size_t align_offset = 0;
-	span_t* span = (span_t*)_memory_map(aligned_span_count * _memory_span_size, &align_offset);
-	if (!span)
+#else
+	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#  if defined(__APPLE__)
+	int fd = (int)VM_MAKE_TAG(240U);
+	if (_memory_huge_pages)
+		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
+#  elif defined(MAP_HUGETLB)
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#  else
+	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+#  endif
+	if ((ptr == MAP_FAILED) || !ptr) {
+		assert("Failed to map virtual memory block" == 0);
 		return 0;
-	_memory_span_initialize(span, aligned_span_count, span_count, align_offset);
-	_memory_statistics_add(&_reserved_spans, aligned_span_count);
-	_memory_statistics_inc(&_master_spans);
-	if (span_count <= LARGE_CLASS_COUNT)
-		_memory_statistics_inc(&heap->span_use[span_count - 1].spans_map_calls);
-	if (aligned_span_count > span_count) {
-		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
-		size_t reserved_count = aligned_span_count - span_count;
-		if (heap->spans_reserved) {
-			_memory_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
-			_memory_heap_cache_insert(heap, heap->span_reserve);
-		}
-		_memory_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
 	}
-	return span;
-}
-
-//! Map in memory pages for the given number of spans (or use previously reserved pages)
-static span_t*
-_memory_map_spans(heap_t* heap, size_t span_count) {
-	if (span_count <= heap->spans_reserved)
-		return _memory_map_from_reserve(heap, span_count);
-	return _memory_map_aligned_span_count(heap, span_count);
+#endif
+	_rpmalloc_stat_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
+	if (padding) {
+		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+		assert(final_padding <= _memory_span_size);
+		assert(final_padding <= padding);
+		assert(!(final_padding % 8));
+		ptr = pointer_offset(ptr, final_padding);
+		*offset = final_padding >> 3;
+	}
+	assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask));
+	return ptr;
 }
 
-//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
+//! Default implementation to unmap pages from virtual memory
 static void
-_memory_unmap_span(span_t* span) {
-	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
-	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
-
-	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
-	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size)));
-	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
-	assert(master->flags & SPAN_FLAG_MASTER);
-
-	size_t span_count = span->span_count;
-	if (!is_master) {
-		//Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
-		assert(span->align_offset == 0);
-		if (_memory_span_size >= _memory_page_size) {
-			_memory_unmap(span, span_count * _memory_span_size, 0, 0);
-			_memory_statistics_sub(&_reserved_spans, span_count);
+_rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
+	assert(release || (offset == 0));
+	assert(!release || (release >= _memory_page_size));
+	assert(size >= _memory_page_size);
+	if (release && offset) {
+		offset <<= 3;
+		address = pointer_offset(address, -(int32_t)offset);
+#if PLATFORM_POSIX
+		//Padding is always one span size
+		release += _memory_span_size;
+#endif
+	}
+#if !DISABLE_UNMAP
+#if PLATFORM_WINDOWS
+	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
+		assert(!"Failed to unmap virtual memory block");
+	}
+#else
+	if (release) {
+		if (munmap(address, release)) {
+			assert("Failed to unmap virtual memory block" == 0);
 		}
-	} else {
-		//Special double flag to denote an unmapped master
-		//It must be kept in memory since span header must be used
-		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
 	}
-
-	if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
-		//Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
-		assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
-		size_t unmap_count = master->span_count;
-		if (_memory_span_size < _memory_page_size)
-			unmap_count = master->total_spans;
-		_memory_statistics_sub(&_reserved_spans, unmap_count);
-		_memory_statistics_sub(&_master_spans, 1);
-		_memory_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
+	else {
+#if defined(POSIX_MADV_FREE)
+		if (posix_madvise(address, size, POSIX_MADV_FREE))
+#endif
+		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+			assert("Failed to madvise virtual memory block as free" == 0);
+		}
 	}
+#endif
+#endif
+	if (release)
+		_rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
 }
 
+
+////////////
+///
+/// Span linked list management
+///
+//////
+
 #if ENABLE_THREAD_CACHE
 
+static void
+_rpmalloc_span_unmap(span_t* span);
+
 //! Unmap a single linked list of spans
 static void
-_memory_unmap_span_list(span_t* span) {
+_rpmalloc_span_list_unmap_all(span_t* span) {
 	size_t list_size = span->list_size;
 	for (size_t ispan = 0; ispan < list_size; ++ispan) {
 		span_t* next_span = span->next;
-		_memory_unmap_span(span);
+		_rpmalloc_span_unmap(span);
 		span = next_span;
 	}
 	assert(!span);
@@ -828,7 +843,7 @@ _memory_unmap_span_list(span_t* span) {
 
 //! Add span to head of single linked span list
 static size_t
-_memory_span_list_push(span_t** head, span_t* span) {
+_rpmalloc_span_list_push(span_t** head, span_t* span) {
 	span->next = *head;
 	if (*head)
 		span->list_size = (*head)->list_size + 1;
@@ -840,7 +855,7 @@ _memory_span_list_push(span_t** head, span_t* span) {
 
 //! Remove span from head of single linked span list, returns the new list head
 static span_t*
-_memory_span_list_pop(span_t** head) {
+_rpmalloc_span_list_pop(span_t** head) {
 	span_t* span = *head;
 	span_t* next_span = 0;
 	if (span->list_size > 1) {
@@ -855,7 +870,7 @@ _memory_span_list_pop(span_t** head) {
 
 //! Split a single linked span list
 static span_t*
-_memory_span_list_split(span_t* span, size_t limit) {
+_rpmalloc_span_list_split(span_t* span, size_t limit) {
 	span_t* next = 0;
 	if (limit < 2)
 		limit = 2;
@@ -881,7 +896,7 @@ _memory_span_list_split(span_t* span, size_t limit) {
 
 //! Add a span to double linked list at the head
 static void
-_memory_span_double_link_list_add(span_t** head, span_t* span) {
+_rpmalloc_span_double_link_list_add(span_t** head, span_t* span) {
 	if (*head) {
 		span->next = *head;
 		(*head)->prev = span;
@@ -893,7 +908,7 @@ _memory_span_double_link_list_add(span_t** head, span_t* span) {
 
 //! Pop head span from double linked list
 static void
-_memory_span_double_link_list_pop_head(span_t** head, span_t* span) {
+_rpmalloc_span_double_link_list_pop_head(span_t** head, span_t* span) {
 	assert(*head == span);
 	span = *head;
 	*head = span->next;
@@ -901,7 +916,7 @@ _memory_span_double_link_list_pop_head(span_t** head, span_t* span) {
 
 //! Remove a span from double linked list
 static void
-_memory_span_double_link_list_remove(span_t** head, span_t* span) {
+_rpmalloc_span_double_link_list_remove(span_t** head, span_t* span) {
 	assert(*head);
 	if (*head == span) {
 		*head = span->next;
@@ -915,39 +930,315 @@ _memory_span_double_link_list_remove(span_t** head, span_t* span) {
 	}
 }
 
-#if ENABLE_GLOBAL_CACHE
 
-//! Insert the given list of memory page spans in the global cache
+////////////
+///
+/// Span control
+///
+//////
+
 static void
-_memory_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
-	assert((span->list_size == 1) || (span->next != 0));
-	int32_t list_size = (int32_t)span->list_size;
-	//Unmap if cache has reached the limit. Does not need stronger synchronization, the worst
-	//case is that the span list is unmapped when it could have been cached (no real dependency
-	//between the two variables)
-	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-		_memory_unmap_span_list(span);
-		atomic_add32(&cache->size, -list_size);
-		return;
-#endif
-	}
-	void* current_cache, *new_cache;
-	do {
-		current_cache = atomic_load_ptr(&cache->cache);
-		span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
-		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
-}
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span);
 
-//! Extract a number of memory page spans from the global cache
-static span_t*
-_memory_cache_extract(global_cache_t* cache) {
-	uintptr_t span_ptr;
-	do {
-		void* global_span = atomic_load_ptr(&cache->cache);
-		span_ptr = (uintptr_t)global_span & _memory_span_mask;
-		if (span_ptr) {
+static void
+_rpmalloc_heap_finalize(heap_t* heap);
+
+static void
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count);
+
+//! Declare the span to be a subspan and store distance from master span and span count
+static void
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count) {
+	assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER));
+	if (subspan != master) {
+		subspan->flags = SPAN_FLAG_SUBSPAN;
+		subspan->offset_from_master = (uint32_t)((uintptr_t)pointer_diff(subspan, master) >> _memory_span_size_shift);
+		subspan->align_offset = 0;
+	}
+	subspan->span_count = (uint32_t)span_count;
+}
+
+//! Use reserved spans to fulfill a memory map request (reserve size must be checked by caller)
+static span_t*
+_rpmalloc_span_map_from_reserve(heap_t* heap, size_t span_count) {
+	//Update the heap span reserve
+	span_t* span = heap->span_reserve;
+	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+	heap->spans_reserved -= span_count;
+
+	_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
+	if (span_count <= LARGE_CLASS_COUNT)
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved);
+
+	return span;
+}
+
+//! Get the aligned number of spans to map in based on wanted count, configured mapping granularity and the page size
+static size_t
+_rpmalloc_span_align_count(size_t span_count) {
+	size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
+	if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
+		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);	
+	return request_count;
+}
+
+//! Setup a newly mapped span
+static void
+_rpmalloc_span_initialize(span_t* span, size_t total_span_count, size_t span_count, size_t align_offset) {
+	span->total_spans = (uint32_t)total_span_count;
+	span->span_count = (uint32_t)span_count;
+	span->align_offset = (uint32_t)align_offset;
+	span->flags = SPAN_FLAG_MASTER;
+	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
+}
+
+//! Map an aligned set of spans, taking configured mapping granularity and the page size into account
+static span_t*
+_rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) {
+	//If we already have some, but not enough, reserved spans, release those to heap cache and map a new
+	//full set of spans. Otherwise we would waste memory if page size > span size (huge pages)
+	size_t aligned_span_count = _rpmalloc_span_align_count(span_count);
+	size_t align_offset = 0;
+	span_t* span = (span_t*)_rpmalloc_mmap(aligned_span_count * _memory_span_size, &align_offset);
+	if (!span)
+		return 0;
+	_rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset);
+	_rpmalloc_stat_add(&_reserved_spans, aligned_span_count);
+	_rpmalloc_stat_inc(&_master_spans);
+	if (span_count <= LARGE_CLASS_COUNT)
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls);
+	if (aligned_span_count > span_count) {
+		span_t* reserved_spans = (span_t*)pointer_offset(span, span_count * _memory_span_size);
+		size_t reserved_count = aligned_span_count - span_count;
+		if (heap->spans_reserved) {
+			_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
+			_rpmalloc_heap_cache_insert(heap, heap->span_reserve);
+		}
+		_rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
+	}
+	return span;
+}
+
+//! Map in memory pages for the given number of spans (or use previously reserved pages)
+static span_t*
+_rpmalloc_span_map(heap_t* heap, size_t span_count) {
+	if (span_count <= heap->spans_reserved)
+		return _rpmalloc_span_map_from_reserve(heap, span_count);
+	return _rpmalloc_span_map_aligned_count(heap, span_count);
+}
+
+//! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
+static void
+_rpmalloc_span_unmap(span_t* span) {
+	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
+	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
+
+	int is_master = !!(span->flags & SPAN_FLAG_MASTER);
+	span_t* master = is_master ? span : ((span_t*)pointer_offset(span, -(intptr_t)((uintptr_t)span->offset_from_master * _memory_span_size)));
+	assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN));
+	assert(master->flags & SPAN_FLAG_MASTER);
+
+	size_t span_count = span->span_count;
+	if (!is_master) {
+		//Directly unmap subspans (unless huge pages, in which case we defer and unmap entire page range with master)
+		assert(span->align_offset == 0);
+		if (_memory_span_size >= _memory_page_size) {
+			_rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0);
+			_rpmalloc_stat_sub(&_reserved_spans, span_count);
+		}
+	} else {
+		//Special double flag to denote an unmapped master
+		//It must be kept in memory since span header must be used
+		span->flags |= SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN;
+	}
+
+	if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
+		//Everything unmapped, unmap the master span with release flag to unmap the entire range of the super span
+		assert(!!(master->flags & SPAN_FLAG_MASTER) && !!(master->flags & SPAN_FLAG_SUBSPAN));
+		size_t unmap_count = master->span_count;
+		if (_memory_span_size < _memory_page_size)
+			unmap_count = master->total_spans;
+		_rpmalloc_stat_sub(&_reserved_spans, unmap_count);
+		_rpmalloc_stat_sub(&_master_spans, 1);
+		_rpmalloc_unmap(master, unmap_count * _memory_span_size, master->align_offset, (size_t)master->total_spans * _memory_span_size);
+	}
+}
+
+//! Move the span (used for small or medium allocations) to the heap thread cache
+static void
+_rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
+	assert(heap == span->heap);
+	assert(span->size_class < SIZE_CLASS_COUNT);
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	atomic_decr32(&heap->span_use[0].current);
+#endif
+	_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
+	_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+	_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+	_rpmalloc_heap_cache_insert(heap, span);
+}
+
+//! Initialize a (partial) free list up to next system memory page, while reserving the first block
+//! as allocated, returning number of blocks in list
+static uint32_t
+free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start,
+                       uint32_t block_count, uint32_t block_size) {
+	assert(block_count);
+	*first_block = block_start;
+	if (block_count > 1) {
+		void* free_block = pointer_offset(block_start, block_size);
+		void* block_end = pointer_offset(block_start, (size_t)block_size * block_count);
+		//If block size is less than half a memory page, bound init to next memory page boundary
+		if (block_size < (_memory_page_size >> 1)) {
+			void* page_end = pointer_offset(page_start, _memory_page_size);
+			if (page_end < block_end)
+				block_end = page_end;
+		}
+		*list = free_block;
+		block_count = 2;
+		void* next_block = pointer_offset(free_block, block_size);
+		while (next_block < block_end) {
+			*((void**)free_block) = next_block;
+			free_block = next_block;
+			++block_count;
+			next_block = pointer_offset(next_block, block_size);
+		}
+		*((void**)free_block) = 0;
+	} else {
+		*list = 0;
+	}
+	return block_count;
+}
+
+//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list
+static void*
+_rpmalloc_span_initialize_new(heap_t* heap, span_t* span, uint32_t class_idx) {
+	assert(span->span_count == 1);
+	size_class_t* size_class = _memory_size_class + class_idx;
+	span->size_class = class_idx;
+	span->heap = heap;
+	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
+	span->block_size = size_class->block_size;
+	span->block_count = size_class->block_count;
+	span->free_list = 0;
+	span->list_size = 0;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
+
+	//Setup free list. Only initialize one system page worth of free blocks in list
+	void* block;
+	span->free_list_limit = free_list_partial_init(&heap->free_list[class_idx], &block, 
+		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
+	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
+	if (span->free_list_limit < span->block_count) {
+		_rpmalloc_span_double_link_list_add(&heap->partial_span[class_idx], span);
+		span->used_count = span->free_list_limit;
+	} else {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+		++heap->full_span_count;
+		span->used_count = span->block_count;
+	}
+	return block;
+}
+
+static void
+_rpmalloc_span_extract_free_list_deferred(span_t* span) {
+	// We need acquire semantics on the CAS operation since we are interested in the list size
+	// Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency
+	do {
+		span->free_list = atomic_load_ptr(&span->free_list_deferred);
+	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, span->free_list));
+	span->used_count -= span->list_size;
+	span->list_size = 0;
+	atomic_store_ptr_release(&span->free_list_deferred, 0);
+}
+
+static int
+_rpmalloc_span_is_fully_utilized(span_t* span) {
+	assert(span->free_list_limit <= span->block_count);
+	return !span->free_list && (span->free_list_limit >= span->block_count);
+}
+
+static int
+_rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
+	span_t* class_span = (span_t*)((uintptr_t)heap->free_list[iclass] & _memory_span_mask);
+	if (span == class_span) {
+		// Adopt the heap class free list back into the span free list
+		void* block = span->free_list;
+		void* last_block = 0;
+		while (block) {
+			last_block = block;
+			block = *((void**)block);
+		}
+		uint32_t free_count = 0;
+		block = heap->free_list[iclass];
+		while (block) {
+			++free_count;
+			block = *((void**)block);
+		}
+		if (last_block) {
+			*((void**)last_block) = heap->free_list[iclass];
+		} else {
+			span->free_list = heap->free_list[iclass];
+		}
+		heap->free_list[iclass] = 0;
+		span->used_count -= free_count;
+	}
+	//If this assert triggers you have memory leaks
+	assert(span->list_size == span->used_count);
+	if (span->list_size == span->used_count) {
+		_rpmalloc_stat_dec(&heap->span_use[0].current);
+		_rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current);
+		// This function only used for spans in double linked lists
+		if (list_head)
+			_rpmalloc_span_double_link_list_remove(list_head, span);
+		_rpmalloc_span_unmap(span);
+		return 1;
+	}
+	return 0;
+}
+
+
+////////////
+///
+/// Global cache
+///
+//////
+
+#if ENABLE_GLOBAL_CACHE
+
+//! Insert the given list of memory page spans in the global cache
+static void
+_rpmalloc_global_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
+	assert((span->list_size == 1) || (span->next != 0));
+	int32_t list_size = (int32_t)span->list_size;
+	//Unmap if cache has reached the limit. Does not need stronger synchronization, the worst
+	//case is that the span list is unmapped when it could have been cached (no real dependency
+	//between the two variables)
+	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
+#if !ENABLE_UNLIMITED_GLOBAL_CACHE
+		_rpmalloc_span_list_unmap_all(span);
+		atomic_add32(&cache->size, -list_size);
+		return;
+#endif
+	}
+	void* current_cache, *new_cache;
+	do {
+		current_cache = atomic_load_ptr(&cache->cache);
+		span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
+		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
+	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
+}
+
+//! Extract a number of memory page spans from the global cache
+static span_t*
+_rpmalloc_global_cache_extract(global_cache_t* cache) {
+	uintptr_t span_ptr;
+	do {
+		void* global_span = atomic_load_ptr(&cache->cache);
+		span_ptr = (uintptr_t)global_span & _memory_span_mask;
+		if (span_ptr) {
 			span_t* span = (span_t*)span_ptr;
 			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
 			//does not manage to traverse the span to being unmapped before we access it
@@ -963,13 +1254,13 @@ _memory_cache_extract(global_cache_t* cache) {
 
 //! Finalize a global cache, only valid from allocator finalization (not thread safe)
 static void
-_memory_cache_finalize(global_cache_t* cache) {
+_rpmalloc_global_cache_finalize(global_cache_t* cache) {
 	void* current_cache = atomic_load_ptr(&cache->cache);
 	span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
 	while (span) {
 		span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask);
 		atomic_add32(&cache->size, -(int32_t)span->list_size);
-		_memory_unmap_span_list(span);
+		_rpmalloc_span_list_unmap_all(span);
 		span = skip_span;
 	}
 	assert(!atomic_load32(&cache->size));
@@ -979,31 +1270,46 @@ _memory_cache_finalize(global_cache_t* cache) {
 
 //! Insert the given list of memory page spans in the global cache
 static void
-_memory_global_cache_insert(span_t* span) {
+_rpmalloc_global_cache_insert_span_list(span_t* span) {
 	size_t span_count = span->span_count;
 #if ENABLE_UNLIMITED_GLOBAL_CACHE
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
+	_rpmalloc_global_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
 #else
 	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
-	_memory_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
+	_rpmalloc_global_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
 #endif
 }
 
 //! Extract a number of memory page spans from the global cache for large blocks
 static span_t*
-_memory_global_cache_extract(size_t span_count) {
-	span_t* span = _memory_cache_extract(&_memory_span_cache[span_count - 1]);
+_rpmalloc_global_cache_extract_span_list(size_t span_count) {
+	span_t* span = _rpmalloc_global_cache_extract(&_memory_span_cache[span_count - 1]);
 	assert(!span || (span->span_count == span_count));
 	return span;
 }
 
 #endif
 
-static void _memory_deallocate_huge(span_t*);
+
+////////////
+///
+/// Heap control
+///
+//////
+
+static void _rpmalloc_deallocate_huge(span_t*);
+
+//! Store the given spans as reserve in the given heap
+static void
+_rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
+	heap->span_reserve_master = master;
+	heap->span_reserve = reserve;
+	heap->spans_reserved = reserve_span_count;
+}
 
 //! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
 static void
-_memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
+_rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	span_t* span = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
 	if (!span)
 		return;
@@ -1016,32 +1322,31 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 			assert(heap->full_span_count);
 			--heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
-			heap_class_t* heap_class = heap->span_class + span->size_class;
-			_memory_span_double_link_list_remove(&heap_class->full_span, span);
+			_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
 #endif
 			if (single_span && !*single_span) {
 				*single_span = span;
 			} else {
-				_memory_statistics_dec(&heap->span_use[0].current);
-				_memory_statistics_dec(&heap->size_class_use[span->size_class].spans_current);
-				_memory_heap_cache_insert(heap, span);
+				_rpmalloc_stat_dec(&heap->span_use[0].current);
+				_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+				_rpmalloc_heap_cache_insert(heap, span);
 			}
 		} else {
 			if (span->size_class == SIZE_CLASS_HUGE) {
-				_memory_deallocate_huge(span);
+				_rpmalloc_deallocate_huge(span);
 			} else {
 				assert(span->size_class == SIZE_CLASS_LARGE);
 				assert(heap->full_span_count);
 				--heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
-				_memory_span_double_link_list_remove(&heap->large_huge_span, span);
+				_rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span);
 #endif
 				uint32_t idx = span->span_count - 1;
 				if (!idx && single_span && !*single_span) {
 					*single_span = span;
 				} else {
-					_memory_statistics_dec(&heap->span_use[idx].current);
-					_memory_heap_cache_insert(heap, span);
+					_rpmalloc_stat_dec(&heap->span_use[idx].current);
+					_rpmalloc_heap_cache_insert(heap, span);
 				}
 			}
 		}
@@ -1050,10 +1355,10 @@ _memory_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 }
 
 static void
-_memory_heap_global_finalize(heap_t* heap);
+_rpmalloc_heap_global_finalize(heap_t* heap);
 
 static void
-_memory_unlink_orphan_heap(atomicptr_t* list, heap_t* heap) {
+_rpmalloc_heap_unlink_orphan(atomicptr_t* list, heap_t* heap) {
 	void* raworphan = atomic_load_ptr(list);
 	heap_t* orphan = (heap_t*)((uintptr_t)raworphan & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
 	if (orphan == heap) {
@@ -1071,37 +1376,37 @@ _memory_unlink_orphan_heap(atomicptr_t* list, heap_t* heap) {
 }
 
 static void
-_memory_unmap_heap(heap_t* heap) {
+_rpmalloc_heap_unmap(heap_t* heap) {
 	if (!heap->master_heap) {
 		if (!atomic_load32(&heap->child_count)) {
-			_memory_unlink_orphan_heap(&_memory_orphan_heaps, heap);
+			_rpmalloc_heap_unlink_orphan(&_memory_orphan_heaps, heap);
 #if RPMALLOC_FIRST_CLASS_HEAPS
-			_memory_unlink_orphan_heap(&_memory_first_class_orphan_heaps, heap);
+			_rpmalloc_heap_unlink_orphan(&_memory_first_class_orphan_heaps, heap);
 #endif
 			size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-			_memory_unmap(heap, block_size, heap->align_offset, block_size);
+			_rpmalloc_unmap(heap, block_size, heap->align_offset, block_size);
 		}
 	} else {
 		if (atomic_decr32(&heap->master_heap->child_count) == 0) {
-			_memory_heap_global_finalize(heap->master_heap);
+			_rpmalloc_heap_global_finalize(heap->master_heap);
 		}
 	}
 }
 
 static void
-_memory_heap_global_finalize(heap_t* heap) {
+_rpmalloc_heap_global_finalize(heap_t* heap) {
 	if (heap->finalize++ > 1) {
 		--heap->finalize;
 		return;
 	}
 
-	_memory_heap_finalize(heap);
+	_rpmalloc_heap_finalize(heap);
 
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		span_t* span = heap->span_cache[iclass];
 		heap->span_cache[iclass] = 0;
 		if (span)
-			_memory_unmap_span_list(span);
+			_rpmalloc_span_list_unmap_all(span);
 	}
 
 	if (heap->full_span_count) {
@@ -1110,8 +1415,7 @@ _memory_heap_global_finalize(heap_t* heap) {
 	}
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		heap_class_t* heap_class = heap->span_class + iclass;
-		if (heap_class->free_list || heap_class->partial_span) {
+		if (heap->free_list[iclass] || heap->partial_span[iclass]) {
 			--heap->finalize;
 			return;
 		}
@@ -1127,26 +1431,26 @@ _memory_heap_global_finalize(heap_t* heap) {
 		list_heap->next_heap = heap->next_heap;
 	}
 
-	_memory_unmap_heap( heap );
+	_rpmalloc_heap_unmap( heap );
 }
 
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
 static void
-_memory_heap_cache_insert(heap_t* heap, span_t* span) {
+_rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) {
 	if (UNEXPECTED(heap->finalize != 0)) {
-		_memory_unmap_span(span);
-		_memory_heap_global_finalize(heap);
+		_rpmalloc_span_unmap(span);
+		_rpmalloc_heap_global_finalize(heap);
 		return;
 	}
 #if ENABLE_THREAD_CACHE
 	size_t span_count = span->span_count;
 	size_t idx = span_count - 1;
-	_memory_statistics_inc(&heap->span_use[idx].spans_to_cache);
+	_rpmalloc_stat_inc(&heap->span_use[idx].spans_to_cache);
 #if ENABLE_UNLIMITED_THREAD_CACHE
-	_memory_span_list_push(&heap->span_cache[idx], span);
+	_rpmalloc_span_list_push(&heap->span_cache[idx], span);
 #else
 	const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
-	size_t current_cache_size = _memory_span_list_push(&heap->span_cache[idx], span);
+	size_t current_cache_size = _rpmalloc_span_list_push(&heap->span_cache[idx], span);
 	if (current_cache_size <= release_count)
 		return;
 	const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
@@ -1161,55 +1465,55 @@ _memory_heap_cache_insert(heap_t* heap, span_t* span) {
 		return;
 #endif
 	}
-	heap->span_cache[idx] = _memory_span_list_split(span, release_count);
+	heap->span_cache[idx] = _rpmalloc_span_list_split(span, release_count);
 	assert(span->list_size == release_count);
 #if ENABLE_GLOBAL_CACHE
-	_memory_statistics_add64(&heap->thread_to_global, (size_t)span->list_size * span_count * _memory_span_size);
-	_memory_statistics_add(&heap->span_use[idx].spans_to_global, span->list_size);
-	_memory_global_cache_insert(span);
+	_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span_count * _memory_span_size);
+	_rpmalloc_stat_add(&heap->span_use[idx].spans_to_global, span->list_size);
+	_rpmalloc_global_cache_insert_span_list(span);
 #else
-	_memory_unmap_span_list(span);
+	_rpmalloc_span_list_unmap_all(span);
 #endif
 #endif
 #else
 	(void)sizeof(heap);
-	_memory_unmap_span(span);
+	_rpmalloc_span_unmap(span);
 #endif
 }
 
 //! Extract the given number of spans from the different cache levels
 static span_t*
-_memory_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
+_rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
 	span_t* span = 0;
 	size_t idx = span_count - 1;
 	if (!idx)
-		_memory_heap_cache_adopt_deferred(heap, &span);
+		_rpmalloc_heap_cache_adopt_deferred(heap, &span);
 #if ENABLE_THREAD_CACHE
 	if (!span && heap->span_cache[idx]) {
-		_memory_statistics_inc(&heap->span_use[idx].spans_from_cache);
-		span = _memory_span_list_pop(&heap->span_cache[idx]);
+		_rpmalloc_stat_inc(&heap->span_use[idx].spans_from_cache);
+		span = _rpmalloc_span_list_pop(&heap->span_cache[idx]);
 	}
 #endif
 	return span;
 }
 
 static span_t*
-_memory_heap_reserved_extract(heap_t* heap, size_t span_count) {
+_rpmalloc_heap_reserved_extract(heap_t* heap, size_t span_count) {
 	if (heap->spans_reserved >= span_count)
-		return _memory_map_spans(heap, span_count);
+		return _rpmalloc_span_map(heap, span_count);
 	return 0;
 }
 
 //! Extract a span from the global cache
 static span_t*
-_memory_heap_global_cache_extract(heap_t* heap, size_t span_count) {
+_rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) {
 #if ENABLE_GLOBAL_CACHE
 	size_t idx = span_count - 1;
-	heap->span_cache[idx] = _memory_global_cache_extract(span_count);
+	heap->span_cache[idx] = _rpmalloc_global_cache_extract_span_list(span_count);
 	if (heap->span_cache[idx]) {
-		_memory_statistics_add64(&heap->global_to_thread, (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size);
-		_memory_statistics_add(&heap->span_use[idx].spans_from_global, heap->span_cache[idx]->list_size);
-		return _memory_span_list_pop(&heap->span_cache[idx]);
+		_rpmalloc_stat_add64(&heap->global_to_thread, (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[idx].spans_from_global, heap->span_cache[idx]->list_size);
+		return _rpmalloc_span_list_pop(&heap->span_cache[idx]);
 	}
 #endif
 	(void)sizeof(heap);
@@ -1219,132 +1523,234 @@ _memory_heap_global_cache_extract(heap_t* heap, size_t span_count) {
 
 //! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
 static span_t*
-_memory_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
+_rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
 	(void)sizeof(class_idx);
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	uint32_t idx = (uint32_t)span_count - 1;
 	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
 	if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high))
 		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
-	_memory_statistics_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
+	_rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
 #endif
-	span_t* span = _memory_heap_thread_cache_extract(heap, span_count);
+	span_t* span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(&heap->size_class_use[class_idx].spans_from_cache);
+		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
 	}
-	span = _memory_heap_reserved_extract(heap, span_count);
+	span = _rpmalloc_heap_reserved_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(&heap->size_class_use[class_idx].spans_from_reserved);
+		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
 		return span;
 	}
-	span = _memory_heap_global_cache_extract(heap, span_count);
+	span = _rpmalloc_heap_global_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
-		_memory_statistics_inc(&heap->size_class_use[class_idx].spans_from_cache);
+		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
 	}
 	//Final fallback, map in more virtual memory
-	span = _memory_map_spans(heap, span_count);
-	_memory_statistics_inc(&heap->size_class_use[class_idx].spans_map_calls);
+	span = _rpmalloc_span_map(heap, span_count);
+	_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls);
 	return span;
 }
 
-//! Move the span (used for small or medium allocations) to the heap thread cache
 static void
-_memory_span_release_to_cache(heap_t* heap, span_t* span) {
-	assert(heap == span->heap);
-	assert(span->size_class < SIZE_CLASS_COUNT);
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	atomic_decr32(&heap->span_use[0].current);
+_rpmalloc_heap_initialize(heap_t* heap) {
+	memset(heap, 0, sizeof(heap_t));
+
+	//Get a new heap ID
+	heap->id = 1 + atomic_incr32(&_memory_heap_id);
+
+	//Link in heap in heap ID map
+	heap_t* next_heap;
+	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
+	do {
+		next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap->next_heap = next_heap;
+	} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+}
+
+static void
+_rpmalloc_heap_orphan(heap_t* heap, int first_class) {
+	void* raw_heap;
+	uintptr_t orphan_counter;
+	heap_t* last_heap;
+	heap->owner_thread = (uintptr_t)-1;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	atomicptr_t* heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
+#else
+	(void)sizeof(first_class);
+	atomicptr_t* heap_list = &_memory_orphan_heaps;
 #endif
-	_memory_statistics_inc(&heap->span_use[0].spans_to_cache);
-	_memory_statistics_inc(&heap->size_class_use[span->size_class].spans_to_cache);
-	_memory_statistics_dec(&heap->size_class_use[span->size_class].spans_current);
-	_memory_heap_cache_insert(heap, span);
+	do {
+		last_heap = (heap_t*)atomic_load_ptr(heap_list);
+		heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
+		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
+		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
+	} while (!atomic_cas_ptr(heap_list, raw_heap, last_heap));
 }
 
-//! Initialize a (partial) free list up to next system memory page, while reserving the first block
-//! as allocated, returning number of blocks in list
-static uint32_t
-free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start,
-                       uint32_t block_count, uint32_t block_size) {
-	assert(block_count);
-	*first_block = block_start;
-	if (block_count > 1) {
-		void* free_block = pointer_offset(block_start, block_size);
-		void* block_end = pointer_offset(block_start, (size_t)block_size * block_count);
-		//If block size is less than half a memory page, bound init to next memory page boundary
-		if (block_size < (_memory_page_size >> 1)) {
-			void* page_end = pointer_offset(page_start, _memory_page_size);
-			if (page_end < block_end)
-				block_end = page_end;
-		}
-		*list = free_block;
-		block_count = 2;
-		void* next_block = pointer_offset(free_block, block_size);
-		while (next_block < block_end) {
-			*((void**)free_block) = next_block;
-			free_block = next_block;
-			++block_count;
-			next_block = pointer_offset(next_block, block_size);
-		}
-		*((void**)free_block) = 0;
-	} else {
-		*list = 0;
+//! Allocate a new heap from newly mapped memory pages
+static heap_t*
+_rpmalloc_heap_allocate_new(void) {
+	//Map in pages for a new heap
+	size_t align_offset = 0;
+	size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+	heap_t* heap = (heap_t*)_rpmalloc_mmap(block_size, &align_offset);
+	if (!heap)
+		return heap;
+
+	_rpmalloc_heap_initialize(heap);
+	heap->align_offset = align_offset;
+
+	//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
+	size_t aligned_heap_size = sizeof(heap_t);
+	if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
+		aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
+	size_t num_heaps = block_size / aligned_heap_size;
+	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
+	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
+	while (num_heaps > 1) {
+		_rpmalloc_heap_initialize(extra_heap);
+		extra_heap->master_heap = heap;
+		_rpmalloc_heap_orphan(extra_heap, 1);
+		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
+		--num_heaps;
 	}
-	return block_count;
+	return heap;
 }
 
-//! Initialize an unused span (from cache or mapped) to be new active span, putting the initial free list in heap class free list
-static void*
-_memory_span_initialize_new(heap_t* heap, heap_class_t* heap_class, span_t* span, uint32_t class_idx) {
-	assert(span->span_count == 1);
-	size_class_t* size_class = _memory_size_class + class_idx;
-	span->size_class = class_idx;
-	span->heap = heap;
-	span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
-	span->block_size = size_class->block_size;
-	span->block_count = size_class->block_count;
-	span->free_list = 0;
-	span->list_size = 0;
-	atomic_store_ptr_release(&span->free_list_deferred, 0);
+static heap_t*
+_rpmalloc_heap_extract_orphan(atomicptr_t* heap_list) {
+	void* raw_heap;
+	void* next_raw_heap;
+	uintptr_t orphan_counter;
+	heap_t* heap;
+	heap_t* next_heap;
+	do {
+		raw_heap = atomic_load_ptr(heap_list);
+		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
+		if (!heap)
+			break;
+		next_heap = heap->next_orphan;
+		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
+		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
+	} while (!atomic_cas_ptr(heap_list, next_raw_heap, raw_heap));
+	return heap;
+}
 
-	//Setup free list. Only initialize one system page worth of free blocks in list
-	void* block;
-	span->free_list_limit = free_list_partial_init(&heap_class->free_list, &block, 
-		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
-	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
-	if (span->free_list_limit < span->block_count) {
-		_memory_span_double_link_list_add(&heap_class->partial_span, span);
-		span->used_count = span->free_list_limit;
-	} else {
+//! Allocate a new heap, potentially reusing a previously orphaned heap
+static heap_t*
+_rpmalloc_heap_allocate(int first_class) {
+	heap_t* heap = 0;
+	if (first_class == 0)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
 #if RPMALLOC_FIRST_CLASS_HEAPS
-		_memory_span_double_link_list_add(&heap_class->full_span, span);
+	if (!heap)
+		heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps);
+#endif
+	if (!heap)
+		heap = _rpmalloc_heap_allocate_new();
+	return heap;
+}
+
+static void
+_rpmalloc_heap_release(void* heapptr, int first_class) {
+	heap_t* heap = (heap_t*)heapptr;
+	if (!heap)
+		return;
+	//Release thread cache spans back to global cache
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		span_t* span = heap->span_cache[iclass];
+		heap->span_cache[iclass] = 0;
+		if (span && heap->finalize) {
+			_rpmalloc_span_list_unmap_all(span);
+			continue;
+		}
+#if ENABLE_GLOBAL_CACHE
+		while (span) {
+			assert(span->span_count == (iclass + 1));
+			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
+			span_t* next = _rpmalloc_span_list_split(span, (uint32_t)release_count);
+			_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span->list_size);
+			_rpmalloc_global_cache_insert_span_list(span);
+			span = next;
+		}
+#else
+		if (span)
+			_rpmalloc_span_list_unmap_all(span);
 #endif
-		++heap->full_span_count;
-		span->used_count = span->block_count;
 	}
-	return block;
+#endif
+
+	//Orphan the heap
+	_rpmalloc_heap_orphan(heap, first_class);
+
+	set_thread_heap(0);
+#if ENABLE_STATISTICS
+	atomic_decr32(&_memory_active_heaps);
+	assert(atomic_load32(&_memory_active_heaps) >= 0);
+#endif
 }
 
 static void
-_memory_span_extract_free_list_deferred(span_t* span) {
-	// We need acquire semantics on the CAS operation since we are interested in the list size
-	// Refer to _memory_deallocate_defer_small_or_medium for further comments on this dependency
-	do {
-		span->free_list = atomic_load_ptr(&span->free_list_deferred);
-	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, span->free_list));
-	span->used_count -= span->list_size;
-	span->list_size = 0;
-	atomic_store_ptr_release(&span->free_list_deferred, 0);
+_rpmalloc_heap_release_raw(void* heapptr) {
+	_rpmalloc_heap_release(heapptr, 0);
 }
 
-static int
-_memory_span_is_fully_utilized(span_t* span) {
-	assert(span->free_list_limit <= span->block_count);
-	return !span->free_list && (span->free_list_limit >= span->block_count);
+static void
+_rpmalloc_heap_finalize(heap_t* heap) {
+	if (heap->spans_reserved) {
+		span_t* span = _rpmalloc_span_map(heap, heap->spans_reserved);
+		_rpmalloc_span_unmap(span);
+		heap->spans_reserved = 0;
+	}
+
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+		span_t* span = heap->partial_span[iclass];
+		while (span) {
+			span_t* next = span->next;
+			_rpmalloc_span_finalize(heap, iclass, span, &heap->partial_span[iclass]);
+			span = next;
+		}
+		// If class still has a free list it must be a full span
+		if (heap->free_list[iclass]) {
+			span_t* class_span = (span_t*)((uintptr_t)heap->free_list[iclass] & _memory_span_mask);
+			span_t** list = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+			list = &heap->full_span[iclass];
+#endif
+			--heap->full_span_count;
+			if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
+				if (list)
+					_rpmalloc_span_double_link_list_remove(list, class_span);
+				_rpmalloc_span_double_link_list_add(&heap->partial_span[iclass], class_span);
+			}
+		}
+	}
+
+#if ENABLE_THREAD_CACHE
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+		if (heap->span_cache[iclass]) {
+			_rpmalloc_span_list_unmap_all(heap->span_cache[iclass]);
+			heap->span_cache[iclass] = 0;
+		}
+	}
+#endif
+	assert(!atomic_load_ptr(&heap->span_free_deferred));
 }
 
+
+////////////
+///
+/// Allocation entry points
+///
+//////
+
 //! Pop first block from a free list
 static void*
 free_list_pop(void** list) {
@@ -1355,22 +1761,21 @@ free_list_pop(void** list) {
 
 //! Allocate a small/medium sized memory block from the given heap
 static void*
-_memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
-	heap_class_t* heap_class = &heap->span_class[class_idx];
-	span_t* span = heap_class->partial_span;
+_rpmalloc_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
+	span_t* span = heap->partial_span[class_idx];
 	if (EXPECTED(span != 0)) {
 		assert(span->block_count == _memory_size_class[span->size_class].block_count);
-		assert(!_memory_span_is_fully_utilized(span));
+		assert(!_rpmalloc_span_is_fully_utilized(span));
 		void* block;
 		if (span->free_list) {
 			//Swap in free list if not empty
-			heap_class->free_list = span->free_list;
+			heap->free_list[class_idx] = span->free_list;
 			span->free_list = 0;
-			block = free_list_pop(&heap_class->free_list);
+			block = free_list_pop(&heap->free_list[class_idx]);
 		} else {
 			//If the span did not fully initialize free list, link up another page worth of blocks			
 			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size));
-			span->free_list_limit += free_list_partial_init(&heap_class->free_list, &block,
+			span->free_list_limit += free_list_partial_init(&heap->free_list[class_idx], &block,
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
 				span->block_count - span->free_list_limit, span->block_size);
 		}
@@ -1379,26 +1784,26 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 
 		//Swap in deferred free list if present
 		if (atomic_load_ptr(&span->free_list_deferred))
-			_memory_span_extract_free_list_deferred(span);
+			_rpmalloc_span_extract_free_list_deferred(span);
 
 		//If span is still not fully utilized keep it in partial list and early return block
-		if (!_memory_span_is_fully_utilized(span))
+		if (!_rpmalloc_span_is_fully_utilized(span))
 			return block;
 
 		//The span is fully utilized, unlink from partial list and add to fully utilized list
-		_memory_span_double_link_list_pop_head(&heap_class->partial_span, span);
+		_rpmalloc_span_double_link_list_pop_head(&heap->partial_span[class_idx], span);
 #if RPMALLOC_FIRST_CLASS_HEAPS
-		_memory_span_double_link_list_add(&heap_class->full_span, span);
+		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
 #endif
 		++heap->full_span_count;
 		return block;
 	}
 
 	//Find a span in one of the cache levels
-	span = _memory_heap_extract_new_span(heap, 1, class_idx);
+	span = _rpmalloc_heap_extract_new_span(heap, 1, class_idx);
 	if (EXPECTED(span != 0)) {
 		//Mark span as owned by this heap and set base data, return first block
-		return _memory_span_initialize_new(heap, heap_class, span, class_idx);
+		return _rpmalloc_span_initialize_new(heap, span, class_idx);
 	}
 
 	return 0;
@@ -1406,32 +1811,32 @@ _memory_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 
 //! Allocate a small sized memory block from the given heap
 static void*
-_memory_allocate_small(heap_t* heap, size_t size) {
+_rpmalloc_allocate_small(heap_t* heap, size_t size) {
 	assert(heap);
 	//Small sizes have unique size classes
 	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
-	_memory_statistics_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
-		return free_list_pop(&heap->span_class[class_idx].free_list);
-	return _memory_allocate_from_heap_fallback(heap, class_idx);
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap->free_list[class_idx] != 0))
+		return free_list_pop(&heap->free_list[class_idx]);
+	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
 //! Allocate a medium sized memory block from the given heap
 static void*
-_memory_allocate_medium(heap_t* heap, size_t size) {
+_rpmalloc_allocate_medium(heap_t* heap, size_t size) {
 	assert(heap);
 	//Calculate the size class index and do a dependent lookup of the final class index (in case of merged classes)
 	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
 	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
-	_memory_statistics_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->span_class[class_idx].free_list != 0))
-		return free_list_pop(&heap->span_class[class_idx].free_list);
-	return _memory_allocate_from_heap_fallback(heap, class_idx);
+	_rpmalloc_stat_inc_alloc(heap, class_idx);
+	if (EXPECTED(heap->free_list[class_idx] != 0))
+		return free_list_pop(&heap->free_list[class_idx]);
+	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
 //! Allocate a large sized memory block from the given heap
 static void*
-_memory_allocate_large(heap_t* heap, size_t size) {
+_rpmalloc_allocate_large(heap_t* heap, size_t size) {
 	assert(heap);
 	//Calculate number of needed max sized spans (including header)
 	//Since this function is never called if size > LARGE_SIZE_LIMIT
@@ -1442,7 +1847,7 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 		++span_count;
 
 	//Find a span in one of the cache levels
-	span_t* span = _memory_heap_extract_new_span(heap, span_count, SIZE_CLASS_LARGE);
+	span_t* span = _rpmalloc_heap_extract_new_span(heap, span_count, SIZE_CLASS_LARGE);
 	if (!span)
 		return span;
 
@@ -1452,7 +1857,7 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 	span->heap = heap;
 
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_add(&heap->large_huge_span, span);
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
 	++heap->full_span_count;
 
@@ -1461,14 +1866,14 @@ _memory_allocate_large(heap_t* heap, size_t size) {
 
 //! Allocate a huge block by mapping memory pages directly
 static void*
-_memory_allocate_huge(heap_t* heap, size_t size) {
+_rpmalloc_allocate_huge(heap_t* heap, size_t size) {
 	assert(heap);
 	size += SPAN_HEADER_SIZE;
 	size_t num_pages = size >> _memory_page_size_shift;
 	if (size & (_memory_page_size - 1))
 		++num_pages;
 	size_t align_offset = 0;
-	span_t* span = (span_t*)_memory_map(num_pages * _memory_page_size, &align_offset);
+	span_t* span = (span_t*)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset);
 	if (!span)
 		return span;
 
@@ -1477,10 +1882,10 @@ _memory_allocate_huge(heap_t* heap, size_t size) {
 	span->span_count = (uint32_t)num_pages;
 	span->align_offset = (uint32_t)align_offset;
 	span->heap = heap;
-	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
 
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_add(&heap->large_huge_span, span);
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
 	++heap->full_span_count;
 
@@ -1489,20 +1894,20 @@ _memory_allocate_huge(heap_t* heap, size_t size) {
 
 //! Allocate a block of the given size
 static void*
-_memory_allocate(heap_t* heap, size_t size) {
+_rpmalloc_allocate(heap_t* heap, size_t size) {
 	if (EXPECTED(size <= SMALL_SIZE_LIMIT))
-		return _memory_allocate_small(heap, size);
+		return _rpmalloc_allocate_small(heap, size);
 	else if (size <= _memory_medium_size_limit)
-		return _memory_allocate_medium(heap, size);
+		return _rpmalloc_allocate_medium(heap, size);
 	else if (size <= LARGE_SIZE_LIMIT)
-		return _memory_allocate_large(heap, size);
-	return _memory_allocate_huge(heap, size);
+		return _rpmalloc_allocate_large(heap, size);
+	return _rpmalloc_allocate_huge(heap, size);
 }
 
 static void*
-_memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
+_rpmalloc_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 	if (alignment <= SMALL_GRANULARITY)
-		return _memory_allocate(heap, size);
+		return _rpmalloc_allocate(heap, size);
 
 #if ENABLE_VALIDATE_ARGS
 	if ((size + alignment) < size) {
@@ -1522,13 +1927,13 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 		size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) & ~(uintptr_t)(SPAN_HEADER_SIZE - 1) : SPAN_HEADER_SIZE;
 		assert(!(multiple_size % SPAN_HEADER_SIZE));
 		if (multiple_size <= (size + alignment))
-			return _memory_allocate(heap, multiple_size);
+			return _rpmalloc_allocate(heap, multiple_size);
 	}
 
 	void* ptr = 0;
 	size_t align_mask = alignment - 1;
 	if (alignment <= _memory_page_size) {
-		ptr = _memory_allocate(heap, size + alignment);
+		ptr = _rpmalloc_allocate(heap, size + alignment);
 		if ((uintptr_t)ptr & align_mask) {
 			ptr = (void*)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
 			//Mark as having aligned blocks
@@ -1578,7 +1983,7 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 	align_offset = 0;
 	mapped_size = num_pages * _memory_page_size;
 
-	span = (span_t*)_memory_map(mapped_size, &align_offset);
+	span = (span_t*)_rpmalloc_mmap(mapped_size, &align_offset);
 	if (!span) {
 		errno = ENOMEM;
 		return 0;
@@ -1591,157 +1996,62 @@ _memory_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 	if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
 	    (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
 	    (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
-		_memory_unmap(span, mapped_size, align_offset, mapped_size);
+		_rpmalloc_unmap(span, mapped_size, align_offset, mapped_size);
 		++num_pages;
 		if (num_pages > limit_pages) {
 			errno = EINVAL;
-			return 0;
-		}
-		goto retry;
-	}
-
-	//Store page count in span_count
-	span->size_class = SIZE_CLASS_HUGE;
-	span->span_count = (uint32_t)num_pages;
-	span->align_offset = (uint32_t)align_offset;
-	span->heap = heap;
-	_memory_statistics_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
-
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_add(&heap->large_huge_span, span);
-#endif
-	++heap->full_span_count;
-
-	return ptr;
-}
-
-static void
-_memory_heap_initialize(heap_t* heap) {
-	memset(heap, 0, sizeof(heap_t));
-
-	//Get a new heap ID
-	heap->id = 1 + atomic_incr32(&_memory_heap_id);
-
-	//Link in heap in heap ID map
-	heap_t* next_heap;
-	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-	do {
-		next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
-		heap->next_heap = next_heap;
-	} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
-}
-
-static void
-_memory_heap_orphan(heap_t* heap, int first_class) {
-	void* raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* last_heap;
-	heap->owner_thread = (uintptr_t)-1;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	atomicptr_t* heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
-#else
-	(void)sizeof(first_class);
-	atomicptr_t* heap_list = &_memory_orphan_heaps;
-#endif
-	do {
-		last_heap = (heap_t*)atomic_load_ptr(heap_list);
-		heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
-	} while (!atomic_cas_ptr(heap_list, raw_heap, last_heap));
-}
-
-//! Allocate a new heap from newly mapped memory pages
-static heap_t*
-_memory_allocate_heap_new(void) {
-	//Map in pages for a new heap
-	size_t align_offset = 0;
-	size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
-	heap_t* heap = (heap_t*)_memory_map(block_size, &align_offset);
-	if (!heap)
-		return heap;
-
-	_memory_heap_initialize(heap);
-	heap->align_offset = align_offset;
-
-	//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
-	size_t aligned_heap_size = sizeof(heap_t);
-	if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
-		aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
-	size_t num_heaps = block_size / aligned_heap_size;
-	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
-	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
-	while (num_heaps > 1) {
-		_memory_heap_initialize(extra_heap);
-		extra_heap->master_heap = heap;
-		_memory_heap_orphan(extra_heap, 1);
-		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
-		--num_heaps;
-	}
-	return heap;
-}
-
-static heap_t*
-_memory_heap_extract_orphan(atomicptr_t* heap_list) {
-	void* raw_heap;
-	void* next_raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* heap;
-	heap_t* next_heap;
-	do {
-		raw_heap = atomic_load_ptr(heap_list);
-		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
-		if (!heap)
-			break;
-		next_heap = heap->next_orphan;
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
-	} while (!atomic_cas_ptr(heap_list, next_raw_heap, raw_heap));
-	return heap;
-}
+			return 0;
+		}
+		goto retry;
+	}
+
+	//Store page count in span_count
+	span->size_class = SIZE_CLASS_HUGE;
+	span->span_count = (uint32_t)num_pages;
+	span->align_offset = (uint32_t)align_offset;
+	span->heap = heap;
+	_rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
 
-//! Allocate a new heap, potentially reusing a previously orphaned heap
-static heap_t*
-_memory_allocate_heap(int first_class) {
-	heap_t* heap = 0;
-	if (first_class == 0)
-		heap = _memory_heap_extract_orphan(&_memory_orphan_heaps);
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	if (!heap)
-		heap = _memory_heap_extract_orphan(&_memory_first_class_orphan_heaps);
+	_rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
 #endif
-	if (!heap)
-		heap = _memory_allocate_heap_new();
-	return heap;
+	++heap->full_span_count;
+
+	return ptr;
 }
 
+
+////////////
+///
+/// Deallocation entry points
+///
+//////
+
 //! Deallocate the given small/medium memory block in the current thread local heap
 static void
-_memory_deallocate_direct_small_or_medium(span_t* span, void* block) {
+_rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) {
 	heap_t* heap = span->heap;
 	assert(heap->owner_thread == get_thread_id() || heap->finalize);
 	//Add block to free list
-	if (UNEXPECTED(_memory_span_is_fully_utilized(span))) {
+	if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) {
 		span->used_count = span->block_count;
-		heap_class_t* heap_class = &heap->span_class[span->size_class];
 #if RPMALLOC_FIRST_CLASS_HEAPS
-		_memory_span_double_link_list_remove(&heap_class->full_span, span);
+		_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
 #endif
-		_memory_span_double_link_list_add(&heap_class->partial_span, span);
+		_rpmalloc_span_double_link_list_add(&heap->partial_span[span->size_class], span);
 		--heap->full_span_count;
 	}
 	--span->used_count;
 	*((void**)block) = span->free_list;
 	span->free_list = block;
 	if (UNEXPECTED(span->used_count == span->list_size)) {
-		heap_class_t* heap_class = &heap->span_class[span->size_class];
-		_memory_span_double_link_list_remove(&heap_class->partial_span, span);
-		_memory_span_release_to_cache(heap, span);
+		_rpmalloc_span_double_link_list_remove(&heap->partial_span[span->size_class], span);
+		_rpmalloc_span_release_to_cache(heap, span);
 	}
 }
 
 static void
-_memory_deallocate_defer_free_span(heap_t* heap, span_t* span) {
+_rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) {
 	//This list does not need ABA protection, no mutable side state
 	do {
 		span->free_list = atomic_load_ptr(&heap->span_free_deferred);
@@ -1750,7 +2060,7 @@ _memory_deallocate_defer_free_span(heap_t* heap, span_t* span) {
 
 //! Put the block in the deferred free list of the owning span
 static void
-_memory_deallocate_defer_small_or_medium(span_t* span, void* block) {
+_rpmalloc_deallocate_defer_small_or_medium(span_t* span, void* block) {
 	// The memory ordering here is a bit tricky, to avoid having to ABA protect
 	// the deferred free list to avoid desynchronization of list and list size
 	// we need to have acquire semantics on successful CAS of the pointer to
@@ -1766,13 +2076,13 @@ _memory_deallocate_defer_small_or_medium(span_t* span, void* block) {
 		// Span was completely freed by this block. Due to the INVALID_POINTER spin lock
 		// no other thread can reach this state simultaneously on this span.
 		// Safe to move to owner heap deferred cache
-		_memory_deallocate_defer_free_span(span->heap, span);
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 	}
 }
 
 static void
-_memory_deallocate_small_or_medium(span_t* span, void* p) {
-	_memory_statistics_inc_free(span->heap, span->size_class);
+_rpmalloc_deallocate_small_or_medium(span_t* span, void* p) {
+	_rpmalloc_stat_inc_free(span->heap, span->size_class);
 	if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
 		//Realign pointer to block start
 		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
@@ -1781,27 +2091,27 @@ _memory_deallocate_small_or_medium(span_t* span, void* p) {
 	}
 	//Check if block belongs to this heap or if deallocation should be deferred
 	if ((span->heap->owner_thread == get_thread_id()) || span->heap->finalize)
-		_memory_deallocate_direct_small_or_medium(span, p);
+		_rpmalloc_deallocate_direct_small_or_medium(span, p);
 	else
-		_memory_deallocate_defer_small_or_medium(span, p);
+		_rpmalloc_deallocate_defer_small_or_medium(span, p);
 }
 
 //! Deallocate the given large memory block to the current heap
 static void
-_memory_deallocate_large(span_t* span) {
+_rpmalloc_deallocate_large(span_t* span) {
 	assert(span->size_class == SIZE_CLASS_LARGE);
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
 	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
 	int defer = (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize;
 	if (defer) {
-		_memory_deallocate_defer_free_span(span->heap, span);
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
 	assert(span->heap->full_span_count);
 	--span->heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_remove(&span->heap->large_huge_span, span);
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
 #endif
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	//Decrease counter
@@ -1822,71 +2132,61 @@ _memory_deallocate_large(span_t* span) {
 			assert(master->flags & SPAN_FLAG_MASTER);
 			assert(atomic_load32(&master->remaining_spans) >= (int32_t)span->span_count);
 		}
-		_memory_statistics_inc(&heap->span_use[idx].spans_to_reserved);
+		_rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved);
 	} else {
 		//Insert into cache list
-		_memory_heap_cache_insert(heap, span);
+		_rpmalloc_heap_cache_insert(heap, span);
 	}
 }
 
 //! Deallocate the given huge span
 static void
-_memory_deallocate_huge(span_t* span) {
+_rpmalloc_deallocate_huge(span_t* span) {
 	assert(span->heap);
 	if ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize) {
-		_memory_deallocate_defer_free_span(span->heap, span);
+		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
 	assert(span->heap->full_span_count);
 	--span->heap->full_span_count;
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	_memory_span_double_link_list_remove(&span->heap->large_huge_span, span);
+	_rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
 #endif
 
 	//Oversized allocation, page count is stored in span_count
 	size_t num_pages = span->span_count;
-	_memory_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
-	_memory_statistics_sub(&_huge_pages_current, num_pages);
+	_rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset, num_pages * _memory_page_size);
+	_rpmalloc_stat_sub(&_huge_pages_current, num_pages);
 }
 
 //! Deallocate the given block
 static void
-_memory_deallocate(void* p) {
+_rpmalloc_deallocate(void* p) {
 	//Grab the span (always at start of span, using span alignment)
 	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
 	if (UNEXPECTED(!span))
 		return;
 	if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
-		_memory_deallocate_small_or_medium(span, p);
+		_rpmalloc_deallocate_small_or_medium(span, p);
 	else if (span->size_class == SIZE_CLASS_LARGE)
-		_memory_deallocate_large(span);
+		_rpmalloc_deallocate_large(span);
 	else
-		_memory_deallocate_huge(span);
+		_rpmalloc_deallocate_huge(span);
 }
 
-//! Get the usable size of the given block
+
+////////////
+///
+/// Reallocation entry points
+///
+//////
+
 static size_t
-_memory_usable_size(void* p) {
-	//Grab the span using guaranteed span alignment
-	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
-	if (span->size_class < SIZE_CLASS_COUNT) {
-		//Small/medium block
-		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-		return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
-	}
-	if (span->size_class == SIZE_CLASS_LARGE) {
-		//Large block
-		size_t current_spans = span->span_count;
-		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
-	}
-	//Oversized block, page count is stored in span_count
-	size_t current_pages = span->span_count;
-	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
-}
+_rpmalloc_usable_size(void* p);
 
 //! Reallocate the given block to the given size
 static void*
-_memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) {
+_rpmalloc_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned int flags) {
 	if (p) {
 		//Grab the span using guaranteed span alignment
 		span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
@@ -1950,44 +2250,71 @@ _memory_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigned
 	//Avoid hysteresis by overallocating if increase is small (below 37%)
 	size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
 	size_t new_size = (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
-	void* block = _memory_allocate(heap, new_size);
+	void* block = _rpmalloc_allocate(heap, new_size);
 	if (p && block) {
 		if (!(flags & RPMALLOC_NO_PRESERVE))
 			memcpy(block, p, oldsize < new_size ? oldsize : new_size);
-		_memory_deallocate(p);
+		_rpmalloc_deallocate(p);
 	}
 
 	return block;
 }
 
 static void*
-_memory_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize,
+_rpmalloc_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t size, size_t oldsize,
                            unsigned int flags) {
 	if (alignment <= SMALL_GRANULARITY)
-		return _memory_reallocate(heap, ptr, size, oldsize, flags);
+		return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags);
 
 	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
-	size_t usablesize = _memory_usable_size(ptr);
+	size_t usablesize = _rpmalloc_usable_size(ptr);
 	if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
 		if (no_alloc || (size >= (usablesize / 2)))
 			return ptr;
 	}
 	// Aligned alloc marks span as having aligned blocks
-	void* block = (!no_alloc ? _memory_aligned_allocate(heap, alignment, size) : 0);
+	void* block = (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0);
 	if (EXPECTED(block != 0)) {
 		if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
 			if (!oldsize)
 				oldsize = usablesize;
 			memcpy(block, ptr, oldsize < size ? oldsize : size);
 		}
-		rpfree(ptr);
+		_rpmalloc_deallocate(ptr);
 	}
 	return block;
 }
 
+
+////////////
+///
+/// Initialization, finalization and utility
+///
+//////
+
+//! Get the usable size of the given block
+static size_t
+_rpmalloc_usable_size(void* p) {
+	//Grab the span using guaranteed span alignment
+	span_t* span = (span_t*)((uintptr_t)p & _memory_span_mask);
+	if (span->size_class < SIZE_CLASS_COUNT) {
+		//Small/medium block
+		void* blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+		return span->block_size - ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+	}
+	if (span->size_class == SIZE_CLASS_LARGE) {
+		//Large block
+		size_t current_spans = span->span_count;
+		return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
+	}
+	//Oversized block, page count is stored in span_count
+	size_t current_pages = span->span_count;
+	return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
+}
+
 //! Adjust and optimize the size class properties for the given class
 static void
-_memory_adjust_size_class(size_t iclass) {
+_rpmalloc_adjust_size_class(size_t iclass) {
 	size_t block_size = _memory_size_class[iclass].block_size;
 	size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
 
@@ -2006,76 +2333,6 @@ _memory_adjust_size_class(size_t iclass) {
 	}
 }
 
-static void
-_memory_heap_release(void* heapptr, int first_class) {
-	heap_t* heap = (heap_t*)heapptr;
-	if (!heap)
-		return;
-	//Release thread cache spans back to global cache
-	_memory_heap_cache_adopt_deferred(heap, 0);
-#if ENABLE_THREAD_CACHE
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->span_cache[iclass];
-		heap->span_cache[iclass] = 0;
-		if (span && heap->finalize) {
-			_memory_unmap_span_list(span);
-			continue;
-		}
-#if ENABLE_GLOBAL_CACHE
-		while (span) {
-			assert(span->span_count == (iclass + 1));
-			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			span_t* next = _memory_span_list_split(span, (uint32_t)release_count);
-			_memory_statistics_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
-			_memory_statistics_add(&heap->span_use[iclass].spans_to_global, span->list_size);
-			_memory_global_cache_insert(span);
-			span = next;
-		}
-#else
-		if (span)
-			_memory_unmap_span_list(span);
-#endif
-	}
-#endif
-
-	//Orphan the heap
-	_memory_heap_orphan(heap, first_class);
-
-	set_thread_heap(0);
-#if ENABLE_STATISTICS
-	atomic_decr32(&_memory_active_heaps);
-	assert(atomic_load32(&_memory_active_heaps) >= 0);
-#endif
-}
-
-static void
-_memory_heap_release_raw(void* heapptr) {
-	_memory_heap_release(heapptr, 0);
-}
-
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-#include <fibersapi.h>
-static DWORD fls_key;
-static void NTAPI
-rp_thread_destructor(void* value) {
-	if (value)
-		rpmalloc_thread_finalize();
-}
-#endif
-
-#if PLATFORM_POSIX
-#  include <sys/mman.h>
-#  include <sched.h>
-#  ifdef __FreeBSD__
-#    include <sys/sysctl.h>
-#    define MAP_HUGETLB MAP_ALIGNED_SUPER
-#  endif
-#  ifndef MAP_UNINITIALIZED
-#    define MAP_UNINITIALIZED 0
-#  endif
-#endif
-#include <errno.h>
-
 //! Initialize the allocator and setup global data
 extern inline int
 rpmalloc_initialize(void) {
@@ -2100,8 +2357,8 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		memset(&_memory_config, 0, sizeof(rpmalloc_config_t));
 
 	if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
-		_memory_config.memory_map = _memory_map_os;
-		_memory_config.memory_unmap = _memory_unmap_os;
+		_memory_config.memory_map = _rpmalloc_mmap_os;
+		_memory_config.memory_unmap = _rpmalloc_unmap_os;
 	}
 
 #if RPMALLOC_CONFIGURABLE
@@ -2244,17 +2501,17 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		return -1;
 #endif
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-    fls_key = FlsAlloc(&rp_thread_destructor);
+    fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
 #endif
 
 	//Setup all small and medium size classes
 	size_t iclass = 0;
 	_memory_size_class[iclass].block_size = SMALL_GRANULARITY;
-	_memory_adjust_size_class(iclass);
+	_rpmalloc_adjust_size_class(iclass);
 	for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
 		size_t size = iclass * SMALL_GRANULARITY;
 		_memory_size_class[iclass].block_size = (uint32_t)size;
-		_memory_adjust_size_class(iclass);
+		_rpmalloc_adjust_size_class(iclass);
 	}
 	//At least two blocks per span, then fall back to large allocations
 	_memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
@@ -2265,7 +2522,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		if (size > _memory_medium_size_limit)
 			break;
 		_memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
-		_memory_adjust_size_class(SMALL_CLASS_COUNT + iclass);
+		_rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
 	}
 
 	//Initialize this thread
@@ -2273,91 +2530,6 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	return 0;
 }
 
-static int
-_memory_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
-	heap_class_t* heap_class = heap->span_class + iclass;
-	span_t* class_span = (span_t*)((uintptr_t)heap_class->free_list & _memory_span_mask);
-	if (span == class_span) {
-		// Adopt the heap class free list back into the span free list
-		void* block = span->free_list;
-		void* last_block = 0;
-		while (block) {
-			last_block = block;
-			block = *((void**)block);
-		}
-		uint32_t free_count = 0;
-		block = heap_class->free_list;
-		while (block) {
-			++free_count;
-			block = *((void**)block);
-		}
-		if (last_block) {
-			*((void**)last_block) = heap_class->free_list;
-		} else {
-			span->free_list = heap_class->free_list;
-		}
-		heap_class->free_list = 0;
-		span->used_count -= free_count;
-	}
-	//If this assert triggers you have memory leaks
-	assert(span->list_size == span->used_count);
-	if (span->list_size == span->used_count) {
-		_memory_statistics_dec(&heap->span_use[0].current);
-		_memory_statistics_dec(&heap->size_class_use[iclass].spans_current);
-		// This function only used for spans in double linked lists
-		if (list_head)
-			_memory_span_double_link_list_remove(list_head, span);
-		_memory_unmap_span(span);
-		return 1;
-	}
-	return 0;
-}
-
-static void
-_memory_heap_finalize(heap_t* heap) {
-	if (heap->spans_reserved) {
-		span_t* span = _memory_map_spans(heap, heap->spans_reserved);
-		_memory_unmap_span(span);
-		heap->spans_reserved = 0;
-	}
-
-	_memory_heap_cache_adopt_deferred(heap, 0);
-
-	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		heap_class_t* heap_class = heap->span_class + iclass;				
-		span_t* span = heap_class->partial_span;
-		while (span) {
-			span_t* next = span->next;
-			_memory_span_finalize(heap, iclass, span, &heap_class->partial_span);
-			span = next;
-		}
-		// If class still has a free list it must be a full span
-		if (heap_class->free_list) {
-			span_t* class_span = (span_t*)((uintptr_t)heap_class->free_list & _memory_span_mask);
-			span_t** list = 0;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-			list = &heap_class->full_span;
-#endif
-			--heap->full_span_count;
-			if (!_memory_span_finalize(heap, iclass, class_span, list)) {
-				if (list)
-					_memory_span_double_link_list_remove(list, class_span);
-				_memory_span_double_link_list_add(&heap_class->partial_span, class_span);
-			}
-		}
-	}
-
-#if ENABLE_THREAD_CACHE
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (heap->span_cache[iclass]) {
-			_memory_unmap_span_list(heap->span_cache[iclass]);
-			heap->span_cache[iclass] = 0;
-		}
-	}
-#endif
-	assert(!atomic_load_ptr(&heap->span_free_deferred));
-}
-
 //! Finalize the allocator
 void
 rpmalloc_finalize(void) {
@@ -2370,7 +2542,7 @@ rpmalloc_finalize(void) {
 		while (heap) {
 			heap_t* next_heap = heap->next_heap;
 			heap->finalize = 1;
-			_memory_heap_global_finalize(heap);
+			_rpmalloc_heap_global_finalize(heap);
 			heap = next_heap;
 		}
 	}
@@ -2378,7 +2550,7 @@ rpmalloc_finalize(void) {
 #if ENABLE_GLOBAL_CACHE
 	//Free global caches
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-		_memory_cache_finalize(&_memory_span_cache[iclass]);
+		_rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]);
 #endif
 
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
@@ -2402,9 +2574,9 @@ rpmalloc_finalize(void) {
 extern inline void
 rpmalloc_thread_initialize(void) {
 	if (!get_thread_heap_raw()) {
-		heap_t* heap = _memory_allocate_heap(0);
+		heap_t* heap = _rpmalloc_heap_allocate(0);
 		if (heap) {
-			_memory_statistics_inc(&_memory_active_heaps);
+			_rpmalloc_stat_inc(&_memory_active_heaps);
 			set_thread_heap(heap);
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 			FlsSetValue(fls_key, heap);
@@ -2418,7 +2590,7 @@ void
 rpmalloc_thread_finalize(void) {
 	heap_t* heap = get_thread_heap_raw();
 	if (heap)
-		_memory_heap_release_raw(heap);
+		_rpmalloc_heap_release_raw(heap);
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 	FlsSetValue(fls_key, 0);
 #endif
@@ -2434,88 +2606,6 @@ rpmalloc_config(void) {
 	return &_memory_config;
 }
 
-//! Map new pages to virtual memory
-static void*
-_memory_map_os(size_t size, size_t* offset) {
-	//Either size is a heap (a single page) or a (multiple) span - we only need to align spans, and only if larger than map granularity
-	size_t padding = ((size >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) ? _memory_span_size : 0;
-	assert(size >= _memory_page_size);
-#if PLATFORM_WINDOWS
-	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
-	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
-	if (!ptr) {
-		assert(!"Failed to map virtual memory block");
-		return 0;
-	}
-#else
-	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
-#  if defined(__APPLE__)
-	int fd = (int)VM_MAKE_TAG(240U);
-	if (_memory_huge_pages)
-		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
-#  elif defined(MAP_HUGETLB)
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
-#  else
-	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
-#  endif
-	if ((ptr == MAP_FAILED) || !ptr) {
-		assert("Failed to map virtual memory block" == 0);
-		return 0;
-	}
-#endif
-	_memory_statistics_add(&_mapped_pages_os, (int32_t)((size + padding) >> _memory_page_size_shift));
-	if (padding) {
-		size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
-		assert(final_padding <= _memory_span_size);
-		assert(final_padding <= padding);
-		assert(!(final_padding % 8));
-		ptr = pointer_offset(ptr, final_padding);
-		*offset = final_padding >> 3;
-	}
-	assert((size < _memory_span_size) || !((uintptr_t)ptr & ~_memory_span_mask));
-	return ptr;
-}
-
-//! Unmap pages from virtual memory
-static void
-_memory_unmap_os(void* address, size_t size, size_t offset, size_t release) {
-	assert(release || (offset == 0));
-	assert(!release || (release >= _memory_page_size));
-	assert(size >= _memory_page_size);
-	if (release && offset) {
-		offset <<= 3;
-		address = pointer_offset(address, -(int32_t)offset);
-#if PLATFORM_POSIX
-		//Padding is always one span size
-		release += _memory_span_size;
-#endif
-	}
-#if !DISABLE_UNMAP
-#if PLATFORM_WINDOWS
-	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
-		assert(!"Failed to unmap virtual memory block");
-	}
-#else
-	if (release) {
-		if (munmap(address, release)) {
-			assert("Failed to unmap virtual memory block" == 0);
-		}
-	}
-	else {
-#if defined(POSIX_MADV_FREE)
-		if (posix_madvise(address, size, POSIX_MADV_FREE))
-#endif
-		if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
-			assert("Failed to madvise virtual memory block as free" == 0);
-		}
-	}
-#endif
-#endif
-	if (release)
-		_memory_statistics_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
-}
-
 // Extern interface
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2527,12 +2617,12 @@ rpmalloc(size_t size) {
 	}
 #endif
 	heap_t* heap = get_thread_heap();
-	return _memory_allocate(heap, size);
+	return _rpmalloc_allocate(heap, size);
 }
 
 extern inline void
 rpfree(void* ptr) {
-	_memory_deallocate(ptr);
+	_rpmalloc_deallocate(ptr);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2556,7 +2646,7 @@ rpcalloc(size_t num, size_t size) {
 	total = num * size;
 #endif
 	heap_t* heap = get_thread_heap();
-	void* block = _memory_allocate(heap, total);
+	void* block = _rpmalloc_allocate(heap, total);
 	if (block)
 		memset(block, 0, total);
 	return block;
@@ -2571,7 +2661,7 @@ rprealloc(void* ptr, size_t size) {
 	}
 #endif
 	heap_t* heap = get_thread_heap();
-	return _memory_reallocate(heap, ptr, size, 0, 0);
+	return _rpmalloc_reallocate(heap, ptr, size, 0, 0);
 }
 
 extern RPMALLOC_ALLOCATOR void*
@@ -2584,13 +2674,13 @@ rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize,
 	}
 #endif
 	heap_t* heap = get_thread_heap();
-	return _memory_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags);
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize, flags);
 }
 
 extern RPMALLOC_ALLOCATOR void*
 rpaligned_alloc(size_t alignment, size_t size) {
 	heap_t* heap = get_thread_heap();
-	return _memory_aligned_allocate(heap, alignment, size);
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2635,7 +2725,7 @@ rpposix_memalign(void **memptr, size_t alignment, size_t size) {
 
 extern inline size_t
 rpmalloc_usable_size(void* ptr) {
-	return (ptr ? _memory_usable_size(ptr) : 0);
+	return (ptr ? _rpmalloc_usable_size(ptr) : 0);
 }
 
 extern inline void
@@ -2651,8 +2741,7 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		size_class_t* size_class = _memory_size_class + iclass;
-		heap_class_t* heap_class = heap->span_class + iclass;
-		span_t* span = heap_class->partial_span;
+		span_t* span = heap->partial_span[iclass];
 		while (span) {
 			size_t free_count = span->list_size;
 			size_t block_count = size_class->block_count;
@@ -2836,15 +2925,15 @@ rpmalloc_heap_acquire(void) {
 	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
 	// could already be allocated from the heap which would (wrongly) be released when
 	// heap is cleared with rpmalloc_heap_free_all()
-	heap_t* heap = _memory_allocate_heap(1);
-	_memory_statistics_inc(&_memory_active_heaps);
+	heap_t* heap = _rpmalloc_heap_allocate(1);
+	_rpmalloc_stat_inc(&_memory_active_heaps);
 	return heap;
 }
 
 extern inline void
 rpmalloc_heap_release(rpmalloc_heap_t* heap) {
 	if (heap)
-		_memory_heap_release(heap, 1);
+		_rpmalloc_heap_release(heap, 1);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2855,7 +2944,7 @@ rpmalloc_heap_alloc(rpmalloc_heap_t* heap, size_t size) {
 		return ptr;
 	}
 #endif
-	return _memory_allocate(heap, size);
+	return _rpmalloc_allocate(heap, size);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2866,7 +2955,7 @@ rpmalloc_heap_aligned_alloc(rpmalloc_heap_t* heap, size_t alignment, size_t size
 		return ptr;
 	}
 #endif
-	return _memory_aligned_allocate(heap, alignment, size);
+	return _rpmalloc_aligned_allocate(heap, alignment, size);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2894,7 +2983,7 @@ rpmalloc_heap_aligned_calloc(rpmalloc_heap_t* heap, size_t alignment, size_t num
 #else
 	total = num * size;
 #endif
-	void* block = _memory_aligned_allocate(heap, alignment, total);
+	void* block = _rpmalloc_aligned_allocate(heap, alignment, total);
 	if (block)
 		memset(block, 0, total);
 	return block;
@@ -2908,7 +2997,7 @@ rpmalloc_heap_realloc(rpmalloc_heap_t* heap, void* ptr, size_t size, unsigned in
 		return ptr;
 	}
 #endif
-	return _memory_reallocate(heap, ptr, size, 0, flags);
+	return _rpmalloc_reallocate(heap, ptr, size, 0, flags);
 }
 
 extern inline RPMALLOC_ALLOCATOR void*
@@ -2919,13 +3008,13 @@ rpmalloc_heap_aligned_realloc(rpmalloc_heap_t* heap, void* ptr, size_t alignment
 		return 0;
 	}
 #endif
-	return _memory_aligned_reallocate(heap, ptr, alignment, size, 0, flags);	
+	return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags);	
 }
 
 extern inline void
 rpmalloc_heap_free(rpmalloc_heap_t* heap, void* ptr) {
 	(void)sizeof(heap);
-	_memory_deallocate(ptr);
+	_rpmalloc_deallocate(ptr);
 }
 
 extern inline void
@@ -2933,31 +3022,34 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 	span_t* span;
 	span_t* next_span;
 
-	_memory_heap_cache_adopt_deferred(heap, 0);
+	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		span = heap->span_class[iclass].partial_span;
+		span = heap->partial_span[iclass];
 		while (span) {
 			next_span = span->next;
-			_memory_heap_cache_insert(heap, span);
+			_rpmalloc_heap_cache_insert(heap, span);
 			span = next_span;
 		}
-		span = heap->span_class[iclass].full_span;
+		heap->partial_span[iclass] = 0;
+		span = heap->full_span[iclass];
 		while (span) {
 			next_span = span->next;
-			_memory_heap_cache_insert(heap, span);
+			_rpmalloc_heap_cache_insert(heap, span);
 			span = next_span;
 		}
 	}
-	memset(heap->span_class, 0, sizeof(heap->span_class));
+	memset(heap->free_list, 0, sizeof(heap->free_list));
+	memset(heap->partial_span, 0, sizeof(heap->partial_span));
+	memset(heap->full_span, 0, sizeof(heap->full_span));
 
 	span = heap->large_huge_span;
 	while (span) {
 		next_span = span->next;
 		if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE))
-			_memory_deallocate_huge(span);
+			_rpmalloc_deallocate_huge(span);
 		else
-			_memory_heap_cache_insert(heap, span);
+			_rpmalloc_heap_cache_insert(heap, span);
 		span = next_span;
 	}
 	heap->large_huge_span = 0;
@@ -2970,15 +3062,15 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 		while (span) {
 			assert(span->span_count == (iclass + 1));
 			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			next_span = _memory_span_list_split(span, (uint32_t)release_count);
-			_memory_statistics_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
-			_memory_statistics_add(&heap->span_use[iclass].spans_to_global, span->list_size);
-			_memory_global_cache_insert(span);
+			next_span = _rpmalloc_span_list_split(span, (uint32_t)release_count);
+			_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span->list_size);
+			_rpmalloc_global_cache_insert_span_list(span);
 			span = next_span;
 		}
 #else
 		if (span)
-			_memory_unmap_span_list(span);
+			_rpmalloc_span_list_unmap_all(span);
 #endif
 		heap->span_cache[iclass] = 0;
 	}
diff --git a/test/main.c b/test/main.c
index a6f54b17..5f8da29f 100644
--- a/test/main.c
+++ b/test/main.c
@@ -305,6 +305,7 @@ test_alloc(void) {
 	rpmalloc_finalize();
 
 	// Test that a full span with deferred block is finalized properly
+	// Also test that a deferred huge span is finalized properly
 	rpmalloc_initialize();
 	{
 		addr[0] = rpmalloc(23457);
@@ -315,6 +316,14 @@ test_alloc(void) {
 		uintptr_t thread = thread_run(&targ);
 		thread_sleep(100);
 		thread_join(thread);
+
+		addr[0] = rpmalloc(12345678);
+
+		targ.fn = defer_free_thread;
+		targ.arg = addr[0];
+		thread = thread_run(&targ);
+		thread_sleep(100);
+		thread_join(thread);
 	}
 	rpmalloc_finalize();
 

From 5e752e65fe2d90cf04996c7ac47b20dbdb4c0d5b Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 24 Mar 2020 18:22:48 +0100
Subject: [PATCH 33/69] more x86 test reductions

---
 test/main.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test/main.c b/test/main.c
index 5f8da29f..3a044972 100644
--- a/test/main.c
+++ b/test/main.c
@@ -789,8 +789,13 @@ test_threaded(void) {
 	arg.datasize[14] = 38934;
 	arg.datasize[15] = 234;
 	arg.num_datasize = 16;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
 	arg.loops = 100;
 	arg.passes = 4000;
+#else
+	arg.loops = 30;
+	arg.passes = 2000;
+#endif
 	arg.init_fini_each_loop = 0;
 
 	thread_arg targ;
@@ -837,7 +842,7 @@ test_crossthread(void) {
 		arg[ithread].passes = 1024;
 #else
 		arg[ithread].loops = 10;
-		arg[ithread].passes = 128;
+		arg[ithread].passes = 100;
 #endif
 		arg[ithread].pointers = rpmalloc(sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
 		memset(arg[ithread].pointers, 0, sizeof(void*) * arg[ithread].loops * arg[ithread].passes);

From b5f7768ea4a4af753938284c14d6153d92c23b5d Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 25 Mar 2020 11:44:45 +0100
Subject: [PATCH 34/69] fix heap cleanup when heaps share memory page (#163)

---
 .gitignore                       |   2 +
 build/msvs/rpmalloc-test.vcxproj | 213 +++++++++++++++++++++++++++++++
 build/msvs/rpmalloc.sln          |  12 +-
 build/msvs/test.vcxproj          |   4 +-
 rpmalloc/rpmalloc.c              |  27 ++--
 test/main.c                      |  11 +-
 6 files changed, 250 insertions(+), 19 deletions(-)
 create mode 100644 build/msvs/rpmalloc-test.vcxproj

diff --git a/.gitignore b/.gitignore
index c291de3d..fd145ba5 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,6 +13,8 @@ local.properties
 .loadpath
 .ninja*
 build.ninja
+.vs
+.vscode
 
 # Generated version
 version.c
diff --git a/build/msvs/rpmalloc-test.vcxproj b/build/msvs/rpmalloc-test.vcxproj
new file mode 100644
index 00000000..52078956
--- /dev/null
+++ b/build/msvs/rpmalloc-test.vcxproj
@@ -0,0 +1,213 @@
+﻿<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|Win32">
+      <Configuration>Debug</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|Win32">
+      <Configuration>Release</Configuration>
+      <Platform>Win32</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\rpmalloc\rpmalloc.c" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="..\..\rpmalloc\rpmalloc.h" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{01b8c8be-038d-482f-b016-3a9496ac41b0}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>rpmalloc-test</RootNamespace>
+    <WindowsTargetPlatformVersion>10.0</WindowsTargetPlatformVersion>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v142</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Label="Shared">
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <OutDir>..\..\lib\windows\debug\x86\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <OutDir>..\..\lib\windows\release\x86\</OutDir>
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <OutDir>..\..\lib\windows\debug\x86-64\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <IntDir>$(SolutionDir)$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
+    <OutDir>..\..\lib\windows\release\x86-64\</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;WIN32;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>false</MinimalRebuild>
+      <ExceptionHandling>false</ExceptionHandling>
+      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level3</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;_DEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <MinimalRebuild>false</MinimalRebuild>
+      <ExceptionHandling>false</ExceptionHandling>
+      <BasicRuntimeChecks>Default</BasicRuntimeChecks>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;WIN32;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ExceptionHandling>false</ExceptionHandling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <EnableEnhancedInstructionSet>StreamingSIMDExtensions2</EnableEnhancedInstructionSet>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <StringPooling>true</StringPooling>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level3</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>Full</Optimization>
+      <FunctionLevelLinking>false</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>ENABLE_ASSERTS=1;ENABLE_STATISTICS=1;NDEBUG;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <ExceptionHandling>false</ExceptionHandling>
+      <RuntimeLibrary>MultiThreaded</RuntimeLibrary>
+      <BufferSecurityCheck>false</BufferSecurityCheck>
+      <EnableParallelCodeGeneration>false</EnableParallelCodeGeneration>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <CreateHotpatchableImage>false</CreateHotpatchableImage>
+      <CompileAsManaged>false</CompileAsManaged>
+      <CompileAsWinRT>false</CompileAsWinRT>
+      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <OmitFramePointers>true</OmitFramePointers>
+      <EnableFiberSafeOptimizations>true</EnableFiberSafeOptimizations>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <StringPooling>true</StringPooling>
+    </ClCompile>
+    <Link>
+      <SubSystem>Windows</SubSystem>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
\ No newline at end of file
diff --git a/build/msvs/rpmalloc.sln b/build/msvs/rpmalloc.sln
index 03e6ccf2..fa2456a8 100644
--- a/build/msvs/rpmalloc.sln
+++ b/build/msvs/rpmalloc.sln
@@ -1,12 +1,14 @@
 ﻿
 Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 16
+# Visual Studio Version 16
 VisualStudioVersion = 16.0.28803.202
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rpmalloc", "rpmalloc.vcxproj", "{65DC4291-954E-4B91-8889-4F3ADCC9D2D5}"
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "test", "test.vcxproj", "{C31980DD-1241-4EF8-A351-69DAF982A7B9}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "rpmalloc-test", "rpmalloc-test.vcxproj", "{01B8C8BE-038D-482F-B016-3A9496AC41B0}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@@ -31,6 +33,14 @@ Global
 		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x64.Build.0 = Release|x64
 		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x86.ActiveCfg = Release|Win32
 		{C31980DD-1241-4EF8-A351-69DAF982A7B9}.Release|x86.Build.0 = Release|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x64.ActiveCfg = Debug|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x64.Build.0 = Debug|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x86.ActiveCfg = Debug|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Debug|x86.Build.0 = Debug|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x64.ActiveCfg = Release|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x64.Build.0 = Release|x64
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x86.ActiveCfg = Release|Win32
+		{01B8C8BE-038D-482F-B016-3A9496AC41B0}.Release|x86.Build.0 = Release|Win32
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
diff --git a/build/msvs/test.vcxproj b/build/msvs/test.vcxproj
index 2ba1fec2..ca051ef4 100644
--- a/build/msvs/test.vcxproj
+++ b/build/msvs/test.vcxproj
@@ -27,8 +27,8 @@
     <ClInclude Include="..\..\test\thread.h" />
   </ItemGroup>
   <ItemGroup>
-    <ProjectReference Include="rpmalloc.vcxproj">
-      <Project>{65dc4291-954e-4b91-8889-4f3adcc9d2d5}</Project>
+    <ProjectReference Include="rpmalloc-test.vcxproj">
+      <Project>{01b8c8be-038d-482f-b016-3a9496ac41b0}</Project>
     </ProjectReference>
   </ItemGroup>
   <PropertyGroup Label="Globals">
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index f1f09d20..473f3289 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -745,7 +745,7 @@ _rpmalloc_mmap_os(size_t size, size_t* offset) {
 	//Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not allocated unless/until the virtual addresses are actually accessed"
 	void* ptr = VirtualAlloc(0, size + padding, (_memory_huge_pages ? MEM_LARGE_PAGES : 0) | MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE);
 	if (!ptr) {
-		assert(!"Failed to map virtual memory block");
+		assert(ptr && "Failed to map virtual memory block");
 		return 0;
 	}
 #else
@@ -1378,17 +1378,14 @@ _rpmalloc_heap_unlink_orphan(atomicptr_t* list, heap_t* heap) {
 static void
 _rpmalloc_heap_unmap(heap_t* heap) {
 	if (!heap->master_heap) {
-		if (!atomic_load32(&heap->child_count)) {
-			_rpmalloc_heap_unlink_orphan(&_memory_orphan_heaps, heap);
-#if RPMALLOC_FIRST_CLASS_HEAPS
-			_rpmalloc_heap_unlink_orphan(&_memory_first_class_orphan_heaps, heap);
-#endif
-			size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+		if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) {
+			size_t heap_size = sizeof(heap_t);
+			size_t block_size = _memory_page_size * ((heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
 			_rpmalloc_unmap(heap, block_size, heap->align_offset, block_size);
 		}
 	} else {
 		if (atomic_decr32(&heap->master_heap->child_count) == 0) {
-			_rpmalloc_heap_global_finalize(heap->master_heap);
+			_rpmalloc_heap_unmap(heap->master_heap);
 		}
 	}
 }
@@ -1594,7 +1591,8 @@ static heap_t*
 _rpmalloc_heap_allocate_new(void) {
 	//Map in pages for a new heap
 	size_t align_offset = 0;
-	size_t block_size = (1 + (sizeof(heap_t) >> _memory_page_size_shift)) * _memory_page_size;
+	size_t heap_size = sizeof(heap_t);
+	size_t block_size = _memory_page_size* ((heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
 	heap_t* heap = (heap_t*)_rpmalloc_mmap(block_size, &align_offset);
 	if (!heap)
 		return heap;
@@ -1603,9 +1601,7 @@ _rpmalloc_heap_allocate_new(void) {
 	heap->align_offset = align_offset;
 
 	//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
-	size_t aligned_heap_size = sizeof(heap_t);
-	if (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE)
-		aligned_heap_size += HEAP_ORPHAN_ABA_SIZE - (aligned_heap_size % HEAP_ORPHAN_ABA_SIZE);
+	size_t aligned_heap_size = HEAP_ORPHAN_ABA_SIZE * ((heap_size + HEAP_ORPHAN_ABA_SIZE - 1) / HEAP_ORPHAN_ABA_SIZE);
 	size_t num_heaps = block_size / aligned_heap_size;
 	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
 	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
@@ -2525,6 +2521,12 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		_rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
 	}
 
+	atomic_store_ptr(&_memory_orphan_heaps, 0);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	atomic_store_ptr(&_memory_first_class_orphan_heaps, 0);
+#endif
+	memset((void*)_memory_heaps, 0, sizeof(_memory_heaps));
+
 	//Initialize this thread
 	rpmalloc_thread_initialize();
 	return 0;
@@ -2591,6 +2593,7 @@ rpmalloc_thread_finalize(void) {
 	heap_t* heap = get_thread_heap_raw();
 	if (heap)
 		_rpmalloc_heap_release_raw(heap);
+	set_thread_heap(0);
 #if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 	FlsSetValue(fls_key, 0);
 #endif
diff --git a/test/main.c b/test/main.c
index 3a044972..e242a856 100644
--- a/test/main.c
+++ b/test/main.c
@@ -18,6 +18,7 @@
 #define pointer_diff(first, second) (ptrdiff_t)((const char*)(first) - (const char*)(second))
 
 static size_t _hardware_threads;
+static int _test_failed;
 
 static void
 test_initialize(void);
@@ -25,6 +26,8 @@ test_initialize(void);
 static int
 test_fail_cb(const char* reason, const char* file, int line) {
 	fprintf(stderr, "FAIL: %s @ %s:%d\n", reason, file, line);
+	fflush(stderr);
+	_test_failed = 1;
 	return -1;
 }
 
@@ -650,7 +653,7 @@ crossallocator_thread(void* argp) {
 
 	rpfree(extra_pointers);
 
-	while (next_crossthread < end_crossthread) {
+	while ((next_crossthread < end_crossthread) && !_test_failed) {
 		if (arg.crossthread_pointers[next_crossthread]) {
 			rpfree(arg.crossthread_pointers[next_crossthread]);
 			arg.crossthread_pointers[next_crossthread] = 0;
@@ -794,7 +797,7 @@ test_threaded(void) {
 	arg.passes = 4000;
 #else
 	arg.loops = 30;
-	arg.passes = 2000;
+	arg.passes = 1000;
 #endif
 	arg.init_fini_each_loop = 0;
 
@@ -841,8 +844,8 @@ test_crossthread(void) {
 		arg[ithread].loops = 50;
 		arg[ithread].passes = 1024;
 #else
-		arg[ithread].loops = 10;
-		arg[ithread].passes = 100;
+		arg[ithread].loops = 20;
+		arg[ithread].passes = 200;
 #endif
 		arg[ithread].pointers = rpmalloc(sizeof(void*) * arg[ithread].loops * arg[ithread].passes);
 		memset(arg[ithread].pointers, 0, sizeof(void*) * arg[ithread].loops * arg[ithread].passes);

From 64114768d01bc2cb3bd3952c7b1fd32e01c30dfa Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 25 Mar 2020 13:42:45 +0100
Subject: [PATCH 35/69] clang compatibility

---
 rpmalloc/malloc.c   |  5 ++++-
 rpmalloc/rpmalloc.c | 35 ++++++-----------------------------
 2 files changed, 10 insertions(+), 30 deletions(-)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index e9b08c01..87fe9377 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -229,7 +229,10 @@ pvalloc(size_t size) {
 
 #if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK
 
-__declspec(dllexport) BOOL WINAPI
+extern __declspec(dllexport) BOOL WINAPI
+DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved);
+
+extern __declspec(dllexport) BOOL WINAPI
 DllMain(HINSTANCE instance, DWORD reason, LPVOID reserved) {
 	(void)sizeof(reserved);
 	(void)sizeof(instance);
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 473f3289..28d46dd0 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -129,10 +129,7 @@
 #  ifndef WIN32_LEAN_AND_MEAN
 #    define WIN32_LEAN_AND_MEAN
 #  endif
-#  ifndef __USE_MINGW_ANSI_STDIO
-#  define __USE_MINGW_ANSI_STDIO 1
-#  endif
-#  include <windows.h>
+#  include <Windows.h>
 #  if ENABLE_VALIDATE_ARGS
 #    include <Intsafe.h>
 #  endif
@@ -665,7 +662,7 @@ get_thread_heap(void) {
 static inline uintptr_t 
 get_thread_id(void) {
 #if defined(_WIN32)
-	return (uintptr_t)NtCurrentTeb();
+	return (uintptr_t)((void*)NtCurrentTeb());
 #elif defined(__GNUC__) || defined(__clang__)
 	uintptr_t tid;
 #  if defined(__i386__)
@@ -795,7 +792,7 @@ _rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
 #if !DISABLE_UNMAP
 #if PLATFORM_WINDOWS
 	if (!VirtualFree(address, release ? 0 : size, release ? MEM_RELEASE : MEM_DECOMMIT)) {
-		assert(!"Failed to unmap virtual memory block");
+		assert(address && "Failed to unmap virtual memory block");
 	}
 #else
 	if (release) {
@@ -1354,27 +1351,6 @@ _rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
 	}
 }
 
-static void
-_rpmalloc_heap_global_finalize(heap_t* heap);
-
-static void
-_rpmalloc_heap_unlink_orphan(atomicptr_t* list, heap_t* heap) {
-	void* raworphan = atomic_load_ptr(list);
-	heap_t* orphan = (heap_t*)((uintptr_t)raworphan & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
-	if (orphan == heap) {
-		//We're now in single-threaded finalization phase, no need to ABA protect or CAS
-		atomic_store_ptr(list, heap->next_orphan);
-	} else if (orphan) {
-		heap_t* last = orphan;
-		while (orphan && (orphan != heap)) {
-			last = orphan;
-			orphan = orphan->next_orphan;
-		}
-		if (orphan == heap)
-			last->next_orphan = heap->next_orphan;
-	}
-}
-
 static void
 _rpmalloc_heap_unmap(heap_t* heap) {
 	if (!heap->master_heap) {
@@ -1592,7 +1568,7 @@ _rpmalloc_heap_allocate_new(void) {
 	//Map in pages for a new heap
 	size_t align_offset = 0;
 	size_t heap_size = sizeof(heap_t);
-	size_t block_size = _memory_page_size* ((heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
+	size_t block_size = _memory_page_size * ((heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
 	heap_t* heap = (heap_t*)_rpmalloc_mmap(block_size, &align_offset);
 	if (!heap)
 		return heap;
@@ -2525,7 +2501,8 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	atomic_store_ptr(&_memory_first_class_orphan_heaps, 0);
 #endif
-	memset((void*)_memory_heaps, 0, sizeof(_memory_heaps));
+	for (size_t ilist = 0, lsize = (sizeof(_memory_heaps) / sizeof(_memory_heaps[0])); ilist < lsize; ++ilist)
+		atomic_store_ptr(&_memory_heaps[ilist], 0);
 
 	//Initialize this thread
 	rpmalloc_thread_initialize();

From fb90156b3ad8a1501755c08b300f0d88b41990e2 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 26 Mar 2020 07:17:18 +0100
Subject: [PATCH 36/69] fix for disabled thread cache

---
 rpmalloc/rpmalloc.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 28d46dd0..15879bfe 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1375,12 +1375,14 @@ _rpmalloc_heap_global_finalize(heap_t* heap) {
 
 	_rpmalloc_heap_finalize(heap);
 
+#if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
 		span_t* span = heap->span_cache[iclass];
 		heap->span_cache[iclass] = 0;
 		if (span)
 			_rpmalloc_span_list_unmap_all(span);
 	}
+#endif
 
 	if (heap->full_span_count) {
 		--heap->finalize;

From 90c722f7549e19a7749722c90a006ce6e125d579 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Fri, 27 Mar 2020 10:25:39 +0100
Subject: [PATCH 37/69] additional build compatibility for clang on windows

---
 build/ninja/clang.py | 74 +++++++++++++++++++++++++++++---------------
 rpmalloc/rpmalloc.c  | 10 +++---
 test/main.c          | 20 ++++++------
 3 files changed, 64 insertions(+), 40 deletions(-)

diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index 024b00ad..2e56d10b 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -19,10 +19,12 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.cxxcompiler = os.environ.get('CXX') or 'clang++'
     if self.target.is_windows():
       self.archiver = os.environ.get('AR') or 'llvm-ar'
+      self.linker = os.environ.get('CC') or 'lld-link'
+      self.cxxlinker = os.environ.get('CXX') or 'lld-link'
     else:
       self.archiver = os.environ.get('AR') or 'ar'
-    self.linker = os.environ.get('CC') or 'clang'
-    self.cxxlinker = os.environ.get('CXX') or 'clang++'
+      self.linker = os.environ.get('CC') or 'clang'
+      self.cxxlinker = os.environ.get('CXX') or 'clang++'
 
     #Default variables
     self.sysroot = ''
@@ -37,7 +39,11 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.ccdeps = 'gcc'
     self.ccdepfile = '$out.d'
     self.arcmd = self.rmcmd('$out') + ' && $toolchain$ar crsD $ararchflags $arflags $arenvflags $out $in'
-    self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags -o $out $in $libs $archlibs $oslibs $frameworks'
+    if self.target.is_windows():
+      self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags /debug /nologo /subsystem:console /dynamicbase /nxcompat /manifest /manifestuac:\"level=\'asInvoker\' uiAccess=\'false\'\" /tlbid:1 /pdb:$pdbpath /out:$out $in $libs $archlibs $oslibs $frameworks'
+      self.dllcmd = self.linkcmd + ' /dll'
+    else:
+      self.linkcmd = '$toolchain$link $libpaths $configlibpaths $linkflags $linkarchflags $linkconfigflags $linkenvflags -o $out $in $libs $archlibs $oslibs $frameworks'
 
     #Base flags
     self.cflags = ['-D' + project.upper() + '_COMPILE=1',
@@ -45,11 +51,12 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
                    '-fomit-frame-pointer', '-fno-math-errno','-ffinite-math-only', '-funsafe-math-optimizations',
                    '-fno-trapping-math', '-ffast-math']
     self.cwarnflags = ['-W', '-Werror', '-pedantic', '-Wall', '-Weverything',
-                       '-Wno-padded', '-Wno-documentation-unknown-command', '-Wno-static-in-inline']
+                       '-Wno-padded', '-Wno-documentation-unknown-command',
+                       '-Wno-implicit-fallthrough', '-Wno-static-in-inline', '-Wno-reserved-id-macro']
     self.cmoreflags = []
     self.mflags = []
     self.arflags = []
-    self.linkflags = ['-fomit-frame-pointer']
+    self.linkflags = []
     self.oslibs = []
     self.frameworks = []
 
@@ -66,10 +73,13 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi():
       self.cflags += ['-D_GNU_SOURCE=1']
       self.linkflags += ['-pthread']
+      self.oslibs += ['m']
     if self.target.is_linux() or self.target.is_raspberrypi():
       self.oslibs += ['dl']
     if self.target.is_bsd():
       self.oslibs += ['execinfo']
+    if not self.target.is_windows():
+      self.linkflags += ['-fomit-frame-pointer']
 
     self.includepaths = self.prefix_includepaths((includepaths or []) + ['.'])
 
@@ -85,7 +95,7 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
       self.cflags += ['-w']
     self.cxxflags = list(self.cflags)
 
-    self.cflags += ['-std=gnu11']
+    self.cflags += ['-std=c11']
     if self.target.is_macos() or self.target.is_ios():
       self.cxxflags += ['-std=c++14', '-stdlib=libc++']
     else:
@@ -172,6 +182,8 @@ def write_variables(self, writer):
     writer.variable('archlibs', '')
     writer.variable('oslibs', self.make_libs(self.oslibs))
     writer.variable('frameworks', '')
+    if self.target.is_windows():
+      writer.variable('pdbpath', 'ninja.pdb')
     writer.newline()
 
   def write_rules(self, writer):
@@ -183,7 +195,10 @@ def write_rules(self, writer):
       writer.rule( 'lipo', command = self.lipocmd, description = 'LIPO $out' )
     writer.rule('ar', command = self.arcmd, description = 'LIB $out')
     writer.rule('link', command = self.linkcmd, description = 'LINK $out')
-    writer.rule('so', command = self.linkcmd, description = 'SO $out')
+    if self.target.is_windows():
+      writer.rule('dll', command = self.dllcmd, description = 'DLL $out')
+    else:
+      writer.rule('so', command = self.linkcmd, description = 'SO $out')
     writer.newline()
 
   def build_toolchain(self):
@@ -269,7 +284,7 @@ def make_libpath(self, path):
   def make_libpaths(self, libpaths):
     if not libpaths is None:
       if self.target.is_windows():
-        return ['-Xlinker /LIBPATH:' + self.path_escape(path) for path in libpaths]
+        return ['/libpath:' + self.path_escape(path) for path in libpaths]
       return ['-L' + self.make_libpath(path) for path in libpaths]
     return []
 
@@ -297,13 +312,18 @@ def make_targetarchflags(self, arch, targettype):
       flags += ['-gcc-toolchain', self.android.make_gcc_toolchain_path(arch)]
     elif self.target.is_macos() or self.target.is_ios():
       if arch == 'x86':
-        flags += [' -arch x86']
+        flags += ['-arch', 'x86']
       elif arch == 'x86-64':
-        flags += [' -arch x86_64']
+        flags += ['-arch', 'x86_64']
       elif arch == 'arm7':
-        flags += [' -arch armv7']
+        flags += ['-arch', 'armv7']
       elif arch == 'arm64':
-        flags += [' -arch arm64']
+        flags += ['-arch', 'arm64']
+    elif self.target.is_windows():
+      if arch == 'x86':
+        flags += ['-target', 'x86-pc-windows-msvc']
+      elif arch == 'x64':
+        flags += ['-target', 'x86_64-pc-windows-msvc']
     else:
       if arch == 'x86':
         flags += ['-m32']
@@ -321,15 +341,15 @@ def make_carchflags(self, arch, targettype):
     return flags
 
   def make_cconfigflags(self, config, targettype):
-    flags = []
+    flags = ['-g']
     if config == 'debug':
-      flags += ['-DBUILD_DEBUG=1', '-g']
+      flags += ['-DBUILD_DEBUG=1']
     elif config == 'release':
-      flags += ['-DBUILD_RELEASE=1', '-DNDEBUG', '-O3', '-g', '-funroll-loops']
+      flags += ['-DBUILD_RELEASE=1', '-O3', '-funroll-loops', '-flto']
     elif config == 'profile':
-      flags += ['-DBUILD_PROFILE=1', '-DNDEBUG', '-O3', '-g', '-funroll-loops']
+      flags += ['-DBUILD_PROFILE=1', '-O3', '-funroll-loops', '-flto']
     elif config == 'deploy':
-      flags += ['-DBUILD_DEPLOY=1', '-DNDEBUG', '-O3', '-g', '-funroll-loops']
+      flags += ['-DBUILD_DEPLOY=1', '-O3', '-funroll-loops', '-flto']
     return flags
 
   def make_ararchflags(self, arch, targettype):
@@ -347,10 +367,12 @@ def make_linkarchflags(self, arch, targettype, variables):
       if arch == 'arm7':
         flags += ['-Wl,--no-warn-mismatch', '-Wl,--fix-cortex-a8']
     if self.target.is_windows():
+      # Ignore target arch flags from above, add link style arch instead
+      flags = []
       if arch == 'x86':
-        flags += ['-Xlinker', '/MACHINE:X86']
+        flags += ['/machine:x86']
       elif arch == 'x86-64':
-        flags += ['-Xlinker', '/MACHINE:X64']
+        flags += ['/machine:x64']
     if self.target.is_macos() and variables != None and 'support_lua' in variables and variables['support_lua']:
       flags += ['-pagezero_size', '10000', '-image_base', '100000000']
     return flags
@@ -358,18 +380,16 @@ def make_linkarchflags(self, arch, targettype, variables):
   def make_linkconfigflags(self, config, targettype, variables):
     flags = []
     if self.target.is_windows():
-      if targettype == 'sharedlib':
-        flags += ['-Xlinker', '/DLL']
-      elif targettype == 'bin':
-        flags += ['-Xlinker', '/SUBSYSTEM:CONSOLE']
+      if config == 'debug':
+        flags += ['/incremental', '/defaultlib:libcmtd']
+      else:
+        flags += ['/incremental:no', '/opt:ref', '/opt:icf', '/defaultlib:libcmt']
     elif self.target.is_macos() or self.target.is_ios():
       if targettype == 'sharedlib' or targettype == 'multisharedlib':
         flags += ['-dynamiclib']
     else:
       if targettype == 'sharedlib':
         flags += ['-shared', '-fPIC']
-    if config == 'release':
-      flags += ['-DNDEBUG', '-O3']
     return flags
 
   def make_linkarchlibs(self, arch, targettype):
@@ -384,6 +404,8 @@ def make_linkarchlibs(self, arch, targettype):
 
   def make_libs(self, libs):
     if libs != None:
+      if self.target.is_windows():
+        return [lib + ".lib" for lib in libs]
       return ['-l' + lib for lib in libs]
     return []
 
@@ -484,6 +506,8 @@ def builder_lib(self, writer, config, arch, targettype, infiles, outfile, variab
     return writer.build(outfile, 'ar', infiles, implicit = self.implicit_deps(config, variables), variables = self.ar_variables(config, arch, targettype, variables))
 
   def builder_sharedlib(self, writer, config, arch, targettype, infiles, outfile, variables):
+    if self.target.is_windows():
+      return writer.build(outfile, 'dll', infiles, implicit = self.implicit_deps(config, variables), variables = self.link_variables(config, arch, targettype, variables))
     return writer.build(outfile, 'so', infiles, implicit = self.implicit_deps(config, variables), variables = self.link_variables(config, arch, targettype, variables))
 
   def builder_bin(self, writer, config, arch, targettype, infiles, outfile, variables):
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 15879bfe..47b0e0a5 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -152,7 +152,7 @@
 #include <string.h>
 #include <errno.h>
 
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 #include <fibersapi.h>
 static DWORD fls_key;
 static void NTAPI
@@ -2474,7 +2474,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	if (pthread_key_create(&_memory_thread_heap, _memory_heap_release_raw))
 		return -1;
 #endif
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
     fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
 #endif
 
@@ -2537,7 +2537,7 @@ rpmalloc_finalize(void) {
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
 	pthread_key_delete(_memory_thread_heap);
 #endif
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 	FlsFree(fls_key);
 	fls_key = 0;
 #endif
@@ -2559,7 +2559,7 @@ rpmalloc_thread_initialize(void) {
 		if (heap) {
 			_rpmalloc_stat_inc(&_memory_active_heaps);
 			set_thread_heap(heap);
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 			FlsSetValue(fls_key, heap);
 #endif
 		}
@@ -2573,7 +2573,7 @@ rpmalloc_thread_finalize(void) {
 	if (heap)
 		_rpmalloc_heap_release_raw(heap);
 	set_thread_heap(0);
-#if defined(_MSC_VER) && !defined(__clang__) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
 	FlsSetValue(fls_key, 0);
 #endif
 }
diff --git a/test/main.c b/test/main.c
index e242a856..efff01da 100644
--- a/test/main.c
+++ b/test/main.c
@@ -676,13 +676,13 @@ initfini_thread(void* argp) {
 	unsigned int ipass = 0;
 	unsigned int icheck = 0;
 	unsigned int id = 0;
-	void* addr[4096];
+	uint32_t* addr[4096];
 	char data[8192];
 	unsigned int cursize;
 	unsigned int iwait = 0;
 	int ret = 0;
 
-	for (id = 0; id < 8192; ++id)
+	for (id = 0; id < sizeof(data); ++id)
 		data[id] = (char)id;
 
 	thread_yield();
@@ -702,12 +702,12 @@ initfini_thread(void* argp) {
 				goto end;
 			}
 
-			*(uint32_t*)addr[ipass] = (uint32_t)cursize;
-			memcpy(pointer_offset(addr[ipass], 4), data, cursize);
+			addr[ipass][0] = (uint32_t)cursize;
+			memcpy(addr[ipass] + 1, data, cursize);
 
 			for (icheck = 0; icheck < ipass; ++icheck) {
-				size_t this_size = *(uint32_t*)addr[ipass];
-				size_t check_size = *(uint32_t*)addr[icheck];
+				size_t this_size = addr[ipass][0];
+				size_t check_size = addr[icheck][0];
 				if (this_size != cursize) {
 					ret = test_fail("Data corrupted in this block (size)");
 					goto end;
@@ -721,13 +721,13 @@ initfini_thread(void* argp) {
 					goto end;
 				}
 				if (addr[icheck] < addr[ipass]) {
-					if (pointer_offset(addr[icheck], check_size + 4) > addr[ipass]) {
+					if (pointer_offset(addr[icheck], check_size + 4) > (void*)addr[ipass]) {
 						ret = test_fail("Invalid pointer inside another block returned from allocation");
 						goto end;
 					}
 				}
 				else if (addr[icheck] > addr[ipass]) {
-					if (pointer_offset(addr[ipass], cursize + 4) > addr[icheck]) {
+					if (pointer_offset(addr[ipass], cursize + 4) > (void*)addr[icheck]) {
 						ret = test_fail("Invalid pointer inside another block returned from allocation");
 						goto end;
 					}
@@ -736,13 +736,13 @@ initfini_thread(void* argp) {
 		}
 
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
-			cursize = *(uint32_t*)addr[ipass];
+			cursize = addr[ipass][0];
 			if (cursize > max_datasize) {
 				ret = test_fail("Data corrupted (size)");
 				goto end;
 			}
 
-			if (memcmp(pointer_offset(addr[ipass], 4), data, cursize)) {
+			if (memcmp(addr[ipass] + 1, data, cursize)) {
 				ret = test_fail("Data corrupted");
 				goto end;
 			}

From b7a32468194db5009e7968d3afc8223f612493ad Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Sat, 28 Mar 2020 16:08:57 +0100
Subject: [PATCH 38/69] only reset thread local heap if first class heap
 matches

---
 rpmalloc/rpmalloc.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 47b0e0a5..76b4589d 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1662,7 +1662,8 @@ _rpmalloc_heap_release(void* heapptr, int first_class) {
 	//Orphan the heap
 	_rpmalloc_heap_orphan(heap, first_class);
 
-	set_thread_heap(0);
+	if (get_thread_heap_raw() == heap)
+		set_thread_heap(0);
 #if ENABLE_STATISTICS
 	atomic_decr32(&_memory_active_heaps);
 	assert(atomic_load32(&_memory_active_heaps) >= 0);

From 00b9372c70afc49aa4cc401e98f66f149c8b6a4b Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 9 Apr 2020 10:57:57 +0200
Subject: [PATCH 39/69] avoid deferred deallocation in first class heaps

---
 rpmalloc/rpmalloc.c | 12 +++++++-----
 test/main.c         |  2 +-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 76b4589d..c6287fa9 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -2006,7 +2006,7 @@ _rpmalloc_aligned_allocate(heap_t* heap, size_t alignment, size_t size) {
 static void
 _rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) {
 	heap_t* heap = span->heap;
-	assert(heap->owner_thread == get_thread_id() || heap->finalize);
+	assert(heap->owner_thread == get_thread_id() || !heap->owner_thread || heap->finalize);
 	//Add block to free list
 	if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) {
 		span->used_count = span->block_count;
@@ -2065,7 +2065,7 @@ _rpmalloc_deallocate_small_or_medium(span_t* span, void* p) {
 		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
 	}
 	//Check if block belongs to this heap or if deallocation should be deferred
-	if ((span->heap->owner_thread == get_thread_id()) || span->heap->finalize)
+	if ((span->heap->owner_thread == get_thread_id()) || !span->heap->owner_thread || span->heap->finalize)
 		_rpmalloc_deallocate_direct_small_or_medium(span, p);
 	else
 		_rpmalloc_deallocate_defer_small_or_medium(span, p);
@@ -2078,7 +2078,7 @@ _rpmalloc_deallocate_large(span_t* span) {
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
 	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
-	int defer = (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize;
+	int defer = (span->heap->owner_thread != get_thread_id()) && span->heap->owner_thread && !span->heap->finalize;
 	if (defer) {
 		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 		return;
@@ -2118,7 +2118,7 @@ _rpmalloc_deallocate_large(span_t* span) {
 static void
 _rpmalloc_deallocate_huge(span_t* span) {
 	assert(span->heap);
-	if ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize) {
+	if ((span->heap->owner_thread != get_thread_id()) && span->heap->owner_thread && !span->heap->finalize) {
 		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 		return;
 	}
@@ -2907,8 +2907,10 @@ extern inline rpmalloc_heap_t*
 rpmalloc_heap_acquire(void) {
 	// Must be a pristine heap from newly mapped memory pages, or else memory blocks
 	// could already be allocated from the heap which would (wrongly) be released when
-	// heap is cleared with rpmalloc_heap_free_all()
+	// heap is cleared with rpmalloc_heap_free_all(). Also heaps guaranteed to be
+	// pristine from the dedicated orphan list can be used.
 	heap_t* heap = _rpmalloc_heap_allocate(1);
+	heap->owner_thread = 0;
 	_rpmalloc_stat_inc(&_memory_active_heaps);
 	return heap;
 }
diff --git a/test/main.c b/test/main.c
index efff01da..426a48a5 100644
--- a/test/main.c
+++ b/test/main.c
@@ -1024,7 +1024,7 @@ test_first_class_heaps(void) {
 			return -1;
 	}
 
-	printf("Heap threaded tests passed\n");
+	printf("First class heap tests passed\n");
 #endif
 	return 0;
 }

From 0d9be19ef04abf73b6619952a4c822870743fcf1 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 9 Apr 2020 22:32:39 +0200
Subject: [PATCH 40/69] only test for first class heaps if enabled

---
 rpmalloc/rpmalloc.c | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index c6287fa9..72905fca 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -2065,7 +2065,12 @@ _rpmalloc_deallocate_small_or_medium(span_t* span, void* p) {
 		p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
 	}
 	//Check if block belongs to this heap or if deallocation should be deferred
-	if ((span->heap->owner_thread == get_thread_id()) || !span->heap->owner_thread || span->heap->finalize)
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (!defer)
 		_rpmalloc_deallocate_direct_small_or_medium(span, p);
 	else
 		_rpmalloc_deallocate_defer_small_or_medium(span, p);
@@ -2078,7 +2083,11 @@ _rpmalloc_deallocate_large(span_t* span) {
 	assert(!(span->flags & SPAN_FLAG_MASTER) || !(span->flags & SPAN_FLAG_SUBSPAN));
 	assert((span->flags & SPAN_FLAG_MASTER) || (span->flags & SPAN_FLAG_SUBSPAN));
 	//We must always defer (unless finalizing) if from another heap since we cannot touch the list or counters of another heap
-	int defer = (span->heap->owner_thread != get_thread_id()) && span->heap->owner_thread && !span->heap->finalize;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
 	if (defer) {
 		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 		return;
@@ -2118,7 +2127,12 @@ _rpmalloc_deallocate_large(span_t* span) {
 static void
 _rpmalloc_deallocate_huge(span_t* span) {
 	assert(span->heap);
-	if ((span->heap->owner_thread != get_thread_id()) && span->heap->owner_thread && !span->heap->finalize) {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	int defer = (span->heap->owner_thread && (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+	int defer = ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+	if (defer) {
 		_rpmalloc_deallocate_defer_free_span(span->heap, span);
 		return;
 	}

From ada045b10481fb766f1868767f4ff2f17141ccd3 Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Mon, 13 Apr 2020 22:05:41 +0100
Subject: [PATCH 41/69] Android malloc_usable_size signature fix (#166)

---
 rpmalloc/malloc.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 87fe9377..1c95fbea 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -160,7 +160,11 @@ void* memalign(size_t alignment, size_t size) RPALIAS(rpmemalign)
 int posix_memalign(void** memptr, size_t alignment, size_t size) RPALIAS(rpposix_memalign)
 void free(void* ptr) RPALIAS(rpfree)
 void cfree(void* ptr) RPALIAS(rpfree)
+#if defined(__ANDROID__)
+size_t malloc_usable_size(const void* ptr) RPALIAS(rpmalloc_usable_size)
+#else
 size_t malloc_usable_size(void* ptr) RPALIAS(rpmalloc_usable_size)
+#endif
 size_t malloc_size(void* ptr) RPALIAS(rpmalloc_usable_size)
 
 #endif

From 98accc256bc7fed66037c0cd4009edb28c1c0629 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sat, 25 Apr 2020 16:19:20 +0200
Subject: [PATCH 42/69] fix clang build

---
 build/ninja/clang.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index 2e56d10b..ff321c2d 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -345,11 +345,11 @@ def make_cconfigflags(self, config, targettype):
     if config == 'debug':
       flags += ['-DBUILD_DEBUG=1']
     elif config == 'release':
-      flags += ['-DBUILD_RELEASE=1', '-O3', '-funroll-loops', '-flto']
+      flags += ['-DBUILD_RELEASE=1', '-O3', '-funroll-loops']
     elif config == 'profile':
-      flags += ['-DBUILD_PROFILE=1', '-O3', '-funroll-loops', '-flto']
+      flags += ['-DBUILD_PROFILE=1', '-O3', '-funroll-loops']
     elif config == 'deploy':
-      flags += ['-DBUILD_DEPLOY=1', '-O3', '-funroll-loops', '-flto']
+      flags += ['-DBUILD_DEPLOY=1', '-O3', '-funroll-loops']
     return flags
 
   def make_ararchflags(self, arch, targettype):

From 1fd517df8e11faa6f72795b6d381e4168e487173 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mattias@rampantpixels.com>
Date: Sat, 25 Apr 2020 16:33:31 +0200
Subject: [PATCH 43/69] enable clang lto

---
 build/ninja/clang.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index ff321c2d..f3fb9ce4 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -390,6 +390,9 @@ def make_linkconfigflags(self, config, targettype, variables):
     else:
       if targettype == 'sharedlib':
         flags += ['-shared', '-fPIC']
+    if config != 'debug':
+      if targettype == 'bin' or targettype == 'sharedlib':
+        flags += ['-flto']
     return flags
 
   def make_linkarchlibs(self, arch, targettype):

From e2b86d93dcfb6fe36fca50ba6c9144701ff8a548 Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Sun, 26 Apr 2020 22:20:59 +0100
Subject: [PATCH 44/69] Darwin build fix: thread key destructor most likely
 refactored. (#169)

---
 rpmalloc/rpmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 72905fca..6c80c11e 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -2486,7 +2486,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	_memory_span_release_count_large = (_memory_span_release_count > 8 ? (_memory_span_release_count / 4) : 2);
 
 #if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-	if (pthread_key_create(&_memory_thread_heap, _memory_heap_release_raw))
+	if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw))
 		return -1;
 #endif
 #if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)

From cd1cfac2ceac485f23528bf62388287d542f9e4e Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Thu, 7 May 2020 19:34:56 +0100
Subject: [PATCH 45/69] IOS build fix, large pages unsupported and wrong asm
 detected (#172)

---
 rpmalloc/rpmalloc.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 6c80c11e..f6f172ed 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -138,8 +138,11 @@
 #  include <stdio.h>
 #  include <stdlib.h>
 #  if defined(__APPLE__)
+#    include <TargetConditionals.h>
+#    if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
 #    include <mach/mach_vm.h>
 #    include <mach/vm_statistics.h>
+#    endif
 #    include <pthread.h>
 #  endif
 #  if defined(__HAIKU__)
@@ -667,7 +670,7 @@ get_thread_id(void) {
 	uintptr_t tid;
 #  if defined(__i386__)
 	__asm__("movl %%gs:0, %0" : "=r" (tid) : : );
-#  elif defined(__MACH__)
+#  elif defined(__MACH__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
 	__asm__("movq %%gs:0, %0" : "=r" (tid) : : );
 #  elif defined(__x86_64__)
 	__asm__("movq %%fs:0, %0" : "=r" (tid) : : );
@@ -747,7 +750,7 @@ _rpmalloc_mmap_os(size_t size, size_t* offset) {
 	}
 #else
 	int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
-#  if defined(__APPLE__)
+#  if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
 	int fd = (int)VM_MAKE_TAG(240U);
 	if (_memory_huge_pages)
 		fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;

From c4d2c0586b5518838c0de143fd9842274943f4b3 Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Fri, 15 May 2020 11:18:06 +0200
Subject: [PATCH 46/69] Decode return values from subprocess.check_output
 (#173)

---
 build/ninja/android.py   |  2 +-
 build/ninja/clang.py     | 12 ++++++------
 build/ninja/toolchain.py |  5 +++++
 build/ninja/xcode.py     | 10 +++++-----
 4 files changed, 17 insertions(+), 12 deletions(-)

diff --git a/build/ninja/android.py b/build/ninja/android.py
index 82fba807..7d0a1fa4 100644
--- a/build/ninja/android.py
+++ b/build/ninja/android.py
@@ -81,7 +81,7 @@ def initialize_toolchain(self):
       else:
         self.hostarchname = 'windows-x86'
     elif self.host.is_linux():
-        localarch = subprocess.check_output(['uname', '-m']).strip()
+        localarch = toolchain.check_output(['uname', '-m'])
         if localarch == 'x86_64':
           self.hostarchname = 'linux-x86_64'
         else:
diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index f3fb9ce4..a6d21276 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -250,15 +250,15 @@ def build_xcode_toolchain(self):
       self.linkflags += ['-isysroot', '$sysroot']
     self.cflags += ['-fembed-bitcode-marker']
 
-    platformpath = subprocess.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path']).strip()
+    platformpath = toolchain.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path'])
     localpath = platformpath + "/Developer/usr/bin:/Applications/Xcode.app/Contents/Developer/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin"
 
-    self.sysroot = subprocess.check_output(['xcrun', '--sdk', sdk, '--show-sdk-path']).strip()
+    self.sysroot = toolchain.check_output(['xcrun', '--sdk', sdk, '--show-sdk-path'])
 
-    self.ccompiler = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'clang']).strip()
-    self.archiver = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'libtool']).strip()
+    self.ccompiler = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'clang'])
+    self.archiver = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'libtool'])
     self.linker = deploytarget + " " + self.ccompiler
-    self.lipo = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'lipo']).strip()
+    self.lipo = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'lipo'])
 
     self.mflags += list(self.cflags) + ['-fobjc-arc', '-fno-objc-exceptions', '-x', 'objective-c']
     self.cflags += ['-x', 'c']
@@ -480,7 +480,7 @@ def link_variables(self, config, arch, targettype, variables):
       localframeworks += list(variables['frameworks'])
     if len(localframeworks) > 0:
       localvariables += [('frameworks', self.make_frameworks(list(localframeworks)))]
-      
+
     libpaths = []
     if 'libpaths' in variables:
       libpaths = variables['libpaths']
diff --git a/build/ninja/toolchain.py b/build/ninja/toolchain.py
index 727c8586..dd5f4ac5 100644
--- a/build/ninja/toolchain.py
+++ b/build/ninja/toolchain.py
@@ -14,6 +14,11 @@
 import android
 import xcode
 
+
+def check_output(args):
+  import subprocess
+  return subprocess.check_output(args).decode().strip()
+
 def supported_toolchains():
   return ['msvc', 'gcc', 'clang', 'intel']
 
diff --git a/build/ninja/xcode.py b/build/ninja/xcode.py
index 8e158d9c..3af3761e 100644
--- a/build/ninja/xcode.py
+++ b/build/ninja/xcode.py
@@ -34,13 +34,13 @@ def build_toolchain(self):
       sdk = 'iphoneos'
       deploytarget = 'IPHONEOS_DEPLOYMENT_TARGET=' + self.deploymenttarget
 
-    platformpath = subprocess.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path']).strip()
+    platformpath = toolchain.check_output(['xcrun', '--sdk', sdk, '--show-sdk-platform-path'])
     localpath = platformpath + "/Developer/usr/bin:/Applications/Xcode.app/Contents/Developer/usr/bin:/usr/bin:/bin:/usr/sbin:/sbin"
 
-    self.plist = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'plutil']).strip()
-    self.xcassets = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'actool']).strip()
-    self.xib = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'ibtool']).strip()
-    self.dsymutil = "PATH=" + localpath + " " + subprocess.check_output(['xcrun', '--sdk', sdk, '-f', 'dsymutil']).strip()
+    self.plist = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'plutil'])
+    self.xcassets = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'actool'])
+    self.xib = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'ibtool'])
+    self.dsymutil = "PATH=" + localpath + " " + toolchain.check_output(['xcrun', '--sdk', sdk, '-f', 'dsymutil'])
 
     self.plistcmd = 'build/ninja/plist.py --exename $exename --prodname $prodname --bundle $bundleidentifier --target $target --deploymenttarget $deploymenttarget --output $outpath $in'
     if self.target.is_macos():

From 94994cb30136a3c4af24b83a1f241a77eb90ef72 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 2 Jul 2020 09:03:03 +0200
Subject: [PATCH 47/69] Minor optimizations and fix map padding on Windows
 (#174)

---
 rpmalloc/malloc.c   |  9 ++++++---
 rpmalloc/rpmalloc.c | 46 +++++++++++++++++++++++----------------------
 test/main.c         | 23 ++++++++++++++++-------
 3 files changed, 46 insertions(+), 32 deletions(-)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 1c95fbea..def584f7 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -62,15 +62,18 @@ __attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf
 #undef malloc
 #undef free
 #undef calloc
+#define RPMALLOC_RESTRICT __declspec(restrict)
+#else
+#define RPMALLOC_RESTRICT
 #endif
 
 #if ENABLE_OVERRIDE
 
 #if USE_IMPLEMENT
 
-extern inline void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
-extern inline void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
-extern inline void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL calloc(size_t count, size_t size) { return rpcalloc(count, size); }
+extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL realloc(void* ptr, size_t size) { return rprealloc(ptr, size); }
 extern inline void* RPMALLOC_CDECL reallocf(void* ptr, size_t size) { return rprealloc(ptr, size); }
 extern inline void* RPMALLOC_CDECL aligned_alloc(size_t alignment, size_t size) { return rpaligned_alloc(alignment, size); }
 extern inline void* RPMALLOC_CDECL memalign(size_t alignment, size_t size) { return rpmemalign(alignment, size); }
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index f6f172ed..15e666c3 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -216,8 +216,8 @@ static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return (void*)InterlockedExchangePointer((void* volatile*)dst, val); }
 static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return (InterlockedCompareExchangePointer((void* volatile*)dst, val, ref) == ref) ? 1 : 0; }
-static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_cas_ptr(dst, val, ref); }
 
 #define EXPECTED(x) (x)
 #define UNEXPECTED(x) (x)
@@ -242,8 +242,8 @@ static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return a
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
+static FORCEINLINE void*   atomic_exchange_ptr_acquire(atomicptr_t* dst, void* val) { return atomic_exchange_explicit(dst, val, memory_order_acquire); }
 static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_relaxed, memory_order_relaxed); }
-static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, void* ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
 
 #define EXPECTED(x) __builtin_expect((x), 1)
 #define UNEXPECTED(x) __builtin_expect((x), 0)
@@ -305,7 +305,7 @@ static FORCEINLINE int     atomic_cas_ptr_acquire(atomicptr_t* dst, void* val, v
 //! Total number of small + medium size classes
 #define SIZE_CLASS_COUNT          (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
 //! Number of large block size classes
-#define LARGE_CLASS_COUNT         32
+#define LARGE_CLASS_COUNT         63
 //! Maximum size of a medium block
 #define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
 //! Maximum size of a large block
@@ -787,10 +787,10 @@ _rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
 	if (release && offset) {
 		offset <<= 3;
 		address = pointer_offset(address, -(int32_t)offset);
-#if PLATFORM_POSIX
-		//Padding is always one span size
-		release += _memory_span_size;
-#endif
+		if ((release >= _memory_span_size) && (_memory_span_size > _memory_map_granularity)) {
+			//Padding is always one span size
+			release += _memory_span_size;
+		}
 	}
 #if !DISABLE_UNMAP
 #if PLATFORM_WINDOWS
@@ -1147,8 +1147,8 @@ _rpmalloc_span_extract_free_list_deferred(span_t* span) {
 	// We need acquire semantics on the CAS operation since we are interested in the list size
 	// Refer to _rpmalloc_deallocate_defer_small_or_medium for further comments on this dependency
 	do {
-		span->free_list = atomic_load_ptr(&span->free_list_deferred);
-	} while ((span->free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, span->free_list));
+		span->free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (span->free_list == INVALID_POINTER);
 	span->used_count -= span->list_size;
 	span->list_size = 0;
 	atomic_store_ptr_release(&span->free_list_deferred, 0);
@@ -2045,9 +2045,9 @@ _rpmalloc_deallocate_defer_small_or_medium(span_t* span, void* block) {
 	// guarantee the list_size variable validity + release semantics on pointer store
 	void* free_list;
 	do {
-		free_list = atomic_load_ptr(&span->free_list_deferred);
-		*((void**)block) = free_list;
-	} while ((free_list == INVALID_POINTER) || !atomic_cas_ptr_acquire(&span->free_list_deferred, INVALID_POINTER, free_list));
+		free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+	} while (free_list == INVALID_POINTER);
+	*((void**)block) = free_list;
 	uint32_t free_count = ++span->list_size;
 	atomic_store_ptr_release(&span->free_list_deferred, block);
 	if (free_count == span->block_count) {
@@ -2207,7 +2207,7 @@ _rpmalloc_reallocate(heap_t* heap, void* p, size_t size, size_t oldsize, unsigne
 			void* block = pointer_offset(span, SPAN_HEADER_SIZE);
 			if (!oldsize)
 				oldsize = (current_spans * _memory_span_size) - (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-			if ((current_spans >= num_spans) && (num_spans >= (current_spans / 2))) {
+			if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
 				//Still fits in block, never mind trying to save memory, but preserve data if alignment changed
 				if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
 					memmove(block, p, oldsize);
@@ -2314,14 +2314,16 @@ _rpmalloc_adjust_size_class(size_t iclass) {
 	_memory_size_class[iclass].class_idx = (uint16_t)iclass;
 
 	//Check if previous size classes can be merged
-	size_t prevclass = iclass;
-	while (prevclass > 0) {
-		--prevclass;
-		//A class can be merged if number of pages and number of blocks are equal
-		if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
-			memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
-		else
-			break;
+	if (iclass >= SMALL_CLASS_COUNT) {
+		size_t prevclass = iclass;
+		while (prevclass > 0) {
+			--prevclass;
+			//A class can be merged if number of pages and number of blocks are equal
+			if (_memory_size_class[prevclass].block_count == _memory_size_class[iclass].block_count)
+				memcpy(_memory_size_class + prevclass, _memory_size_class + iclass, sizeof(_memory_size_class[iclass]));
+			else
+				break;
+		}
 	}
 }
 
@@ -2533,7 +2535,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 void
 rpmalloc_finalize(void) {
 	rpmalloc_thread_finalize();
-	//rpmalloc_dump_statistics(stderr);
+	//rpmalloc_dump_statistics(stdout);
 
 	//Free all thread caches and fully free spans
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
diff --git a/test/main.c b/test/main.c
index 426a48a5..7e6fe642 100644
--- a/test/main.c
+++ b/test/main.c
@@ -47,9 +47,15 @@ test_alloc(void) {
 	void* addr[8142];
 	char data[20000];
 	unsigned int datasize[7] = { 473, 39, 195, 24, 73, 376, 245 };
+	size_t wanted_usable_size;
 
 	rpmalloc_initialize();
 
+	//Query the small granularity
+	void* zero_alloc = rpmalloc(0);
+	size_t small_granularity = rpmalloc_usable_size(zero_alloc);
+	rpfree(zero_alloc);
+
 	for (id = 0; id < 20000; ++id)
 		data[id] = (char)(id % 139 + id % 17);
 
@@ -68,16 +74,18 @@ test_alloc(void) {
 	rpfree(testptr);
 	for (iloop = 0; iloop <= 1024; ++iloop) {
 		testptr = rpmalloc(iloop);
-		size_t wanted_usable_size = 16 * ((iloop / 16) + ((!iloop || (iloop % 16)) ? 1 : 0));
-		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
+		wanted_usable_size = iloop ? small_granularity * ((iloop + (small_granularity - 1)) / small_granularity) : small_granularity;
+		if (rpmalloc_usable_size(testptr) != wanted_usable_size) {
+			printf("For %u wanted %zu got %zu\n", iloop, wanted_usable_size, rpmalloc_usable_size(testptr));
 			return test_fail("Bad base alloc usable size");
+		}
 		rpfree(testptr);
 	}
 
 	//Verify medium block sizes (until class merging kicks in)
 	for (iloop = 1025; iloop <= 6000; ++iloop) {
 		testptr = rpmalloc(iloop);
-		size_t wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
+		wanted_usable_size = 512 * ((iloop / 512) + ((iloop % 512) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 			return test_fail("Bad medium alloc usable size");
 		rpfree(testptr);
@@ -86,9 +94,10 @@ test_alloc(void) {
 	//Large reallocation test
 	testptr = rpmalloc(253000);
 	testptr = rprealloc(testptr, 151);
-	if (rpmalloc_usable_size(testptr) != 160)
+	wanted_usable_size = (small_granularity * ((151 + (small_granularity - 1)) / small_granularity));
+	if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 		return test_fail("Bad usable size");
-	if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != 144)
+	if (rpmalloc_usable_size(pointer_offset(testptr, 16)) != (wanted_usable_size - 16))
 		return test_fail("Bad offset usable size");
 	rpfree(testptr);
 
@@ -97,7 +106,7 @@ test_alloc(void) {
 		size_t size = 37 * iloop;
 		testptr = rpmalloc(size);
 		*((uintptr_t*)testptr) = 0x12345678;
-		size_t wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+		wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) != wanted_usable_size)
 			return test_fail("Bad usable size (alloc)");
 		testptr = rprealloc(testptr, size + 16);
@@ -109,7 +118,7 @@ test_alloc(void) {
 
 		testptr = rpaligned_alloc(128, size);
 		*((uintptr_t*)testptr) = 0x12345678;
-		wanted_usable_size = 16 * ((size / 16) + ((size % 16) ? 1 : 0));
+		wanted_usable_size = small_granularity * ((size / small_granularity) + ((size % small_granularity) ? 1 : 0));
 		if (rpmalloc_usable_size(testptr) < wanted_usable_size)
 			return test_fail("Bad usable size (aligned alloc)");
 		if (rpmalloc_usable_size(testptr) > (wanted_usable_size + 128))

From a8b6c1acf8e07095da9ccd6522d2d29ad375b315 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 2 Jul 2020 09:17:50 +0200
Subject: [PATCH 48/69] Fix aligned reallocation of null pointer

---
 rpmalloc/rpmalloc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 15e666c3..84227b39 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -2259,7 +2259,7 @@ _rpmalloc_aligned_reallocate(heap_t* heap, void* ptr, size_t alignment, size_t s
 		return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags);
 
 	int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
-	size_t usablesize = _rpmalloc_usable_size(ptr);
+	size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0);
 	if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
 		if (no_alloc || (size >= (usablesize / 2)))
 			return ptr;

From 9adf4e0aed0a60b22c9a4f10d20d674ca55a9f8c Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Thu, 2 Jul 2020 17:07:07 +0200
Subject: [PATCH 49/69] add MIT license option

---
 LICENSE | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/LICENSE b/LICENSE
index cf1ab25d..be01deb9 100644
--- a/LICENSE
+++ b/LICENSE
@@ -22,3 +22,31 @@ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 OTHER DEALINGS IN THE SOFTWARE.
 
 For more information, please refer to <http://unlicense.org>
+
+
+You can also use this software under the MIT license if public domain
+is not recognized in your country
+
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Mattias Jansson
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

From fa87350752cbb50b7a798e3446c1b0592012119d Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Wed, 8 Jul 2020 06:37:08 +0100
Subject: [PATCH 50/69] Sun OS platforms (legacy solaris but illumos based ones
 too). (#176)

large pages set via a hint.
---
 build/ninja/clang.py     | 4 ++--
 build/ninja/gcc.py       | 4 ++--
 build/ninja/platform.py  | 7 ++++++-
 build/ninja/toolchain.py | 2 +-
 rpmalloc/rpmalloc.c      | 3 +++
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index a6d21276..bae4417d 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -70,7 +70,7 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.parse_default_variables(variables)
     self.read_build_prefs()
 
-    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi():
+    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi() or self.target.is_sunos():
       self.cflags += ['-D_GNU_SOURCE=1']
       self.linkflags += ['-pthread']
       self.oslibs += ['m']
@@ -335,7 +335,7 @@ def make_carchflags(self, arch, targettype):
     flags = []
     if targettype == 'sharedlib':
       flags += ['-DBUILD_DYNAMIC_LINK=1']
-      if self.target.is_linux() or self.target.is_bsd():
+      if self.target.is_linux() or self.target.is_bsd() or self.target.is_sunos():
        flags += ['-fPIC']
     flags += self.make_targetarchflags(arch, targettype)
     return flags
diff --git a/build/ninja/gcc.py b/build/ninja/gcc.py
index 20646c19..299be53f 100644
--- a/build/ninja/gcc.py
+++ b/build/ninja/gcc.py
@@ -49,7 +49,7 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     self.parse_default_variables(variables)
     self.read_build_prefs()
 
-    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi():
+    if self.target.is_linux() or self.target.is_bsd() or self.target.is_raspberrypi() or self.target.is_sunos():
       self.cflags += ['-D_GNU_SOURCE=1']
       self.linkflags += ['-pthread']
     if self.target.is_linux() or self.target.is_raspberrypi():
@@ -186,7 +186,7 @@ def make_carchflags(self, arch, targettype):
     flags = []
     if targettype == 'sharedlib':
       flags += ['-DBUILD_DYNAMIC_LINK=1']
-      if self.target.is_linux() or self.target.is_bsd():
+      if self.target.is_linux() or self.target.is_bsd() or self.target.is_sunos():
         flags += ['-fPIC']
     flags += self.make_targetarchflags(arch, targettype)
     return flags
diff --git a/build/ninja/platform.py b/build/ninja/platform.py
index 68cf0ab1..cf91c14b 100644
--- a/build/ninja/platform.py
+++ b/build/ninja/platform.py
@@ -5,7 +5,7 @@
 import sys
 
 def supported_platforms():
-  return [ 'windows', 'linux', 'macos', 'bsd', 'ios', 'android', 'raspberrypi', 'tizen' ]
+  return [ 'windows', 'linux', 'macos', 'bsd', 'ios', 'android', 'raspberrypi', 'tizen', 'sunos' ]
 
 class Platform(object):
   def __init__(self, platform):
@@ -30,6 +30,8 @@ def __init__(self, platform):
       self.platform = 'raspberrypi'
     elif self.platform.startswith('tizen'):
       self.platform = 'tizen'
+    elif self.platform.startswith('sunos'):
+      self.platform = 'sunos'
 
   def platform(self):
     return self.platform
@@ -58,5 +60,8 @@ def is_raspberrypi(self):
   def is_tizen(self):
     return self.platform == 'tizen'
 
+  def is_sunos(self):
+    return self.platform == 'sunos'
+
   def get(self):
     return self.platform
diff --git a/build/ninja/toolchain.py b/build/ninja/toolchain.py
index dd5f4ac5..d10d8407 100644
--- a/build/ninja/toolchain.py
+++ b/build/ninja/toolchain.py
@@ -132,7 +132,7 @@ def initialize_archs(self, archs):
   def initialize_default_archs(self):
     if self.target.is_windows():
       self.archs = ['x86-64']
-    elif self.target.is_linux() or self.target.is_bsd():
+    elif self.target.is_linux() or self.target.is_bsd() or self.target.is_sunos():
       localarch = subprocess.check_output(['uname', '-m']).decode().strip()
       if localarch == 'x86_64' or localarch == 'amd64':
         self.archs = ['x86-64']
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 84227b39..d011bcc4 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -757,6 +757,9 @@ _rpmalloc_mmap_os(size_t size, size_t* offset) {
 	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
 #  elif defined(MAP_HUGETLB)
 	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#  elif defined(MAP_ALIGN)
+	caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0);
+	void* ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE, (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
 #  else
 	void* ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
 #  endif

From ed9e13f86490276c3e8597700976ffe2fb325d24 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 8 Jul 2020 20:18:13 +0200
Subject: [PATCH 51/69] Update description

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index dd1ed8e7..626f63ac 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-# rpmalloc - RP Memory Allocator
+# rpmalloc - General Purpose Memory Allocator
 This library provides a public domain cross platform lock free thread caching 16-byte aligned memory allocator implemented in C. The latest source code is always available at https://github.com/mjansson/rpmalloc
 
 Created by Mattias Jansson ([@maniccoder](https://twitter.com/maniccoder))  -  Support development through my [GitHub Sponsors page](https://github.com/sponsors/mjansson)

From bde9e20ad660eb4937789201d95bfa91530ab4d9 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Sat, 11 Jul 2020 10:01:19 +0200
Subject: [PATCH 52/69] add missing c++ operator overrides

---
 rpmalloc/malloc.c | 38 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 38 insertions(+)

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index def584f7..4011e610 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -69,6 +69,8 @@ __attribute__ ((section("__DATA, __interpose"))) = MAC_INTERPOSE_PAIR(newf, oldf
 
 #if ENABLE_OVERRIDE
 
+typedef struct rp_nothrow_t { int __dummy; } rp_nothrow_t;
+
 #if USE_IMPLEMENT
 
 extern inline RPMALLOC_RESTRICT void* RPMALLOC_CDECL malloc(size_t size) { return rpmalloc(size); }
@@ -87,18 +89,36 @@ extern inline size_t RPMALLOC_CDECL malloc_size(void* ptr) { return rpmalloc_usa
 // operators delete and delete[]
 extern void _ZdlPv(void* p); void _ZdlPv(void* p) { rpfree(p); }
 extern void _ZdaPv(void* p); void _ZdaPv(void* p) { rpfree(p); }
+extern void _ZdlPvm(void* p, size_t n); void _ZdlPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
+extern void _ZdaPvm(void* p, size_t n); void _ZdaPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
+extern void _ZdlPvSt11align_val_t(void* p, size_t a); void _ZdlPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
+extern void _ZdaPvSt11align_val_t(void* p, size_t a); void _ZdaPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
+extern void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t a); void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
+extern void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t a); void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
 #if ARCH_64BIT
 // 64-bit operators new and new[], normal and aligned
 extern void* _Znwm(uint64_t size); void* _Znwm(uint64_t size) { return rpmalloc(size); }
 extern void* _Znam(uint64_t size); void* _Znam(uint64_t size) { return rpmalloc(size); }
 extern void* _Znwmm(uint64_t size, uint64_t align); void* _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
 extern void* _Znamm(uint64_t size, uint64_t align); void* _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwmSt11align_val_t(size_t size, size_t align); void* _ZnwmSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_t(size_t size, size_t align); void* _ZnamSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwmRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnwmRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnamRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnamRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
 #else
 // 32-bit operators new and new[], normal and aligned
 extern void* _Znwj(uint32_t size); void* _Znwj(uint32_t size) { return rpmalloc(size); }
 extern void* _Znaj(uint32_t size); void* _Znaj(uint32_t size) { return rpmalloc(size); }
 extern void* _Znwjj(uint32_t size, uint32_t align); void* _Znwjj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
 extern void* _Znajj(uint32_t size, uint32_t align); void* _Znajj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwjSt11align_val_t(size_t size, size_t align); void* _ZnwjSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnajSt11align_val_t(size_t size, size_t align); void* _ZnajSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
 #endif
 
 #endif
@@ -139,6 +159,12 @@ __attribute__ ((section("__DATA, __interpose"))) = {
 // operators delete and delete[]
 void _ZdlPv(void* p) RPALIAS(rpfree)
 void _ZdaPv(void* p) RPALIAS(rpfree)
+extern inline void _ZdlPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
+extern inline void _ZdaPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
+extern inline void _ZdlPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
+extern inline void _ZdaPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
+extern inline void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
+extern inline void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
 
 #if ARCH_64BIT
 // 64-bit operators new and new[], normal and aligned
@@ -146,12 +172,24 @@ void* _Znwm(uint64_t size) RPALIAS(rpmalloc)
 void* _Znam(uint64_t size) RPALIAS(rpmalloc)
 extern inline void* _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
 extern inline void* _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwmSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnamSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwmRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnamRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern inline void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
 #else
 // 32-bit operators new and new[], normal and aligned
 void* _Znwj(uint32_t size) RPALIAS(rpmalloc)
 void* _Znaj(uint32_t size) RPALIAS(rpmalloc)
 extern inline void* _Znwjj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
 extern inline void* _Znajj(uint32_t size, uint32_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwjSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnajSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
+extern inline void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern inline void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern inline void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
 #endif
 
 void* malloc(size_t size) RPALIAS(rpmalloc)

From 3ed16f6e3ccc9a65ab07c8dfbe8842bfd1151e15 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Sat, 11 Jul 2020 20:53:29 +0200
Subject: [PATCH 53/69] override header for windows c++ operators

---
 rpmalloc/malloc.c     |  36 +++++++++-----
 rpmalloc/rpnew.h      | 111 ++++++++++++++++++++++++++++++++++++++++++
 test/main-override.cc |  12 +++++
 3 files changed, 147 insertions(+), 12 deletions(-)
 create mode 100644 rpmalloc/rpnew.h

diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 4011e610..b281e1f7 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -85,28 +85,32 @@ extern inline void RPMALLOC_CDECL cfree(void* ptr) { rpfree(ptr); }
 extern inline size_t RPMALLOC_CDECL malloc_usable_size(void* ptr) { return rpmalloc_usable_size(ptr); }
 extern inline size_t RPMALLOC_CDECL malloc_size(void* ptr) { return rpmalloc_usable_size(ptr); }
 
+#ifdef _WIN32
+// For Windows, #include <rpnew.h> in one source file to get the C++ operator overrides implemented in your module
+#else
 // Overload the C++ operators using the mangled names (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling)
 // operators delete and delete[]
 extern void _ZdlPv(void* p); void _ZdlPv(void* p) { rpfree(p); }
 extern void _ZdaPv(void* p); void _ZdaPv(void* p) { rpfree(p); }
-extern void _ZdlPvm(void* p, size_t n); void _ZdlPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
-extern void _ZdaPvm(void* p, size_t n); void _ZdaPvm(void* p, size_t n) { rpfree(p); (void)sizeof(n); }
-extern void _ZdlPvSt11align_val_t(void* p, size_t a); void _ZdlPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
-extern void _ZdaPvSt11align_val_t(void* p, size_t a); void _ZdaPvSt11align_val_t(void* p, size_t a) { rpfree(p); (void)sizeof(a); }
-extern void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t a); void _ZdlPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
-extern void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t a); void _ZdaPvmSt11align_val_t(void* p, size_t n, size_t a) { rpfree(p); (void)sizeof(n); (void)sizeof(a); }
 #if ARCH_64BIT
 // 64-bit operators new and new[], normal and aligned
 extern void* _Znwm(uint64_t size); void* _Znwm(uint64_t size) { return rpmalloc(size); }
 extern void* _Znam(uint64_t size); void* _Znam(uint64_t size) { return rpmalloc(size); }
 extern void* _Znwmm(uint64_t size, uint64_t align); void* _Znwmm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
 extern void* _Znamm(uint64_t size, uint64_t align); void* _Znamm(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
-extern void* _ZnwmSt11align_val_t(size_t size, size_t align); void* _ZnwmSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
-extern void* _ZnamSt11align_val_t(size_t size, size_t align); void* _ZnamSt11align_val_t(size_t size, size_t align) { return rpaligned_alloc(align, size); }
-extern void* _ZnwmRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnwmRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
-extern void* _ZnamRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnamRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
-extern void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnwmSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
-extern void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnamSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnwmSt11align_val_t(uint64_t size, uint64_t align); void* _ZnwmSt11align_val_t(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_t(uint64_t size, uint64_t align); void* _ZnamSt11align_val_t(uint64_t size, uint64_t align) { return rpaligned_alloc(align, size); }
+extern void* _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); void* _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t); void* _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
+extern void* _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t); void* _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+extern void* _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t); void* _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+// 64-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvm(void* p, uint64_t size); void _ZdlPvm(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdaPvm(void* p, uint64_t size); void _ZdaPvm(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdlPvSt11align_val_t(void* p, uint64_t align); void _ZdlPvSt11align_val_t(void* p, uint64_t align) { rpfree(p); (void)sizeof(align); }
+extern void _ZdaPvSt11align_val_t(void* p, uint64_t align); void _ZdaPvSt11align_val_t(void* p, uint64_t align) { rpfree(p); (void)sizeof(align); }
+extern void _ZdlPvmSt11align_val_t(void* p, uint64_t size, uint64_t align); void _ZdlPvmSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(align); }
+extern void _ZdaPvmSt11align_val_t(void* p, uint64_t size, uint64_t align); void _ZdaPvmSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(align); }
 #else
 // 32-bit operators new and new[], normal and aligned
 extern void* _Znwj(uint32_t size); void* _Znwj(uint32_t size) { return rpmalloc(size); }
@@ -119,6 +123,14 @@ extern void* _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnwjRKSt9n
 extern void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t); void* _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) { (void)sizeof(t); return rpmalloc(size); }
 extern void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
 extern void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t); void* _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align, rp_nothrow_t t) { (void)sizeof(t); return rpaligned_alloc(align, size); }
+// 32-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvj(void* p, uint64_t size); void _ZdlPvj(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdaPvj(void* p, uint64_t size); void _ZdaPvj(void* p, uint64_t size) { rpfree(p); (void)sizeof(size); }
+extern void _ZdlPvSt11align_val_t(void* p, uint32_t align); void _ZdlPvSt11align_val_t(void* p, uint64_t a) { rpfree(p); (void)sizeof(align); }
+extern void _ZdaPvSt11align_val_t(void* p, uint32_t align); void _ZdaPvSt11align_val_t(void* p, uint64_t a) { rpfree(p); (void)sizeof(align); }
+extern void _ZdlPvjSt11align_val_t(void* p, uint32_t size, uint32_t align); void _ZdlPvjSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(a); }
+extern void _ZdaPvjSt11align_val_t(void* p, uint32_t size, uint32_t align); void _ZdaPvjSt11align_val_t(void* p, uint64_t size, uint64_t align) { rpfree(p); (void)sizeof(size); (void)sizeof(a); }
+#endif
 #endif
 
 #endif
diff --git a/rpmalloc/rpnew.h b/rpmalloc/rpnew.h
new file mode 100644
index 00000000..cdb3d24d
--- /dev/null
+++ b/rpmalloc/rpnew.h
@@ -0,0 +1,111 @@
+
+#ifdef __cplusplus
+
+#include <new>
+#include <rpmalloc.h>
+
+#ifdef _WIN32
+
+extern void __CRTDECL
+operator delete(void* p) noexcept {
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p) noexcept {
+	rpfree(p);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size) noexcept(false) {
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size) noexcept(false) {
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpmalloc(size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpmalloc(size);
+}
+
+#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+
+extern void __CRTDECL
+operator delete(void* p, std::size_t size) noexcept {
+	(void)sizeof(size);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::size_t size) noexcept {
+	(void)sizeof(size);
+	rpfree(p);
+}
+
+#endif
+
+#if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+
+extern void __CRTDECL
+operator delete(void* p, std::align_val_t align) noexcept {
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::align_val_t align) noexcept {
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete(void* p, std::size_t size, std::align_val_t align) noexcept {
+	(void)sizeof(size);
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void __CRTDECL
+operator delete[](void* p, std::size_t size, std::align_val_t align) noexcept {
+	(void)sizeof(size);
+	(void)sizeof(align);
+	rpfree(p);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, std::align_val_t align) noexcept(false) {
+	return rpaligned_alloc(align, size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, std::align_val_t align) noexcept(false) {
+	return rpaligned_alloc(align, size);
+}
+
+extern void* __CRTDECL
+operator new(std::size_t size, std::align_val_t align, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpaligned_alloc(align, size);
+}
+
+extern void* __CRTDECL
+operator new[](std::size_t size, std::align_val_t align, const std::nothrow_t& tag) noexcept {
+	(void)sizeof(tag);
+	return rpaligned_alloc(align, size);
+}
+
+#endif
+
+#endif
+
+#endif
diff --git a/test/main-override.cc b/test/main-override.cc
index 1134d37f..30c40787 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -4,6 +4,7 @@
 #endif
 
 #include <rpmalloc.h>
+#include <rpnew.h>
 #include <thread.h>
 #include <test.h>
 
@@ -47,6 +48,17 @@ test_alloc(void) {
 		return test_fail("usable size invalid (3)");
 	delete[] static_cast<int*>(p);
 
+	p = new int[32];
+	if (!p)
+		return test_fail("new[] failed");
+	if (rpmalloc_usable_size(p) != 32*sizeof(int))
+		return test_fail("usable size invalid (4)");
+#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+	::operator delete[] (static_cast<int*>(p), sizeof(int) * 32);
+#else
+	delete[] static_cast<int*>(p);
+#endif
+
 	printf("Allocation tests passed\n");
 	return 0;
 }

From e639165bf575ad8938fb9146356713e98d81e6ed Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Mon, 3 Aug 2020 18:52:13 +0200
Subject: [PATCH 54/69] Array based caches (#177)

---
 CHANGELOG           |   7 +
 rpmalloc/malloc.c   |  19 +-
 rpmalloc/rpmalloc.c | 690 +++++++++++++++++++++++---------------------
 3 files changed, 380 insertions(+), 336 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 2e1a037b..c11f6134 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -1,5 +1,7 @@
 1.4.1
 
+Dual license as both released to public domain or under MIT license
+
 Allow up to 4GiB page sizes
 
 Fix an issue where large page sizes in conjunction with many threads waste a lot of memory (previously
@@ -21,6 +23,11 @@ with alignment less or equal to 128 bytes by utilizing natural block alignments
 Refactor finalization to be compatible with global scope data causing dynamic allocations and frees, like
 C++ objects with custom ctors/dtors.
 
+Refactor thread and global cache to be array based instead of list based for improved performance
+and cache size control.
+
+Added missing C++ operator overloads with ENABLE_OVERRIDE when using Microsoft C++ runtimes
+
 
 1.4.0
 
diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index b281e1f7..2e1f02a0 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -222,6 +222,11 @@ size_t malloc_size(void* ptr) RPALIAS(rpmalloc_usable_size)
 
 #endif
 
+static inline size_t
+_rpmalloc_page_size(void) {
+	return _memory_page_size;
+}
+
 extern inline void* RPMALLOC_CDECL
 reallocarray(void* ptr, size_t count, size_t size) {
 	size_t total;
@@ -248,9 +253,10 @@ reallocarray(void* ptr, size_t count, size_t size) {
 extern inline void* RPMALLOC_CDECL
 valloc(size_t size) {
 	get_thread_heap();
+	const size_t page_size = _rpmalloc_page_size();
 	if (!size)
-		size = _memory_page_size;
-	size_t total_size = size + _memory_page_size;
+		size = page_size;
+	size_t total_size = size + page_size;
 #if ENABLE_VALIDATE_ARGS
 	if (total_size < size) {
 		errno = EINVAL;
@@ -258,8 +264,8 @@ valloc(size_t size) {
 	}
 #endif
 	void* buffer = rpmalloc(total_size);
-	if ((uintptr_t)buffer & (_memory_page_size - 1))
-		return (void*)(((uintptr_t)buffer & ~(_memory_page_size - 1)) + _memory_page_size);
+	if ((uintptr_t)buffer & (page_size - 1))
+		return (void*)(((uintptr_t)buffer & ~(page_size - 1)) + page_size);
 	return buffer;
 }
 
@@ -267,8 +273,9 @@ extern inline void* RPMALLOC_CDECL
 pvalloc(size_t size) {
 	get_thread_heap();
 	size_t aligned_size = size;
-	if (aligned_size % _memory_page_size)
-		aligned_size = (1 + (aligned_size / _memory_page_size)) * _memory_page_size;
+	const size_t page_size = _rpmalloc_page_size();
+	if (aligned_size % page_size)
+		aligned_size = (1 + (aligned_size / page_size)) * page_size;
 #if ENABLE_VALIDATE_ARGS
 	if (aligned_size < size) {
 		errno = EINVAL;
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index d011bcc4..4b83a0e1 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1,4 +1,4 @@
-/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson
+/* rpmalloc.c  -  Memory allocator  -  Public Domain  -  2016-2020 Mattias Jansson
  *
  * This library provides a cross-platform lock free thread caching malloc implementation in C11.
  * The latest source code is always available at
@@ -50,60 +50,43 @@
 #define ENABLE_PRELOAD            0
 #endif
 #ifndef DISABLE_UNMAP
-//! Disable unmapping memory pages
+//! Disable unmapping memory pages (also enables unlimited cache)
 #define DISABLE_UNMAP             0
 #endif
-#ifndef DEFAULT_SPAN_MAP_COUNT
-//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
-#define DEFAULT_SPAN_MAP_COUNT    64
-#endif
-
-#if ENABLE_THREAD_CACHE
 #ifndef ENABLE_UNLIMITED_CACHE
-//! Unlimited thread and global cache
+//! Enable unlimited global cache (no unmapping until finalization)
 #define ENABLE_UNLIMITED_CACHE    0
 #endif
-#ifndef ENABLE_UNLIMITED_THREAD_CACHE
-//! Unlimited cache disables any thread cache limitations
-#define ENABLE_UNLIMITED_THREAD_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_THREAD_CACHE
-#ifndef THREAD_CACHE_MULTIPLIER
-//! Multiplier for thread cache (cache limit will be span release count multiplied by this value)
-#define THREAD_CACHE_MULTIPLIER 16
-#endif
 #ifndef ENABLE_ADAPTIVE_THREAD_CACHE
-//! Enable adaptive size of per-thread cache (still bounded by THREAD_CACHE_MULTIPLIER hard limit)
-#define ENABLE_ADAPTIVE_THREAD_CACHE  0
+//! Enable adaptive thread cache size based on use heuristics
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
+#ifndef DEFAULT_SPAN_MAP_COUNT
+//! Default number of spans to map in call to map more virtual memory (default values yield 4MiB here)
+#define DEFAULT_SPAN_MAP_COUNT    64
 #endif
+#ifndef GLOBAL_CACHE_MULTIPLIER
+//! Multiplier for global cache
+#define GLOBAL_CACHE_MULTIPLIER   8
 #endif
+
+#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
+#error Must use global cache if unmap is disabled
 #endif
 
-#if ENABLE_GLOBAL_CACHE && ENABLE_THREAD_CACHE
 #if DISABLE_UNMAP
-#undef ENABLE_UNLIMITED_GLOBAL_CACHE
-#define ENABLE_UNLIMITED_GLOBAL_CACHE 1
-#endif
-#ifndef ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Unlimited cache disables any global cache limitations
-#define ENABLE_UNLIMITED_GLOBAL_CACHE ENABLE_UNLIMITED_CACHE
-#endif
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-//! Multiplier for global cache (cache limit will be span release count multiplied by this value)
-#define GLOBAL_CACHE_MULTIPLIER (THREAD_CACHE_MULTIPLIER * 6)
-#endif
-#else
-#  undef ENABLE_GLOBAL_CACHE
-#  define ENABLE_GLOBAL_CACHE 0
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 1
 #endif
 
-#if !ENABLE_THREAD_CACHE || ENABLE_UNLIMITED_THREAD_CACHE
-#  undef ENABLE_ADAPTIVE_THREAD_CACHE
-#  define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#if !ENABLE_GLOBAL_CACHE
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 0
 #endif
 
-#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
-#  error Must use global cache if unmap is disabled
+#if !ENABLE_THREAD_CACHE
+#undef ENABLE_ADAPTIVE_THREAD_CACHE
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
 #endif
 
 #if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
@@ -115,7 +98,7 @@
 #endif
 
 /// Platform and arch specifics
-#if defined(_MSC_VER) && !defined(__clang__)
+#if defined(_MSC_VER)
 #  ifndef FORCEINLINE
 #    define FORCEINLINE inline __forceinline
 #  endif
@@ -206,13 +189,15 @@ typedef volatile void*     atomicptr_t;
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { *dst = val; }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)InterlockedIncrement(val); }
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)InterlockedDecrement(val); }
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)InterlockedExchangeAdd64(val, add) + add; }
 #endif
-static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
@@ -232,13 +217,15 @@ typedef volatile _Atomic(void*) atomicptr_t;
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_release); }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
+static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
+static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
 #if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
 static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return atomic_load_explicit(val, memory_order_relaxed); }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 #endif
-static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
@@ -314,6 +301,14 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 #define HEAP_ORPHAN_ABA_SIZE      512
 //! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
 #define SPAN_HEADER_SIZE          128
+//! Number of spans in thread cache
+#define MAX_THREAD_SPAN_CACHE     256
+//! Number of spans to transfer between thread and global cache
+#define THREAD_SPAN_CACHE_TRANSFER 64
+//! Number of spans in thread cache for large spans (must be greater than LARGE_CLASS_COUNT / 2)
+#define MAX_THREAD_SPAN_LARGE_CACHE 64
+//! Number of spans to transfer between thread and global cache for large spans
+#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
 
 _Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0, "Small granularity must be power of two");
 _Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0, "Span header size must be power of two");
@@ -456,30 +451,42 @@ struct span_t {
 };
 _Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
 
+struct span_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_CACHE];
+};
+typedef struct span_cache_t span_cache_t;
+
+struct span_large_cache_t {
+	size_t       count;
+	span_t*      span[MAX_THREAD_SPAN_LARGE_CACHE];
+};
+typedef struct span_large_cache_t span_large_cache_t;
+
+struct heap_size_class_t {
+	//! Free list of active span
+	void*        free_list;
+	//! Double linked list of partially used spans with free blocks.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      partial_span;
+	//! Early level cache of fully free spans
+	span_t*      cache[2];
+};
+typedef struct heap_size_class_t heap_size_class_t;
+
 // Control structure for a heap, either a thread heap or a first class heap if enabled
 struct heap_t {
 	//! Owning thread ID
 	uintptr_t    owner_thread;
-	//! Free list of active span
-	void*        free_list[SIZE_CLASS_COUNT];
-	//! Double linked list of partially used spans with free blocks for each size class.
-	//  Previous span pointer in head points to tail span of list.
-	span_t*      partial_span[SIZE_CLASS_COUNT];
+	//! Free lists for each size class
+	heap_size_class_t size_class[SIZE_CLASS_COUNT];
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of fully utilized spans with free blocks for each size class.
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      full_span[SIZE_CLASS_COUNT];
-#endif
-#if ENABLE_THREAD_CACHE
-	//! List of free spans (single linked list)
-	span_t*      span_cache[LARGE_CLASS_COUNT];
 #endif
 	//! List of deferred free spans (single linked list)
 	atomicptr_t  span_free_deferred;
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-	//! Current and high water mark of spans used per span count
-	span_use_t   span_use[LARGE_CLASS_COUNT];
-#endif
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	//! Double linked list of large and huge spans allocated by this heap
 	span_t*      large_huge_span;
@@ -506,6 +513,16 @@ struct heap_t {
 	heap_t*      master_heap;
 	//! Child count
 	atomic32_t   child_count;
+#if ENABLE_THREAD_CACHE
+	//! Arrays of fully freed spans, single span
+	span_cache_t span_cache;
+	//! Arrays of fully freed spans, large spans with > 1 span count
+	span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+	//! Current and high water mark of spans used per span count
+	span_use_t   span_use[LARGE_CLASS_COUNT];
+#endif
 #if ENABLE_STATISTICS
 	//! Number of bytes transitioned thread -> global
 	atomic64_t   thread_to_global;
@@ -528,12 +545,16 @@ struct size_class_t {
 _Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
 
 struct global_cache_t {
-	//! Cache list pointer
-	atomicptr_t cache;
-	//! Cache size
-	atomic32_t size;
-	//! ABA counter
-	atomic32_t counter;
+	//! Cache lock
+	atomic32_t lock;
+	//! Cache count
+	size_t count;
+	//! Cached spans
+	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
+#if ENABLE_UNLIMITED_CACHE
+	//! Unlimited cache overflow
+	span_t* overflow;
+#endif
 };
 
 ////////////
@@ -542,6 +563,11 @@ struct global_cache_t {
 ///
 //////
 
+//! Default span size (64KiB)
+#define _memory_default_span_size (64 * 1024)
+#define _memory_default_span_size_shift 16
+#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+
 //! Initialized flag
 static int _rpmalloc_initialized;
 //! Configuration
@@ -560,10 +586,10 @@ static size_t _memory_span_size_shift;
 //! Mask to get to start of a memory span
 static uintptr_t _memory_span_mask;
 #else
-//! Hardwired span size (64KiB)
-#define _memory_span_size (64 * 1024)
-#define _memory_span_size_shift 16
-#define _memory_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+//! Hardwired span size
+#define _memory_span_size _memory_default_span_size
+#define _memory_span_size_shift _memory_default_span_size_shift
+#define _memory_span_mask _memory_default_span_mask
 #endif
 //! Number of spans to map in each map call
 static size_t _memory_span_map_count;
@@ -827,76 +853,6 @@ _rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
 ///
 //////
 
-#if ENABLE_THREAD_CACHE
-
-static void
-_rpmalloc_span_unmap(span_t* span);
-
-//! Unmap a single linked list of spans
-static void
-_rpmalloc_span_list_unmap_all(span_t* span) {
-	size_t list_size = span->list_size;
-	for (size_t ispan = 0; ispan < list_size; ++ispan) {
-		span_t* next_span = span->next;
-		_rpmalloc_span_unmap(span);
-		span = next_span;
-	}
-	assert(!span);
-}
-
-//! Add span to head of single linked span list
-static size_t
-_rpmalloc_span_list_push(span_t** head, span_t* span) {
-	span->next = *head;
-	if (*head)
-		span->list_size = (*head)->list_size + 1;
-	else
-		span->list_size = 1;
-	*head = span;
-	return span->list_size;
-}
-
-//! Remove span from head of single linked span list, returns the new list head
-static span_t*
-_rpmalloc_span_list_pop(span_t** head) {
-	span_t* span = *head;
-	span_t* next_span = 0;
-	if (span->list_size > 1) {
-		assert(span->next);
-		next_span = span->next;
-		assert(next_span);
-		next_span->list_size = span->list_size - 1;
-	}
-	*head = next_span;
-	return span;
-}
-
-//! Split a single linked span list
-static span_t*
-_rpmalloc_span_list_split(span_t* span, size_t limit) {
-	span_t* next = 0;
-	if (limit < 2)
-		limit = 2;
-	if (span->list_size > limit) {
-		uint32_t list_size = 1;
-		span_t* last = span;
-		next = span->next;
-		while (list_size < limit) {
-			last = next;
-			next = next->next;
-			++list_size;
-		}
-		last->next = 0;
-		assert(next);
-		next->list_size = span->list_size - list_size;
-		span->list_size = list_size;
-		span->prev = 0;
-	}
-	return next;
-}
-
-#endif
-
 //! Add a span to double linked list at the head
 static void
 _rpmalloc_span_double_link_list_add(span_t** head, span_t* span) {
@@ -1075,10 +1031,19 @@ _rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	atomic_decr32(&heap->span_use[0].current);
 #endif
-	_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
-	_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
 	_rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
-	_rpmalloc_heap_cache_insert(heap, span);
+	if (!heap->finalize) {
+		_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
+		_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+		if (heap->size_class[span->size_class].cache[0]) {
+			if (heap->size_class[span->size_class].cache[1])
+				_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache[1]);
+			heap->size_class[span->size_class].cache[1] = heap->size_class[span->size_class].cache[0];
+		}
+		heap->size_class[span->size_class].cache[0] = span;
+	} else {
+		_rpmalloc_span_unmap(span);
+	}
 }
 
 //! Initialize a (partial) free list up to next system memory page, while reserving the first block
@@ -1129,11 +1094,11 @@ _rpmalloc_span_initialize_new(heap_t* heap, span_t* span, uint32_t class_idx) {
 
 	//Setup free list. Only initialize one system page worth of free blocks in list
 	void* block;
-	span->free_list_limit = free_list_partial_init(&heap->free_list[class_idx], &block, 
+	span->free_list_limit = free_list_partial_init(&heap->size_class[class_idx].free_list, &block, 
 		span, pointer_offset(span, SPAN_HEADER_SIZE), size_class->block_count, size_class->block_size);
 	//Link span as partial if there remains blocks to be initialized as free list, or full if fully initialized
 	if (span->free_list_limit < span->block_count) {
-		_rpmalloc_span_double_link_list_add(&heap->partial_span[class_idx], span);
+		_rpmalloc_span_double_link_list_add(&heap->size_class[class_idx].partial_span, span);
 		span->used_count = span->free_list_limit;
 	} else {
 #if RPMALLOC_FIRST_CLASS_HEAPS
@@ -1165,7 +1130,8 @@ _rpmalloc_span_is_fully_utilized(span_t* span) {
 
 static int
 _rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list_head) {
-	span_t* class_span = (span_t*)((uintptr_t)heap->free_list[iclass] & _memory_span_mask);
+	void* free_list = heap->size_class[iclass].free_list;
+	span_t* class_span = (span_t*)((uintptr_t)free_list & _memory_span_mask);
 	if (span == class_span) {
 		// Adopt the heap class free list back into the span free list
 		void* block = span->free_list;
@@ -1175,17 +1141,17 @@ _rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list
 			block = *((void**)block);
 		}
 		uint32_t free_count = 0;
-		block = heap->free_list[iclass];
+		block = free_list;
 		while (block) {
 			++free_count;
 			block = *((void**)block);
 		}
 		if (last_block) {
-			*((void**)last_block) = heap->free_list[iclass];
+			*((void**)last_block) = free_list;
 		} else {
-			span->free_list = heap->free_list[iclass];
+			span->free_list = free_list;
 		}
-		heap->free_list[iclass] = 0;
+		heap->size_class[iclass].free_list = 0;
 		span->used_count -= free_count;
 	}
 	//If this assert triggers you have memory leaks
@@ -1211,88 +1177,82 @@ _rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list
 
 #if ENABLE_GLOBAL_CACHE
 
-//! Insert the given list of memory page spans in the global cache
-static void
-_rpmalloc_global_cache_insert(global_cache_t* cache, span_t* span, size_t cache_limit) {
-	assert((span->list_size == 1) || (span->next != 0));
-	int32_t list_size = (int32_t)span->list_size;
-	//Unmap if cache has reached the limit. Does not need stronger synchronization, the worst
-	//case is that the span list is unmapped when it could have been cached (no real dependency
-	//between the two variables)
-	if (atomic_add32(&cache->size, list_size) > (int32_t)cache_limit) {
-#if !ENABLE_UNLIMITED_GLOBAL_CACHE
-		_rpmalloc_span_list_unmap_all(span);
-		atomic_add32(&cache->size, -list_size);
-		return;
-#endif
-	}
-	void* current_cache, *new_cache;
-	do {
-		current_cache = atomic_load_ptr(&cache->cache);
-		span->prev = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
-		new_cache = (void*)((uintptr_t)span | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-	} while (!atomic_cas_ptr(&cache->cache, new_cache, current_cache));
-}
-
-//! Extract a number of memory page spans from the global cache
-static span_t*
-_rpmalloc_global_cache_extract(global_cache_t* cache) {
-	uintptr_t span_ptr;
-	do {
-		void* global_span = atomic_load_ptr(&cache->cache);
-		span_ptr = (uintptr_t)global_span & _memory_span_mask;
-		if (span_ptr) {
-			span_t* span = (span_t*)span_ptr;
-			//By accessing the span ptr before it is swapped out of list we assume that a contending thread
-			//does not manage to traverse the span to being unmapped before we access it
-			void* new_cache = (void*)((uintptr_t)span->prev | ((uintptr_t)atomic_incr32(&cache->counter) & ~_memory_span_mask));
-			if (atomic_cas_ptr(&cache->cache, new_cache, global_span)) {
-				atomic_add32(&cache->size, -(int32_t)span->list_size);
-				return span;
-			}
-		}
-	} while (span_ptr);
-	return 0;
-}
-
 //! Finalize a global cache, only valid from allocator finalization (not thread safe)
 static void
 _rpmalloc_global_cache_finalize(global_cache_t* cache) {
-	void* current_cache = atomic_load_ptr(&cache->cache);
-	span_t* span = (span_t*)((uintptr_t)current_cache & _memory_span_mask);
-	while (span) {
-		span_t* skip_span = (span_t*)((uintptr_t)span->prev & _memory_span_mask);
-		atomic_add32(&cache->size, -(int32_t)span->list_size);
-		_rpmalloc_span_list_unmap_all(span);
-		span = skip_span;
+	for (size_t ispan = 0; ispan < cache->count; ++ispan)
+		_rpmalloc_span_unmap(cache->span[ispan]);
+	cache->count = 0;
+
+#if ENABLE_UNLIMITED_CACHE
+	while (cache->overflow) {
+		span_t* span = cache->overflow;
+		cache->overflow = span->next;
+		_rpmalloc_span_unmap(span);
 	}
-	assert(!atomic_load32(&cache->size));
-	atomic_store_ptr(&cache->cache, 0);
-	atomic_store32(&cache->size, 0);
+#endif
+
+	atomic_store32_release(&cache->lock, 0);
 }
 
-//! Insert the given list of memory page spans in the global cache
 static void
-_rpmalloc_global_cache_insert_span_list(span_t* span) {
-	size_t span_count = span->span_count;
-#if ENABLE_UNLIMITED_GLOBAL_CACHE
-	_rpmalloc_global_cache_insert(&_memory_span_cache[span_count - 1], span, 0);
+_rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t count) {
+	const size_t cache_limit = (span_count == 1) ? 
+		GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE :
+		GLOBAL_CACHE_MULTIPLIER * (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t insert_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
+
+	if ((cache->count + insert_count) > cache_limit)
+		insert_count = cache_limit - cache->count;
+
+	memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count);
+	cache->count += insert_count;
+
+#if ENABLE_UNLIMITED_CACHE
+	while (insert_count < count) {
+		span_t* current_span = span[insert_count++];
+		current_span->next = cache->overflow;
+		cache->overflow = current_span;
+	}
+	atomic_store32_release(&cache->lock, 0);
 #else
-	const size_t cache_limit = (GLOBAL_CACHE_MULTIPLIER * ((span_count == 1) ? _memory_span_release_count : _memory_span_release_count_large));
-	_rpmalloc_global_cache_insert(&_memory_span_cache[span_count - 1], span, cache_limit);
+	atomic_store32_release(&cache->lock, 0);
+	for (size_t ispan = insert_count; ispan < count; ++ispan)
+		_rpmalloc_span_unmap(span[ispan]);
 #endif
 }
 
-//! Extract a number of memory page spans from the global cache for large blocks
-static span_t*
-_rpmalloc_global_cache_extract_span_list(size_t span_count) {
-	span_t* span = _rpmalloc_global_cache_extract(&_memory_span_cache[span_count - 1]);
-	assert(!span || (span->span_count == span_count));
-	return span;
-}
+static size_t
+_rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t count) {
+	global_cache_t* cache = &_memory_span_cache[span_count - 1];
+
+	size_t extract_count = count;
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
 
+	if (extract_count > cache->count)
+		extract_count = cache->count;
+
+	memcpy(span, cache->span + (cache->count - extract_count), sizeof(span_t*) * extract_count);
+	cache->count -= extract_count;
+#if ENABLE_UNLIMITED_CACHE
+	while ((extract_count < count) && cache->overflow) {
+		span_t* current_span = cache->overflow;
+		span[extract_count++] = current_span;
+		cache->overflow = current_span->next;
+	}
 #endif
+	atomic_store32_release(&cache->lock, 0);
 
+	return extract_count;
+}
+
+#endif
 
 ////////////
 ///
@@ -1383,10 +1343,14 @@ _rpmalloc_heap_global_finalize(heap_t* heap) {
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->span_cache[iclass];
-		heap->span_cache[iclass] = 0;
-		if (span)
-			_rpmalloc_span_list_unmap_all(span);
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
 	}
 #endif
 
@@ -1396,7 +1360,7 @@ _rpmalloc_heap_global_finalize(heap_t* heap) {
 	}
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		if (heap->free_list[iclass] || heap->partial_span[iclass]) {
+		if (heap->size_class[iclass].free_list || heap->size_class[iclass].partial_span) {
 			--heap->finalize;
 			return;
 		}
@@ -1412,7 +1376,7 @@ _rpmalloc_heap_global_finalize(heap_t* heap) {
 		list_heap->next_heap = heap->next_heap;
 	}
 
-	_rpmalloc_heap_unmap( heap );
+	_rpmalloc_heap_unmap(heap);
 }
 
 //! Insert a single span into thread heap cache, releasing to global cache if overflow
@@ -1425,37 +1389,42 @@ _rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) {
 	}
 #if ENABLE_THREAD_CACHE
 	size_t span_count = span->span_count;
-	size_t idx = span_count - 1;
-	_rpmalloc_stat_inc(&heap->span_use[idx].spans_to_cache);
-#if ENABLE_UNLIMITED_THREAD_CACHE
-	_rpmalloc_span_list_push(&heap->span_cache[idx], span);
-#else
-	const size_t release_count = (!idx ? _memory_span_release_count : _memory_span_release_count_large);
-	size_t current_cache_size = _rpmalloc_span_list_push(&heap->span_cache[idx], span);
-	if (current_cache_size <= release_count)
-		return;
-	const size_t hard_limit = release_count * THREAD_CACHE_MULTIPLIER;
-	if (current_cache_size <= hard_limit) {
-#if ENABLE_ADAPTIVE_THREAD_CACHE
-		//Require 25% of high water mark to remain in cache (and at least 1, if use is 0)
-		const size_t high_mark = heap->span_use[idx].high;
-		const size_t min_limit = (high_mark >> 2) + release_count + 1;
-		if (current_cache_size < min_limit)
-			return;
+	_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache);
+	if (span_count == 1) {
+		span_cache_t* span_cache = &heap->span_cache;
+		span_cache->span[span_cache->count++] = span;
+		if (span_cache->count == MAX_THREAD_SPAN_CACHE) {
+			const size_t remain_count = MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER;
+#if ENABLE_GLOBAL_CACHE
+			_rpmalloc_stat_add64(&heap->thread_to_global, THREAD_SPAN_CACHE_TRANSFER * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, THREAD_SPAN_CACHE_TRANSFER);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, THREAD_SPAN_CACHE_TRANSFER);
 #else
-		return;
+			for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
 #endif
-	}
-	heap->span_cache[idx] = _rpmalloc_span_list_split(span, release_count);
-	assert(span->list_size == release_count);
+			span_cache->count = remain_count;
+		}
+	} else {
+		size_t cache_idx = span_count - 2;
+		span_large_cache_t* span_cache = heap->span_large_cache + cache_idx;
+		span_cache->span[span_cache->count++] = span;
+		const size_t cache_limit = (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+		if (span_cache->count == cache_limit) {
+			const size_t transfer_limit = 2 + (cache_limit >> 2);
+			const size_t transfer_count = (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit ? THREAD_SPAN_LARGE_CACHE_TRANSFER : transfer_limit);
+			const size_t remain_count = cache_limit - transfer_count;
 #if ENABLE_GLOBAL_CACHE
-	_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span_count * _memory_span_size);
-	_rpmalloc_stat_add(&heap->span_use[idx].spans_to_global, span->list_size);
-	_rpmalloc_global_cache_insert_span_list(span);
+			_rpmalloc_stat_add64(&heap->thread_to_global, transfer_count * span_count * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global, transfer_count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span + remain_count, span_count, transfer_count);
 #else
-	_rpmalloc_span_list_unmap_all(span);
-#endif
+			for (size_t ispan = 0; ispan < transfer_count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
 #endif
+			span_cache->count = remain_count;
+		}
+	}
 #else
 	(void)sizeof(heap);
 	_rpmalloc_span_unmap(span);
@@ -1466,13 +1435,20 @@ _rpmalloc_heap_cache_insert(heap_t* heap, span_t* span) {
 static span_t*
 _rpmalloc_heap_thread_cache_extract(heap_t* heap, size_t span_count) {
 	span_t* span = 0;
-	size_t idx = span_count - 1;
-	if (!idx)
+	if (span_count == 1) {
 		_rpmalloc_heap_cache_adopt_deferred(heap, &span);
+		if (span)
+			return span;
+	}
 #if ENABLE_THREAD_CACHE
-	if (!span && heap->span_cache[idx]) {
-		_rpmalloc_stat_inc(&heap->span_use[idx].spans_from_cache);
-		span = _rpmalloc_span_list_pop(&heap->span_cache[idx]);
+	span_cache_t* span_cache;
+	if (span_count == 1)
+		span_cache = &heap->span_cache;
+	else
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+	if (span_cache->count) {
+		_rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache);
+		return span_cache->span[--span_cache->count];
 	}
 #endif
 	return span;
@@ -1489,13 +1465,31 @@ _rpmalloc_heap_reserved_extract(heap_t* heap, size_t span_count) {
 static span_t*
 _rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) {
 #if ENABLE_GLOBAL_CACHE
-	size_t idx = span_count - 1;
-	heap->span_cache[idx] = _rpmalloc_global_cache_extract_span_list(span_count);
-	if (heap->span_cache[idx]) {
-		_rpmalloc_stat_add64(&heap->global_to_thread, (size_t)heap->span_cache[idx]->list_size * span_count * _memory_span_size);
-		_rpmalloc_stat_add(&heap->span_use[idx].spans_from_global, heap->span_cache[idx]->list_size);
-		return _rpmalloc_span_list_pop(&heap->span_cache[idx]);
+#if ENABLE_THREAD_CACHE
+	span_cache_t* span_cache;
+	size_t wanted_count;
+	if (span_count == 1) {
+		span_cache = &heap->span_cache;
+		wanted_count = THREAD_SPAN_CACHE_TRANSFER;
+	} else {
+		span_cache = (span_cache_t*)(heap->span_large_cache + (span_count - 2));
+		wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER;
+	}
+	span_cache->count = _rpmalloc_global_cache_extract_spans(span_cache->span, span_count, wanted_count);
+	if (span_cache->count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * span_cache->count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, span_cache->count);
+		return span_cache->span[--span_cache->count];
 	}
+#else
+	span_t* span = 0;
+	size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1);
+	if (count) {
+		_rpmalloc_stat_add64(&heap->global_to_thread, span_count * count * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global, count);
+		return span;
+	}
+#endif
 #endif
 	(void)sizeof(heap);
 	(void)sizeof(span_count);
@@ -1505,7 +1499,7 @@ _rpmalloc_heap_global_cache_extract(heap_t* heap, size_t span_count) {
 //! Get a span from one of the cache levels (thread cache, reserved, global cache) or fallback to mapping more memory
 static span_t*
 _rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_idx) {
-	(void)sizeof(class_idx);
+	span_t* span;
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	uint32_t idx = (uint32_t)span_count - 1;
 	uint32_t current_count = (uint32_t)atomic_incr32(&heap->span_use[idx].current);
@@ -1513,7 +1507,26 @@ _rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_
 		atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
 	_rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1, heap->size_class_use[class_idx].spans_peak);
 #endif
-	span_t* span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+#if ENABLE_THREAD_CACHE
+	if (class_idx < SIZE_CLASS_COUNT) {
+		if (heap->size_class[class_idx].cache[0]) {
+			span = heap->size_class[class_idx].cache[0];
+			span_t* new_cache = 0;
+			if (heap->span_cache.count)
+				new_cache = heap->span_cache.span[--heap->span_cache.count];
+			if (heap->size_class[class_idx].cache[1]) {
+				heap->size_class[class_idx].cache[0] = heap->size_class[class_idx].cache[1];
+				heap->size_class[class_idx].cache[1] = new_cache;
+			} else {
+				heap->size_class[class_idx].cache[0] = new_cache;
+			}
+			return span;
+		}
+	}
+#else
+	(void)sizeof(class_idx);
+#endif
+	span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
 	if (EXPECTED(span != 0)) {
 		_rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
 		return span;
@@ -1642,26 +1655,27 @@ _rpmalloc_heap_release(void* heapptr, int first_class) {
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->span_cache[iclass];
-		heap->span_cache[iclass] = 0;
-		if (span && heap->finalize) {
-			_rpmalloc_span_list_unmap_all(span);
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
 			continue;
-		}
+		if (heap->finalize) {
+			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[ispan]);
+		} else {
 #if ENABLE_GLOBAL_CACHE
-		while (span) {
-			assert(span->span_count == (iclass + 1));
-			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			span_t* next = _rpmalloc_span_list_split(span, (uint32_t)release_count);
-			_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
-			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span->list_size);
-			_rpmalloc_global_cache_insert_span_list(span);
-			span = next;
-		}
+			_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+			_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
 #else
-		if (span)
-			_rpmalloc_span_list_unmap_all(span);
+			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+				_rpmalloc_span_unmap(span_cache->span[ispan]);
 #endif
+		}
+		span_cache->count = 0;
 	}
 #endif
 
@@ -1692,15 +1706,21 @@ _rpmalloc_heap_finalize(heap_t* heap) {
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		span_t* span = heap->partial_span[iclass];
+		if (heap->size_class[iclass].cache[0])
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache[0]);
+		if (heap->size_class[iclass].cache[1])
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache[1]);
+		heap->size_class[iclass].cache[0] = 0;
+		heap->size_class[iclass].cache[1] = 0;
+		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
 			span_t* next = span->next;
-			_rpmalloc_span_finalize(heap, iclass, span, &heap->partial_span[iclass]);
+			_rpmalloc_span_finalize(heap, iclass, span, &heap->size_class[iclass].partial_span);
 			span = next;
 		}
 		// If class still has a free list it must be a full span
-		if (heap->free_list[iclass]) {
-			span_t* class_span = (span_t*)((uintptr_t)heap->free_list[iclass] & _memory_span_mask);
+		if (heap->size_class[iclass].free_list) {
+			span_t* class_span = (span_t*)((uintptr_t)heap->size_class[iclass].free_list & _memory_span_mask);
 			span_t** list = 0;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 			list = &heap->full_span[iclass];
@@ -1709,17 +1729,21 @@ _rpmalloc_heap_finalize(heap_t* heap) {
 			if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
 				if (list)
 					_rpmalloc_span_double_link_list_remove(list, class_span);
-				_rpmalloc_span_double_link_list_add(&heap->partial_span[iclass], class_span);
+				_rpmalloc_span_double_link_list_add(&heap->size_class[iclass].partial_span, class_span);
 			}
 		}
 	}
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (heap->span_cache[iclass]) {
-			_rpmalloc_span_list_unmap_all(heap->span_cache[iclass]);
-			heap->span_cache[iclass] = 0;
-		}
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
+		span_cache->count = 0;
 	}
 #endif
 	assert(!atomic_load_ptr(&heap->span_free_deferred));
@@ -1743,20 +1767,20 @@ free_list_pop(void** list) {
 //! Allocate a small/medium sized memory block from the given heap
 static void*
 _rpmalloc_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
-	span_t* span = heap->partial_span[class_idx];
+	span_t* span = heap->size_class[class_idx].partial_span;
 	if (EXPECTED(span != 0)) {
 		assert(span->block_count == _memory_size_class[span->size_class].block_count);
 		assert(!_rpmalloc_span_is_fully_utilized(span));
 		void* block;
 		if (span->free_list) {
 			//Swap in free list if not empty
-			heap->free_list[class_idx] = span->free_list;
+			heap->size_class[class_idx].free_list = span->free_list;
 			span->free_list = 0;
-			block = free_list_pop(&heap->free_list[class_idx]);
+			block = free_list_pop(&heap->size_class[class_idx].free_list);
 		} else {
 			//If the span did not fully initialize free list, link up another page worth of blocks			
 			void* block_start = pointer_offset(span, SPAN_HEADER_SIZE + ((size_t)span->free_list_limit * span->block_size));
-			span->free_list_limit += free_list_partial_init(&heap->free_list[class_idx], &block,
+			span->free_list_limit += free_list_partial_init(&heap->size_class[class_idx].free_list, &block,
 				(void*)((uintptr_t)block_start & ~(_memory_page_size - 1)), block_start,
 				span->block_count - span->free_list_limit, span->block_size);
 		}
@@ -1772,7 +1796,7 @@ _rpmalloc_allocate_from_heap_fallback(heap_t* heap, uint32_t class_idx) {
 			return block;
 
 		//The span is fully utilized, unlink from partial list and add to fully utilized list
-		_rpmalloc_span_double_link_list_pop_head(&heap->partial_span[class_idx], span);
+		_rpmalloc_span_double_link_list_pop_head(&heap->size_class[class_idx].partial_span, span);
 #if RPMALLOC_FIRST_CLASS_HEAPS
 		_rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
 #endif
@@ -1797,8 +1821,8 @@ _rpmalloc_allocate_small(heap_t* heap, size_t size) {
 	//Small sizes have unique size classes
 	const uint32_t class_idx = (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
 	_rpmalloc_stat_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->free_list[class_idx] != 0))
-		return free_list_pop(&heap->free_list[class_idx]);
+	if (EXPECTED(heap->size_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->size_class[class_idx].free_list);
 	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
@@ -1810,8 +1834,8 @@ _rpmalloc_allocate_medium(heap_t* heap, size_t size) {
 	const uint32_t base_idx = (uint32_t)(SMALL_CLASS_COUNT + ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
 	const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
 	_rpmalloc_stat_inc_alloc(heap, class_idx);
-	if (EXPECTED(heap->free_list[class_idx] != 0))
-		return free_list_pop(&heap->free_list[class_idx]);
+	if (EXPECTED(heap->size_class[class_idx].free_list != 0))
+		return free_list_pop(&heap->size_class[class_idx].free_list);
 	return _rpmalloc_allocate_from_heap_fallback(heap, class_idx);
 }
 
@@ -2019,14 +2043,14 @@ _rpmalloc_deallocate_direct_small_or_medium(span_t* span, void* block) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 		_rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class], span);
 #endif
-		_rpmalloc_span_double_link_list_add(&heap->partial_span[span->size_class], span);
+		_rpmalloc_span_double_link_list_add(&heap->size_class[span->size_class].partial_span, span);
 		--heap->full_span_count;
 	}
 	--span->used_count;
 	*((void**)block) = span->free_list;
 	span->free_list = block;
 	if (UNEXPECTED(span->used_count == span->list_size)) {
-		_rpmalloc_span_double_link_list_remove(&heap->partial_span[span->size_class], span);
+		_rpmalloc_span_double_link_list_remove(&heap->size_class[span->size_class].partial_span, span);
 		_rpmalloc_span_release_to_cache(heap, span);
 	}
 }
@@ -2465,18 +2489,22 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	_memory_page_size = ((size_t)1 << _memory_page_size_shift);
 
 #if RPMALLOC_CONFIGURABLE
-	size_t span_size = _memory_config.span_size;
-	if (!span_size)
-		span_size = (64 * 1024);
-	if (span_size > (256 * 1024))
-		span_size = (256 * 1024);
-	_memory_span_size = 4096;
-	_memory_span_size_shift = 12;
-	while (_memory_span_size < span_size) {
-		_memory_span_size <<= 1;
-		++_memory_span_size_shift;
+	if (!_memory_config.span_size) {
+		_memory_span_size = _memory_default_span_size;
+		_memory_span_size_shift = _memory_default_span_size_shift;
+		_memory_span_mask = _memory_default_span_mask;
+	} else {
+		size_t span_size = _memory_config.span_size;
+		if (span_size > (256 * 1024))
+			span_size = (256 * 1024);
+		_memory_span_size = 4096;
+		_memory_span_size_shift = 12;
+		while (_memory_span_size < span_size) {
+			_memory_span_size <<= 1;
+			++_memory_span_size_shift;
+		}
+		_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
 	}
-	_memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
 #endif
 
 	_memory_span_map_count = ( _memory_config.span_map_count ? _memory_config.span_map_count : DEFAULT_SPAN_MAP_COUNT);
@@ -2746,7 +2774,7 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
 		size_class_t* size_class = _memory_size_class + iclass;
-		span_t* span = heap->partial_span[iclass];
+		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
 			size_t free_count = span->list_size;
 			size_t block_count = size_class->block_count;
@@ -2760,8 +2788,12 @@ rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats) {
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		if (heap->span_cache[iclass])
-			stats->spancache = (size_t)heap->span_cache[iclass]->list_size * (iclass + 1) * _memory_span_size;
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		stats->spancache = span_cache->count * (iclass + 1) * _memory_span_size;
 	}
 #endif
 
@@ -2812,9 +2844,8 @@ rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats) {
 	stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
 #endif
 #if ENABLE_GLOBAL_CACHE
-	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		stats->cached += (size_t)atomic_load32(&_memory_span_cache[iclass].size) * (iclass + 1) * _memory_span_size;
-	}
+	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+		stats->cached += _memory_span_cache[iclass].count * (iclass + 1) * _memory_span_size;
 #endif
 }
 
@@ -2851,7 +2882,7 @@ _memory_heap_dump_statistics(heap_t* heap, void* file) {
 			atomic_load32(&heap->span_use[iclass].high),
 			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
 #if ENABLE_THREAD_CACHE
-			heap->span_cache[iclass] ? heap->span_cache[iclass]->list_size : 0,
+			(unsigned int)(iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 #else
@@ -3032,13 +3063,13 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		span = heap->partial_span[iclass];
+		span = heap->size_class[iclass].partial_span;
 		while (span) {
 			next_span = span->next;
 			_rpmalloc_heap_cache_insert(heap, span);
 			span = next_span;
 		}
-		heap->partial_span[iclass] = 0;
+		heap->size_class[iclass].partial_span = 0;
 		span = heap->full_span[iclass];
 		while (span) {
 			next_span = span->next;
@@ -3046,8 +3077,7 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 			span = next_span;
 		}
 	}
-	memset(heap->free_list, 0, sizeof(heap->free_list));
-	memset(heap->partial_span, 0, sizeof(heap->partial_span));
+	memset(heap->size_class, 0, sizeof(heap->size_class));
 	memset(heap->full_span, 0, sizeof(heap->full_span));
 
 	span = heap->large_huge_span;
@@ -3064,22 +3094,22 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 
 #if ENABLE_THREAD_CACHE
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		span = heap->span_cache[iclass];
+		span_cache_t* span_cache;
+		if (!iclass)
+			span_cache = &heap->span_cache;
+		else
+			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
+		if (!span_cache->count)
+			continue;
 #if ENABLE_GLOBAL_CACHE
-		while (span) {
-			assert(span->span_count == (iclass + 1));
-			size_t release_count = (!iclass ? _memory_span_release_count : _memory_span_release_count_large);
-			next_span = _rpmalloc_span_list_split(span, (uint32_t)release_count);
-			_rpmalloc_stat_add64(&heap->thread_to_global, (size_t)span->list_size * span->span_count * _memory_span_size);
-			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span->list_size);
-			_rpmalloc_global_cache_insert_span_list(span);
-			span = next_span;
-		}
+		_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
+		_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
+		_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
 #else
-		if (span)
-			_rpmalloc_span_list_unmap_all(span);
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
 #endif
-		heap->span_cache[iclass] = 0;
+		span_cache->count = 0;
 	}
 #endif
 

From 4ad9ca0b310296c26cbae233e10bc8831c34d0e7 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 08:05:27 +0200
Subject: [PATCH 55/69] fix test compilation

---
 test/main-override.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/test/main-override.cc b/test/main-override.cc
index 30c40787..fa84b637 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -53,11 +53,7 @@ test_alloc(void) {
 		return test_fail("new[] failed");
 	if (rpmalloc_usable_size(p) != 32*sizeof(int))
 		return test_fail("usable size invalid (4)");
-#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-	::operator delete[] (static_cast<int*>(p), sizeof(int) * 32);
-#else
 	delete[] static_cast<int*>(p);
-#endif
 
 	printf("Allocation tests passed\n");
 	return 0;

From b54eacb778cbad012e189869d3d496dc2dbcfc7a Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 09:58:42 +0200
Subject: [PATCH 56/69] fix statistics class count

---
 rpmalloc/rpmalloc.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index bfbdbdea..03b6c054 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -111,7 +111,7 @@ typedef struct rpmalloc_thread_statistics_t {
 		size_t from_reserved;
 		//! Number of raw memory map calls (not hitting the reserve spans but resulting in actual OS mmap calls)
 		size_t map_calls;
-	} span_use[32];
+	} span_use[64];
 	//! Per size class statistics (only if ENABLE_STATISTICS=1)
 	struct {
 		//! Current number of allocations

From 185d8e7b94e504c1028fdcdfead67be01b6dc224 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 09:59:15 +0200
Subject: [PATCH 57/69] fix statistics collection

---
 rpmalloc/rpmalloc.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 4b83a0e1..c29b3c75 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -705,11 +705,11 @@ get_thread_id(void) {
 #  elif defined(__aarch64__)
 	__asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tid));
 #  else
-	tid = (uintptr_t)get_thread_heap_raw();
+	tid = (uintptr_t)((void*)get_thread_heap_raw());
 #  endif
 	return tid;
 #else
-	return (uintptr_t)get_thread_heap_raw();
+	return (uintptr_t)((void*)get_thread_heap_raw());
 #endif
 }
 
@@ -1177,9 +1177,12 @@ _rpmalloc_span_finalize(heap_t* heap, size_t iclass, span_t* span, span_t** list
 
 #if ENABLE_GLOBAL_CACHE
 
-//! Finalize a global cache, only valid from allocator finalization (not thread safe)
+//! Finalize a global cache
 static void
 _rpmalloc_global_cache_finalize(global_cache_t* cache) {
+	while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+		/* Spin */;
+
 	for (size_t ispan = 0; ispan < cache->count; ++ispan)
 		_rpmalloc_span_unmap(cache->span[ispan]);
 	cache->count = 0;
@@ -2882,7 +2885,7 @@ _memory_heap_dump_statistics(heap_t* heap, void* file) {
 			atomic_load32(&heap->span_use[iclass].high),
 			((size_t)atomic_load32(&heap->span_use[iclass].high) * (size_t)_memory_span_size * (iclass + 1)) / (size_t)(1024 * 1024),
 #if ENABLE_THREAD_CACHE
-			(unsigned int)(iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
+			(unsigned int)(!iclass ? heap->span_cache.count : heap->span_large_cache[iclass - 1].count),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 #else

From 6236183ee719db8c9a8eac5e645ab08cccb96a75 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 10:28:52 +0200
Subject: [PATCH 58/69] structure padding and reshuffling

---
 rpmalloc/rpmalloc.c | 35 ++++++++++++++++++-----------------
 rpmalloc/rpmalloc.h |  1 +
 2 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index c29b3c75..9eb0b1e5 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -401,6 +401,7 @@ struct size_class_use_t {
 	atomic32_t spans_from_reserved;
 	//! Number of spans mapped
 	atomic32_t spans_map_calls;
+	int32_t unused;
 };
 typedef struct size_class_use_t size_class_use_t;
 #endif
@@ -480,17 +481,12 @@ struct heap_t {
 	uintptr_t    owner_thread;
 	//! Free lists for each size class
 	heap_size_class_t size_class[SIZE_CLASS_COUNT];
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	//! Double linked list of fully utilized spans with free blocks for each size class.
-	//  Previous span pointer in head points to tail span of list.
-	span_t*      full_span[SIZE_CLASS_COUNT];
+#if ENABLE_THREAD_CACHE
+	//! Arrays of fully freed spans, single span
+	span_cache_t span_cache;
 #endif
 	//! List of deferred free spans (single linked list)
 	atomicptr_t  span_free_deferred;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-	//! Double linked list of large and huge spans allocated by this heap
-	span_t*      large_huge_span;
-#endif
 	//! Number of full spans
 	size_t       full_span_count;
 	//! Mapped but unused spans
@@ -498,7 +494,9 @@ struct heap_t {
 	//! Master span for mapped but unused spans
 	span_t*      span_reserve_master;
 	//! Number of mapped but unused spans
-	size_t       spans_reserved;
+	uint32_t     spans_reserved;
+	//! Child count
+	atomic32_t   child_count;
 	//! Next heap in id list
 	heap_t*      next_heap;
 	//! Next heap in orphan list
@@ -511,25 +509,28 @@ struct heap_t {
 	int          finalize;
 	//! Master heap owning the memory pages
 	heap_t*      master_heap;
-	//! Child count
-	atomic32_t   child_count;
 #if ENABLE_THREAD_CACHE
-	//! Arrays of fully freed spans, single span
-	span_cache_t span_cache;
 	//! Arrays of fully freed spans, large spans with > 1 span count
 	span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
 #endif
+#if RPMALLOC_FIRST_CLASS_HEAPS
+	//! Double linked list of fully utilized spans with free blocks for each size class.
+	//  Previous span pointer in head points to tail span of list.
+	span_t*      full_span[SIZE_CLASS_COUNT];
+	//! Double linked list of large and huge spans allocated by this heap
+	span_t*      large_huge_span;
+#endif
 #if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
 	//! Current and high water mark of spans used per span count
 	span_use_t   span_use[LARGE_CLASS_COUNT];
 #endif
 #if ENABLE_STATISTICS
+	//! Allocation stats per size class
+	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
 	//! Number of bytes transitioned thread -> global
 	atomic64_t   thread_to_global;
 	//! Number of bytes transitioned global -> thread
 	atomic64_t   global_to_thread;
-	//! Allocation stats per size class
-	size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
 #endif
 };
 
@@ -548,7 +549,7 @@ struct global_cache_t {
 	//! Cache lock
 	atomic32_t lock;
 	//! Cache count
-	size_t count;
+	uint32_t count;
 	//! Cached spans
 	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
 #if ENABLE_UNLIMITED_CACHE
@@ -1270,7 +1271,7 @@ static void
 _rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve, size_t reserve_span_count) {
 	heap->span_reserve_master = master;
 	heap->span_reserve = reserve;
-	heap->spans_reserved = reserve_span_count;
+	heap->spans_reserved = (uint32_t)reserve_span_count;
 }
 
 //! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
diff --git a/rpmalloc/rpmalloc.h b/rpmalloc/rpmalloc.h
index 03b6c054..6b85c0af 100644
--- a/rpmalloc/rpmalloc.h
+++ b/rpmalloc/rpmalloc.h
@@ -175,6 +175,7 @@ typedef struct rpmalloc_config_t {
 	//  For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
 	//  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
 	int enable_huge_pages;
+	int unused;
 } rpmalloc_config_t;
 
 //! Initialize allocator with default configuration

From e741d892408ba5d933e9b6b61380e49c659d53ff Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 08:55:11 +0200
Subject: [PATCH 59/69] clang windows compatibility

---
 build/ninja/clang.py |  4 ++--
 rpmalloc/rpmalloc.c  | 10 +++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/build/ninja/clang.py b/build/ninja/clang.py
index bae4417d..bd4f8217 100644
--- a/build/ninja/clang.py
+++ b/build/ninja/clang.py
@@ -51,7 +51,7 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
                    '-fomit-frame-pointer', '-fno-math-errno','-ffinite-math-only', '-funsafe-math-optimizations',
                    '-fno-trapping-math', '-ffast-math']
     self.cwarnflags = ['-W', '-Werror', '-pedantic', '-Wall', '-Weverything',
-                       '-Wno-padded', '-Wno-documentation-unknown-command',
+                       '-Wno-c++98-compat', '-Wno-padded', '-Wno-documentation-unknown-command',
                        '-Wno-implicit-fallthrough', '-Wno-static-in-inline', '-Wno-reserved-id-macro']
     self.cmoreflags = []
     self.mflags = []
@@ -99,7 +99,7 @@ def initialize(self, project, archs, configs, includepaths, dependlibs, libpaths
     if self.target.is_macos() or self.target.is_ios():
       self.cxxflags += ['-std=c++14', '-stdlib=libc++']
     else:
-      self.cxxflags += ['-std=gnu++14']
+      self.cxxflags += ['-std=c++14']
 
     #Overrides
     self.objext = '.o'
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 9eb0b1e5..dbed7966 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -89,7 +89,7 @@
 #define ENABLE_ADAPTIVE_THREAD_CACHE 0
 #endif
 
-#if defined( _WIN32 ) || defined( __WIN32__ ) || defined( _WIN64 )
+#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
 #  define PLATFORM_WINDOWS 1
 #  define PLATFORM_POSIX 0
 #else
@@ -98,7 +98,7 @@
 #endif
 
 /// Platform and arch specifics
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
 #  ifndef FORCEINLINE
 #    define FORCEINLINE inline __forceinline
 #  endif
@@ -924,7 +924,7 @@ _rpmalloc_span_map_from_reserve(heap_t* heap, size_t span_count) {
 	//Update the heap span reserve
 	span_t* span = heap->span_reserve;
 	heap->span_reserve = (span_t*)pointer_offset(span, span_count * _memory_span_size);
-	heap->spans_reserved -= span_count;
+	heap->spans_reserved -= (uint32_t)span_count;
 
 	_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span, span_count);
 	if (span_count <= LARGE_CLASS_COUNT)
@@ -1215,7 +1215,7 @@ _rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t cou
 		insert_count = cache_limit - cache->count;
 
 	memcpy(cache->span + cache->count, span, sizeof(span_t*) * insert_count);
-	cache->count += insert_count;
+	cache->count += (uint32_t)insert_count;
 
 #if ENABLE_UNLIMITED_CACHE
 	while (insert_count < count) {
@@ -1243,7 +1243,7 @@ _rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t co
 		extract_count = cache->count;
 
 	memcpy(span, cache->span + (cache->count - extract_count), sizeof(span_t*) * extract_count);
-	cache->count -= extract_count;
+	cache->count -= (uint32_t)extract_count;
 #if ENABLE_UNLIMITED_CACHE
 	while ((extract_count < count) && cache->overflow) {
 		span_t* current_span = cache->overflow;

From 84b1519187309a878e14f453072ccad8696e2b1a Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 09:31:50 +0200
Subject: [PATCH 60/69] cleanup macro use

---
 rpmalloc/rpmalloc.c | 17 +++++++++++------
 test/main.c         |  5 +++++
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index dbed7966..a59fbfce 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -17,6 +17,14 @@
 ///
 //////
 
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-macros"
+#pragma clang diagnostic ignored "-Wunused-function"
+#elif defined(__GCC__)
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
 #ifndef HEAP_ARRAY_SIZE
 //! Size of heap hashmap
 #define HEAP_ARRAY_SIZE           47
@@ -189,15 +197,13 @@ typedef volatile void*     atomicptr_t;
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return *src; }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { *dst = val; }
-static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { *dst = val; }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return (int32_t)InterlockedIncrement(val); }
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return (int32_t)InterlockedDecrement(val); }
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return (int32_t)InterlockedExchangeAdd(val, add) + add; }
 static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0; }
-#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { *dst = val; }
 static FORCEINLINE int64_t atomic_load64(atomic64_t* src) { return *src; }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return (int64_t)InterlockedExchangeAdd64(val, add) + add; }
-#endif
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return (void*)*src; }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { *dst = val; }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { *dst = val; }
@@ -217,15 +223,13 @@ typedef volatile _Atomic(void*) atomicptr_t;
 
 static FORCEINLINE int32_t atomic_load32(atomic32_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store32(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
-static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_release); }
 static FORCEINLINE int32_t atomic_incr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1; }
 static FORCEINLINE int32_t atomic_decr32(atomic32_t* val) { return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1; }
 static FORCEINLINE int32_t atomic_add32(atomic32_t* val, int32_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
 static FORCEINLINE int     atomic_cas32_acquire(atomic32_t* dst, int32_t val, int32_t ref) { return atomic_compare_exchange_weak_explicit(dst, &ref, val, memory_order_acquire, memory_order_relaxed); }
-#if ENABLE_STATISTICS || ENABLE_ADAPTIVE_THREAD_CACHE
+static FORCEINLINE void    atomic_store32_release(atomic32_t* dst, int32_t val) { atomic_store_explicit(dst, val, memory_order_release); }
 static FORCEINLINE int64_t atomic_load64(atomic64_t* val) { return atomic_load_explicit(val, memory_order_relaxed); }
 static FORCEINLINE int64_t atomic_add64(atomic64_t* val, int64_t add) { return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add; }
-#endif
 static FORCEINLINE void*   atomic_load_ptr(atomicptr_t* src) { return atomic_load_explicit(src, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_relaxed); }
 static FORCEINLINE void    atomic_store_ptr_release(atomicptr_t* dst, void* val) { atomic_store_explicit(dst, val, memory_order_release); }
@@ -271,6 +275,7 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 #  define _rpmalloc_stat_inc_free(heap, class_idx) do {} while(0)
 #endif
 
+
 ///
 /// Preconfigured limits and sizes
 ///
diff --git a/test/main.c b/test/main.c
index 7e6fe642..bdd760b7 100644
--- a/test/main.c
+++ b/test/main.c
@@ -696,12 +696,17 @@ initfini_thread(void* argp) {
 
 	thread_yield();
 
+	if (arg.passes > (sizeof(addr) / sizeof(addr[0])))
+		arg.passes = sizeof(addr) / sizeof(addr[0]);
+
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		rpmalloc_thread_initialize();
 
 		unsigned int max_datasize = 0;
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
 			cursize = arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
+			if (cursize > sizeof(data))
+				cursize = sizeof(data);
 			if (cursize > max_datasize)
 				max_datasize = cursize;
 

From 439ec54cae7df8dc7f95de62d4500dd88bb2695a Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 13:39:03 +0200
Subject: [PATCH 61/69] use spin for orphan heap lists

---
 rpmalloc/rpmalloc.c | 63 ++++++++++++++++++---------------------------
 test/main.c         | 53 ++++++++++++++++++++++++--------------
 2 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index a59fbfce..b190c1c4 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -617,14 +617,14 @@ static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
 //! All heaps
 static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
+//! Orphan lock
+static atomic32_t _memory_orphan_lock;
 //! Orphaned heaps
-static atomicptr_t _memory_orphan_heaps;
+static heap_t* _memory_orphan_heaps;
 #if RPMALLOC_FIRST_CLASS_HEAPS
 //! Orphaned heaps (first class heaps)
-static atomicptr_t _memory_first_class_orphan_heaps;
+static heap_t* _memory_first_class_orphan_heaps;
 #endif
-//! Running orphan counter to avoid ABA issues in linked list
-static atomic32_t _memory_orphan_counter;
 #if ENABLE_STATISTICS
 //! Active heap count
 static atomic32_t _memory_active_heaps;
@@ -1574,22 +1574,18 @@ _rpmalloc_heap_initialize(heap_t* heap) {
 
 static void
 _rpmalloc_heap_orphan(heap_t* heap, int first_class) {
-	void* raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* last_heap;
 	heap->owner_thread = (uintptr_t)-1;
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	atomicptr_t* heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
+	heap_t** heap_list = (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
 #else
 	(void)sizeof(first_class);
-	atomicptr_t* heap_list = &_memory_orphan_heaps;
+	heap_t** heap_list = &_memory_orphan_heaps;
 #endif
-	do {
-		last_heap = (heap_t*)atomic_load_ptr(heap_list);
-		heap->next_orphan = (heap_t*)((uintptr_t)last_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		raw_heap = (void*)((uintptr_t)heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
-	} while (!atomic_cas_ptr(heap_list, raw_heap, last_heap));
+	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+		/* Spin */;
+	heap->next_orphan = *heap_list;
+	*heap_list = heap;
+	atomic_store32_release(&_memory_orphan_lock, 0);
 }
 
 //! Allocate a new heap from newly mapped memory pages
@@ -1598,7 +1594,8 @@ _rpmalloc_heap_allocate_new(void) {
 	//Map in pages for a new heap
 	size_t align_offset = 0;
 	size_t heap_size = sizeof(heap_t);
-	size_t block_size = _memory_page_size * ((heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
+	size_t aligned_heap_size = 64 * ((heap_size + 63) / 64);
+	size_t block_size = _memory_page_size * ((aligned_heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
 	heap_t* heap = (heap_t*)_rpmalloc_mmap(block_size, &align_offset);
 	if (!heap)
 		return heap;
@@ -1607,7 +1604,6 @@ _rpmalloc_heap_allocate_new(void) {
 	heap->align_offset = align_offset;
 
 	//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
-	size_t aligned_heap_size = HEAP_ORPHAN_ABA_SIZE * ((heap_size + HEAP_ORPHAN_ABA_SIZE - 1) / HEAP_ORPHAN_ABA_SIZE);
 	size_t num_heaps = block_size / aligned_heap_size;
 	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
 	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
@@ -1622,21 +1618,12 @@ _rpmalloc_heap_allocate_new(void) {
 }
 
 static heap_t*
-_rpmalloc_heap_extract_orphan(atomicptr_t* heap_list) {
-	void* raw_heap;
-	void* next_raw_heap;
-	uintptr_t orphan_counter;
-	heap_t* heap;
-	heap_t* next_heap;
-	do {
-		raw_heap = atomic_load_ptr(heap_list);
-		heap = (heap_t*)((uintptr_t)raw_heap & ~(uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1));
-		if (!heap)
-			break;
-		next_heap = heap->next_orphan;
-		orphan_counter = (uintptr_t)atomic_incr32(&_memory_orphan_counter);
-		next_raw_heap = (void*)((uintptr_t)next_heap | (orphan_counter & (uintptr_t)(HEAP_ORPHAN_ABA_SIZE - 1)));
-	} while (!atomic_cas_ptr(heap_list, next_raw_heap, raw_heap));
+_rpmalloc_heap_extract_orphan(heap_t** heap_list) {
+	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+		/* Spin */;
+	heap_t* heap = *heap_list;
+	*heap_list = (heap ? heap->next_orphan : 0);
+	atomic_store32_release(&_memory_orphan_lock, 0);
 	return heap;
 }
 
@@ -1688,15 +1675,15 @@ _rpmalloc_heap_release(void* heapptr, int first_class) {
 	}
 #endif
 
-	//Orphan the heap
-	_rpmalloc_heap_orphan(heap, first_class);
-
 	if (get_thread_heap_raw() == heap)
 		set_thread_heap(0);
+
 #if ENABLE_STATISTICS
 	atomic_decr32(&_memory_active_heaps);
 	assert(atomic_load32(&_memory_active_heaps) >= 0);
 #endif
+
+	_rpmalloc_heap_orphan(heap, first_class);
 }
 
 static void
@@ -2559,9 +2546,9 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		_rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
 	}
 
-	atomic_store_ptr(&_memory_orphan_heaps, 0);
+	_memory_orphan_heaps = 0;
 #if RPMALLOC_FIRST_CLASS_HEAPS
-	atomic_store_ptr(&_memory_first_class_orphan_heaps, 0);
+	_memory_first_class_orphan_heaps = 0;
 #endif
 	for (size_t ilist = 0, lsize = (sizeof(_memory_heaps) / sizeof(_memory_heaps[0])); ilist < lsize; ++ilist)
 		atomic_store_ptr(&_memory_heaps[ilist], 0);
@@ -2895,7 +2882,7 @@ _memory_heap_dump_statistics(heap_t* heap, void* file) {
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 #else
-			0, 0ULL, 0ULL,
+			0, (size_t)0, (size_t)0,
 #endif
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
 			((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) * (iclass + 1) * _memory_span_size) / (size_t)(1024 * 1024),
diff --git a/test/main.c b/test/main.c
index bdd760b7..6d6b3233 100644
--- a/test/main.c
+++ b/test/main.c
@@ -423,9 +423,9 @@ typedef struct _allocator_thread_arg {
 	unsigned int        passes; //max 4096
 	unsigned int        datasize[32];
 	unsigned int        num_datasize; //max 32
+	int                 init_fini_each_loop;
 	void**              pointers;
 	void**              crossthread_pointers;
-	int                 init_fini_each_loop;
 } allocator_thread_arg_t;
 
 static void
@@ -681,14 +681,17 @@ crossallocator_thread(void* argp) {
 static void
 initfini_thread(void* argp) {
 	allocator_thread_arg_t arg = *(allocator_thread_arg_t*)argp;
-	unsigned int iloop = 0;
-	unsigned int ipass = 0;
-	unsigned int icheck = 0;
+	unsigned int iloop;
+	unsigned int ipass;
+	unsigned int icheck;
 	unsigned int id = 0;
 	uint32_t* addr[4096];
+	uint32_t blocksize[4096];
 	char data[8192];
 	unsigned int cursize;
-	unsigned int iwait = 0;
+	unsigned int max_datasize = 0;
+	uint32_t this_size;
+	uint32_t check_size;
 	int ret = 0;
 
 	for (id = 0; id < sizeof(data); ++id)
@@ -702,31 +705,33 @@ initfini_thread(void* argp) {
 	for (iloop = 0; iloop < arg.loops; ++iloop) {
 		rpmalloc_thread_initialize();
 
-		unsigned int max_datasize = 0;
+		max_datasize = 0;
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
-			cursize = arg.datasize[(iloop + ipass + iwait) % arg.num_datasize] + ((iloop + ipass) % 1024);
+			cursize = arg.datasize[(iloop + ipass) % arg.num_datasize] + ((iloop + ipass) % 1024);
 			if (cursize > sizeof(data))
 				cursize = sizeof(data);
 			if (cursize > max_datasize)
 				max_datasize = cursize;
 
-			addr[ipass] = rpmalloc(4 + cursize);
+			addr[ipass] = rpmalloc(sizeof(uint32_t) + cursize);
 			if (addr[ipass] == 0) {
 				ret = test_fail("Allocation failed");
 				goto end;
 			}
 
+			blocksize[ipass] = (uint32_t)cursize;
 			addr[ipass][0] = (uint32_t)cursize;
 			memcpy(addr[ipass] + 1, data, cursize);
 
 			for (icheck = 0; icheck < ipass; ++icheck) {
-				size_t this_size = addr[ipass][0];
-				size_t check_size = addr[icheck][0];
+				this_size = addr[ipass][0];
+				check_size = addr[icheck][0];
 				if (this_size != cursize) {
 					ret = test_fail("Data corrupted in this block (size)");
 					goto end;
 				}
-				if (check_size > max_datasize) {
+				if (check_size != blocksize[icheck]) {
+					printf("For %u:%u got previous block size %u (%x) wanted %u (%x)\n", iloop, ipass, check_size, check_size, blocksize[icheck], blocksize[icheck]);
 					ret = test_fail("Data corrupted in previous block (size)");
 					goto end;
 				}
@@ -735,13 +740,12 @@ initfini_thread(void* argp) {
 					goto end;
 				}
 				if (addr[icheck] < addr[ipass]) {
-					if (pointer_offset(addr[icheck], check_size + 4) > (void*)addr[ipass]) {
+					if (pointer_offset(addr[icheck], check_size + sizeof(uint32_t)) > (void*)addr[ipass]) {
 						ret = test_fail("Invalid pointer inside another block returned from allocation");
 						goto end;
 					}
-				}
-				else if (addr[icheck] > addr[ipass]) {
-					if (pointer_offset(addr[ipass], cursize + 4) > (void*)addr[icheck]) {
+				} else {
+					if (pointer_offset(addr[ipass], this_size + sizeof(uint32_t)) > (void*)addr[icheck]) {
 						ret = test_fail("Invalid pointer inside another block returned from allocation");
 						goto end;
 					}
@@ -751,11 +755,17 @@ initfini_thread(void* argp) {
 
 		for (ipass = 0; ipass < arg.passes; ++ipass) {
 			cursize = addr[ipass][0];
+
+			if (cursize != blocksize[ipass]) {
+				printf("For %u:%u got size %u (%x) wanted %u (%x)\n", iloop, ipass, cursize, cursize, blocksize[ipass], blocksize[ipass]);
+				ret = test_fail("Data corrupted (size)");
+				goto end;
+			}
 			if (cursize > max_datasize) {
+				printf("For %u:%u got size %u (%x) >= %u\n", iloop, ipass, cursize, cursize, max_datasize);
 				ret = test_fail("Data corrupted (size)");
 				goto end;
 			}
-
 			if (memcmp(addr[ipass] + 1, data, cursize)) {
 				ret = test_fail("Data corrupted");
 				goto end;
@@ -925,8 +935,13 @@ test_threadspam(void) {
 	num_alloc_threads = _hardware_threads;
 	if (num_alloc_threads < 2)
 		num_alloc_threads = 2;
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
+	if (num_alloc_threads > 32)
+		num_alloc_threads = 32;
+#else
 	if (num_alloc_threads > 16)
 		num_alloc_threads = 16;
+#endif
 
 	arg.loops = 500;
 	arg.passes = 10;
@@ -946,7 +961,7 @@ test_threadspam(void) {
 		thread[i] = thread_run(&targ);
 
 	for (j = 0; j < num_passes; ++j) {
-		thread_sleep(10);
+		thread_sleep(100);
 
 		for (i = 0; i < num_alloc_threads; ++i) {
 			threadres[i] = thread_join(thread[i]);
@@ -1056,10 +1071,10 @@ test_run(int argc, char** argv) {
 		return -1;
 	if (test_crossthread())
 		return -1;
-	if (test_threadspam())
-		return -1;
 	if (test_threaded())
 		return -1;
+	if (test_threadspam())
+		return -1;
 	if (test_first_class_heaps())
 		return -1;
 	printf("All tests passed\n");

From 13d0e5321eb11e1caf20001c4a9d3822cc5ce163 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Tue, 4 Aug 2020 14:14:06 +0200
Subject: [PATCH 62/69] simplify orphan and size cache handling

---
 rpmalloc/rpmalloc.c | 71 +++++++++++++++++----------------------------
 1 file changed, 26 insertions(+), 45 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index b190c1c4..cf7310bb 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -302,8 +302,6 @@ static FORCEINLINE int     atomic_cas_ptr(atomicptr_t* dst, void* val, void* ref
 #define MEDIUM_SIZE_LIMIT         (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
 //! Maximum size of a large block
 #define LARGE_SIZE_LIMIT          ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
-//! ABA protection size in orhpan heap list (also becomes limit of smallest page size)
-#define HEAP_ORPHAN_ABA_SIZE      512
 //! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power of two)
 #define SPAN_HEADER_SIZE          128
 //! Number of spans in thread cache
@@ -476,7 +474,7 @@ struct heap_size_class_t {
 	//  Previous span pointer in head points to tail span of list.
 	span_t*      partial_span;
 	//! Early level cache of fully free spans
-	span_t*      cache[2];
+	span_t*      cache;
 };
 typedef struct heap_size_class_t heap_size_class_t;
 
@@ -616,7 +614,7 @@ static int _memory_huge_pages;
 static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
 //! All heaps
-static atomicptr_t _memory_heaps[HEAP_ARRAY_SIZE];
+static heap_t* _memory_heaps[HEAP_ARRAY_SIZE];
 //! Orphan lock
 static atomic32_t _memory_orphan_lock;
 //! Orphaned heaps
@@ -1041,12 +1039,9 @@ _rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
 	if (!heap->finalize) {
 		_rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
 		_rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
-		if (heap->size_class[span->size_class].cache[0]) {
-			if (heap->size_class[span->size_class].cache[1])
-				_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache[1]);
-			heap->size_class[span->size_class].cache[1] = heap->size_class[span->size_class].cache[0];
-		}
-		heap->size_class[span->size_class].cache[0] = span;
+		if (heap->size_class[span->size_class].cache)
+			_rpmalloc_heap_cache_insert(heap, heap->size_class[span->size_class].cache);
+		heap->size_class[span->size_class].cache = span;
 	} else {
 		_rpmalloc_span_unmap(span);
 	}
@@ -1376,9 +1371,9 @@ _rpmalloc_heap_global_finalize(heap_t* heap) {
 	}
 	//Heap is now completely free, unmap and remove from heap list
 	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-	heap_t* list_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+	heap_t* list_heap = _memory_heaps[list_idx];
 	if (list_heap == heap) {
-		atomic_store_ptr(&_memory_heaps[list_idx], heap->next_heap);
+		_memory_heaps[list_idx] = heap->next_heap;
 	} else {
 		while (list_heap->next_heap != heap)
 			list_heap = list_heap->next_heap;
@@ -1518,17 +1513,12 @@ _rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_
 #endif
 #if ENABLE_THREAD_CACHE
 	if (class_idx < SIZE_CLASS_COUNT) {
-		if (heap->size_class[class_idx].cache[0]) {
-			span = heap->size_class[class_idx].cache[0];
+		if (heap->size_class[class_idx].cache) {
+			span = heap->size_class[class_idx].cache;
 			span_t* new_cache = 0;
 			if (heap->span_cache.count)
 				new_cache = heap->span_cache.span[--heap->span_cache.count];
-			if (heap->size_class[class_idx].cache[1]) {
-				heap->size_class[class_idx].cache[0] = heap->size_class[class_idx].cache[1];
-				heap->size_class[class_idx].cache[1] = new_cache;
-			} else {
-				heap->size_class[class_idx].cache[0] = new_cache;
-			}
+			heap->size_class[class_idx].cache = new_cache;
 			return span;
 		}
 	}
@@ -1558,18 +1548,13 @@ _rpmalloc_heap_extract_new_span(heap_t* heap, size_t span_count, uint32_t class_
 
 static void
 _rpmalloc_heap_initialize(heap_t* heap) {
-	memset(heap, 0, sizeof(heap_t));
-
 	//Get a new heap ID
 	heap->id = 1 + atomic_incr32(&_memory_heap_id);
 
 	//Link in heap in heap ID map
-	heap_t* next_heap;
 	size_t list_idx = heap->id % HEAP_ARRAY_SIZE;
-	do {
-		next_heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
-		heap->next_heap = next_heap;
-	} while (!atomic_cas_ptr(&_memory_heaps[list_idx], heap, next_heap));
+	heap->next_heap = _memory_heaps[list_idx];
+	_memory_heaps[list_idx] = heap;
 }
 
 static void
@@ -1581,11 +1566,8 @@ _rpmalloc_heap_orphan(heap_t* heap, int first_class) {
 	(void)sizeof(first_class);
 	heap_t** heap_list = &_memory_orphan_heaps;
 #endif
-	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
-		/* Spin */;
 	heap->next_orphan = *heap_list;
 	*heap_list = heap;
-	atomic_store32_release(&_memory_orphan_lock, 0);
 }
 
 //! Allocate a new heap from newly mapped memory pages
@@ -1619,11 +1601,8 @@ _rpmalloc_heap_allocate_new(void) {
 
 static heap_t*
 _rpmalloc_heap_extract_orphan(heap_t** heap_list) {
-	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
-		/* Spin */;
 	heap_t* heap = *heap_list;
 	*heap_list = (heap ? heap->next_orphan : 0);
-	atomic_store32_release(&_memory_orphan_lock, 0);
 	return heap;
 }
 
@@ -1631,6 +1610,8 @@ _rpmalloc_heap_extract_orphan(heap_t** heap_list) {
 static heap_t*
 _rpmalloc_heap_allocate(int first_class) {
 	heap_t* heap = 0;
+	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+		/* Spin */;
 	if (first_class == 0)
 		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
 #if RPMALLOC_FIRST_CLASS_HEAPS
@@ -1639,6 +1620,7 @@ _rpmalloc_heap_allocate(int first_class) {
 #endif
 	if (!heap)
 		heap = _rpmalloc_heap_allocate_new();
+	atomic_store32_release(&_memory_orphan_lock, 0);
 	return heap;
 }
 
@@ -1683,7 +1665,10 @@ _rpmalloc_heap_release(void* heapptr, int first_class) {
 	assert(atomic_load32(&_memory_active_heaps) >= 0);
 #endif
 
+	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+		/* Spin */;
 	_rpmalloc_heap_orphan(heap, first_class);
+	atomic_store32_release(&_memory_orphan_lock, 0);
 }
 
 static void
@@ -1702,12 +1687,9 @@ _rpmalloc_heap_finalize(heap_t* heap) {
 	_rpmalloc_heap_cache_adopt_deferred(heap, 0);
 
 	for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-		if (heap->size_class[iclass].cache[0])
-			_rpmalloc_span_unmap(heap->size_class[iclass].cache[0]);
-		if (heap->size_class[iclass].cache[1])
-			_rpmalloc_span_unmap(heap->size_class[iclass].cache[1]);
-		heap->size_class[iclass].cache[0] = 0;
-		heap->size_class[iclass].cache[1] = 0;
+		if (heap->size_class[iclass].cache)
+			_rpmalloc_span_unmap(heap->size_class[iclass].cache);
+		heap->size_class[iclass].cache = 0;
 		span_t* span = heap->size_class[iclass].partial_span;
 		while (span) {
 			span_t* next = span->next;
@@ -2464,8 +2446,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	}
 #endif
 
-	//The ABA counter in heap orphan list is tied to using HEAP_ORPHAN_ABA_SIZE
-	size_t min_span_size = HEAP_ORPHAN_ABA_SIZE;
+	size_t min_span_size = 256;
 	size_t max_page_size;
 #if UINTPTR_MAX > 0xFFFFFFFF
 	max_page_size = 4096ULL * 1024ULL * 1024ULL;
@@ -2550,8 +2531,8 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 #if RPMALLOC_FIRST_CLASS_HEAPS
 	_memory_first_class_orphan_heaps = 0;
 #endif
-	for (size_t ilist = 0, lsize = (sizeof(_memory_heaps) / sizeof(_memory_heaps[0])); ilist < lsize; ++ilist)
-		atomic_store_ptr(&_memory_heaps[ilist], 0);
+	memset(_memory_heaps, 0, sizeof(_memory_heaps));
+	atomic_store32_release(&_memory_orphan_lock, 0);
 
 	//Initialize this thread
 	rpmalloc_thread_initialize();
@@ -2566,7 +2547,7 @@ rpmalloc_finalize(void) {
 
 	//Free all thread caches and fully free spans
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = (heap_t*)atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
 			heap_t* next_heap = heap->next_heap;
 			heap->finalize = 1;
@@ -2902,7 +2883,7 @@ rpmalloc_dump_statistics(void* file) {
 	//If you hit this assert, you still have active threads or forgot to finalize some thread(s)
 	assert(atomic_load32(&_memory_active_heaps) == 0);
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-		heap_t* heap = atomic_load_ptr(&_memory_heaps[list_idx]);
+		heap_t* heap = _memory_heaps[list_idx];
 		while (heap) {
 			int need_dump = 0;
 			for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT); ++iclass) {

From 3ff988c65c4f8a9fddc3e50fc65980913b250bca Mon Sep 17 00:00:00 2001
From: Eduardo Bart <edub4rt@gmail.com>
Date: Tue, 25 Aug 2020 15:40:14 -0300
Subject: [PATCH 63/69] Fix windows include on mingw-w64 (#182)

---
 test/main-override.cc | 2 +-
 test/main.c           | 2 +-
 test/thread.c         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/test/main-override.cc b/test/main-override.cc
index fa84b637..0323ff6e 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -139,7 +139,7 @@ main(int argc, char** argv) {
 #endif
 
 #ifdef _WIN32
-#include <Windows.h>
+#include <windows.h>
 
 static void
 test_initialize(void) {
diff --git a/test/main.c b/test/main.c
index 6d6b3233..805cbecb 100644
--- a/test/main.c
+++ b/test/main.c
@@ -1100,7 +1100,7 @@ main(int argc, char** argv) {
 #endif
 
 #ifdef _WIN32
-#include <Windows.h>
+#include <windows.h>
 
 static void
 test_initialize(void) {
diff --git a/test/thread.c b/test/thread.c
index 5e3ad8da..adaab444 100644
--- a/test/thread.c
+++ b/test/thread.c
@@ -8,7 +8,7 @@
 #endif
 
 #ifdef _WIN32
-#  include <Windows.h>
+#  include <windows.h>
 #  include <process.h>
 
 static unsigned __stdcall

From a0caea160397dfa56b0ee2f469ceedd7d14e71b7 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 26 Aug 2020 08:28:03 +0200
Subject: [PATCH 64/69] fix pvalloc and rearrange heap release code (#183)

---
 CHANGELOG             |  2 ++
 rpmalloc/malloc.c     | 21 +++------------------
 rpmalloc/rpmalloc.c   |  8 ++++----
 test/main-override.cc | 25 +++++++++++++++++++++++++
 4 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index c11f6134..4fd0bb36 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -28,6 +28,8 @@ and cache size control.
 
 Added missing C++ operator overloads with ENABLE_OVERRIDE when using Microsoft C++ runtimes
 
+Fixed issue in pvalloc override that could return less than a memory page in usable size.
+
 
 1.4.0
 
diff --git a/rpmalloc/malloc.c b/rpmalloc/malloc.c
index 2e1f02a0..56becb8d 100644
--- a/rpmalloc/malloc.c
+++ b/rpmalloc/malloc.c
@@ -253,36 +253,21 @@ reallocarray(void* ptr, size_t count, size_t size) {
 extern inline void* RPMALLOC_CDECL
 valloc(size_t size) {
 	get_thread_heap();
-	const size_t page_size = _rpmalloc_page_size();
-	if (!size)
-		size = page_size;
-	size_t total_size = size + page_size;
-#if ENABLE_VALIDATE_ARGS
-	if (total_size < size) {
-		errno = EINVAL;
-		return 0;
-	}
-#endif
-	void* buffer = rpmalloc(total_size);
-	if ((uintptr_t)buffer & (page_size - 1))
-		return (void*)(((uintptr_t)buffer & ~(page_size - 1)) + page_size);
-	return buffer;
+	return rpaligned_alloc(_rpmalloc_page_size(), size);
 }
 
 extern inline void* RPMALLOC_CDECL
 pvalloc(size_t size) {
 	get_thread_heap();
-	size_t aligned_size = size;
 	const size_t page_size = _rpmalloc_page_size();
-	if (aligned_size % page_size)
-		aligned_size = (1 + (aligned_size / page_size)) * page_size;
+	const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size;
 #if ENABLE_VALIDATE_ARGS
 	if (aligned_size < size) {
 		errno = EINVAL;
 		return 0;
 	}
 #endif
-	return valloc(size);
+	return rpaligned_alloc(_rpmalloc_page_size(), aligned_size);
 }
 
 #endif // ENABLE_OVERRIDE
diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index cf7310bb..3cce6884 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1640,19 +1640,19 @@ _rpmalloc_heap_release(void* heapptr, int first_class) {
 			span_cache = (span_cache_t*)(heap->span_large_cache + (iclass - 1));
 		if (!span_cache->count)
 			continue;
+#if ENABLE_GLOBAL_CACHE
 		if (heap->finalize) {
 			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
 				_rpmalloc_span_unmap(span_cache->span[ispan]);
 		} else {
-#if ENABLE_GLOBAL_CACHE
 			_rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count * (iclass + 1) * _memory_span_size);
 			_rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global, span_cache->count);
 			_rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1, span_cache->count);
+		}
 #else
-			for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
-				_rpmalloc_span_unmap(span_cache->span[ispan]);
+		for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+			_rpmalloc_span_unmap(span_cache->span[ispan]);
 #endif
-		}
 		span_cache->count = 0;
 	}
 #endif
diff --git a/test/main-override.cc b/test/main-override.cc
index 0323ff6e..b8df4f6e 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -14,6 +14,11 @@
 #include <string.h>
 #include <math.h>
 
+#if defined(_WIN32)
+extern "C" void* RPMALLOC_CDECL pvalloc(size_t size);
+extern "C" void* RPMALLOC_CDECL valloc(size_t size);
+#endif
+
 static size_t _hardware_threads;
 
 static void
@@ -27,6 +32,8 @@ test_fail(const char* reason) {
 
 static int
 test_alloc(void) {
+	const rpmalloc_config_t* config = rpmalloc_config();
+
 	void* p = malloc(371);
 	if (!p)
 		return test_fail("malloc failed");
@@ -55,6 +62,24 @@ test_alloc(void) {
 		return test_fail("usable size invalid (4)");
 	delete[] static_cast<int*>(p);
 
+	p = valloc(873);
+	if ((uintptr_t)p & (config->page_size - 1)) {
+		fprintf(stderr, "FAIL: pvalloc did not align address to page size (%p)\n", p);
+		return -1;
+	}
+	free(p);
+
+	p = pvalloc(275);
+	if ((uintptr_t)p & (config->page_size - 1)) {
+		fprintf(stderr, "FAIL: pvalloc did not align address to page size (%p)\n", p);
+		return -1;
+	}
+	if (rpmalloc_usable_size(p) < config->page_size) {
+		fprintf(stderr, "FAIL: pvalloc did not align size to page size (%llu)\n", rpmalloc_usable_size(p));
+		return -1;
+	}
+	free(p);
+
 	printf("Allocation tests passed\n");
 	return 0;
 }

From ff8a3dc53d313faee2c12acff4477ad2c75d5b94 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 26 Aug 2020 11:41:07 +0200
Subject: [PATCH 65/69] Improve huge page utilization (#178)

---
 rpmalloc/rpmalloc.c | 212 ++++++++++++++++++++++++++++++++++----------
 test/main.c         |   2 +-
 2 files changed, 164 insertions(+), 50 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 3cce6884..3c559c9e 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -504,8 +504,6 @@ struct heap_t {
 	heap_t*      next_heap;
 	//! Next heap in orphan list
 	heap_t*      next_orphan;
-	//! Memory pages alignment offset
-	size_t       align_offset;
 	//! Heap ID
 	int32_t      id;
 	//! Finalization state flag
@@ -555,10 +553,8 @@ struct global_cache_t {
 	uint32_t count;
 	//! Cached spans
 	span_t* span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
-#if ENABLE_UNLIMITED_CACHE
 	//! Unlimited cache overflow
 	span_t* overflow;
-#endif
 };
 
 ////////////
@@ -613,10 +609,16 @@ static int _memory_huge_pages;
 //! Global span cache
 static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
 #endif
+//! Global reserved spans
+static span_t* _memory_global_reserve;
+//! Global reserved count
+static size_t _memory_global_reserve_count;
+//! Global reserved master
+static span_t* _memory_global_reserve_master;
 //! All heaps
 static heap_t* _memory_heaps[HEAP_ARRAY_SIZE];
-//! Orphan lock
-static atomic32_t _memory_orphan_lock;
+//! Used to restrict access to mapping memory for huge pages
+static atomic32_t _memory_global_lock;
 //! Orphaned heaps
 static heap_t* _memory_orphan_heaps;
 #if RPMALLOC_FIRST_CLASS_HEAPS
@@ -692,7 +694,7 @@ get_thread_heap(void) {
 }
 
 //! Fast thread ID
-static inline uintptr_t 
+static inline uintptr_t
 get_thread_id(void) {
 #if defined(_WIN32)
 	return (uintptr_t)((void*)NtCurrentTeb());
@@ -835,8 +837,7 @@ _rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
 		if (munmap(address, release)) {
 			assert("Failed to unmap virtual memory block" == 0);
 		}
-	}
-	else {
+	} else {
 #if defined(POSIX_MADV_FREE)
 		if (posix_madvise(address, size, POSIX_MADV_FREE))
 #endif
@@ -850,6 +851,30 @@ _rpmalloc_unmap_os(void* address, size_t size, size_t offset, size_t release) {
 		_rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
 }
 
+static void
+_rpmalloc_span_mark_as_subspan_unless_master(span_t* master, span_t* subspan, size_t span_count);
+
+//! Use global reserved spans to fulfill a memory map request (reserve size must be checked by caller)
+static span_t*
+_rpmalloc_global_get_reserved_spans(size_t span_count) {
+	span_t* span = _memory_global_reserve;
+	_rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master, span, span_count);
+	_memory_global_reserve_count -= span_count;
+	if (_memory_global_reserve_count)
+		_memory_global_reserve = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+	else
+		_memory_global_reserve = 0;
+	return span;
+}
+
+//! Store the given spans as global reserve (must only be called from within new heap allocation, not thread safe)
+static void
+_rpmalloc_global_set_reserved_spans(span_t* master, span_t* reserve, size_t reserve_span_count) {
+	_memory_global_reserve_master = master;
+	_memory_global_reserve_count = reserve_span_count;
+	_memory_global_reserve = reserve;
+}
+
 
 ////////////
 ///
@@ -941,7 +966,7 @@ static size_t
 _rpmalloc_span_align_count(size_t span_count) {
 	size_t request_count = (span_count > _memory_span_map_count) ? span_count : _memory_span_map_count;
 	if ((_memory_page_size > _memory_span_size) && ((request_count * _memory_span_size) % _memory_page_size))
-		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);	
+		request_count += _memory_span_map_count - (request_count % _memory_span_map_count);
 	return request_count;
 }
 
@@ -955,6 +980,9 @@ _rpmalloc_span_initialize(span_t* span, size_t total_span_count, size_t span_cou
 	atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
 }
 
+static void
+_rpmalloc_span_unmap(span_t* span);
+
 //! Map an aligned set of spans, taking configured mapping granularity and the page size into account
 static span_t*
 _rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) {
@@ -977,17 +1005,53 @@ _rpmalloc_span_map_aligned_count(heap_t* heap, size_t span_count) {
 			_rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
 			_rpmalloc_heap_cache_insert(heap, heap->span_reserve);
 		}
+		if (reserved_count > DEFAULT_SPAN_MAP_COUNT) {
+			size_t remain_count = reserved_count - DEFAULT_SPAN_MAP_COUNT;
+			reserved_count = DEFAULT_SPAN_MAP_COUNT;
+			span_t* remain_span = (span_t*)pointer_offset(reserved_spans, reserved_count * _memory_span_size);
+			if (_memory_global_reserve)
+				_rpmalloc_span_unmap(_memory_global_reserve);
+			_rpmalloc_global_set_reserved_spans(span, remain_span, remain_count);
+		}
 		_rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans, reserved_count);
 	}
 	return span;
 }
 
+static span_t*
+_rpmalloc_global_get_reserved_spans(size_t span_count);
+
 //! Map in memory pages for the given number of spans (or use previously reserved pages)
 static span_t*
 _rpmalloc_span_map(heap_t* heap, size_t span_count) {
 	if (span_count <= heap->spans_reserved)
 		return _rpmalloc_span_map_from_reserve(heap, span_count);
-	return _rpmalloc_span_map_aligned_count(heap, span_count);
+		span_t* span = 0;
+	if (_memory_page_size > _memory_span_size) {
+		// If huge pages, make sure only one thread maps more memory to avoid bloat
+		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) {
+			/* Spin */
+		}
+		if (_memory_global_reserve_count >= span_count) {
+			size_t reserve_count = (!heap->spans_reserved ? DEFAULT_SPAN_MAP_COUNT : span_count);
+			if (_memory_global_reserve_count < reserve_count)
+				reserve_count = _memory_global_reserve_count;
+			span = _rpmalloc_global_get_reserved_spans(reserve_count);
+			if (span) {
+				if (reserve_count > span_count) {
+					span_t* reserved_span = (span_t*)pointer_offset(span, span_count << _memory_span_size_shift);
+					_rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master, reserved_span, reserve_count - span_count);
+				}
+				// Already marked as subspan in _rpmalloc_global_get_reserved_spans
+				span->span_count = (uint32_t)span_count;
+			}
+		}
+	}
+	if (!span)
+		span = _rpmalloc_span_map_aligned_count(heap, span_count);
+	if (_memory_page_size > _memory_span_size)
+		atomic_store32_release(&_memory_global_lock, 0);
+	return span;
 }
 
 //! Unmap memory pages for the given number of spans (or mark as unused if no partial unmappings)
@@ -1050,8 +1114,7 @@ _rpmalloc_span_release_to_cache(heap_t* heap, span_t* span) {
 //! Initialize a (partial) free list up to next system memory page, while reserving the first block
 //! as allocated, returning number of blocks in list
 static uint32_t
-free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start,
-                       uint32_t block_count, uint32_t block_size) {
+free_list_partial_init(void** list, void** first_block, void* page_start, void* block_start, uint32_t block_count, uint32_t block_size) {
 	assert(block_count);
 	*first_block = block_start;
 	if (block_count > 1) {
@@ -1188,13 +1251,11 @@ _rpmalloc_global_cache_finalize(global_cache_t* cache) {
 		_rpmalloc_span_unmap(cache->span[ispan]);
 	cache->count = 0;
 
-#if ENABLE_UNLIMITED_CACHE
 	while (cache->overflow) {
 		span_t* span = cache->overflow;
 		cache->overflow = span->next;
 		_rpmalloc_span_unmap(span);
 	}
-#endif
 
 	atomic_store32_release(&cache->lock, 0);
 }
@@ -1219,16 +1280,19 @@ _rpmalloc_global_cache_insert_spans(span_t** span, size_t span_count, size_t cou
 
 #if ENABLE_UNLIMITED_CACHE
 	while (insert_count < count) {
+#else
+	// Enable unlimited cache if huge pages, or we will leak since it is unlikely that an entire huge page
+	// will be unmapped, and we're unable to partially decommit a huge page
+	while ((_memory_page_size > _memory_span_size) && (insert_count < count)) {
+#endif		
 		span_t* current_span = span[insert_count++];
 		current_span->next = cache->overflow;
 		cache->overflow = current_span;
 	}
 	atomic_store32_release(&cache->lock, 0);
-#else
-	atomic_store32_release(&cache->lock, 0);
+
 	for (size_t ispan = insert_count; ispan < count; ++ispan)
 		_rpmalloc_span_unmap(span[ispan]);
-#endif
 }
 
 static size_t
@@ -1244,13 +1308,12 @@ _rpmalloc_global_cache_extract_spans(span_t** span, size_t span_count, size_t co
 
 	memcpy(span, cache->span + (cache->count - extract_count), sizeof(span_t*) * extract_count);
 	cache->count -= (uint32_t)extract_count;
-#if ENABLE_UNLIMITED_CACHE
+
 	while ((extract_count < count) && cache->overflow) {
 		span_t* current_span = cache->overflow;
 		span[extract_count++] = current_span;
 		cache->overflow = current_span->next;
 	}
-#endif
 	atomic_store32_release(&cache->lock, 0);
 
 	return extract_count;
@@ -1277,11 +1340,7 @@ _rpmalloc_heap_set_reserved_spans(heap_t* heap, span_t* master, span_t* reserve,
 //! Adopt the deferred span cache list, optionally extracting the first single span for immediate re-use
 static void
 _rpmalloc_heap_cache_adopt_deferred(heap_t* heap, span_t** single_span) {
-	span_t* span = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
-	if (!span)
-		return;
-	while (!atomic_cas_ptr(&heap->span_free_deferred, 0, span))
-		span = (span_t*)atomic_load_ptr(&heap->span_free_deferred);
+	span_t* span = (span_t*)((void*)atomic_exchange_ptr_acquire(&heap->span_free_deferred, 0));
 	while (span) {
 		span_t* next_span = (span_t*)span->free_list;
 		assert(span->heap == heap);
@@ -1325,9 +1384,8 @@ static void
 _rpmalloc_heap_unmap(heap_t* heap) {
 	if (!heap->master_heap) {
 		if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) {
-			size_t heap_size = sizeof(heap_t);
-			size_t block_size = _memory_page_size * ((heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
-			_rpmalloc_unmap(heap, block_size, heap->align_offset, block_size);
+			span_t* span = (span_t*)((uintptr_t)heap & _memory_span_mask);
+			_rpmalloc_span_unmap(span);
 		}
 	} else {
 		if (atomic_decr32(&heap->master_heap->child_count) == 0) {
@@ -1573,20 +1631,52 @@ _rpmalloc_heap_orphan(heap_t* heap, int first_class) {
 //! Allocate a new heap from newly mapped memory pages
 static heap_t*
 _rpmalloc_heap_allocate_new(void) {
-	//Map in pages for a new heap
-	size_t align_offset = 0;
+	// Map in pages for a 16 heaps. If page size is greater than required size for this, map a page and
+	// use first part for heaps and remaining part for spans for allocations. Adds a lot of complexity,
+	// but saves a lot of memory on systems where page size > 64 spans (4MiB)
 	size_t heap_size = sizeof(heap_t);
-	size_t aligned_heap_size = 64 * ((heap_size + 63) / 64);
-	size_t block_size = _memory_page_size * ((aligned_heap_size + _memory_page_size - 1) >> _memory_page_size_shift);
-	heap_t* heap = (heap_t*)_rpmalloc_mmap(block_size, &align_offset);
-	if (!heap)
-		return heap;
+	size_t aligned_heap_size = 16 * ((heap_size + 15) / 16);
+	size_t request_heap_count = 16;
+	size_t heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
+	size_t block_size = _memory_span_size * heap_span_count;
+	size_t span_count = heap_span_count;
+	span_t* span = 0;
+	// If there are global reserved spans, use these first
+	if (_memory_global_reserve_count >= heap_span_count) {
+		span = _rpmalloc_global_get_reserved_spans(heap_span_count);
+	}
+	if (!span) {
+		if (_memory_page_size > block_size) {
+			span_count = _memory_page_size / _memory_span_size;
+			block_size = _memory_page_size;
+			// If using huge pages, make sure to grab enough heaps to avoid reallocating a huge page just to serve new heaps
+			size_t possible_heap_count = (block_size - sizeof(span_t)) / aligned_heap_size;
+			if (possible_heap_count >= (request_heap_count * 16))
+				request_heap_count *= 16;
+			else if (possible_heap_count < request_heap_count)
+				request_heap_count = possible_heap_count;
+			heap_span_count = ((aligned_heap_size * request_heap_count) + sizeof(span_t) + _memory_span_size - 1) / _memory_span_size;
+		}
+
+		size_t align_offset = 0;
+		span = (span_t*)_rpmalloc_mmap(block_size, &align_offset);
+		if (!span)
+			return 0;
+
+		// Master span will contain the heaps
+		_rpmalloc_stat_add(&_reserved_spans, span_count);
+		_rpmalloc_stat_inc(&_master_spans);
+		_rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset);
+	}
 
+	size_t remain_size = _memory_span_size - sizeof(span_t);
+	heap_t* heap = (heap_t*)pointer_offset(span, sizeof(span_t));
 	_rpmalloc_heap_initialize(heap);
-	heap->align_offset = align_offset;
 
-	//Put extra heaps as orphans, aligning to make sure ABA protection bits fit in pointer low bits
-	size_t num_heaps = block_size / aligned_heap_size;
+	// Put extra heaps as orphans
+	size_t num_heaps = remain_size / aligned_heap_size;
+	if (num_heaps < request_heap_count)
+		num_heaps = request_heap_count;
 	atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
 	heap_t* extra_heap = (heap_t*)pointer_offset(heap, aligned_heap_size);
 	while (num_heaps > 1) {
@@ -1596,6 +1686,22 @@ _rpmalloc_heap_allocate_new(void) {
 		extra_heap = (heap_t*)pointer_offset(extra_heap, aligned_heap_size);
 		--num_heaps;
 	}
+
+	if (span_count > heap_span_count) {
+		// Cap reserved spans
+		size_t remain_count = span_count - heap_span_count;
+		size_t reserve_count = (remain_count > DEFAULT_SPAN_MAP_COUNT ? DEFAULT_SPAN_MAP_COUNT : remain_count);
+		span_t* remain_span = (span_t*)pointer_offset(span, heap_span_count * _memory_span_size);
+		_rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count);
+
+		if (remain_count > reserve_count) {
+			// Set to global reserved spans
+			remain_span = (span_t*)pointer_offset(remain_span, reserve_count * _memory_span_size);
+			reserve_count = remain_count - reserve_count;
+			_rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count);
+		}
+	}
+
 	return heap;
 }
 
@@ -1610,7 +1716,7 @@ _rpmalloc_heap_extract_orphan(heap_t** heap_list) {
 static heap_t*
 _rpmalloc_heap_allocate(int first_class) {
 	heap_t* heap = 0;
-	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
 		/* Spin */;
 	if (first_class == 0)
 		heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
@@ -1620,7 +1726,7 @@ _rpmalloc_heap_allocate(int first_class) {
 #endif
 	if (!heap)
 		heap = _rpmalloc_heap_allocate_new();
-	atomic_store32_release(&_memory_orphan_lock, 0);
+	atomic_store32_release(&_memory_global_lock, 0);
 	return heap;
 }
 
@@ -1665,10 +1771,10 @@ _rpmalloc_heap_release(void* heapptr, int first_class) {
 	assert(atomic_load32(&_memory_active_heaps) >= 0);
 #endif
 
-	while (!atomic_cas32_acquire(&_memory_orphan_lock, 1, 0))
+	while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
 		/* Spin */;
 	_rpmalloc_heap_orphan(heap, first_class);
-	atomic_store32_release(&_memory_orphan_lock, 0);
+	atomic_store32_release(&_memory_global_lock, 0);
 }
 
 static void
@@ -2037,7 +2143,7 @@ static void
 _rpmalloc_deallocate_defer_free_span(heap_t* heap, span_t* span) {
 	//This list does not need ABA protection, no mutable side state
 	do {
-		span->free_list = atomic_load_ptr(&heap->span_free_deferred);
+		span->free_list = (void*)atomic_load_ptr(&heap->span_free_deferred);
 	} while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list));
 }
 
@@ -2503,7 +2609,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 		return -1;
 #endif
 #if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-    fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
+	fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
 #endif
 
 	//Setup all small and medium size classes
@@ -2532,7 +2638,7 @@ rpmalloc_initialize_config(const rpmalloc_config_t* config) {
 	_memory_first_class_orphan_heaps = 0;
 #endif
 	memset(_memory_heaps, 0, sizeof(_memory_heaps));
-	atomic_store32_release(&_memory_orphan_lock, 0);
+	atomic_store32_release(&_memory_global_lock, 0);
 
 	//Initialize this thread
 	rpmalloc_thread_initialize();
@@ -2545,6 +2651,14 @@ rpmalloc_finalize(void) {
 	rpmalloc_thread_finalize();
 	//rpmalloc_dump_statistics(stdout);
 
+	if (_memory_global_reserve) {
+		atomic_add32(&_memory_global_reserve_master->remaining_spans, -(int32_t)_memory_global_reserve_count);
+		_memory_global_reserve_master = 0;
+		_memory_global_reserve_count = 0;
+		_memory_global_reserve = 0;
+	}
+	atomic_store32_release(&_memory_global_lock, 0);	
+
 	//Free all thread caches and fully free spans
 	for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
 		heap_t* heap = _memory_heaps[list_idx];
@@ -2571,9 +2685,9 @@ rpmalloc_finalize(void) {
 #endif
 #if ENABLE_STATISTICS
 	//If you hit these asserts you probably have memory leaks (perhaps global scope data doing dynamic allocations) or double frees in your code
-	assert(!atomic_load32(&_mapped_pages));
-	assert(!atomic_load32(&_reserved_spans));
-	assert(!atomic_load32(&_mapped_pages_os));
+	assert(atomic_load32(&_mapped_pages) == 0);
+	assert(atomic_load32(&_reserved_spans) == 0);
+	assert(atomic_load32(&_mapped_pages_os) == 0);
 #endif
 
 	_rpmalloc_initialized = 0;
@@ -3096,7 +3210,7 @@ rpmalloc_heap_free_all(rpmalloc_heap_t* heap) {
 		atomic_store32(&heap->size_class_use[iclass].spans_current, 0);
 	}
 	for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-		atomic_store32(&heap->span_use[iclass].current, 0 );
+		atomic_store32(&heap->span_use[iclass].current, 0);
 	}
 #endif
 }
diff --git a/test/main.c b/test/main.c
index 805cbecb..3d4a7196 100644
--- a/test/main.c
+++ b/test/main.c
@@ -370,7 +370,7 @@ test_realloc(void) {
 
 	size_t bigsize = 1024 * 1024;
 	void* bigptr = rpmalloc(bigsize);
-	while (bigsize < 3 * 1024 * 1024) {
+	while (bigsize < 3000000) {
 		++bigsize;
 		bigptr = rprealloc(bigptr, bigsize);
 		if (rpaligned_realloc(bigptr, 0, bigsize * 32, 0, RPMALLOC_GROW_OR_FAIL))

From 09cd31270aec2b18406c6ed24716e19cab948f94 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 26 Aug 2020 12:32:15 +0200
Subject: [PATCH 66/69] clean up changelog

---
 CHANGELOG | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/CHANGELOG b/CHANGELOG
index 4fd0bb36..2820daff 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,13 +2,11 @@
 
 Dual license as both released to public domain or under MIT license
 
-Allow up to 4GiB page sizes
+Allow up to 4GiB memory page sizes
 
 Fix an issue where large page sizes in conjunction with many threads waste a lot of memory (previously
 each heap occupied an entire memory page, now heaps can now share a memory page)
 
-Added a missing null check in the non-hot allocation code paths
-
 Fixed compilation issue on macOS when ENABLE_PRELOAD is set but not ENABLE_OVERRIDE
 
 New first class heap API allowing explicit heap control and release of entire heap in a single call
@@ -17,18 +15,22 @@ Added rpaligned_calloc function for aligned and zero intialized allocations
 
 Fixed natural alignment check in rpaligned_realloc to 16 bytes (check was 32, which is wrong)
 
-Minor performance improvements for all code paths by simplified span handling, and for aligned allocations
-with alignment less or equal to 128 bytes by utilizing natural block alignments
+Minor performance improvements for all code paths by simplified span handling
+
+Minor performance improvements and for aligned allocations with alignment less or equal to 128 bytes
+by utilizing natural block alignments
 
 Refactor finalization to be compatible with global scope data causing dynamic allocations and frees, like
-C++ objects with custom ctors/dtors.
+C++ objects with custom ctors/dtors
 
 Refactor thread and global cache to be array based instead of list based for improved performance
-and cache size control.
+and cache size control
 
 Added missing C++ operator overloads with ENABLE_OVERRIDE when using Microsoft C++ runtimes
 
-Fixed issue in pvalloc override that could return less than a memory page in usable size.
+Fixed issue in pvalloc override that could return less than a memory page in usable size
+
+Added a missing null check in the non-hot allocation code paths
 
 
 1.4.0

From 80b1f9be95fa01ca821de4e232a223e484252169 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 26 Aug 2020 13:00:21 +0200
Subject: [PATCH 67/69] clang compatibility

---
 rpmalloc/rpmalloc.c   | 2 +-
 test/main-override.cc | 9 ++++++---
 test/main.c           | 3 +++
 test/thread.c         | 3 +++
 4 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/rpmalloc/rpmalloc.c b/rpmalloc/rpmalloc.c
index 3c559c9e..a23d62af 100644
--- a/rpmalloc/rpmalloc.c
+++ b/rpmalloc/rpmalloc.c
@@ -1026,7 +1026,7 @@ static span_t*
 _rpmalloc_span_map(heap_t* heap, size_t span_count) {
 	if (span_count <= heap->spans_reserved)
 		return _rpmalloc_span_map_from_reserve(heap, span_count);
-		span_t* span = 0;
+	span_t* span = 0;
 	if (_memory_page_size > _memory_span_size) {
 		// If huge pages, make sure only one thread maps more memory to avoid bloat
 		while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0)) {
diff --git a/test/main-override.cc b/test/main-override.cc
index b8df4f6e..3b394b28 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -63,18 +63,18 @@ test_alloc(void) {
 	delete[] static_cast<int*>(p);
 
 	p = valloc(873);
-	if ((uintptr_t)p & (config->page_size - 1)) {
+	if (reinterpret_cast<uintptr_t>(p) & (config->page_size - 1)) {
 		fprintf(stderr, "FAIL: pvalloc did not align address to page size (%p)\n", p);
 		return -1;
 	}
 	free(p);
 
 	p = pvalloc(275);
-	if ((uintptr_t)p & (config->page_size - 1)) {
+	if (reinterpret_cast<uintptr_t>(p) & (config->page_size - 1)) {
 		fprintf(stderr, "FAIL: pvalloc did not align address to page size (%p)\n", p);
 		return -1;
 	}
-	if (rpmalloc_usable_size(p) < config->page_size) {
+	if (reinterpret_cast<uintptr_t>(p) < config->page_size) {
 		fprintf(stderr, "FAIL: pvalloc did not align size to page size (%llu)\n", rpmalloc_usable_size(p));
 		return -1;
 	}
@@ -164,6 +164,9 @@ main(int argc, char** argv) {
 #endif
 
 #ifdef _WIN32
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wnonportable-system-include-path"
+#endif
 #include <windows.h>
 
 static void
diff --git a/test/main.c b/test/main.c
index 3d4a7196..f8db4c7e 100644
--- a/test/main.c
+++ b/test/main.c
@@ -2,6 +2,9 @@
 #if defined(_WIN32) && !defined(_CRT_SECURE_NO_WARNINGS)
 #  define _CRT_SECURE_NO_WARNINGS
 #endif
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wnonportable-system-include-path"
+#endif
 
 #include <rpmalloc.h>
 #include <thread.h>
diff --git a/test/thread.c b/test/thread.c
index adaab444..152b2682 100644
--- a/test/thread.c
+++ b/test/thread.c
@@ -6,6 +6,9 @@
 #else
 #  define ATTRIBUTE_NORETURN __attribute__((noreturn))
 #endif
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wnonportable-system-include-path"
+#endif
 
 #ifdef _WIN32
 #  include <windows.h>

From bbd03e77367e49619d084a6998319c9344b9089c Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 26 Aug 2020 13:35:26 +0200
Subject: [PATCH 68/69] clang/gcc compatibility

---
 test/main-override.cc | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/main-override.cc b/test/main-override.cc
index 3b394b28..760fa860 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -14,10 +14,8 @@
 #include <string.h>
 #include <math.h>
 
-#if defined(_WIN32)
 extern "C" void* RPMALLOC_CDECL pvalloc(size_t size);
 extern "C" void* RPMALLOC_CDECL valloc(size_t size);
-#endif
 
 static size_t _hardware_threads;
 
@@ -75,10 +73,10 @@ test_alloc(void) {
 		return -1;
 	}
 	if (reinterpret_cast<uintptr_t>(p) < config->page_size) {
-		fprintf(stderr, "FAIL: pvalloc did not align size to page size (%llu)\n", rpmalloc_usable_size(p));
+		fprintf(stderr, "FAIL: pvalloc did not align size to page size (%lu)\n", static_cast<uintptr_t>(rpmalloc_usable_size(p)));
 		return -1;
 	}
-	free(p);
+	rpfree(p);
 
 	printf("Allocation tests passed\n");
 	return 0;
@@ -89,6 +87,7 @@ test_free(void) {
 	free(rpmalloc(371));
 	free(new int);
 	free(new int[16]);
+	free(pvalloc(1275));
 	printf("Free tests passed\n");
 	return 0;	
 }

From 35dc09bd05c4a58f807968cb8895377a284c1413 Mon Sep 17 00:00:00 2001
From: Mattias Jansson <mjansson@gmail.com>
Date: Wed, 26 Aug 2020 16:06:11 +0200
Subject: [PATCH 69/69] format compatibility

---
 test/main-override.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/main-override.cc b/test/main-override.cc
index 760fa860..ece254c5 100644
--- a/test/main-override.cc
+++ b/test/main-override.cc
@@ -13,6 +13,7 @@
 #include <stdio.h>
 #include <string.h>
 #include <math.h>
+#include <inttypes.h>
 
 extern "C" void* RPMALLOC_CDECL pvalloc(size_t size);
 extern "C" void* RPMALLOC_CDECL valloc(size_t size);
@@ -73,7 +74,7 @@ test_alloc(void) {
 		return -1;
 	}
 	if (reinterpret_cast<uintptr_t>(p) < config->page_size) {
-		fprintf(stderr, "FAIL: pvalloc did not align size to page size (%lu)\n", static_cast<uintptr_t>(rpmalloc_usable_size(p)));
+		fprintf(stderr, "FAIL: pvalloc did not align size to page size (%" PRIu64 ")\n", static_cast<uint64_t>(rpmalloc_usable_size(p)));
 		return -1;
 	}
 	rpfree(p);