From 6e360d34eea79e85b21352014a88c53c104c211d Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 25 Aug 2019 13:15:26 -0700
Subject: [PATCH 01/10] fix 1GB huge page flag on Linux

---
 src/os.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/os.c b/src/os.c
index a1b6cdf3..3a9bd30c 100644
--- a/src/os.c
+++ b/src/os.c
@@ -353,7 +353,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
       lflags |= MAP_HUGETLB;
       #endif
       #ifdef MAP_HUGE_1GB
-      if ((size % (uintptr_t)1 << 20) == 0) {
+      if ((size % ((uintptr_t)1 << 30)) == 0) {
         lflags |= MAP_HUGE_1GB;
       }
       else

From e8664001f76981079191b22aff6dbdada135e6fa Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 25 Aug 2019 22:59:12 -0700
Subject: [PATCH 02/10] Use standard _Atomic declarations and clean up atomic
 operations

---
 include/mimalloc-atomic.h | 186 +++++++++++++++++++-------------------
 include/mimalloc-types.h  |  11 ++-
 src/alloc.c               |   6 +-
 src/memory.c              |  54 +++++------
 src/options.c             |   2 +-
 src/os.c                  |  18 ++--
 src/page.c                |  13 +--
 src/segment.c             |  12 +--
 src/stats.c               |  22 ++---
 9 files changed, 165 insertions(+), 159 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index d504634c..739d0512 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -9,63 +9,98 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MIMALLOC_ATOMIC_H
 
 // ------------------------------------------------------
-// Atomics
+// Atomics 
+// We need to be portable between C, C++, and MSVC.
 // ------------------------------------------------------
 
-// Atomically increment a value; returns the incremented result.
-static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p);
+#if defined(_MSC_VER)
+#define _Atomic(tp)         tp
+#define ATOMIC_VAR_INIT(x)  x
+#elif defined(__cplusplus)
+#include <atomic>
+#define  _Atomic(tp)        std::atomic<tp>
+#else
+#include <stdatomic.h>
+#endif
 
-// Atomically increment a value; returns the incremented result.
-static inline uint32_t mi_atomic_increment32(volatile uint32_t* p);
+#define mi_atomic_cast(tp,x)  (volatile _Atomic(tp)*)(x)
 
-// Atomically decrement a value; returns the decremented result.
-static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p);
+// ------------------------------------------------------
+// Atomic operations specialized for mimalloc
+// ------------------------------------------------------
 
-// Atomically add a 64-bit value; returns the added result.
-static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add);
+// Atomically add a 64-bit value; returns the previous value. 
+// Note: not using _Atomic(int64_t) as it is only used for stats. 
+static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add);
 
-// Atomically subtract a value; returns the subtracted result.
-static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub);
+// Atomically add a value; returns the previous value.
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);
 
-// Atomically subtract a value; returns the subtracted result.
-static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub);
+// Atomically compare and exchange a value; returns `true` if successful. May fail spuriously.
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
 
 // Atomically compare and exchange a value; returns `true` if successful.
-static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare);
-
-// Atomically compare and exchange a value; returns `true` if successful.
-static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare);
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
 
 // Atomically exchange a value.
-static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange);
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange);
 
 // Atomically read a value
-static inline uintptr_t mi_atomic_read(volatile uintptr_t* p);
+static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p);
 
 // Atomically write a value
-static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x);
-
-// Atomically read a pointer
-static inline void* mi_atomic_read_ptr(volatile void** p) {
-  return (void*)mi_atomic_read( (volatile uintptr_t*)p );
-}
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x);
 
+// Yield
 static inline void mi_atomic_yield(void);
 
 
+
+// Atomically add a value; returns the previous value.
+static inline uintptr_t mi_atomic_addu(volatile _Atomic(uintptr_t)* p, uintptr_t add) {
+  return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, (intptr_t)add);
+}
+// Atomically subtract a value; returns the previous value.
+static inline uintptr_t mi_atomic_subu(volatile _Atomic(uintptr_t)* p, uintptr_t sub) {
+  return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, -((intptr_t)sub));
+}
+
+// Atomically increment a value; returns the incremented result.
+static inline uintptr_t mi_atomic_increment(volatile _Atomic(uintptr_t)* p) {
+  return mi_atomic_addu(p, 1);
+}
+
+// Atomically decrement a value; returns the decremented result.
+static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) {
+  return mi_atomic_subu(p, 1);
+}
+
+// Atomically read a pointer
+static inline void* mi_atomic_read_ptr_relaxed(volatile _Atomic(void*) const * p) {
+  return (void*)mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)p);
+}
+
 // Atomically write a pointer
-static inline void mi_atomic_write_ptr(volatile void** p, void* x) {
-  mi_atomic_write((volatile uintptr_t*)p, (uintptr_t)x );
+static inline void mi_atomic_write_ptr(volatile _Atomic(void*)* p, void* x) {
+  mi_atomic_write((volatile _Atomic(uintptr_t)*)p, (uintptr_t)x );
+}
+
+// Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously.
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_ptr_weak(volatile _Atomic(void*)* p, void* desired, void* expected) {
+  return mi_atomic_cas_weak((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected);
 }
 
 // Atomically compare and exchange a pointer; returns `true` if successful.
-static inline bool mi_atomic_compare_exchange_ptr(volatile void** p, void* newp, void* compare) {
-  return mi_atomic_compare_exchange((volatile uintptr_t*)p, (uintptr_t)newp, (uintptr_t)compare);
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
+static inline bool mi_atomic_cas_ptr_strong(volatile _Atomic(void*)* p, void* desired, void* expected) {
+  return mi_atomic_cas_strong((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected);
 }
 
 // Atomically exchange a pointer value.
-static inline void* mi_atomic_exchange_ptr(volatile void** p, void* exchange) {
-  return (void*)mi_atomic_exchange((volatile uintptr_t*)p, (uintptr_t)exchange);
+static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exchange) {
+  return (void*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)p, (uintptr_t)exchange);
 }
 
 
@@ -73,49 +108,37 @@ static inline void* mi_atomic_exchange_ptr(volatile void** p, void* exchange) {
 #define WIN32_LEAN_AND_MEAN
 #include <windows.h>
 #include <intrin.h>
-#if (MI_INTPTR_SIZE==8)
+#ifdef _WIN64
 typedef LONG64   msc_intptr_t;
 #define RC64(f)  f##64
 #else
 typedef LONG     msc_intptr_t;
 #define RC64(f)  f
 #endif
-static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p) {
-  return (uintptr_t)RC64(_InterlockedIncrement)((volatile msc_intptr_t*)p);
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
+  return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
 }
-static inline uint32_t mi_atomic_increment32(volatile uint32_t* p) {
-  return (uint32_t)_InterlockedIncrement((volatile LONG*)p);
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  return (expected == RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
 }
-static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p) {
-  return (uintptr_t)RC64(_InterlockedDecrement)((volatile msc_intptr_t*)p);
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
+  return mi_atomic_cas_strong(p,desired,expected);
 }
-static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub) {
-  return (uintptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub)) - sub;
-}
-static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub) {
-  return (uint32_t)_InterlockedExchangeAdd((volatile LONG*)p, -((LONG)sub)) - sub;
-}
-static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare) {
-  return ((int32_t)compare == _InterlockedCompareExchange((volatile LONG*)p, (LONG)exchange, (LONG)compare));
-}
-static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare) {
-  return (compare == RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange, (msc_intptr_t)compare));
-}
-static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange) {
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
   return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
-static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
+static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) {
   return *p;
 }
-static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
-  *p = x;
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  mi_atomic_exchange(p,x);
 }
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
 }
-static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) {
-  #if (MI_INTPTR_SIZE==8)
-  return _InterlockedExchangeAdd64(p, add) + add;
+static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) {
+  #ifdef _WIN64
+  return mi_atomic_add(p,add);
   #else
   int64_t current;
   int64_t sum;
@@ -123,62 +146,43 @@ static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) {
     current = *p;
     sum = current + add;
   } while (_InterlockedCompareExchange64(p, sum, current) != current);
-  return sum;
+  return current;
   #endif
 }
 
 #else
 #ifdef __cplusplus
-#include <atomic>
 #define  MI_USING_STD   using namespace std;
-#define  _Atomic(tp)    atomic<tp>
 #else
-#include <stdatomic.h>
 #define  MI_USING_STD
 #endif
-static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p) {
+static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add) {
   MI_USING_STD
-  return atomic_fetch_add_explicit((volatile atomic_uintptr_t*)p, (uintptr_t)1, memory_order_relaxed) + 1;
+  return atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed);
 }
-static inline uint32_t mi_atomic_increment32(volatile uint32_t* p) {
+static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
   MI_USING_STD
-  return atomic_fetch_add_explicit((volatile _Atomic(uint32_t)*)p, (uint32_t)1, memory_order_relaxed) + 1;
+  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
 }
-static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p) {
+static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   MI_USING_STD
-  return atomic_fetch_sub_explicit((volatile atomic_uintptr_t*)p, (uintptr_t)1, memory_order_relaxed) - 1;
+  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed);
 }
-static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) {
+static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   MI_USING_STD
-  return atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed) + add;
+  return atomic_compare_exchange_strong_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed);
 }
-static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub) {
+static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
   MI_USING_STD
-  return atomic_fetch_sub_explicit((volatile atomic_uintptr_t*)p, sub, memory_order_relaxed) - sub;
+  return atomic_exchange_explicit(p, exchange, memory_order_acq_rel);
 }
-static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub) {
+static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p) {
   MI_USING_STD
-  return atomic_fetch_sub_explicit((volatile _Atomic(uint32_t)*)p, sub, memory_order_relaxed) - sub;
+  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed);
 }
-static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare) {
+static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
-  return atomic_compare_exchange_weak_explicit((volatile _Atomic(uint32_t)*)p, &compare, exchange, memory_order_release, memory_order_relaxed);
-}
-static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare) {
-  MI_USING_STD
-  return atomic_compare_exchange_weak_explicit((volatile atomic_uintptr_t*)p, &compare, exchange, memory_order_release, memory_order_relaxed);
-}
-static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange) {
-  MI_USING_STD
-  return atomic_exchange_explicit((volatile atomic_uintptr_t*)p, exchange, memory_order_acquire);
-}
-static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
-  MI_USING_STD
-  return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed);
-}
-static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed);
+  return atomic_store_explicit(p, x, memory_order_release);
 }
 
 #if defined(__cplusplus)
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index db39b9c4..0b2334b8 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -10,6 +10,7 @@ terms of the MIT license. A copy of the license can be found in the file
 
 #include <stddef.h>   // ptrdiff_t
 #include <stdint.h>   // uintptr_t, uint16_t, etc
+#include <mimalloc-atomic.h>  // _Atomic
 
 // ------------------------------------------------------
 // Variants
@@ -177,8 +178,8 @@ typedef struct mi_page_s {
   size_t                used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
   
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  volatile uintptr_t    thread_freed;      // at least this number of blocks are in `thread_free`
-  volatile mi_thread_free_t thread_free;   // list of deferred free blocks freed by other threads
+  volatile _Atomic(uintptr_t)        thread_freed;  // at least this number of blocks are in `thread_free`
+  volatile _Atomic(mi_thread_free_t) thread_free;   // list of deferred free blocks freed by other threads
 
   // less accessed info
   size_t                block_size;        // size available in each block (always `>0`)
@@ -208,7 +209,7 @@ typedef enum mi_page_kind_e {
 typedef struct mi_segment_s {
   struct mi_segment_s* next;
   struct mi_segment_s* prev;
-  volatile struct mi_segment_s* abandoned_next;
+  volatile _Atomic(struct mi_segment_s*) abandoned_next;
   size_t          abandoned;   // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t          used;        // count of pages in use (`used <= capacity`)
   size_t          capacity;    // count of available pages (`#free + used`)
@@ -219,7 +220,7 @@ typedef struct mi_segment_s {
 
   // layout like this to optimize access in `mi_free`
   size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  volatile uintptr_t thread_id;   // unique id of the thread owning this segment
+  volatile _Atomic(uintptr_t) thread_id;   // unique id of the thread owning this segment
   mi_page_kind_t  page_kind;   // kind of pages: small, large, or huge
   mi_page_t       pages[1];    // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;
@@ -255,7 +256,7 @@ struct mi_heap_s {
   mi_tld_t*             tld;
   mi_page_t*            pages_free_direct[MI_SMALL_WSIZE_MAX + 2];   // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
   mi_page_queue_t       pages[MI_BIN_FULL + 1];                      // queue of pages for each size class (or "bin")
-  volatile mi_block_t*  thread_delayed_free;
+  volatile _Atomic(mi_block_t*) thread_delayed_free;
   uintptr_t             thread_id;                                   // thread this heap belongs too
   uintptr_t             cookie;
   uintptr_t             random;                                      // random number used for secure allocation
diff --git a/src/alloc.c b/src/alloc.c
index 76e093e7..97c5fcc4 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -144,7 +144,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       mi_block_set_next(page, block, mi_tf_block(tfree));
       tfreex = mi_tf_set_block(tfree,block);
     }
-  } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree));
+  } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
 
   if (mi_likely(!use_delayed)) {
     // increment the thread free count and return
@@ -160,7 +160,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       do {
         dfree = (mi_block_t*)heap->thread_delayed_free;
         mi_block_set_nextx(heap->cookie,block,dfree);
-      } while (!mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, block, dfree));
+      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
     }
 
     // and reset the MI_DELAYED_FREEING flag
@@ -168,7 +168,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
       tfreex = tfree = page->thread_free;
       mi_assert_internal(mi_tf_delayed(tfree) == MI_NEVER_DELAYED_FREE || mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
       if (mi_tf_delayed(tfree) != MI_NEVER_DELAYED_FREE) tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree));
+    } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
   }
 }
 
diff --git a/src/memory.c b/src/memory.c
index 26f87092..1ea6ee16 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -69,8 +69,8 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, mi_os_tld
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile uintptr_t map;    // in-use bit per MI_SEGMENT_SIZE block
-  volatile void*     start;  // start of virtual memory area
+  volatile _Atomic(uintptr_t) map;    // in-use bit per MI_SEGMENT_SIZE block
+  volatile _Atomic(void*)     start;  // start of virtual memory area
 } mem_region_t;
 
 
@@ -78,7 +78,7 @@ typedef struct mem_region_s {
 // TODO: in the future, maintain a map per NUMA node for numa aware allocation
 static mem_region_t regions[MI_REGION_MAX];
 
-static volatile size_t regions_count = 0;        // allocated regions
+static volatile _Atomic(uintptr_t) regions_count; // = 0;        // allocated regions
 
 
 /* ----------------------------------------------------------------------------
@@ -106,9 +106,9 @@ static size_t mi_good_commit_size(size_t size) {
 // Return if a pointer points into a region reserved by us.
 bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
   if (p==NULL) return false;
-  size_t count = mi_atomic_read(&regions_count);
+  size_t count = mi_atomic_read_relaxed(&regions_count);
   for (size_t i = 0; i < count; i++) {
-    uint8_t* start = (uint8_t*)mi_atomic_read_ptr(&regions[i].start);
+    uint8_t* start = (uint8_t*)mi_atomic_read_ptr_relaxed(&regions[i].start);
     if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
   }
   return false;
@@ -127,11 +127,11 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
 {
   size_t mask = mi_region_block_mask(blocks,bitidx);
   mi_assert_internal(mask != 0);
-  mi_assert_internal((mask & mi_atomic_read(&region->map)) == mask);
+  mi_assert_internal((mask & mi_atomic_read_relaxed(&region->map)) == mask);
   mi_assert_internal(&regions[idx] == region);
 
   // ensure the region is reserved
-  void* start = mi_atomic_read_ptr(&region->start);
+  void* start = mi_atomic_read_ptr_relaxed(&region->start);
   if (start == NULL) 
   {
     start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld);    
@@ -139,13 +139,13 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       // failure to allocate from the OS! unclaim the blocks and fail
       size_t map;
       do {
-        map = mi_atomic_read(&region->map);
-      } while (!mi_atomic_compare_exchange(&region->map, map & ~mask, map));
+        map = mi_atomic_read_relaxed(&region->map);
+      } while (!mi_atomic_cas_weak(&region->map, map & ~mask, map));
       return false;
     }
 
     // set the newly allocated region
-    if (mi_atomic_compare_exchange_ptr(&region->start, start, NULL)) {
+    if (mi_atomic_cas_ptr_strong(&region->start, start, NULL)) {
       // update the region count
       mi_atomic_increment(&regions_count);
     }
@@ -154,9 +154,9 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       // we assign it to a later slot instead (up to 4 tries).
       // note: we don't need to increment the region count, this will happen on another allocation
       for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
-        void* s = mi_atomic_read_ptr(&regions[idx+i].start);
+        void* s = mi_atomic_read_ptr_relaxed(&regions[idx+i].start);
         if (s == NULL) { // quick test
-          if (mi_atomic_compare_exchange_ptr(&regions[idx+i].start, start, s)) {
+          if (mi_atomic_cas_ptr_weak(&regions[idx+i].start, start, s)) {
             start = NULL;
             break;
           }
@@ -167,10 +167,10 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
         _mi_os_free(start, MI_REGION_SIZE, tld->stats);
       }
       // and continue with the memory at our index
-      start = mi_atomic_read_ptr(&region->start);
+      start = mi_atomic_read_ptr_relaxed(&region->start);
     }
   }
-  mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
+  mi_assert_internal(start == mi_atomic_read_ptr_relaxed(&region->start));
   mi_assert_internal(start != NULL);
 
   // Commit the blocks to memory
@@ -230,7 +230,7 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
 
   const uintptr_t mask = mi_region_block_mask(blocks, 0);
   const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
-  uintptr_t map = mi_atomic_read(&region->map);
+  uintptr_t map = mi_atomic_read_relaxed(&region->map);
 
   #ifdef MI_HAVE_BITSCAN
   size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
@@ -245,9 +245,9 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
       uintptr_t newmap = map | m;
       mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_compare_exchange(&region->map, newmap, map)) {
+      if (!mi_atomic_cas_strong(&region->map, newmap, map)) {
         // no success, another thread claimed concurrently.. keep going
-        map = mi_atomic_read(&region->map);
+        map = mi_atomic_read_relaxed(&region->map);
         continue;
       }
       else {
@@ -281,7 +281,7 @@ static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, b
   // check if there are available blocks in the region..
   mi_assert_internal(idx < MI_REGION_MAX);
   mem_region_t* region = &regions[idx];
-  uintptr_t m = mi_atomic_read(&region->map);
+  uintptr_t m = mi_atomic_read_relaxed(&region->map);
   if (m != MI_REGION_MAP_FULL) {  // some bits are zero
     return mi_region_alloc_blocks(region, idx, blocks, size, commit, p, id, tld);
   }
@@ -317,7 +317,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t*
 
   // find a range of free blocks
   void* p = NULL;
-  size_t count = mi_atomic_read(&regions_count);
+  size_t count = mi_atomic_read_relaxed(&regions_count);
   size_t idx = tld->region_idx; // start index is per-thread to reduce contention
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
@@ -376,8 +376,8 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     size_t mask = mi_region_block_mask(blocks, bitidx);
     mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
     mem_region_t* region = &regions[idx];
-    mi_assert_internal((mi_atomic_read(&region->map) & mask) == mask ); // claimed?
-    void* start = mi_atomic_read_ptr(&region->start);
+    mi_assert_internal((mi_atomic_read_relaxed(&region->map) & mask) == mask ); // claimed?
+    void* start = mi_atomic_read_ptr_relaxed(&region->start);
     mi_assert_internal(start != NULL);
     void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
@@ -405,9 +405,9 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     uintptr_t map;
     uintptr_t newmap;
     do {
-      map = mi_atomic_read(&region->map);
+      map = mi_atomic_read_relaxed(&region->map);
       newmap = map & ~mask;
-    } while (!mi_atomic_compare_exchange(&region->map, newmap, map));
+    } while (!mi_atomic_cas_weak(&region->map, newmap, map));
   }
 }
 
@@ -419,17 +419,17 @@ void _mi_mem_collect(mi_stats_t* stats) {
   // free every region that has no segments in use.
   for (size_t i = 0; i < regions_count; i++) {
     mem_region_t* region = &regions[i];
-    if (mi_atomic_read(&region->map) == 0 && region->start != NULL) {
+    if (mi_atomic_read_relaxed(&region->map) == 0 && region->start != NULL) {
       // if no segments used, try to claim the whole region
       uintptr_t m;
       do {
-        m = mi_atomic_read(&region->map);
-      } while(m == 0 && !mi_atomic_compare_exchange(&region->map, ~((uintptr_t)0), 0 ));
+        m = mi_atomic_read_relaxed(&region->map);
+      } while(m == 0 && !mi_atomic_cas_weak(&region->map, ~((uintptr_t)0), 0 ));
       if (m == 0) {
         // on success, free the whole region
         if (region->start != NULL) _mi_os_free((void*)region->start, MI_REGION_SIZE, stats);
         // and release
-        region->start = 0;
+        mi_atomic_write_ptr(&region->start,NULL);
         mi_atomic_write(&region->map,0);
       }
     }
diff --git a/src/options.c b/src/options.c
index b30ff1c6..88f2503e 100644
--- a/src/options.c
+++ b/src/options.c
@@ -127,7 +127,7 @@ void mi_option_disable(mi_option_t option) {
 // Messages
 // --------------------------------------------------------
 #define MAX_ERROR_COUNT (10)
-static uintptr_t error_count = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
+static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
 
 // When overriding malloc, we may recurse into mi_vfprintf if an allocation
 // inside the C runtime causes another message.
diff --git a/src/os.c b/src/os.c
index e7ed57b5..fc9c5acc 100644
--- a/src/os.c
+++ b/src/os.c
@@ -186,11 +186,11 @@ static bool mi_os_mem_free(void* addr, size_t size, mi_stats_t* stats)
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
 #if (MI_INTPTR_SIZE >= 8) 
     // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
-  static volatile intptr_t aligned_base = ((intptr_t)4 << 40); // starting at 4TiB
+  static volatile _Atomic(intptr_t) aligned_base = ATOMIC_VAR_INIT((intptr_t)4 << 40); // starting at 4TiB
   if (addr == NULL && try_alignment > 0 &&
       try_alignment <= MI_SEGMENT_SIZE && (size%MI_SEGMENT_SIZE) == 0) 
   {
-    intptr_t hint = mi_atomic_add(&aligned_base, size) - size;
+    intptr_t hint = mi_atomic_add(&aligned_base, size);
     if (hint%try_alignment == 0) {
       return VirtualAlloc((void*)hint, size, flags, PAGE_READWRITE);
     }
@@ -214,11 +214,11 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
   static volatile uintptr_t large_page_try_ok = 0;
   void* p = NULL;
   if (use_large_os_page(size, try_alignment)) {
-    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok);
     if (try_ok > 0) {
       // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
       // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
-      mi_atomic_compare_exchange(&large_page_try_ok, try_ok - 1, try_ok);
+      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
     }
     else {
       // large OS pages must always reserve and commit.
@@ -253,9 +253,9 @@ static void* mi_unix_mmapx(size_t size, size_t try_alignment, int protect_flags,
   void* p = NULL;
   #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED)
   // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
-  static volatile intptr_t aligned_base = ((intptr_t)1 << 42); // starting at 4TiB
+  static volatile _Atomic(intptr_t) aligned_base = ATOMIC_VAR_INIT((intptr_t)1 << 42); // starting at 4TiB
   if (try_alignment <= MI_SEGMENT_SIZE && (size%MI_SEGMENT_SIZE)==0) {
-    intptr_t hint = mi_atomic_add(&aligned_base,size) - size;
+    intptr_t hint = mi_atomic_add(&aligned_base,size);
     if (hint%try_alignment == 0) {
       p = mmap((void*)hint,size,protect_flags,flags,fd,0);
       if (p==MAP_FAILED) p = NULL; // fall back to regular mmap
@@ -291,14 +291,14 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags)
   fd = VM_MAKE_TAG(100);
   #endif
   if (use_large_os_page(size, try_alignment)) {
-    static volatile uintptr_t large_page_try_ok = 0;
-    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    static volatile _Atomic(uintptr_t) large_page_try_ok = 0;
+    uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok);
     if (try_ok > 0) {
       // If the OS is not configured for large OS pages, or the user does not have
       // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
       // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
       // to avoid too many failing calls to mmap.
-      mi_atomic_compare_exchange(&large_page_try_ok, try_ok - 1, try_ok);
+      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
     }
     else {
       int lflags = flags;
diff --git a/src/page.c b/src/page.c
index 54897af5..a95f5b51 100644
--- a/src/page.c
+++ b/src/page.c
@@ -49,11 +49,12 @@ static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
   return count;
 }
 
+/*
 // Start of the page available memory
 static inline uint8_t* mi_page_area(const mi_page_t* page) {
   return _mi_page_start(_mi_page_segment(page), page, NULL);
 }
-
+*/
 
 static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) {
   size_t psize;
@@ -126,7 +127,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay  ) {
     }
   }
   while((mi_tf_delayed(tfreex) !=  mi_tf_delayed(tfree)) && // avoid atomic operation if already equal
-        !mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree));
+        !mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
 }
 
 
@@ -147,7 +148,7 @@ static void mi_page_thread_free_collect(mi_page_t* page)
     tfree = page->thread_free;
     head = mi_tf_block(tfree);
     tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree));
+  } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree));
 
   // return if the list is empty
   if (head == NULL) return;
@@ -166,7 +167,7 @@ static void mi_page_thread_free_collect(mi_page_t* page)
   page->free = head;
 
   // update counts now
-  mi_atomic_subtract(&page->thread_freed, count);
+  mi_atomic_subu(&page->thread_freed, count);
   page->used -= count;
 }
 
@@ -257,7 +258,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
   mi_block_t* block;
   do {
     block = (mi_block_t*)heap->thread_delayed_free;
-  } while (block != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, NULL, block));
+  } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), NULL, block));
 
   // and free them all
   while(block != NULL) {
@@ -270,7 +271,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
       do {
         dfree = (mi_block_t*)heap->thread_delayed_free;
         mi_block_set_nextx(heap->cookie, block, dfree);
-      } while (!mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, block, dfree));
+      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
 
     }
     block = next;
diff --git a/src/segment.c b/src/segment.c
index 18c06fbc..9a744ea6 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -542,8 +542,8 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
 // live blocks (reached through other threads). Such segments
 // are "abandoned" and will be reclaimed by other threads to
 // reuse their pages and/or free them eventually
-static volatile mi_segment_t* abandoned = NULL;
-static volatile uintptr_t     abandoned_count = 0;
+static volatile _Atomic(mi_segment_t*) abandoned; // = NULL;
+static volatile _Atomic(uintptr_t)     abandoned_count; // = 0;
 
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   mi_assert_internal(segment->used == segment->abandoned);
@@ -561,9 +561,9 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   segment->thread_id = 0;
   mi_segment_t* next;
   do {
-    next = (mi_segment_t*)abandoned;
-    mi_atomic_write_ptr((volatile void**)&segment->abandoned_next, next);
-  } while (!mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, segment, next));
+    next = (mi_segment_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&abandoned));
+    mi_atomic_write_ptr(mi_atomic_cast(void*,&segment->abandoned_next), next);
+  } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&abandoned), segment, next));
   mi_atomic_increment(&abandoned_count);
 }
 
@@ -597,7 +597,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
     mi_segment_t* segment;
     do {
       segment = (mi_segment_t*)abandoned;
-    } while(segment != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, (mi_segment_t*)segment->abandoned_next, segment));
+    } while(segment != NULL && !mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&abandoned), (mi_segment_t*)segment->abandoned_next, segment));
     if (segment==NULL) break; // stop early if no more segments available
 
     // got it.
diff --git a/src/stats.c b/src/stats.c
index 39015f94..2176ba17 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -38,13 +38,13 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (mi_is_in_main(stat))
   {
     // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_add(&stat->current,amount);
+    int64_t current = mi_atomic_add64(&stat->current,amount);
     if (current > stat->peak) stat->peak = stat->current;  // racing.. it's ok
     if (amount > 0) {
-      mi_atomic_add(&stat->allocated,amount);
+      mi_atomic_add64(&stat->allocated,amount);
     }
     else {
-      mi_atomic_add(&stat->freed, -amount);
+      mi_atomic_add64(&stat->freed, -amount);
     }
   }
   else {
@@ -62,8 +62,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
 
 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
   if (mi_is_in_main(stat)) {
-    mi_atomic_add( &stat->count, 1 );
-    mi_atomic_add( &stat->total, (int64_t)amount );
+    mi_atomic_add64( &stat->count, 1 );
+    mi_atomic_add64( &stat->total, (int64_t)amount );
   }
   else {
     stat->count++;
@@ -82,16 +82,16 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
   if (stat==src) return;
-  mi_atomic_add( &stat->allocated, src->allocated * unit);
-  mi_atomic_add( &stat->current, src->current * unit);
-  mi_atomic_add( &stat->freed, src->freed * unit);
-  mi_atomic_add( &stat->peak, src->peak * unit);
+  mi_atomic_add64( &stat->allocated, src->allocated * unit);
+  mi_atomic_add64( &stat->current, src->current * unit);
+  mi_atomic_add64( &stat->freed, src->freed * unit);
+  mi_atomic_add64( &stat->peak, src->peak * unit);
 }
 
 static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
   if (stat==src) return;
-  mi_atomic_add( &stat->total, src->total * unit);
-  mi_atomic_add( &stat->count, src->count * unit);
+  mi_atomic_add64( &stat->total, src->total * unit);
+  mi_atomic_add64( &stat->count, src->count * unit);
 }
 
 // must be thread safe as it is called from stats_merge

From baabc775034efeb55a93c8088492933e56d8334f Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 25 Aug 2019 23:02:41 -0700
Subject: [PATCH 03/10] use proper atomic initialization macros

---
 src/init.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/init.c b/src/init.c
index 76e586f2..290caeec 100644
--- a/src/init.c
+++ b/src/init.c
@@ -19,7 +19,8 @@ const mi_page_t _mi_page_empty = {
   0,
   #endif
   0,       // used
-  NULL, 0, 0,
+  NULL, 
+  ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0),
   0, NULL, NULL, NULL
   #if (MI_INTPTR_SIZE==8 && MI_SECURE>0) || (MI_INTPTR_SIZE==4 && MI_SECURE==0)
   , { NULL } // padding
@@ -81,7 +82,7 @@ const mi_heap_t _mi_heap_empty = {
   NULL,
   MI_SMALL_PAGES_EMPTY,
   MI_PAGE_QUEUES_EMPTY,
-  NULL,
+  ATOMIC_VAR_INIT(NULL),
   0,
   0,
   0,

From 2159c224151e5be1f3bcf73acefe62eef17d080f Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Sun, 25 Aug 2019 23:06:18 -0700
Subject: [PATCH 04/10] fix atomic declaration on windows

---
 src/os.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/os.c b/src/os.c
index fc9c5acc..fb36f3fc 100644
--- a/src/os.c
+++ b/src/os.c
@@ -211,7 +211,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
 }
 
 static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-  static volatile uintptr_t large_page_try_ok = 0;
+  static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
   void* p = NULL;
   if (use_large_os_page(size, try_alignment)) {
     uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok);
@@ -291,7 +291,7 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags)
   fd = VM_MAKE_TAG(100);
   #endif
   if (use_large_os_page(size, try_alignment)) {
-    static volatile _Atomic(uintptr_t) large_page_try_ok = 0;
+    static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
     uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok);
     if (try_ok > 0) {
       // If the OS is not configured for large OS pages, or the user does not have

From 5c7c106d62f70db566e337abd6575021ec55f1bf Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 26 Aug 2019 08:11:15 -0700
Subject: [PATCH 05/10] strengthen some atomic operations for weak memory
 models

---
 include/mimalloc-atomic.h | 45 +++++++++++++++++++++++++++------------
 src/alloc.c               | 18 +++++++++-------
 src/memory.c              | 20 ++++++++---------
 src/stats.c               |  4 ++--
 4 files changed, 53 insertions(+), 34 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index 739d0512..3a289feb 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -30,26 +30,32 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------
 
 // Atomically add a 64-bit value; returns the previous value. 
-// Note: not using _Atomic(int64_t) as it is only used for stats. 
-static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add);
+// Note: not using _Atomic(int64_t) as it is only used for statistics.
+static inline void mi_atomic_add64(volatile int64_t* p, int64_t add);
 
-// Atomically add a value; returns the previous value.
+// Atomically add a value; returns the previous value. Memory ordering is relaxed.
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);
 
-// Atomically compare and exchange a value; returns `true` if successful. May fail spuriously.
+// Atomically compare and exchange a value; returns `true` if successful. 
+// May fail spuriously. Memory ordering as release on success, and relaxed on failure.
 // (Note: expected and desired are in opposite order from atomic_compare_exchange)
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
 
 // Atomically compare and exchange a value; returns `true` if successful.
+// Memory ordering is acquire-release
+// (Note: expected and desired are in opposite order from atomic_compare_exchange)
 static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
 
-// Atomically exchange a value.
+// Atomically exchange a value. Memory ordering is acquire-release.
 static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange);
 
-// Atomically read a value
+// Atomically read a value. Memory ordering is relaxed.
 static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p);
 
-// Atomically write a value
+// Atomically read a value. Memory ordering is acquire.
+static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p);
+
+// Atomically write a value. Memory ordering is release.
 static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x);
 
 // Yield
@@ -76,11 +82,16 @@ static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) {
   return mi_atomic_subu(p, 1);
 }
 
-// Atomically read a pointer
+// Atomically read a pointer; Memory order is relaxed.
 static inline void* mi_atomic_read_ptr_relaxed(volatile _Atomic(void*) const * p) {
   return (void*)mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)p);
 }
 
+// Atomically read a pointer; Memory order is acquire.
+static inline void* mi_atomic_read_ptr(volatile _Atomic(void*) const * p) {
+  return (void*)mi_atomic_read((const volatile _Atomic(uintptr_t)*)p);
+}
+
 // Atomically write a pointer
 static inline void mi_atomic_write_ptr(volatile _Atomic(void*)* p, void* x) {
   mi_atomic_write((volatile _Atomic(uintptr_t)*)p, (uintptr_t)x );
@@ -127,18 +138,21 @@ static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t
 static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
   return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
-static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) {
+static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
   return *p;
 }
+static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) {
+  return mi_atomic_read(p);
+}
 static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
   mi_atomic_exchange(p,x);
 }
 static inline void mi_atomic_yield(void) {
   YieldProcessor();
 }
-static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) {
+static inline void mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) {
   #ifdef _WIN64
-  return mi_atomic_add(p,add);
+  mi_atomic_add(p,add);
   #else
   int64_t current;
   int64_t sum;
@@ -146,7 +160,6 @@ static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add)
     current = *p;
     sum = current + add;
   } while (_InterlockedCompareExchange64(p, sum, current) != current);
-  return current;
   #endif
 }
 
@@ -156,9 +169,9 @@ static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add)
 #else
 #define  MI_USING_STD
 #endif
-static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add) {
+static inline void mi_atomic_add64(volatile int64_t* p, int64_t add) {
   MI_USING_STD
-  return atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed);
+  atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed);
 }
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
   MI_USING_STD
@@ -180,6 +193,10 @@ static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)
   MI_USING_STD
   return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed);
 }
+static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p) {
+  MI_USING_STD
+  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_acquire);
+}
 static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
   MI_USING_STD
   return atomic_store_explicit(p, x, memory_order_release);
diff --git a/src/alloc.c b/src/alloc.c
index 97c5fcc4..7e89a591 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -118,22 +118,24 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
   mi_segment_t* segment = _mi_page_segment(page);
   if (segment->page_kind==MI_PAGE_HUGE) {
     // huge page segments are always abandoned and can be freed immediately
-    mi_assert_internal(segment->thread_id==0);
-    mi_assert_internal(segment->abandoned_next==NULL);
+    mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
+    mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&segment->abandoned_next))==NULL);
     // claim it and free
-    mi_block_set_next(page, block, page->free);
-    page->free = block;
-    page->used--;
     mi_heap_t* heap = mi_get_default_heap();
-    segment->thread_id = heap->thread_id;
-    _mi_segment_page_free(page,true,&heap->tld->segments);
+    // paranoia: if this it the last reference, the cas should always succeed
+    if (mi_atomic_cas_strong(&segment->thread_id,heap->thread_id,0)) {
+      mi_block_set_next(page, block, page->free);
+      page->free = block;
+      page->used--;
+      _mi_segment_page_free(page,true,&heap->tld->segments);
+    }
     return;
   }
 
   do {
     tfree = page->thread_free;
     use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE ||
-                   (mi_tf_delayed(tfree) == MI_NO_DELAYED_FREE && page->used == page->thread_freed+1)
+                   (mi_tf_delayed(tfree) == MI_NO_DELAYED_FREE && page->used == mi_atomic_read_relaxed(&page->thread_freed)+1)  // data-race but ok, just optimizes early release of the page
                   );
     if (mi_unlikely(use_delayed)) {
       // unlikely: this only happens on the first concurrent free in a page that is in the full list
diff --git a/src/memory.c b/src/memory.c
index 1ea6ee16..268dc153 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -131,7 +131,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
   mi_assert_internal(&regions[idx] == region);
 
   // ensure the region is reserved
-  void* start = mi_atomic_read_ptr_relaxed(&region->start);
+  void* start = mi_atomic_read_ptr(&region->start);
   if (start == NULL) 
   {
     start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld);    
@@ -154,9 +154,9 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       // we assign it to a later slot instead (up to 4 tries).
       // note: we don't need to increment the region count, this will happen on another allocation
       for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
-        void* s = mi_atomic_read_ptr_relaxed(&regions[idx+i].start);
+        void* s = mi_atomic_read_ptr(&regions[idx+i].start);
         if (s == NULL) { // quick test
-          if (mi_atomic_cas_ptr_weak(&regions[idx+i].start, start, s)) {
+          if (mi_atomic_cas_ptr_strong(&regions[idx+i].start, start, NULL)) {
             start = NULL;
             break;
           }
@@ -167,10 +167,10 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
         _mi_os_free(start, MI_REGION_SIZE, tld->stats);
       }
       // and continue with the memory at our index
-      start = mi_atomic_read_ptr_relaxed(&region->start);
+      start = mi_atomic_read_ptr(&region->start);
     }
   }
-  mi_assert_internal(start == mi_atomic_read_ptr_relaxed(&region->start));
+  mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
   mi_assert_internal(start != NULL);
 
   // Commit the blocks to memory
@@ -230,7 +230,7 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
 
   const uintptr_t mask = mi_region_block_mask(blocks, 0);
   const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
-  uintptr_t map = mi_atomic_read_relaxed(&region->map);
+  uintptr_t map = mi_atomic_read(&region->map);
 
   #ifdef MI_HAVE_BITSCAN
   size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
@@ -245,9 +245,9 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
       mi_assert_internal((m >> bitidx) == mask); // no overflow?
       uintptr_t newmap = map | m;
       mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_strong(&region->map, newmap, map)) {
+      if (!mi_atomic_cas_weak(&region->map, newmap, map)) {
         // no success, another thread claimed concurrently.. keep going
-        map = mi_atomic_read_relaxed(&region->map);
+        map = mi_atomic_read(&region->map);
         continue;
       }
       else {
@@ -317,7 +317,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t*
 
   // find a range of free blocks
   void* p = NULL;
-  size_t count = mi_atomic_read_relaxed(&regions_count);
+  size_t count = mi_atomic_read(&regions_count);
   size_t idx = tld->region_idx; // start index is per-thread to reduce contention
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
@@ -377,7 +377,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
     mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
     mem_region_t* region = &regions[idx];
     mi_assert_internal((mi_atomic_read_relaxed(&region->map) & mask) == mask ); // claimed?
-    void* start = mi_atomic_read_ptr_relaxed(&region->start);
+    void* start = mi_atomic_read_ptr(&region->start);
     mi_assert_internal(start != NULL);
     void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
     mi_assert_internal(blocks_start == p); // not a pointer in our area?
diff --git a/src/stats.c b/src/stats.c
index 2176ba17..4dddb4bc 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -38,8 +38,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
   if (mi_is_in_main(stat))
   {
     // add atomically (for abandoned pages)
-    int64_t current = mi_atomic_add64(&stat->current,amount);
-    if (current > stat->peak) stat->peak = stat->current;  // racing.. it's ok
+    mi_atomic_add64(&stat->current,amount);
+    if (stat->current > stat->peak) stat->peak = stat->current;  // racing.. it's ok
     if (amount > 0) {
       mi_atomic_add64(&stat->allocated,amount);
     }

From 7ce9c02fd40796e4392892c0d413a0ac3462d112 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 26 Aug 2019 08:20:26 -0700
Subject: [PATCH 06/10] make cas weak use release memory order; improve free
 assembly

---
 include/mimalloc-atomic.h | 2 +-
 src/alloc.c               | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index 3a289feb..8b254d3e 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -179,7 +179,7 @@ static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add
 }
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   MI_USING_STD
-  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed);
+  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
 }
 static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
   MI_USING_STD
diff --git a/src/alloc.c b/src/alloc.c
index 7e89a591..afc181dd 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -226,7 +226,7 @@ void mi_free(void* p) mi_attr_noexcept
 #endif
 
   const mi_segment_t* const segment = _mi_ptr_segment(p);
-  if (segment == NULL) return;  // checks for (p==NULL)
+  if (mi_unlikely(segment == NULL)) return;  // checks for (p==NULL)
 
 #if (MI_DEBUG>0)
   if (mi_unlikely(!mi_is_in_heap_region(p))) {

From 2c19388bcfc08fa2acb3b4e58c569b7ff4b060e7 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 26 Aug 2019 11:44:41 -0700
Subject: [PATCH 07/10] initialize mimalloc options at process load

---
 include/mimalloc-internal.h | 1 +
 src/init.c                  | 1 +
 src/options.c               | 7 +++++++
 3 files changed, 9 insertions(+)

diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index d886bcec..3889c66e 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -27,6 +27,7 @@ void       _mi_error_message(const char* fmt, ...);
 void       _mi_warning_message(const char* fmt, ...);
 void       _mi_verbose_message(const char* fmt, ...);
 void       _mi_trace_message(const char* fmt, ...);
+void       _mi_options_init(void);
 
 // "init.c"
 extern mi_stats_t       _mi_stats_main;
diff --git a/src/init.c b/src/init.c
index 290caeec..4c7fdda0 100644
--- a/src/init.c
+++ b/src/init.c
@@ -416,6 +416,7 @@ static void mi_allocator_done() {
 static void mi_process_load(void) {
   os_preloading = false;
   atexit(&mi_process_done);
+  _mi_options_init();
   mi_process_init();
   //mi_stats_reset();
   if (mi_redirected) _mi_verbose_message("malloc is redirected.\n");
diff --git a/src/options.c b/src/options.c
index 88f2503e..16c50f11 100644
--- a/src/options.c
+++ b/src/options.c
@@ -73,6 +73,13 @@ static mi_option_desc_t options[_mi_option_last] =
 
 static void mi_option_init(mi_option_desc_t* desc);
 
+void _mi_options_init(void) {
+  // called on process load
+  for(int i = 0; i < _mi_option_last; i++ ) {
+    mi_option_get((mi_option_t)i); // initialize
+  }
+}
+
 long mi_option_get(mi_option_t option) {
   mi_assert(option >= 0 && option < _mi_option_last);
   mi_option_desc_t* desc = &options[option];

From 8b06ab1e4946005e4bf8c067c33c53b2647aaf39 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 26 Aug 2019 12:41:35 -0700
Subject: [PATCH 08/10] fix check on gigabyte alignment of huge os pages on
 windows

---
 src/os.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/os.c b/src/os.c
index 1c7696b4..5d1b7576 100644
--- a/src/os.c
+++ b/src/os.c
@@ -198,7 +198,7 @@ static bool mi_os_mem_free(void* addr, size_t size, mi_stats_t* stats)
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
 #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  if ((size % (uintptr_t)1 << 30) == 0 /* 1GiB multiple */
+  if ((size % ((uintptr_t)1 << 30)) == 0 /* 1GiB multiple */
     && (flags & MEM_LARGE_PAGES) != 0 && (flags & MEM_COMMIT) != 0 
     && (addr != NULL || try_alignment == 0 || try_alignment % _mi_os_page_size() == 0)
     && pNtAllocateVirtualMemoryEx != NULL)
@@ -217,7 +217,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
     }
     else {
       // else fall back to regular large OS pages
-      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %lx)\n", err);
+      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error 0x%lx)\n", err);
     }
   }
 #endif

From 3d8c331a1c3994a8727528487c956fddf81e2519 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 26 Aug 2019 12:41:59 -0700
Subject: [PATCH 09/10] search regions always from the lowest index

---
 src/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/memory.c b/src/memory.c
index 268dc153..222b87c2 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -318,7 +318,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t*
   // find a range of free blocks
   void* p = NULL;
   size_t count = mi_atomic_read(&regions_count);
-  size_t idx = tld->region_idx; // start index is per-thread to reduce contention
+  size_t idx = 0; // tld->region_idx; // start index is per-thread to reduce contention
   for (size_t visited = 0; visited < count; visited++, idx++) {
     if (idx >= count) idx = 0;  // wrap around
     if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error

From f0a12699c208191afad6373a64a71c76af7bdb05 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Mon, 26 Aug 2019 12:42:25 -0700
Subject: [PATCH 10/10] remove atomic_iread

---
 include/mimalloc-atomic.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h
index 9549cbc3..8b254d3e 100644
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@@ -114,9 +114,6 @@ static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exc
   return (void*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)p, (uintptr_t)exchange);
 }
 
-static inline intptr_t mi_atomic_iread(volatile intptr_t* p) {
-  return (intptr_t)mi_atomic_read( (volatile uintptr_t*)p );
-}
 
 #ifdef _MSC_VER
 #define WIN32_LEAN_AND_MEAN