From 6e360d34eea79e85b21352014a88c53c104c211d Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 25 Aug 2019 13:15:26 -0700 Subject: [PATCH 01/10] fix 1GB huge page flag on Linux --- src/os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os.c b/src/os.c index a1b6cdf3..3a9bd30c 100644 --- a/src/os.c +++ b/src/os.c @@ -353,7 +353,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro lflags |= MAP_HUGETLB; #endif #ifdef MAP_HUGE_1GB - if ((size % (uintptr_t)1 << 20) == 0) { + if ((size % ((uintptr_t)1 << 30)) == 0) { lflags |= MAP_HUGE_1GB; } else From e8664001f76981079191b22aff6dbdada135e6fa Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 25 Aug 2019 22:59:12 -0700 Subject: [PATCH 02/10] Use standard _Atomic declarations and clean up atomic operations --- include/mimalloc-atomic.h | 186 +++++++++++++++++++------------------- include/mimalloc-types.h | 11 ++- src/alloc.c | 6 +- src/memory.c | 54 +++++------ src/options.c | 2 +- src/os.c | 18 ++-- src/page.c | 13 +-- src/segment.c | 12 +-- src/stats.c | 22 ++--- 9 files changed, 165 insertions(+), 159 deletions(-) diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h index d504634c..739d0512 100644 --- a/include/mimalloc-atomic.h +++ b/include/mimalloc-atomic.h @@ -9,63 +9,98 @@ terms of the MIT license. A copy of the license can be found in the file #define MIMALLOC_ATOMIC_H // ------------------------------------------------------ -// Atomics +// Atomics +// We need to be portable between C, C++, and MSVC. // ------------------------------------------------------ -// Atomically increment a value; returns the incremented result. -static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p); +#if defined(_MSC_VER) +#define _Atomic(tp) tp +#define ATOMIC_VAR_INIT(x) x +#elif defined(__cplusplus) +#include +#define _Atomic(tp) std::atomic +#else +#include +#endif -// Atomically increment a value; returns the incremented result. -static inline uint32_t mi_atomic_increment32(volatile uint32_t* p); +#define mi_atomic_cast(tp,x) (volatile _Atomic(tp)*)(x) -// Atomically decrement a value; returns the decremented result. -static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p); +// ------------------------------------------------------ +// Atomic operations specialized for mimalloc +// ------------------------------------------------------ -// Atomically add a 64-bit value; returns the added result. -static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add); +// Atomically add a 64-bit value; returns the previous value. +// Note: not using _Atomic(int64_t) as it is only used for stats. +static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add); -// Atomically subtract a value; returns the subtracted result. -static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub); +// Atomically add a value; returns the previous value. +static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add); -// Atomically subtract a value; returns the subtracted result. -static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub); +// Atomically compare and exchange a value; returns `true` if successful. May fail spuriously. +// (Note: expected and desired are in opposite order from atomic_compare_exchange) +static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected); // Atomically compare and exchange a value; returns `true` if successful. -static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare); - -// Atomically compare and exchange a value; returns `true` if successful. -static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare); +static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected); // Atomically exchange a value. -static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange); +static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange); // Atomically read a value -static inline uintptr_t mi_atomic_read(volatile uintptr_t* p); +static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p); // Atomically write a value -static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x); - -// Atomically read a pointer -static inline void* mi_atomic_read_ptr(volatile void** p) { - return (void*)mi_atomic_read( (volatile uintptr_t*)p ); -} +static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x); +// Yield static inline void mi_atomic_yield(void); + +// Atomically add a value; returns the previous value. +static inline uintptr_t mi_atomic_addu(volatile _Atomic(uintptr_t)* p, uintptr_t add) { + return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, (intptr_t)add); +} +// Atomically subtract a value; returns the previous value. +static inline uintptr_t mi_atomic_subu(volatile _Atomic(uintptr_t)* p, uintptr_t sub) { + return (uintptr_t)mi_atomic_add((volatile _Atomic(intptr_t)*)p, -((intptr_t)sub)); +} + +// Atomically increment a value; returns the incremented result. +static inline uintptr_t mi_atomic_increment(volatile _Atomic(uintptr_t)* p) { + return mi_atomic_addu(p, 1); +} + +// Atomically decrement a value; returns the decremented result. +static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) { + return mi_atomic_subu(p, 1); +} + +// Atomically read a pointer +static inline void* mi_atomic_read_ptr_relaxed(volatile _Atomic(void*) const * p) { + return (void*)mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)p); +} + // Atomically write a pointer -static inline void mi_atomic_write_ptr(volatile void** p, void* x) { - mi_atomic_write((volatile uintptr_t*)p, (uintptr_t)x ); +static inline void mi_atomic_write_ptr(volatile _Atomic(void*)* p, void* x) { + mi_atomic_write((volatile _Atomic(uintptr_t)*)p, (uintptr_t)x ); +} + +// Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously. +// (Note: expected and desired are in opposite order from atomic_compare_exchange) +static inline bool mi_atomic_cas_ptr_weak(volatile _Atomic(void*)* p, void* desired, void* expected) { + return mi_atomic_cas_weak((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected); } // Atomically compare and exchange a pointer; returns `true` if successful. -static inline bool mi_atomic_compare_exchange_ptr(volatile void** p, void* newp, void* compare) { - return mi_atomic_compare_exchange((volatile uintptr_t*)p, (uintptr_t)newp, (uintptr_t)compare); +// (Note: expected and desired are in opposite order from atomic_compare_exchange) +static inline bool mi_atomic_cas_ptr_strong(volatile _Atomic(void*)* p, void* desired, void* expected) { + return mi_atomic_cas_strong((volatile _Atomic(uintptr_t)*)p, (uintptr_t)desired, (uintptr_t)expected); } // Atomically exchange a pointer value. -static inline void* mi_atomic_exchange_ptr(volatile void** p, void* exchange) { - return (void*)mi_atomic_exchange((volatile uintptr_t*)p, (uintptr_t)exchange); +static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exchange) { + return (void*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)p, (uintptr_t)exchange); } @@ -73,49 +108,37 @@ static inline void* mi_atomic_exchange_ptr(volatile void** p, void* exchange) { #define WIN32_LEAN_AND_MEAN #include #include -#if (MI_INTPTR_SIZE==8) +#ifdef _WIN64 typedef LONG64 msc_intptr_t; #define RC64(f) f##64 #else typedef LONG msc_intptr_t; #define RC64(f) f #endif -static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p) { - return (uintptr_t)RC64(_InterlockedIncrement)((volatile msc_intptr_t*)p); +static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) { + return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add); } -static inline uint32_t mi_atomic_increment32(volatile uint32_t* p) { - return (uint32_t)_InterlockedIncrement((volatile LONG*)p); +static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { + return (expected == RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected)); } -static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p) { - return (uintptr_t)RC64(_InterlockedDecrement)((volatile msc_intptr_t*)p); +static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { + return mi_atomic_cas_strong(p,desired,expected); } -static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub) { - return (uintptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub)) - sub; -} -static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub) { - return (uint32_t)_InterlockedExchangeAdd((volatile LONG*)p, -((LONG)sub)) - sub; -} -static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare) { - return ((int32_t)compare == _InterlockedCompareExchange((volatile LONG*)p, (LONG)exchange, (LONG)compare)); -} -static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare) { - return (compare == RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange, (msc_intptr_t)compare)); -} -static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange) { +static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) { return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange); } -static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) { +static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) { return *p; } -static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) { - *p = x; +static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) { + mi_atomic_exchange(p,x); } static inline void mi_atomic_yield(void) { YieldProcessor(); } -static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) { - #if (MI_INTPTR_SIZE==8) - return _InterlockedExchangeAdd64(p, add) + add; +static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) { + #ifdef _WIN64 + return mi_atomic_add(p,add); #else int64_t current; int64_t sum; @@ -123,62 +146,43 @@ static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) { current = *p; sum = current + add; } while (_InterlockedCompareExchange64(p, sum, current) != current); - return sum; + return current; #endif } #else #ifdef __cplusplus -#include #define MI_USING_STD using namespace std; -#define _Atomic(tp) atomic #else -#include #define MI_USING_STD #endif -static inline uintptr_t mi_atomic_increment(volatile uintptr_t* p) { +static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add) { MI_USING_STD - return atomic_fetch_add_explicit((volatile atomic_uintptr_t*)p, (uintptr_t)1, memory_order_relaxed) + 1; + return atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed); } -static inline uint32_t mi_atomic_increment32(volatile uint32_t* p) { +static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) { MI_USING_STD - return atomic_fetch_add_explicit((volatile _Atomic(uint32_t)*)p, (uint32_t)1, memory_order_relaxed) + 1; + return atomic_fetch_add_explicit(p, add, memory_order_relaxed); } -static inline uintptr_t mi_atomic_decrement(volatile uintptr_t* p) { +static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { MI_USING_STD - return atomic_fetch_sub_explicit((volatile atomic_uintptr_t*)p, (uintptr_t)1, memory_order_relaxed) - 1; + return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed); } -static inline int64_t mi_atomic_add(volatile int64_t* p, int64_t add) { +static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { MI_USING_STD - return atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed) + add; + return atomic_compare_exchange_strong_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed); } -static inline uintptr_t mi_atomic_subtract(volatile uintptr_t* p, uintptr_t sub) { +static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) { MI_USING_STD - return atomic_fetch_sub_explicit((volatile atomic_uintptr_t*)p, sub, memory_order_relaxed) - sub; + return atomic_exchange_explicit(p, exchange, memory_order_acq_rel); } -static inline uint32_t mi_atomic_subtract32(volatile uint32_t* p, uint32_t sub) { +static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p) { MI_USING_STD - return atomic_fetch_sub_explicit((volatile _Atomic(uint32_t)*)p, sub, memory_order_relaxed) - sub; + return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed); } -static inline bool mi_atomic_compare_exchange32(volatile uint32_t* p, uint32_t exchange, uint32_t compare) { +static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) { MI_USING_STD - return atomic_compare_exchange_weak_explicit((volatile _Atomic(uint32_t)*)p, &compare, exchange, memory_order_release, memory_order_relaxed); -} -static inline bool mi_atomic_compare_exchange(volatile uintptr_t* p, uintptr_t exchange, uintptr_t compare) { - MI_USING_STD - return atomic_compare_exchange_weak_explicit((volatile atomic_uintptr_t*)p, &compare, exchange, memory_order_release, memory_order_relaxed); -} -static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exchange) { - MI_USING_STD - return atomic_exchange_explicit((volatile atomic_uintptr_t*)p, exchange, memory_order_acquire); -} -static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) { - MI_USING_STD - return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed); -} -static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) { - MI_USING_STD - return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed); + return atomic_store_explicit(p, x, memory_order_release); } #if defined(__cplusplus) diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index db39b9c4..0b2334b8 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -10,6 +10,7 @@ terms of the MIT license. A copy of the license can be found in the file #include // ptrdiff_t #include // uintptr_t, uint16_t, etc +#include // _Atomic // ------------------------------------------------------ // Variants @@ -177,8 +178,8 @@ typedef struct mi_page_s { size_t used; // number of blocks in use (including blocks in `local_free` and `thread_free`) mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) - volatile uintptr_t thread_freed; // at least this number of blocks are in `thread_free` - volatile mi_thread_free_t thread_free; // list of deferred free blocks freed by other threads + volatile _Atomic(uintptr_t) thread_freed; // at least this number of blocks are in `thread_free` + volatile _Atomic(mi_thread_free_t) thread_free; // list of deferred free blocks freed by other threads // less accessed info size_t block_size; // size available in each block (always `>0`) @@ -208,7 +209,7 @@ typedef enum mi_page_kind_e { typedef struct mi_segment_s { struct mi_segment_s* next; struct mi_segment_s* prev; - volatile struct mi_segment_s* abandoned_next; + volatile _Atomic(struct mi_segment_s*) abandoned_next; size_t abandoned; // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`) size_t used; // count of pages in use (`used <= capacity`) size_t capacity; // count of available pages (`#free + used`) @@ -219,7 +220,7 @@ typedef struct mi_segment_s { // layout like this to optimize access in `mi_free` size_t page_shift; // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`). - volatile uintptr_t thread_id; // unique id of the thread owning this segment + volatile _Atomic(uintptr_t) thread_id; // unique id of the thread owning this segment mi_page_kind_t page_kind; // kind of pages: small, large, or huge mi_page_t pages[1]; // up to `MI_SMALL_PAGES_PER_SEGMENT` pages } mi_segment_t; @@ -255,7 +256,7 @@ struct mi_heap_s { mi_tld_t* tld; mi_page_t* pages_free_direct[MI_SMALL_WSIZE_MAX + 2]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") - volatile mi_block_t* thread_delayed_free; + volatile _Atomic(mi_block_t*) thread_delayed_free; uintptr_t thread_id; // thread this heap belongs too uintptr_t cookie; uintptr_t random; // random number used for secure allocation diff --git a/src/alloc.c b/src/alloc.c index 76e093e7..97c5fcc4 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -144,7 +144,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc mi_block_set_next(page, block, mi_tf_block(tfree)); tfreex = mi_tf_set_block(tfree,block); } - } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree)); + } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree)); if (mi_likely(!use_delayed)) { // increment the thread free count and return @@ -160,7 +160,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc do { dfree = (mi_block_t*)heap->thread_delayed_free; mi_block_set_nextx(heap->cookie,block,dfree); - } while (!mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, block, dfree)); + } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree)); } // and reset the MI_DELAYED_FREEING flag @@ -168,7 +168,7 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc tfreex = tfree = page->thread_free; mi_assert_internal(mi_tf_delayed(tfree) == MI_NEVER_DELAYED_FREE || mi_tf_delayed(tfree) == MI_DELAYED_FREEING); if (mi_tf_delayed(tfree) != MI_NEVER_DELAYED_FREE) tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE); - } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree)); + } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree)); } } diff --git a/src/memory.c b/src/memory.c index 26f87092..1ea6ee16 100644 --- a/src/memory.c +++ b/src/memory.c @@ -69,8 +69,8 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, mi_os_tld // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block. typedef struct mem_region_s { - volatile uintptr_t map; // in-use bit per MI_SEGMENT_SIZE block - volatile void* start; // start of virtual memory area + volatile _Atomic(uintptr_t) map; // in-use bit per MI_SEGMENT_SIZE block + volatile _Atomic(void*) start; // start of virtual memory area } mem_region_t; @@ -78,7 +78,7 @@ typedef struct mem_region_s { // TODO: in the future, maintain a map per NUMA node for numa aware allocation static mem_region_t regions[MI_REGION_MAX]; -static volatile size_t regions_count = 0; // allocated regions +static volatile _Atomic(uintptr_t) regions_count; // = 0; // allocated regions /* ---------------------------------------------------------------------------- @@ -106,9 +106,9 @@ static size_t mi_good_commit_size(size_t size) { // Return if a pointer points into a region reserved by us. bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { if (p==NULL) return false; - size_t count = mi_atomic_read(®ions_count); + size_t count = mi_atomic_read_relaxed(®ions_count); for (size_t i = 0; i < count; i++) { - uint8_t* start = (uint8_t*)mi_atomic_read_ptr(®ions[i].start); + uint8_t* start = (uint8_t*)mi_atomic_read_ptr_relaxed(®ions[i].start); if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true; } return false; @@ -127,11 +127,11 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit { size_t mask = mi_region_block_mask(blocks,bitidx); mi_assert_internal(mask != 0); - mi_assert_internal((mask & mi_atomic_read(®ion->map)) == mask); + mi_assert_internal((mask & mi_atomic_read_relaxed(®ion->map)) == mask); mi_assert_internal(®ions[idx] == region); // ensure the region is reserved - void* start = mi_atomic_read_ptr(®ion->start); + void* start = mi_atomic_read_ptr_relaxed(®ion->start); if (start == NULL) { start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld); @@ -139,13 +139,13 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit // failure to allocate from the OS! unclaim the blocks and fail size_t map; do { - map = mi_atomic_read(®ion->map); - } while (!mi_atomic_compare_exchange(®ion->map, map & ~mask, map)); + map = mi_atomic_read_relaxed(®ion->map); + } while (!mi_atomic_cas_weak(®ion->map, map & ~mask, map)); return false; } // set the newly allocated region - if (mi_atomic_compare_exchange_ptr(®ion->start, start, NULL)) { + if (mi_atomic_cas_ptr_strong(®ion->start, start, NULL)) { // update the region count mi_atomic_increment(®ions_count); } @@ -154,9 +154,9 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit // we assign it to a later slot instead (up to 4 tries). // note: we don't need to increment the region count, this will happen on another allocation for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) { - void* s = mi_atomic_read_ptr(®ions[idx+i].start); + void* s = mi_atomic_read_ptr_relaxed(®ions[idx+i].start); if (s == NULL) { // quick test - if (mi_atomic_compare_exchange_ptr(®ions[idx+i].start, start, s)) { + if (mi_atomic_cas_ptr_weak(®ions[idx+i].start, start, s)) { start = NULL; break; } @@ -167,10 +167,10 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit _mi_os_free(start, MI_REGION_SIZE, tld->stats); } // and continue with the memory at our index - start = mi_atomic_read_ptr(®ion->start); + start = mi_atomic_read_ptr_relaxed(®ion->start); } } - mi_assert_internal(start == mi_atomic_read_ptr(®ion->start)); + mi_assert_internal(start == mi_atomic_read_ptr_relaxed(®ion->start)); mi_assert_internal(start != NULL); // Commit the blocks to memory @@ -230,7 +230,7 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc const uintptr_t mask = mi_region_block_mask(blocks, 0); const size_t bitidx_max = MI_REGION_MAP_BITS - blocks; - uintptr_t map = mi_atomic_read(®ion->map); + uintptr_t map = mi_atomic_read_relaxed(®ion->map); #ifdef MI_HAVE_BITSCAN size_t bitidx = mi_bsf(~map); // quickly find the first zero bit if possible @@ -245,9 +245,9 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc mi_assert_internal((m >> bitidx) == mask); // no overflow? uintptr_t newmap = map | m; mi_assert_internal((newmap^map) >> bitidx == mask); - if (!mi_atomic_compare_exchange(®ion->map, newmap, map)) { + if (!mi_atomic_cas_strong(®ion->map, newmap, map)) { // no success, another thread claimed concurrently.. keep going - map = mi_atomic_read(®ion->map); + map = mi_atomic_read_relaxed(®ion->map); continue; } else { @@ -281,7 +281,7 @@ static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, b // check if there are available blocks in the region.. mi_assert_internal(idx < MI_REGION_MAX); mem_region_t* region = ®ions[idx]; - uintptr_t m = mi_atomic_read(®ion->map); + uintptr_t m = mi_atomic_read_relaxed(®ion->map); if (m != MI_REGION_MAP_FULL) { // some bits are zero return mi_region_alloc_blocks(region, idx, blocks, size, commit, p, id, tld); } @@ -317,7 +317,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* // find a range of free blocks void* p = NULL; - size_t count = mi_atomic_read(®ions_count); + size_t count = mi_atomic_read_relaxed(®ions_count); size_t idx = tld->region_idx; // start index is per-thread to reduce contention for (size_t visited = 0; visited < count; visited++, idx++) { if (idx >= count) idx = 0; // wrap around @@ -376,8 +376,8 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) { size_t mask = mi_region_block_mask(blocks, bitidx); mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`? mem_region_t* region = ®ions[idx]; - mi_assert_internal((mi_atomic_read(®ion->map) & mask) == mask ); // claimed? - void* start = mi_atomic_read_ptr(®ion->start); + mi_assert_internal((mi_atomic_read_relaxed(®ion->map) & mask) == mask ); // claimed? + void* start = mi_atomic_read_ptr_relaxed(®ion->start); mi_assert_internal(start != NULL); void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE); mi_assert_internal(blocks_start == p); // not a pointer in our area? @@ -405,9 +405,9 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) { uintptr_t map; uintptr_t newmap; do { - map = mi_atomic_read(®ion->map); + map = mi_atomic_read_relaxed(®ion->map); newmap = map & ~mask; - } while (!mi_atomic_compare_exchange(®ion->map, newmap, map)); + } while (!mi_atomic_cas_weak(®ion->map, newmap, map)); } } @@ -419,17 +419,17 @@ void _mi_mem_collect(mi_stats_t* stats) { // free every region that has no segments in use. for (size_t i = 0; i < regions_count; i++) { mem_region_t* region = ®ions[i]; - if (mi_atomic_read(®ion->map) == 0 && region->start != NULL) { + if (mi_atomic_read_relaxed(®ion->map) == 0 && region->start != NULL) { // if no segments used, try to claim the whole region uintptr_t m; do { - m = mi_atomic_read(®ion->map); - } while(m == 0 && !mi_atomic_compare_exchange(®ion->map, ~((uintptr_t)0), 0 )); + m = mi_atomic_read_relaxed(®ion->map); + } while(m == 0 && !mi_atomic_cas_weak(®ion->map, ~((uintptr_t)0), 0 )); if (m == 0) { // on success, free the whole region if (region->start != NULL) _mi_os_free((void*)region->start, MI_REGION_SIZE, stats); // and release - region->start = 0; + mi_atomic_write_ptr(®ion->start,NULL); mi_atomic_write(®ion->map,0); } } diff --git a/src/options.c b/src/options.c index b30ff1c6..88f2503e 100644 --- a/src/options.c +++ b/src/options.c @@ -127,7 +127,7 @@ void mi_option_disable(mi_option_t option) { // Messages // -------------------------------------------------------- #define MAX_ERROR_COUNT (10) -static uintptr_t error_count = 0; // when MAX_ERROR_COUNT stop emitting errors and warnings +static volatile _Atomic(uintptr_t) error_count; // = 0; // when MAX_ERROR_COUNT stop emitting errors and warnings // When overriding malloc, we may recurse into mi_vfprintf if an allocation // inside the C runtime causes another message. diff --git a/src/os.c b/src/os.c index e7ed57b5..fc9c5acc 100644 --- a/src/os.c +++ b/src/os.c @@ -186,11 +186,11 @@ static bool mi_os_mem_free(void* addr, size_t size, mi_stats_t* stats) static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) { #if (MI_INTPTR_SIZE >= 8) // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations - static volatile intptr_t aligned_base = ((intptr_t)4 << 40); // starting at 4TiB + static volatile _Atomic(intptr_t) aligned_base = ATOMIC_VAR_INIT((intptr_t)4 << 40); // starting at 4TiB if (addr == NULL && try_alignment > 0 && try_alignment <= MI_SEGMENT_SIZE && (size%MI_SEGMENT_SIZE) == 0) { - intptr_t hint = mi_atomic_add(&aligned_base, size) - size; + intptr_t hint = mi_atomic_add(&aligned_base, size); if (hint%try_alignment == 0) { return VirtualAlloc((void*)hint, size, flags, PAGE_READWRITE); } @@ -214,11 +214,11 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, static volatile uintptr_t large_page_try_ok = 0; void* p = NULL; if (use_large_os_page(size, try_alignment)) { - uintptr_t try_ok = mi_atomic_read(&large_page_try_ok); + uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok); if (try_ok > 0) { // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive. // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times. - mi_atomic_compare_exchange(&large_page_try_ok, try_ok - 1, try_ok); + mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok); } else { // large OS pages must always reserve and commit. @@ -253,9 +253,9 @@ static void* mi_unix_mmapx(size_t size, size_t try_alignment, int protect_flags, void* p = NULL; #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED) // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations - static volatile intptr_t aligned_base = ((intptr_t)1 << 42); // starting at 4TiB + static volatile _Atomic(intptr_t) aligned_base = ATOMIC_VAR_INIT((intptr_t)1 << 42); // starting at 4TiB if (try_alignment <= MI_SEGMENT_SIZE && (size%MI_SEGMENT_SIZE)==0) { - intptr_t hint = mi_atomic_add(&aligned_base,size) - size; + intptr_t hint = mi_atomic_add(&aligned_base,size); if (hint%try_alignment == 0) { p = mmap((void*)hint,size,protect_flags,flags,fd,0); if (p==MAP_FAILED) p = NULL; // fall back to regular mmap @@ -291,14 +291,14 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags) fd = VM_MAKE_TAG(100); #endif if (use_large_os_page(size, try_alignment)) { - static volatile uintptr_t large_page_try_ok = 0; - uintptr_t try_ok = mi_atomic_read(&large_page_try_ok); + static volatile _Atomic(uintptr_t) large_page_try_ok = 0; + uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok); if (try_ok > 0) { // If the OS is not configured for large OS pages, or the user does not have // enough permission, the `mmap` will always fail (but it might also fail for other reasons). // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times // to avoid too many failing calls to mmap. - mi_atomic_compare_exchange(&large_page_try_ok, try_ok - 1, try_ok); + mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok); } else { int lflags = flags; diff --git a/src/page.c b/src/page.c index 54897af5..a95f5b51 100644 --- a/src/page.c +++ b/src/page.c @@ -49,11 +49,12 @@ static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) { return count; } +/* // Start of the page available memory static inline uint8_t* mi_page_area(const mi_page_t* page) { return _mi_page_start(_mi_page_segment(page), page, NULL); } - +*/ static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) { size_t psize; @@ -126,7 +127,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay ) { } } while((mi_tf_delayed(tfreex) != mi_tf_delayed(tfree)) && // avoid atomic operation if already equal - !mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree)); + !mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree)); } @@ -147,7 +148,7 @@ static void mi_page_thread_free_collect(mi_page_t* page) tfree = page->thread_free; head = mi_tf_block(tfree); tfreex = mi_tf_set_block(tfree,NULL); - } while (!mi_atomic_compare_exchange((volatile uintptr_t*)&page->thread_free, tfreex, tfree)); + } while (!mi_atomic_cas_weak(mi_atomic_cast(uintptr_t,&page->thread_free), tfreex, tfree)); // return if the list is empty if (head == NULL) return; @@ -166,7 +167,7 @@ static void mi_page_thread_free_collect(mi_page_t* page) page->free = head; // update counts now - mi_atomic_subtract(&page->thread_freed, count); + mi_atomic_subu(&page->thread_freed, count); page->used -= count; } @@ -257,7 +258,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) { mi_block_t* block; do { block = (mi_block_t*)heap->thread_delayed_free; - } while (block != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, NULL, block)); + } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), NULL, block)); // and free them all while(block != NULL) { @@ -270,7 +271,7 @@ void _mi_heap_delayed_free(mi_heap_t* heap) { do { dfree = (mi_block_t*)heap->thread_delayed_free; mi_block_set_nextx(heap->cookie, block, dfree); - } while (!mi_atomic_compare_exchange_ptr((volatile void**)&heap->thread_delayed_free, block, dfree)); + } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree)); } block = next; diff --git a/src/segment.c b/src/segment.c index 18c06fbc..9a744ea6 100644 --- a/src/segment.c +++ b/src/segment.c @@ -542,8 +542,8 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld) // live blocks (reached through other threads). Such segments // are "abandoned" and will be reclaimed by other threads to // reuse their pages and/or free them eventually -static volatile mi_segment_t* abandoned = NULL; -static volatile uintptr_t abandoned_count = 0; +static volatile _Atomic(mi_segment_t*) abandoned; // = NULL; +static volatile _Atomic(uintptr_t) abandoned_count; // = 0; static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(segment->used == segment->abandoned); @@ -561,9 +561,9 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { segment->thread_id = 0; mi_segment_t* next; do { - next = (mi_segment_t*)abandoned; - mi_atomic_write_ptr((volatile void**)&segment->abandoned_next, next); - } while (!mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, segment, next)); + next = (mi_segment_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&abandoned)); + mi_atomic_write_ptr(mi_atomic_cast(void*,&segment->abandoned_next), next); + } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&abandoned), segment, next)); mi_atomic_increment(&abandoned_count); } @@ -597,7 +597,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen mi_segment_t* segment; do { segment = (mi_segment_t*)abandoned; - } while(segment != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, (mi_segment_t*)segment->abandoned_next, segment)); + } while(segment != NULL && !mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&abandoned), (mi_segment_t*)segment->abandoned_next, segment)); if (segment==NULL) break; // stop early if no more segments available // got it. diff --git a/src/stats.c b/src/stats.c index 39015f94..2176ba17 100644 --- a/src/stats.c +++ b/src/stats.c @@ -38,13 +38,13 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { if (mi_is_in_main(stat)) { // add atomically (for abandoned pages) - int64_t current = mi_atomic_add(&stat->current,amount); + int64_t current = mi_atomic_add64(&stat->current,amount); if (current > stat->peak) stat->peak = stat->current; // racing.. it's ok if (amount > 0) { - mi_atomic_add(&stat->allocated,amount); + mi_atomic_add64(&stat->allocated,amount); } else { - mi_atomic_add(&stat->freed, -amount); + mi_atomic_add64(&stat->freed, -amount); } } else { @@ -62,8 +62,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) { if (mi_is_in_main(stat)) { - mi_atomic_add( &stat->count, 1 ); - mi_atomic_add( &stat->total, (int64_t)amount ); + mi_atomic_add64( &stat->count, 1 ); + mi_atomic_add64( &stat->total, (int64_t)amount ); } else { stat->count++; @@ -82,16 +82,16 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { // must be thread safe as it is called from stats_merge static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) { if (stat==src) return; - mi_atomic_add( &stat->allocated, src->allocated * unit); - mi_atomic_add( &stat->current, src->current * unit); - mi_atomic_add( &stat->freed, src->freed * unit); - mi_atomic_add( &stat->peak, src->peak * unit); + mi_atomic_add64( &stat->allocated, src->allocated * unit); + mi_atomic_add64( &stat->current, src->current * unit); + mi_atomic_add64( &stat->freed, src->freed * unit); + mi_atomic_add64( &stat->peak, src->peak * unit); } static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) { if (stat==src) return; - mi_atomic_add( &stat->total, src->total * unit); - mi_atomic_add( &stat->count, src->count * unit); + mi_atomic_add64( &stat->total, src->total * unit); + mi_atomic_add64( &stat->count, src->count * unit); } // must be thread safe as it is called from stats_merge From baabc775034efeb55a93c8088492933e56d8334f Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 25 Aug 2019 23:02:41 -0700 Subject: [PATCH 03/10] use proper atomic initialization macros --- src/init.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/init.c b/src/init.c index 76e586f2..290caeec 100644 --- a/src/init.c +++ b/src/init.c @@ -19,7 +19,8 @@ const mi_page_t _mi_page_empty = { 0, #endif 0, // used - NULL, 0, 0, + NULL, + ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0), 0, NULL, NULL, NULL #if (MI_INTPTR_SIZE==8 && MI_SECURE>0) || (MI_INTPTR_SIZE==4 && MI_SECURE==0) , { NULL } // padding @@ -81,7 +82,7 @@ const mi_heap_t _mi_heap_empty = { NULL, MI_SMALL_PAGES_EMPTY, MI_PAGE_QUEUES_EMPTY, - NULL, + ATOMIC_VAR_INIT(NULL), 0, 0, 0, From 2159c224151e5be1f3bcf73acefe62eef17d080f Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 25 Aug 2019 23:06:18 -0700 Subject: [PATCH 04/10] fix atomic declaration on windows --- src/os.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os.c b/src/os.c index fc9c5acc..fb36f3fc 100644 --- a/src/os.c +++ b/src/os.c @@ -211,7 +211,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment } static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags) { - static volatile uintptr_t large_page_try_ok = 0; + static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0; void* p = NULL; if (use_large_os_page(size, try_alignment)) { uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok); @@ -291,7 +291,7 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags) fd = VM_MAKE_TAG(100); #endif if (use_large_os_page(size, try_alignment)) { - static volatile _Atomic(uintptr_t) large_page_try_ok = 0; + static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0; uintptr_t try_ok = mi_atomic_read_relaxed(&large_page_try_ok); if (try_ok > 0) { // If the OS is not configured for large OS pages, or the user does not have From 5c7c106d62f70db566e337abd6575021ec55f1bf Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 26 Aug 2019 08:11:15 -0700 Subject: [PATCH 05/10] strengthen some atomic operations for weak memory models --- include/mimalloc-atomic.h | 45 +++++++++++++++++++++++++++------------ src/alloc.c | 18 +++++++++------- src/memory.c | 20 ++++++++--------- src/stats.c | 4 ++-- 4 files changed, 53 insertions(+), 34 deletions(-) diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h index 739d0512..3a289feb 100644 --- a/include/mimalloc-atomic.h +++ b/include/mimalloc-atomic.h @@ -30,26 +30,32 @@ terms of the MIT license. A copy of the license can be found in the file // ------------------------------------------------------ // Atomically add a 64-bit value; returns the previous value. -// Note: not using _Atomic(int64_t) as it is only used for stats. -static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add); +// Note: not using _Atomic(int64_t) as it is only used for statistics. +static inline void mi_atomic_add64(volatile int64_t* p, int64_t add); -// Atomically add a value; returns the previous value. +// Atomically add a value; returns the previous value. Memory ordering is relaxed. static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add); -// Atomically compare and exchange a value; returns `true` if successful. May fail spuriously. +// Atomically compare and exchange a value; returns `true` if successful. +// May fail spuriously. Memory ordering as release on success, and relaxed on failure. // (Note: expected and desired are in opposite order from atomic_compare_exchange) static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected); // Atomically compare and exchange a value; returns `true` if successful. +// Memory ordering is acquire-release +// (Note: expected and desired are in opposite order from atomic_compare_exchange) static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected); -// Atomically exchange a value. +// Atomically exchange a value. Memory ordering is acquire-release. static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange); -// Atomically read a value +// Atomically read a value. Memory ordering is relaxed. static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p); -// Atomically write a value +// Atomically read a value. Memory ordering is acquire. +static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p); + +// Atomically write a value. Memory ordering is release. static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x); // Yield @@ -76,11 +82,16 @@ static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) { return mi_atomic_subu(p, 1); } -// Atomically read a pointer +// Atomically read a pointer; Memory order is relaxed. static inline void* mi_atomic_read_ptr_relaxed(volatile _Atomic(void*) const * p) { return (void*)mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)p); } +// Atomically read a pointer; Memory order is acquire. +static inline void* mi_atomic_read_ptr(volatile _Atomic(void*) const * p) { + return (void*)mi_atomic_read((const volatile _Atomic(uintptr_t)*)p); +} + // Atomically write a pointer static inline void mi_atomic_write_ptr(volatile _Atomic(void*)* p, void* x) { mi_atomic_write((volatile _Atomic(uintptr_t)*)p, (uintptr_t)x ); @@ -127,18 +138,21 @@ static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) { return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange); } -static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) { +static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) { return *p; } +static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) { + return mi_atomic_read(p); +} static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) { mi_atomic_exchange(p,x); } static inline void mi_atomic_yield(void) { YieldProcessor(); } -static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) { +static inline void mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) { #ifdef _WIN64 - return mi_atomic_add(p,add); + mi_atomic_add(p,add); #else int64_t current; int64_t sum; @@ -146,7 +160,6 @@ static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) current = *p; sum = current + add; } while (_InterlockedCompareExchange64(p, sum, current) != current); - return current; #endif } @@ -156,9 +169,9 @@ static inline int64_t mi_atomic_add64(volatile _Atomic(int64_t)* p, int64_t add) #else #define MI_USING_STD #endif -static inline int64_t mi_atomic_add64(volatile int64_t* p, int64_t add) { +static inline void mi_atomic_add64(volatile int64_t* p, int64_t add) { MI_USING_STD - return atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed); + atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed); } static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) { MI_USING_STD @@ -180,6 +193,10 @@ static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t) MI_USING_STD return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed); } +static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p) { + MI_USING_STD + return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_acquire); +} static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) { MI_USING_STD return atomic_store_explicit(p, x, memory_order_release); diff --git a/src/alloc.c b/src/alloc.c index 97c5fcc4..7e89a591 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -118,22 +118,24 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc mi_segment_t* segment = _mi_page_segment(page); if (segment->page_kind==MI_PAGE_HUGE) { // huge page segments are always abandoned and can be freed immediately - mi_assert_internal(segment->thread_id==0); - mi_assert_internal(segment->abandoned_next==NULL); + mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0); + mi_assert_internal(mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*,&segment->abandoned_next))==NULL); // claim it and free - mi_block_set_next(page, block, page->free); - page->free = block; - page->used--; mi_heap_t* heap = mi_get_default_heap(); - segment->thread_id = heap->thread_id; - _mi_segment_page_free(page,true,&heap->tld->segments); + // paranoia: if this it the last reference, the cas should always succeed + if (mi_atomic_cas_strong(&segment->thread_id,heap->thread_id,0)) { + mi_block_set_next(page, block, page->free); + page->free = block; + page->used--; + _mi_segment_page_free(page,true,&heap->tld->segments); + } return; } do { tfree = page->thread_free; use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE || - (mi_tf_delayed(tfree) == MI_NO_DELAYED_FREE && page->used == page->thread_freed+1) + (mi_tf_delayed(tfree) == MI_NO_DELAYED_FREE && page->used == mi_atomic_read_relaxed(&page->thread_freed)+1) // data-race but ok, just optimizes early release of the page ); if (mi_unlikely(use_delayed)) { // unlikely: this only happens on the first concurrent free in a page that is in the full list diff --git a/src/memory.c b/src/memory.c index 1ea6ee16..268dc153 100644 --- a/src/memory.c +++ b/src/memory.c @@ -131,7 +131,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit mi_assert_internal(®ions[idx] == region); // ensure the region is reserved - void* start = mi_atomic_read_ptr_relaxed(®ion->start); + void* start = mi_atomic_read_ptr(®ion->start); if (start == NULL) { start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld); @@ -154,9 +154,9 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit // we assign it to a later slot instead (up to 4 tries). // note: we don't need to increment the region count, this will happen on another allocation for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) { - void* s = mi_atomic_read_ptr_relaxed(®ions[idx+i].start); + void* s = mi_atomic_read_ptr(®ions[idx+i].start); if (s == NULL) { // quick test - if (mi_atomic_cas_ptr_weak(®ions[idx+i].start, start, s)) { + if (mi_atomic_cas_ptr_strong(®ions[idx+i].start, start, NULL)) { start = NULL; break; } @@ -167,10 +167,10 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit _mi_os_free(start, MI_REGION_SIZE, tld->stats); } // and continue with the memory at our index - start = mi_atomic_read_ptr_relaxed(®ion->start); + start = mi_atomic_read_ptr(®ion->start); } } - mi_assert_internal(start == mi_atomic_read_ptr_relaxed(®ion->start)); + mi_assert_internal(start == mi_atomic_read_ptr(®ion->start)); mi_assert_internal(start != NULL); // Commit the blocks to memory @@ -230,7 +230,7 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc const uintptr_t mask = mi_region_block_mask(blocks, 0); const size_t bitidx_max = MI_REGION_MAP_BITS - blocks; - uintptr_t map = mi_atomic_read_relaxed(®ion->map); + uintptr_t map = mi_atomic_read(®ion->map); #ifdef MI_HAVE_BITSCAN size_t bitidx = mi_bsf(~map); // quickly find the first zero bit if possible @@ -245,9 +245,9 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc mi_assert_internal((m >> bitidx) == mask); // no overflow? uintptr_t newmap = map | m; mi_assert_internal((newmap^map) >> bitidx == mask); - if (!mi_atomic_cas_strong(®ion->map, newmap, map)) { + if (!mi_atomic_cas_weak(®ion->map, newmap, map)) { // no success, another thread claimed concurrently.. keep going - map = mi_atomic_read_relaxed(®ion->map); + map = mi_atomic_read(®ion->map); continue; } else { @@ -317,7 +317,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* // find a range of free blocks void* p = NULL; - size_t count = mi_atomic_read_relaxed(®ions_count); + size_t count = mi_atomic_read(®ions_count); size_t idx = tld->region_idx; // start index is per-thread to reduce contention for (size_t visited = 0; visited < count; visited++, idx++) { if (idx >= count) idx = 0; // wrap around @@ -377,7 +377,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) { mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`? mem_region_t* region = ®ions[idx]; mi_assert_internal((mi_atomic_read_relaxed(®ion->map) & mask) == mask ); // claimed? - void* start = mi_atomic_read_ptr_relaxed(®ion->start); + void* start = mi_atomic_read_ptr(®ion->start); mi_assert_internal(start != NULL); void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE); mi_assert_internal(blocks_start == p); // not a pointer in our area? diff --git a/src/stats.c b/src/stats.c index 2176ba17..4dddb4bc 100644 --- a/src/stats.c +++ b/src/stats.c @@ -38,8 +38,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { if (mi_is_in_main(stat)) { // add atomically (for abandoned pages) - int64_t current = mi_atomic_add64(&stat->current,amount); - if (current > stat->peak) stat->peak = stat->current; // racing.. it's ok + mi_atomic_add64(&stat->current,amount); + if (stat->current > stat->peak) stat->peak = stat->current; // racing.. it's ok if (amount > 0) { mi_atomic_add64(&stat->allocated,amount); } From 7ce9c02fd40796e4392892c0d413a0ac3462d112 Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 26 Aug 2019 08:20:26 -0700 Subject: [PATCH 06/10] make cas weak use release memory order; improve free assembly --- include/mimalloc-atomic.h | 2 +- src/alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h index 3a289feb..8b254d3e 100644 --- a/include/mimalloc-atomic.h +++ b/include/mimalloc-atomic.h @@ -179,7 +179,7 @@ static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add } static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { MI_USING_STD - return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed); + return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed); } static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { MI_USING_STD diff --git a/src/alloc.c b/src/alloc.c index 7e89a591..afc181dd 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -226,7 +226,7 @@ void mi_free(void* p) mi_attr_noexcept #endif const mi_segment_t* const segment = _mi_ptr_segment(p); - if (segment == NULL) return; // checks for (p==NULL) + if (mi_unlikely(segment == NULL)) return; // checks for (p==NULL) #if (MI_DEBUG>0) if (mi_unlikely(!mi_is_in_heap_region(p))) { From 2c19388bcfc08fa2acb3b4e58c569b7ff4b060e7 Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 26 Aug 2019 11:44:41 -0700 Subject: [PATCH 07/10] initialize mimalloc options at process load --- include/mimalloc-internal.h | 1 + src/init.c | 1 + src/options.c | 7 +++++++ 3 files changed, 9 insertions(+) diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index d886bcec..3889c66e 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -27,6 +27,7 @@ void _mi_error_message(const char* fmt, ...); void _mi_warning_message(const char* fmt, ...); void _mi_verbose_message(const char* fmt, ...); void _mi_trace_message(const char* fmt, ...); +void _mi_options_init(void); // "init.c" extern mi_stats_t _mi_stats_main; diff --git a/src/init.c b/src/init.c index 290caeec..4c7fdda0 100644 --- a/src/init.c +++ b/src/init.c @@ -416,6 +416,7 @@ static void mi_allocator_done() { static void mi_process_load(void) { os_preloading = false; atexit(&mi_process_done); + _mi_options_init(); mi_process_init(); //mi_stats_reset(); if (mi_redirected) _mi_verbose_message("malloc is redirected.\n"); diff --git a/src/options.c b/src/options.c index 88f2503e..16c50f11 100644 --- a/src/options.c +++ b/src/options.c @@ -73,6 +73,13 @@ static mi_option_desc_t options[_mi_option_last] = static void mi_option_init(mi_option_desc_t* desc); +void _mi_options_init(void) { + // called on process load + for(int i = 0; i < _mi_option_last; i++ ) { + mi_option_get((mi_option_t)i); // initialize + } +} + long mi_option_get(mi_option_t option) { mi_assert(option >= 0 && option < _mi_option_last); mi_option_desc_t* desc = &options[option]; From 8b06ab1e4946005e4bf8c067c33c53b2647aaf39 Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 26 Aug 2019 12:41:35 -0700 Subject: [PATCH 08/10] fix check on gigabyte alignment of huge os pages on windows --- src/os.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/os.c b/src/os.c index 1c7696b4..5d1b7576 100644 --- a/src/os.c +++ b/src/os.c @@ -198,7 +198,7 @@ static bool mi_os_mem_free(void* addr, size_t size, mi_stats_t* stats) static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) { #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS) // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages - if ((size % (uintptr_t)1 << 30) == 0 /* 1GiB multiple */ + if ((size % ((uintptr_t)1 << 30)) == 0 /* 1GiB multiple */ && (flags & MEM_LARGE_PAGES) != 0 && (flags & MEM_COMMIT) != 0 && (addr != NULL || try_alignment == 0 || try_alignment % _mi_os_page_size() == 0) && pNtAllocateVirtualMemoryEx != NULL) @@ -217,7 +217,7 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment } else { // else fall back to regular large OS pages - _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %lx)\n", err); + _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error 0x%lx)\n", err); } } #endif From 3d8c331a1c3994a8727528487c956fddf81e2519 Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 26 Aug 2019 12:41:59 -0700 Subject: [PATCH 09/10] search regions always from the lowest index --- src/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/memory.c b/src/memory.c index 268dc153..222b87c2 100644 --- a/src/memory.c +++ b/src/memory.c @@ -318,7 +318,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* // find a range of free blocks void* p = NULL; size_t count = mi_atomic_read(®ions_count); - size_t idx = tld->region_idx; // start index is per-thread to reduce contention + size_t idx = 0; // tld->region_idx; // start index is per-thread to reduce contention for (size_t visited = 0; visited < count; visited++, idx++) { if (idx >= count) idx = 0; // wrap around if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error From f0a12699c208191afad6373a64a71c76af7bdb05 Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 26 Aug 2019 12:42:25 -0700 Subject: [PATCH 10/10] remove atomic_iread --- include/mimalloc-atomic.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h index 9549cbc3..8b254d3e 100644 --- a/include/mimalloc-atomic.h +++ b/include/mimalloc-atomic.h @@ -114,9 +114,6 @@ static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exc return (void*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)p, (uintptr_t)exchange); } -static inline intptr_t mi_atomic_iread(volatile intptr_t* p) { - return (intptr_t)mi_atomic_read( (volatile uintptr_t*)p ); -} #ifdef _MSC_VER #define WIN32_LEAN_AND_MEAN