first working tls on macOS using interpose; still slow

2020-01-29 22:46:44 -08:00 · 2020-01-29 22:46:44 -08:00 · 03b363a1c2
commit 03b363a1c2
parent b3dae128de
9 changed files with 155 additions and 120 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -247,7 +247,7 @@ if (MI_BUILD_TESTS MATCHES "ON")
  target_compile_definitions(mimalloc-test-stress PRIVATE ${mi_defines})
  target_compile_options(mimalloc-test-stress PRIVATE ${mi_cflags})
  target_include_directories(mimalloc-test-stress PRIVATE include)
-  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc-static ${mi_libraries})
+  target_link_libraries(mimalloc-test-stress PRIVATE mimalloc ${mi_libraries})

  enable_testing()
  add_test(test_api, mimalloc-test-api)
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -33,7 +33,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align     
+#define mi_decl_cache_align
 #endif


@ -51,6 +51,7 @@ void       _mi_random_init(mi_random_ctx_t* ctx);
 void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
 uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
 uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);

 // init.c
@ -233,7 +234,7 @@ static inline size_t _mi_wsize_from_size(size_t size) {


 // Overflow detecting multiply
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {  
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
 #if (SIZE_MAX == UINT_MAX)
@ -274,18 +275,24 @@ extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value o
 extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;

-extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from

-static inline mi_heap_t* mi_get_default_heap(void) {
 #ifdef MI_TLS_RECURSE_GUARD
+extern mi_heap_t* _mi_get_default_heap_tls_safe(void);
+static inline mi_heap_t* mi_get_default_heap(void) {
  // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
  // to initialize thread local data. To avoid recursion, we need to avoid
  // accessing the thread local `_mi_default_heap` until our module is loaded
  // and use the statically allocated main heap until that time.
  // TODO: patch ourselves dynamically to avoid this check every time?
-  if (!_mi_process_is_initialized) return &_mi_heap_main;
-#endif
+  return _mi_get_default_heap_tls_safe();
+#else
+
+extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+
+static inline mi_heap_t* mi_get_default_heap(void) {
  return _mi_heap_default;
+
+#endif
 }

 static inline bool mi_heap_is_default(const mi_heap_t* heap) {
@ -302,6 +309,7 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
 }

 static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  mi_assert_internal(_mi_heap_main.cookie != 0);
  return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }

@ -345,7 +353,7 @@ static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, con

 // Get the page containing the pointer
 static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) {
-  uintptr_t idx = _mi_segment_page_idx_of(segment, p);  
+  uintptr_t idx = _mi_segment_page_idx_of(segment, p);
  return &((mi_segment_t*)segment)->pages[idx];
 }

@ -411,14 +419,14 @@ static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t*
  return mi_tf_make(block, mi_tf_delayed(tf));
 }

-// are all blocks in a page freed? 
+// are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
 static inline bool mi_page_all_free(const mi_page_t* page) {
  mi_assert_internal(page != NULL);
  return (page->used == 0);
 }

-// are there any available blocks? 
+// are there any available blocks?
 static inline bool mi_page_has_any_available(const mi_page_t* page) {
  mi_assert_internal(page != NULL && page->reserved > 0);
  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
@ -466,11 +474,11 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 /* -------------------------------------------------------------------
 Encoding/Decoding the free list next pointers

-This is to protect against buffer overflow exploits where the 
-free list is mutated. Many hardened allocators xor the next pointer `p` 
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
 with a secret key `k1`, as `p^k1`. This prevents overwriting with known
-values but might be still too weak: if the attacker can guess 
-the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`). 
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
 Moreover, if multiple blocks can be read as well, the attacker can
 xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
 about the pointers (and subsequently `k1`).
@ -478,9 +486,9 @@ about the pointers (and subsequently `k1`).
 Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
 Since these operations are not associative, the above approaches do not
 work so well any more even if the `p` can be guesstimated. For example,
-for the read case we can subtract two entries to discard the `+k1` term, 
+for the read case we can subtract two entries to discard the `+k1` term,
 but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
-We include the left-rotation since xor and addition are otherwise linear 
+We include the left-rotation since xor and addition are otherwise linear
 in the lowest bit. Finally, both keys are unique per page which reduces
 the re-use of keys by a large factor.

--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -41,6 +41,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #endif

 #if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)
+  static void mi_free_tls_safe(void* p) {
+    if (mi_unlikely(_mi_preloading())) return;
+    mi_free(p);
+  }
  // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1`
  // See: <https://books.google.com/books?id=K8vUkpOXhN4C&pg=PA73>
  struct mi_interpose_s {
@ -54,7 +58,7 @@ terms of the MIT license. A copy of the license can be found in the file
    MI_INTERPOSE_MI(malloc),
    MI_INTERPOSE_MI(calloc),
    MI_INTERPOSE_MI(realloc),
-    MI_INTERPOSE_MI(free),
+    MI_INTERPOSEX(free,mi_free_tls_safe),
    MI_INTERPOSE_MI(strdup),
    MI_INTERPOSE_MI(strndup)
  };
@ -194,4 +198,3 @@ int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_me
 #endif

 #endif // MI_MALLOC_OVERRIDE && !_WIN32
-
--- a/src/alloc.c
+++ b/src/alloc.c
@ -21,7 +21,7 @@ terms of the MIT license. A copy of the license can be found in the file

 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept { 
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
  mi_block_t* block = page->free;
  if (mi_unlikely(block == NULL)) {
--- a/src/init.c
+++ b/src/init.c
@ -104,9 +104,9 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;
 static mi_tld_t tld_main = {
  0, false,
  &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0}, 
-    0, 0, 0, 0, 0, 0, NULL, 
-    tld_main_stats, tld_main_os 
+  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
+    0, 0, 0, 0, 0, 0, NULL,
+    tld_main_stats, tld_main_os
  }, // segments
  { 0, tld_main_stats },  // os
  { MI_STATS_NULL }       // stats
@ -124,9 +124,9 @@ mi_heap_t _mi_heap_main = {
  MI_PAGE_QUEUES_EMPTY,
  ATOMIC_VAR_INIT(NULL),
  0,                // thread id
-  MI_INIT_COOKIE,   // initial cookie
-  { MI_INIT_COOKIE, MI_INIT_COOKIE }, // the key of the main heap can be fixed (unlike page keys that need to be secure!)
-  { {0}, {0}, 0 },  // random
+  0,                // initial cookie
+  { 0, 0 },         // the key of the main heap can be fixed (unlike page keys that need to be secure!)
+  { {0x846ca68b}, {0}, 0 },  // random
  0,                // page count
  false             // can reclaim
 };
@ -148,14 +148,15 @@ typedef struct mi_thread_data_s {

 // Initialize the thread local default heap, called from `mi_thread_init`
 static bool _mi_heap_init(void) {
-  if (mi_heap_is_initialized(_mi_heap_default)) return true;
+  if (mi_heap_is_initialized(mi_get_default_heap())) return true;
  if (_mi_is_main_thread()) {
+    mi_assert_internal(_mi_heap_main.thread_id != 0);
    // the main heap is statically allocated
    _mi_heap_set_default_direct(&_mi_heap_main);
-    mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
+    //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap());
  }
  else {
-    // use `_mi_os_alloc` to allocate directly from the OS    
+    // use `_mi_os_alloc` to allocate directly from the OS
    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
    if (td == NULL) {
      _mi_error_message(ENOMEM, "failed to allocate thread local heap memory\n");
@ -170,7 +171,7 @@ static bool _mi_heap_init(void) {
    heap->cookie = _mi_heap_random_next(heap) | 1;
    heap->key[0] = _mi_heap_random_next(heap);
    heap->key[1] = _mi_heap_random_next(heap);
-    heap->tld = tld;    
+    heap->tld = tld;
    tld->heap_backing = heap;
    tld->segments.stats = &tld->stats;
    tld->segments.os = &tld->os;
@ -265,8 +266,9 @@ static void _mi_thread_done(mi_heap_t* default_heap);
 #endif

 // Set up handlers so `mi_thread_done` is called automatically
+static bool tls_initialized = false; // fine if it races
+
 static void mi_process_setup_auto_thread_done(void) {
-  static bool tls_initialized = false; // fine if it races
  if (tls_initialized) return;
  tls_initialized = true;
  #if defined(_WIN32) && defined(MI_SHARED_LIB)
@ -317,7 +319,9 @@ static void _mi_thread_done(mi_heap_t* heap) {

 void _mi_heap_set_default_direct(mi_heap_t* heap)  {
  mi_assert_internal(heap != NULL);
+  #ifndef MI_TLS_RECURSE_GUARD
  _mi_heap_default = heap;
+  #endif

  // ensure the default heap is passed to `_mi_thread_done`
  // setting to a non-NULL value also ensures `mi_thread_done` is called.
@ -330,7 +334,11 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
  #endif
 }

-
+mi_heap_t* _mi_get_default_heap_tls_safe(void) {
+  if (mi_unlikely(mi_pthread_key==0)) return (mi_heap_t*)&_mi_heap_empty;
+  mi_heap_t* heap = pthread_getspecific(mi_pthread_key);
+  return (mi_likely(heap!=NULL) ? heap : (mi_heap_t*)&_mi_heap_empty);
+}

 // --------------------------------------------------------
 // Run functions on process init/done, and thread init/done
@ -339,6 +347,7 @@ static void mi_process_done(void);

 static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc
+bool _mi_tls_initialized = false;

 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
 bool _mi_preloading() {
@ -383,7 +392,10 @@ static void mi_allocator_done() {

 // Called once by the process loader
 static void mi_process_load(void) {
+  volatile mi_heap_t* dummy = _mi_heap_default; // access TLS to allocate it before setting tls_initialized to true;
+  UNUSED(dummy);
  os_preloading = false;
+  _mi_tls_initialized = true;
  atexit(&mi_process_done);
  _mi_options_init();
  mi_process_init();
@ -398,26 +410,26 @@ static void mi_process_load(void) {
  }
 }

+void _mi_heap_main_init(void) {
+  if (_mi_heap_main.cookie == 0) {
+    _mi_heap_main.thread_id = _mi_thread_id();
+    _mi_heap_main.cookie = _os_random_weak((uintptr_t)&_mi_heap_main_init);
+    _mi_random_init(&_mi_heap_main.random);
+    _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
+    _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
+  }
+}
+
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
  // ensure we are called once
  if (_mi_process_is_initialized) return;
-  // access _mi_heap_default before setting _mi_process_is_initialized to ensure
-  // that the TLS slot is allocated without getting into recursion on macOS
-  // when using dynamic linking with interpose.
-  mi_get_default_heap();
  _mi_process_is_initialized = true;
-
-  _mi_heap_main.thread_id = _mi_thread_id();
-  _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id);
-  _mi_random_init(&_mi_heap_main.random);
-  #ifndef __APPLE__  // TODO: fix this? cannot update cookie if allocation already happened..
-  _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[0] = _mi_heap_random_next(&_mi_heap_main);
-  _mi_heap_main.key[1] = _mi_heap_random_next(&_mi_heap_main);
-  #endif
  mi_process_setup_auto_thread_done();
+
+  _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
  _mi_os_init();
+  _mi_heap_main_init();
  #if (MI_DEBUG)
  _mi_verbose_message("debug level : %d\n", MI_DEBUG);
  #endif
--- a/src/options.c
+++ b/src/options.c
@ -53,7 +53,7 @@ static mi_option_desc_t options[_mi_option_last] =
  // stable options
  { MI_DEBUG, UNINIT, MI_OPTION(show_errors) },
  { 0, UNINIT, MI_OPTION(show_stats) },
-  { 0, UNINIT, MI_OPTION(verbose) },
+  { 1, UNINIT, MI_OPTION(verbose) },

  // the following options are experimental and not all combinations make sense.
  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit on demand
@ -239,16 +239,30 @@ static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT
 // inside the C runtime causes another message.
 static mi_decl_thread bool recurse = false;

+static bool mi_recurse_enter(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return true;
+  #endif
+  if (recurse) return false;
+  recurse = true;
+  return true;
+}
+
+static void mi_recurse_exit(void) {
+  #ifdef MI_TLS_RECURSE_GUARD
+  if (_mi_preloading()) return;
+  #endif
+  recurse = false;
+}
+
 void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message) {
-  if (recurse) return;
+  if (!mi_recurse_enter()) return;
  if (out==NULL || (FILE*)out==stdout || (FILE*)out==stderr) { // TODO: use mi_out_stderr for stderr?
    out = mi_out_get_default(&arg);
  }
-  recurse = true;
  if (prefix != NULL) out(prefix,arg);
  out(message,arg);
-  recurse = false;
-  return;
+  mi_recurse_exit();
 }

 // Define our own limited `fprintf` that avoids memory allocation.
@ -256,14 +270,12 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me
 static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) {
  char buf[512];
  if (fmt==NULL) return;
-  if (recurse) return;
-  recurse = true;
+  if (!mi_recurse_enter()) return;
  vsnprintf(buf,sizeof(buf)-1,fmt,args);
-  recurse = false;
+  mi_recurse_exit();
  _mi_fputs(out,arg,prefix,buf);
 }

-
 void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) {
  va_list args;
  va_start(args,fmt);
@ -290,7 +302,7 @@ void _mi_verbose_message(const char* fmt, ...) {
 static void mi_show_error_message(const char* fmt, va_list args) {
  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
-  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);  
+  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
 }

 void _mi_warning_message(const char* fmt, ...) {
--- a/src/random.c
+++ b/src/random.c
@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file

 /* ----------------------------------------------------------------------------
 We use our own PRNG to keep predictable performance of random number generation
-and to avoid implementations that use a lock. We only use the OS provided 
+and to avoid implementations that use a lock. We only use the OS provided
 random source to initialize the initial seeds. Since we do not need ultimate
 performance but we do rely on the security (for secret cookies in secure mode)
 we use a cryptographically secure generator (chacha20).
@ -21,11 +21,11 @@ we use a cryptographically secure generator (chacha20).


 /* ----------------------------------------------------------------------------
-Chacha20 implementation as the original algorithm with a 64-bit nonce 
+Chacha20 implementation as the original algorithm with a 64-bit nonce
 and counter: https://en.wikipedia.org/wiki/Salsa20
 The input matrix has sixteen 32-bit values:
 Position  0 to  3: constant key
-Position  4 to 11: the key 
+Position  4 to 11: the key
 Position 12 to 13: the counter.
 Position 14 to 15: the nonce.

@ -44,8 +44,8 @@ static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d
  x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
 }

-static void chacha_block(mi_random_ctx_t* ctx) 
-{  
+static void chacha_block(mi_random_ctx_t* ctx)
+{
  // scramble into `x`
  uint32_t x[16];
  for (size_t i = 0; i < 16; i++) {
@ -72,8 +72,8 @@ static void chacha_block(mi_random_ctx_t* ctx)
  ctx->input[12] += 1;
  if (ctx->input[12] == 0) {
    ctx->input[13] += 1;
-    if (ctx->input[13] == 0) {  // and keep increasing into the nonce 
-      ctx->input[14] += 1;  
+    if (ctx->input[13] == 0) {  // and keep increasing into the nonce
+      ctx->input[14] += 1;
    }
  }
 }
@ -83,7 +83,7 @@ static uint32_t chacha_next32(mi_random_ctx_t* ctx) {
    chacha_block(ctx);
    ctx->output_available = 16; // (assign again to suppress static analysis warning)
  }
-  const uint32_t x = ctx->output[16 - ctx->output_available];  
+  const uint32_t x = ctx->output[16 - ctx->output_available];
  ctx->output[16 - ctx->output_available] = 0; // reset once the data is handed out
  ctx->output_available--;
  return x;
@ -94,9 +94,9 @@ static inline uint32_t read32(const uint8_t* p, size_t idx32) {
  return ((uint32_t)p[i+0] | (uint32_t)p[i+1] << 8 | (uint32_t)p[i+2] << 16 | (uint32_t)p[i+3] << 24);
 }

-static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce) 
+static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t nonce)
 {
-  // since we only use chacha for randomness (and not encryption) we 
+  // since we only use chacha for randomness (and not encryption) we
  // do not _need_ to read 32-bit values as little endian but we do anyways
  // just for being compatible :-)
  memset(ctx, 0, sizeof(*ctx));
@ -110,7 +110,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
  ctx->input[12] = 0;
  ctx->input[13] = 0;
  ctx->input[14] = (uint32_t)nonce;
-  ctx->input[15] = (uint32_t)(nonce >> 32);  
+  ctx->input[15] = (uint32_t)(nonce >> 32);
 }

 static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
@ -184,7 +184,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
  arc4random_buf(buf, buf_len);
  return true;
 }
-#elif defined(__linux__) 
+#elif defined(__linux__)
 #include <sys/syscall.h>
 #include <unistd.h>
 #include <sys/types.h>
@ -241,8 +241,8 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 #include <time.h>
 #endif

-static uintptr_t os_random_weak(uintptr_t extra_seed) {
-  uintptr_t x = (uintptr_t)&os_random_weak ^ extra_seed; // ASLR makes the address random
+uintptr_t _os_random_weak(uintptr_t extra_seed) {
+  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
  #if defined(_WIN32)
    LARGE_INTEGER pcount;
    QueryPerformanceCounter(&pcount);
@ -267,10 +267,10 @@ static uintptr_t os_random_weak(uintptr_t extra_seed) {
 void _mi_random_init(mi_random_ctx_t* ctx) {
  uint8_t key[32];
  if (!os_random_buf(key, sizeof(key))) {
-    // if we fail to get random data from the OS, we fall back to a 
+    // if we fail to get random data from the OS, we fall back to a
    // weak random source based on the current time
    _mi_warning_message("unable to use secure randomness\n");
-    uintptr_t x = os_random_weak(0);
+    uintptr_t x = _os_random_weak(0);
    for (size_t i = 0; i < 8; i++) {  // key is eight 32-bit words.
      x = _mi_random_shuffle(x);
      ((uint32_t*)key)[i] = (uint32_t)x;
@ -280,7 +280,7 @@ void _mi_random_init(mi_random_ctx_t* ctx) {
 }

 /* --------------------------------------------------------
-test vectors from <https://tools.ietf.org/html/rfc8439> 
+test vectors from <https://tools.ietf.org/html/rfc8439>
 ----------------------------------------------------------- */
 /*
 static bool array_equals(uint32_t* x, uint32_t* y, size_t n) {
--- a/src/segment.c
+++ b/src/segment.c
@ -17,9 +17,9 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_

 /* --------------------------------------------------------------------------------
  Segment allocation
-  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid 
-  splitting VMA's on Linux and reduce fragmentation on other OS's. 
-  Each thread owns its own segments. 
+  We allocate pages inside bigger "segments" (4mb on 64-bit). This is to avoid
+  splitting VMA's on Linux and reduce fragmentation on other OS's.
+  Each thread owns its own segments.

  Currently we have:
  - small pages (64kb), 64 in one segment
@ -154,14 +154,14 @@ static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t*
  for (size_t i = 0; i < segment->capacity; i++) {
    const mi_page_t* const page = &segment->pages[i];
    if (!page->segment_in_use) {
-      nfree++;      
+      nfree++;
    }
    if (page->segment_in_use || page->is_reset) {
      mi_assert_expensive(!mi_pages_reset_contains(page, tld));
    }
  }
  mi_assert_internal(nfree + segment->used == segment->capacity);
-  mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
+  // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0
  mi_assert_internal(segment->page_kind == MI_PAGE_HUGE ||
                     (mi_segment_page_size(segment) * segment->capacity == segment->segment_size));
  return true;
@ -286,7 +286,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
  mi_assert_expensive(!mi_pages_reset_contains(page, tld));
  mi_assert_internal(_mi_page_segment(page)==segment);
  if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;  
+  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;

  if (mi_option_get(mi_option_reset_delay) == 0) {
    // reset immediately?
@ -295,7 +295,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
  else {
    // otherwise push on the delayed page reset queue
    mi_page_queue_t* pq = &tld->pages_reset;
-    // push on top 
+    // push on top
    mi_page_reset_set_expire(page);
    page->next = pq->first;
    page->prev = NULL;
@ -316,7 +316,7 @@ static void mi_pages_reset_remove(mi_page_t* page, mi_segments_tld_t* tld) {
  mi_page_queue_t* pq = &tld->pages_reset;
  mi_assert_internal(pq!=NULL);
  mi_assert_internal(!page->segment_in_use);
-  mi_assert_internal(mi_pages_reset_contains(page, tld));  
+  mi_assert_internal(mi_pages_reset_contains(page, tld));
  if (page->prev != NULL) page->prev->next = page->next;
  if (page->next != NULL) page->next->prev = page->prev;
  if (page == pq->last)  pq->last = page->prev;
@ -332,19 +332,19 @@ static void mi_pages_reset_remove_all_in_segment(mi_segment_t* segment, bool for
    if (!page->segment_in_use && page->is_committed && !page->is_reset) {
      mi_pages_reset_remove(page, tld);
      if (force_reset) {
-        mi_page_reset(segment, page, 0, tld); 
+        mi_page_reset(segment, page, 0, tld);
      }
    }
    else {
      mi_assert_internal(mi_page_not_in_queue(page,tld));
-    }    
+    }
  }
 }

 static void mi_reset_delayed(mi_segments_tld_t* tld) {
  if (!mi_option_is_enabled(mi_option_page_reset)) return;
  mi_msecs_t now = _mi_clock_now();
-  mi_page_queue_t* pq = &tld->pages_reset;  
+  mi_page_queue_t* pq = &tld->pages_reset;
  // from oldest up to the first that has not expired yet
  mi_page_t* page = pq->last;
  while (page != NULL && mi_page_reset_is_expired(page,now)) {
@ -358,7 +358,7 @@ static void mi_reset_delayed(mi_segments_tld_t* tld) {
  pq->last = page;
  if (page != NULL){
    page->next = NULL;
-  } 
+  }
  else {
    pq->first = NULL;
  }
@ -540,7 +540,7 @@ void _mi_segment_thread_collect(mi_segments_tld_t* tld) {
  }
  mi_assert_internal(tld->cache_count == 0);
  mi_assert_internal(tld->cache == NULL);
-#if MI_DEBUG>=2 
+#if MI_DEBUG>=2
  if (!_mi_is_main_thread()) {
    mi_assert_internal(tld->pages_reset.first == NULL);
    mi_assert_internal(tld->pages_reset.last == NULL);
@ -684,7 +684,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,

 static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) {
  UNUSED(force);
-  mi_assert(segment != NULL);  
+  mi_assert(segment != NULL);
  // note: don't reset pages even on abandon as the whole segment is freed? (and ready for reuse)
  bool force_reset = (force && mi_option_is_enabled(mi_option_abandoned_page_reset));
  mi_pages_reset_remove_all_in_segment(segment, force_reset, tld);
@ -716,7 +716,7 @@ static bool mi_segment_has_free(const mi_segment_t* segment) {

 static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
  mi_assert_internal(_mi_page_segment(page) == segment);
-  mi_assert_internal(!page->segment_in_use);    
+  mi_assert_internal(!page->segment_in_use);
  // set in-use before doing unreset to prevent delayed reset
  mi_pages_reset_remove(page, tld);
  page->segment_in_use = true;
@ -756,7 +756,7 @@ static void mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);

 // clear page data; can be called on abandoned segments
-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld) 
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool allow_reset, mi_segments_tld_t* tld)
 {
  mi_assert_internal(page->segment_in_use);
  mi_assert_internal(mi_page_all_free(page));
@ -787,7 +787,7 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
  segment->used--;

  // add to the free page list for reuse/reset
-  if (allow_reset) {  
+  if (allow_reset) {
    mi_pages_reset_add(segment, page, tld);
  }
 }
@ -841,12 +841,12 @@ Note: the current implementation is one possible design;
 another way might be to keep track of abandoned segments
 in the regions. This would have the advantage of keeping
 all concurrent code in one place and not needing to deal
-with ABA issues. The drawback is that it is unclear how to 
-scan abandoned segments efficiently in that case as they 
+with ABA issues. The drawback is that it is unclear how to
+scan abandoned segments efficiently in that case as they
 would be spread among all other segments in the regions.
 ----------------------------------------------------------- */

-// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers 
+// Use the bottom 20-bits (on 64-bit) of the aligned segment pointers
 // to put in a tag that increments on update to avoid the A-B-A problem.
 #define MI_TAGGED_MASK   MI_SEGMENT_MASK
 typedef uintptr_t        mi_tagged_segment_t;
@ -862,7 +862,7 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se
 }

 // This is a list of visited abandoned pages that were full at the time.
-// this list migrates to `abandoned` when that becomes NULL. The use of 
+// this list migrates to `abandoned` when that becomes NULL. The use of
 // this list reduces contention and the rate at which segments are visited.
 static mi_decl_cache_align volatile _Atomic(mi_segment_t*)       abandoned_visited; // = NULL

@ -888,7 +888,7 @@ static void mi_abandoned_visited_push(mi_segment_t* segment) {
 }

 // Move the visited list to the abandoned list.
-static bool mi_abandoned_visited_revisit(void) 
+static bool mi_abandoned_visited_revisit(void)
 {
  // quick check if the visited list is empty
  if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false;
@ -954,12 +954,12 @@ static mi_segment_t* mi_abandoned_pop(void) {
  segment = mi_tagged_segment_ptr(ts);
  if (mi_likely(segment == NULL)) {
    if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
-      return NULL;  
+      return NULL;
    }
  }

  // Do a pop. We use a reader count to prevent
-  // a segment to be decommitted while a read is still pending, 
+  // a segment to be decommitted while a read is still pending,
  // and a tagged pointer to prevent A-B-A link corruption.
  // (this is called from `memory.c:_mi_mem_free` for example)
  mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
@ -1024,7 +1024,7 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
 ----------------------------------------------------------- */

 // Possibly clear pages and check if free space is available
-static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free) 
+static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free)
 {
  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
  bool has_page = false;
@ -1032,17 +1032,17 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
  size_t pages_used_empty = 0;
  for (size_t i = 0; i < segment->capacity; i++) {
    mi_page_t* page = &segment->pages[i];
-    if (page->segment_in_use) {      
+    if (page->segment_in_use) {
      pages_used++;
      // ensure used count is up to date and collect potential concurrent frees
-      _mi_page_free_collect(page, false); 
+      _mi_page_free_collect(page, false);
      if (mi_page_all_free(page)) {
        // if everything free already, page can be reused for some block size
        // note: don't clear the page yet as we can only OS reset it once it is reclaimed
        pages_used_empty++;
        has_page = true;
      }
-      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {  
+      else if (page->xblock_size == block_size && mi_page_has_any_available(page)) {
        // a page has available free blocks of the right size
        has_page = true;
      }
@ -1051,7 +1051,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
      // whole empty page
      has_page = true;
    }
-  }  
+  }
  mi_assert_internal(pages_used == segment->used && pages_used >= pages_used_empty);
  if (all_pages_free != NULL) {
    *all_pages_free = ((pages_used - pages_used_empty) == 0);
@ -1100,7 +1100,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap,
          if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; }
        }
      }
-    }   
+    }
    else if (page->is_committed && !page->is_reset) {  // not in-use, and not reset yet
      // note: do not reset as this includes pages that were not touched before
      // mi_pages_reset_add(segment, page, tld);
@ -1141,17 +1141,17 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,
      // free the segment (by forced reclaim) to make it available to other threads.
      // note1: we prefer to free a segment as that might lead to reclaiming another
      // segment that is still partially used.
-      // note2: we could in principle optimize this by skipping reclaim and directly 
+      // note2: we could in principle optimize this by skipping reclaim and directly
      // freeing but that would violate some invariants temporarily)
      mi_segment_reclaim(segment, heap, 0, NULL, tld);
    }
    else if (has_page && segment->page_kind == page_kind) {
-      // found a free page of the right kind, or page of the right block_size with free space 
+      // found a free page of the right kind, or page of the right block_size with free space
      // we return the result of reclaim (which is usually `segment`) as it might free
      // the segment due to concurrent frees (in which case `NULL` is returned).
      return mi_segment_reclaim(segment, heap, block_size, reclaimed, tld);
    }
-    else if (segment->abandoned_visits >= 3) {  
+    else if (segment->abandoned_visits >= 3) {
      // always reclaim on 3rd visit to limit the list length.
      mi_segment_reclaim(segment, heap, 0, NULL, tld);
    }
@ -1165,12 +1165,12 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size,


 /* -----------------------------------------------------------
-   Reclaim or allocate  
+   Reclaim or allocate
 ----------------------------------------------------------- */

-static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) 
+static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
 {
-  mi_assert_internal(page_kind <= MI_PAGE_LARGE);  
+  mi_assert_internal(page_kind <= MI_PAGE_LARGE);
  mi_assert_internal(block_size < MI_HUGE_BLOCK_SIZE);
  // 1. try to get a segment from our cache
  mi_segment_t* segment = mi_segment_cache_pop(MI_SEGMENT_SIZE, tld);
@ -1191,7 +1191,7 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_s
    return segment;
  }
  // 3. otherwise allocate a fresh segment
-  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);  
+  return mi_segment_alloc(0, page_kind, page_shift, tld, os_tld);
 }


@ -1216,11 +1216,11 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t*
 // Allocate a page inside a segment. Requires that the page has free pages
 static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
  mi_assert_internal(mi_segment_has_free(segment));
-  return mi_segment_find_free(segment, tld);  
+  return mi_segment_find_free(segment, tld);
 }

 static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
-  // find an available segment the segment free queue 
+  // find an available segment the segment free queue
  mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld);
  if (mi_segment_queue_is_empty(free_queue)) {
    // possibly allocate or reclaim a fresh segment
@ -1275,7 +1275,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
 }

 /* -----------------------------------------------------------
-   Page allocation 
+   Page allocation
 ----------------------------------------------------------- */

 mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -20,7 +20,7 @@ terms of the MIT license.
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
-#include <mimalloc.h>
+// #include <mimalloc.h>

 // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
 //
@ -38,7 +38,7 @@ static bool   allow_large_objects = true;    // allow very large objects?
 static size_t use_one_size = 0;              // use single object size of `N * sizeof(uintptr_t)`?


-#ifdef USE_STD_MALLOC
+#ifndef USE_STD_MALLOC
 #define custom_calloc(n,s)    calloc(n,s)
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)
@ -188,7 +188,7 @@ static void test_stress(void) {
        free_items(p);
      }
    }
-    mi_collect(false);
+    // mi_collect(false);
 #ifndef NDEBUG
    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
 #endif
@ -206,7 +206,7 @@ static void leak(intptr_t tid) {
  }
 }

-static void test_leak(void) {  
+static void test_leak(void) {
  for (int n = 0; n < ITER; n++) {
    run_os_threads(THREADS, &leak);
    mi_collect(false);
@ -242,15 +242,15 @@ int main(int argc, char** argv) {

  // Run ITER full iterations where half the objects in the transfer buffer survive to the next round.
  srand(0x7feb352d);
-  mi_stats_reset();
+  // mi_stats_reset();
 #ifdef STRESS
    test_stress();
 #else
    test_leak();
-#endif  
+#endif

-  mi_collect(true);
-  mi_stats_print(NULL);
+  // mi_collect(true);
+  // mi_stats_print(NULL);
  //bench_end_program();
  return 0;
 }
@ -262,7 +262,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress;

 #include <windows.h>

-static DWORD WINAPI thread_entry(LPVOID param) {  
+static DWORD WINAPI thread_entry(LPVOID param) {
  thread_entry_fun((intptr_t)param);
  return 0;
 }