Merge branch 'dev-exp' into dev

2019-11-22 09:22:03 -08:00 · 2019-11-22 09:22:03 -08:00 · 31fbe9793d
commit 31fbe9793d
parent acb03c5497 50575b12c0
22 changed files with 1582 additions and 788 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.0)
 project(libmimalloc C CXX)
-include("cmake/mimalloc-config-version.cmake")
+
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)

@ -14,9 +14,12 @@ option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanis
 option(MI_BUILD_TESTS       "Build test executables" ON)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)

+include("cmake/mimalloc-config-version.cmake")
+
 set(mi_sources
    src/stats.c
    src/os.c
+    src/arena.c
    src/memory.c
    src/segment.c
    src/page.c
--- a/ide/vs2017/mimalloc-override.vcxproj
+++ b/ide/vs2017/mimalloc-override.vcxproj
@ -231,6 +231,7 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
--- a/ide/vs2017/mimalloc-override.vcxproj.filters
+++ b/ide/vs2017/mimalloc-override.vcxproj.filters
@ -70,5 +70,8 @@
    <ClCompile Include="..\..\src\alloc-posix.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
 </Project>
--- a/ide/vs2017/mimalloc.vcxproj
+++ b/ide/vs2017/mimalloc.vcxproj
@ -217,6 +217,7 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
--- a/ide/vs2017/mimalloc.vcxproj.filters
+++ b/ide/vs2017/mimalloc.vcxproj.filters
@ -53,6 +53,9 @@
    <ClCompile Include="..\..\src\alloc-posix.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\arena.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@ -123,7 +123,7 @@
      <SDLCheck>true</SDLCheck>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions);</PreprocessorDefinitions>
      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
      <SupportJustMyCode>false</SupportJustMyCode>
      <CompileAs>Default</CompileAs>
@ -231,6 +231,10 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@ -116,7 +116,7 @@
      <SDLCheck>true</SDLCheck>
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
-      <PreprocessorDefinitions>MI_DEBUG=1;%(PreprocessorDefinitions);</PreprocessorDefinitions>
+      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
      <CompileAs>CompileAsCpp</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
      <LanguageStandard>stdcpp17</LanguageStandard>
@ -217,6 +217,10 @@
    </ClCompile>
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
+    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.inc.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    </ClCompile>
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\memory.c" />
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@ -36,6 +36,13 @@ static inline void mi_atomic_add64(volatile int64_t* p, int64_t add);
 // Atomically add a value; returns the previous value. Memory ordering is relaxed.
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add);

+// Atomically "and" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+// Atomically "or" a value; returns the previous value. Memory ordering is relaxed.
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+
+
 // Atomically compare and exchange a value; returns `true` if successful. 
 // May fail spuriously. Memory ordering as release on success, and relaxed on failure.
 // (Note: expected and desired are in opposite order from atomic_compare_exchange)
@ -121,22 +128,28 @@ static inline void* mi_atomic_exchange_ptr(volatile _Atomic(void*)* p, void* exc
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
-#define RC64(f)  f##64
+#define MI_64(f) f##64
 #else
 typedef LONG     msc_intptr_t;
-#define RC64(f)  f
+#define MI_64(f) f
 #endif
 static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add) {
-  return (intptr_t)RC64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+  return (intptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
+}
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
 static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return (expected == (uintptr_t)RC64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+  return (expected == (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
 }
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
  return mi_atomic_cas_strong(p,desired,expected);
 }
 static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
-  return (uintptr_t)RC64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
+  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
 static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
  return *p;
@ -177,6 +190,14 @@ static inline intptr_t mi_atomic_add(volatile _Atomic(intptr_t)* p, intptr_t add
  MI_USING_STD
  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
 }
+static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_and_explicit(p, x, memory_order_relaxed);
+}
+static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+  MI_USING_STD
+  return atomic_fetch_or_explicit(p, x, memory_order_relaxed);
+}
 static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
  MI_USING_STD
  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -17,18 +17,18 @@ terms of the MIT license. A copy of the license can be found in the file
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
-#define mi_trace_message(...)  
+#define mi_trace_message(...)
 #endif

 #if defined(_MSC_VER)
 #define mi_decl_noinline   __declspec(noinline)
-#define mi_attr_noreturn 
+#define mi_attr_noreturn
 #elif defined(__GNUC__) || defined(__clang__)
 #define mi_decl_noinline   __attribute__((noinline))
 #define mi_attr_noreturn   __attribute__((noreturn))
 #else
 #define mi_decl_noinline
-#define mi_attr_noreturn   
+#define mi_attr_noreturn
 #endif


@ -59,15 +59,15 @@ size_t     _mi_os_good_alloc_size(size_t size);

 // memory.c
 void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
-void       _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats);
+void       _mi_mem_free(void* p, size_t size, size_t id, bool fully_committed, bool any_reset, mi_os_tld_t* tld);

-bool       _mi_mem_reset(void* p, size_t size, mi_stats_t* stats);
-bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool       _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld);
+bool       _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
+bool       _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld);
 bool       _mi_mem_protect(void* addr, size_t size);
 bool       _mi_mem_unprotect(void* addr, size_t size);

-void        _mi_mem_collect(mi_stats_t* stats);
+void        _mi_mem_collect(mi_os_tld_t* tld);

 // "segment.c"
 mi_page_t* _mi_segment_page_alloc(size_t block_wsize, mi_segments_tld_t* tld, mi_os_tld_t* os_tld);
@ -75,7 +75,7 @@ void       _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t*
 void       _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld);
 bool       _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld);
 void       _mi_segment_thread_collect(mi_segments_tld_t* tld);
-uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size); // page start for any page
+uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size); // page start for any page

 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;
@ -105,8 +105,10 @@ void       _mi_heap_set_default_direct(mi_heap_t* heap);

 // "stats.c"
 void       _mi_stats_done(mi_stats_t* stats);
-double     _mi_clock_end(double start);
-double     _mi_clock_start(void);
+
+mi_msecs_t  _mi_clock_now(void);
+mi_msecs_t  _mi_clock_end(mi_msecs_t start);
+mi_msecs_t  _mi_clock_start(void);

 // "alloc.c"
 void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept;  // called from `_mi_malloc_generic`
@ -143,8 +145,8 @@ bool        _mi_page_is_valid(mi_page_t* page);
  Inlined definitions
 ----------------------------------------------------------- */
 #define UNUSED(x)     (void)(x)
-#if (MI_DEBUG>0) 
-#define UNUSED_RELEASE(x)  
+#if (MI_DEBUG>0)
+#define UNUSED_RELEASE(x)
 #else
 #define UNUSED_RELEASE(x)  UNUSED(x)
 #endif
@ -159,7 +161,6 @@ bool        _mi_page_is_valid(mi_page_t* page);


 // Overflow detecting multiply
-#define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
 static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
@ -171,6 +172,7 @@ static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
  return __builtin_umulll_overflow(count, size, total);
 #endif
 #else /* __builtin_umul_overflow is unavailable */
+  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
  *total = count * size;
  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
          && size > 0 && (SIZE_MAX / size) < count);
@ -184,6 +186,7 @@ static inline bool _mi_is_power_of_two(uintptr_t x) {

 // Align upwards
 static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
+  mi_assert_internal(alignment != 0);
  uintptr_t mask = alignment - 1;
  if ((alignment & mask) == 0) {  // power of two?
    return ((sz + mask) & ~mask);
@ -193,6 +196,12 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) {
  }
 }

+// Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`.
+static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) {
+  mi_assert_internal(divider != 0);
+  return (divider == 0 ? size : ((size + divider - 1) / divider));
+}
+
 // Is memory zero initialized?
 static inline bool mi_mem_is_zero(void* p, size_t size) {
  for (size_t i = 0; i < size; i++) {
@ -279,7 +288,7 @@ static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) {
 static inline uintptr_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) {
  // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0];  // huge pages
  ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment;
-  mi_assert_internal(diff >= 0 && diff < MI_SEGMENT_SIZE);
+  mi_assert_internal(diff >= 0 && (size_t)diff < MI_SEGMENT_SIZE);
  uintptr_t idx = (uintptr_t)diff >> segment->page_shift;
  mi_assert_internal(idx < segment->capacity);
  mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0);
@ -294,7 +303,9 @@ static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const

 // Quick page start for initialized pages
 static inline uint8_t* _mi_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
-  return _mi_segment_page_start(segment, page, page->block_size, page_size);
+  const size_t bsize = page->block_size;
+  mi_assert_internal(bsize > 0 && (bsize%sizeof(void*)) == 0);
+  return _mi_segment_page_start(segment, page, bsize, page_size, NULL);
 }

 // Get the page containing the pointer
@ -426,7 +437,7 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t*
  if (next!=NULL && !mi_is_in_same_page(block, next)) {
    _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
    next = NULL;
-  }   
+  }
  return next;
  #else
  UNUSED(page);
@ -443,6 +454,25 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
  #endif
 }

+
+// -------------------------------------------------------------------
+// Optimize numa node access for the common case (= one node)
+// -------------------------------------------------------------------
+
+int    _mi_os_numa_node_get(mi_os_tld_t* tld);
+size_t _mi_os_numa_node_count_get(void);
+
+extern size_t _mi_numa_node_count;
+static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_likely(_mi_numa_node_count == 1)) return 0;
+  else return _mi_os_numa_node_get(tld);
+}
+static inline size_t _mi_os_numa_node_count(void) {
+  if (mi_likely(_mi_numa_node_count>0)) return _mi_numa_node_count;
+  else return _mi_os_numa_node_count_get();
+}
+
+
 // -------------------------------------------------------------------
 // Getting the thread id should be performant
 // as it is called in the fast path of `_mi_free`,
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -93,12 +93,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_SEGMENT_SHIFT                  ( MI_LARGE_PAGE_SHIFT)      // 4mb

 // Derived constants
-#define MI_SEGMENT_SIZE                   (1<<MI_SEGMENT_SHIFT)
+#define MI_SEGMENT_SIZE                   (1UL<<MI_SEGMENT_SHIFT)
 #define MI_SEGMENT_MASK                   ((uintptr_t)MI_SEGMENT_SIZE - 1)

-#define MI_SMALL_PAGE_SIZE                (1<<MI_SMALL_PAGE_SHIFT)
-#define MI_MEDIUM_PAGE_SIZE               (1<<MI_MEDIUM_PAGE_SHIFT)
-#define MI_LARGE_PAGE_SIZE                (1<<MI_LARGE_PAGE_SHIFT)
+#define MI_SMALL_PAGE_SIZE                (1UL<<MI_SMALL_PAGE_SHIFT)
+#define MI_MEDIUM_PAGE_SIZE               (1UL<<MI_MEDIUM_PAGE_SHIFT)
+#define MI_LARGE_PAGE_SIZE                (1UL<<MI_LARGE_PAGE_SHIFT)

 #define MI_SMALL_PAGES_PER_SEGMENT        (MI_SEGMENT_SIZE/MI_SMALL_PAGE_SIZE)
 #define MI_MEDIUM_PAGES_PER_SEGMENT       (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
@ -384,17 +384,23 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)

-
 // ------------------------------------------------------
 // Thread Local data
 // ------------------------------------------------------

+typedef int64_t  mi_msecs_t;
+
 // Queue of segments
 typedef struct mi_segment_queue_s {
  mi_segment_t* first;
  mi_segment_t* last;
 } mi_segment_queue_t;

+// OS thread local data
+typedef struct mi_os_tld_s {
+  size_t                region_idx;   // start point for next allocation
+  mi_stats_t*           stats;        // points to tld stats
+} mi_os_tld_t;

 // Segments thread local data
 typedef struct mi_segments_tld_s {
@ -408,14 +414,9 @@ typedef struct mi_segments_tld_s {
  size_t              cache_size;   // total size of all segments in the cache
  mi_segment_t*       cache;        // (small) cache of segments
  mi_stats_t*         stats;        // points to tld stats
+  mi_os_tld_t*        os;           // points to os stats
 } mi_segments_tld_t;

-// OS thread local data
-typedef struct mi_os_tld_s {
-  size_t              region_idx;   // start point for next allocation
-  mi_stats_t*         stats;        // points to tld stats
-} mi_os_tld_t;
-
 // Thread local data
 struct mi_tld_s {
  unsigned long long  heartbeat;     // monotonic heartbeat count
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -228,9 +228,14 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b

 // Experimental
 mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
-mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
 mi_decl_export bool mi_is_redirected() mi_attr_noexcept;

+mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
+mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;
+
+// deprecated
+mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;
+
 // ------------------------------------------------------
 // Convenience
 // ------------------------------------------------------
@ -266,10 +271,11 @@ typedef enum mi_option_e {
  mi_option_reserve_huge_os_pages,
  mi_option_segment_cache,
  mi_option_page_reset,
-  mi_option_cache_reset,
+  mi_option_segment_reset,
  mi_option_reset_decommits,
  mi_option_eager_commit_delay,
-  mi_option_segment_reset,
+  mi_option_reset_delay,
+  mi_option_use_numa_nodes,
  mi_option_os_tag,
  mi_option_max_errors,
  _mi_option_last
--- a/src/arena.c
+++ b/src/arena.c
@ -0,0 +1,354 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+"Arenas" are fixed area's of OS memory from which we can allocate
+large blocks (>= MI_ARENA_BLOCK_SIZE, 32MiB). 
+In contrast to the rest of mimalloc, the arenas are shared between 
+threads and need to be accessed using atomic operations.
+
+Currently arenas are only used to for huge OS page (1GiB) reservations,
+otherwise it delegates to direct allocation from the OS.
+In the future, we can expose an API to manually add more kinds of arenas 
+which is sometimes needed for embedded devices or shared memory for example.
+(We can also employ this with WASI or `sbrk` systems to reserve large arenas
+ on demand and be able to reuse them efficiently).
+
+The arena allocation needs to be thread safe and we use an atomic
+bitmap to allocate. The current implementation of the bitmap can
+only do this within a field (`uintptr_t`) so we can allocate at most
+blocks of 2GiB (64*32MiB) and no object can cross the boundary. This
+can lead to fragmentation but fortunately most objects will be regions
+of 256MiB in practice.
+-----------------------------------------------------------------------------*/
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "mimalloc-atomic.h"
+
+#include <string.h>  // memset
+
+#include "bitmap.inc.c"  // atomic bitmap
+
+
+// os.c
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
+void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);
+
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize);
+void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);
+
+bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats); 
+
+/* -----------------------------------------------------------
+  Arena allocation
+----------------------------------------------------------- */
+
+#define MI_SEGMENT_ALIGN      MI_SEGMENT_SIZE
+#define MI_ARENA_BLOCK_SIZE   (8*MI_SEGMENT_ALIGN)     // 32MiB
+#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_FIELD_BITS * MI_ARENA_BLOCK_SIZE)  // 2GiB
+#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
+#define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)
+
+// A memory arena descriptor
+typedef struct mi_arena_s {
+  uint8_t* start;                         // the start of the memory area
+  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
+  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
+  int      numa_node;                     // associated NUMA node
+  bool     is_zero_init;                  // is the arena zero initialized?
+  bool     is_committed;                  // is the memory committed
+  bool     is_large;                      // large OS page allocated
+  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
+  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
+  mi_bitmap_field_t* blocks_committed;    // if `!is_committed`, are the blocks committed?
+  mi_bitmap_field_t  blocks_inuse[1];       // in-place bitmap of in-use blocks (of size `field_count`)
+} mi_arena_t;
+
+
+// The available arenas
+static _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS];
+static _Atomic(uintptr_t)   mi_arena_count; // = 0
+
+
+/* -----------------------------------------------------------
+  Arena allocations get a memory id where the lower 8 bits are
+  the arena index +1, and the upper bits the block index.
+----------------------------------------------------------- */
+
+// Use `0` as a special id for direct OS allocated memory.
+#define MI_MEMID_OS   0
+
+static size_t mi_memid_create(size_t arena_index, mi_bitmap_index_t bitmap_index) {
+  mi_assert_internal(arena_index < 0xFE);
+  mi_assert_internal(((bitmap_index << 8) >> 8) == bitmap_index); // no overflow?
+  return ((bitmap_index << 8) | ((arena_index+1) & 0xFF));
+}
+
+static void mi_memid_indices(size_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) {
+  mi_assert_internal(memid != MI_MEMID_OS);
+  *arena_index = (memid & 0xFF) - 1;
+  *bitmap_index = (memid >> 8);
+}
+
+static size_t mi_block_count_of_size(size_t size) {
+  return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE);
+}
+
+/* -----------------------------------------------------------
+  Thread safe allocation in an arena
+----------------------------------------------------------- */
+static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) 
+{
+  const size_t fcount = arena->field_count;
+  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
+  for (size_t visited = 0; visited < fcount; visited++, idx++) {
+    if (idx >= fcount) idx = 0;  // wrap around
+    if (mi_bitmap_try_find_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) {
+      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
+      return true;
+    }
+  }
+  return false;
+}
+
+
+/* -----------------------------------------------------------
+  Arena Allocation
+----------------------------------------------------------- */
+
+static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, 
+                                 bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) 
+{
+  mi_bitmap_index_t bitmap_index;
+  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;
+
+  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
+  void* p  = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
+  *memid   = mi_memid_create(arena_index, bitmap_index);
+  *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  *large   = arena->is_large;
+  if (arena->is_committed) {
+    // always committed
+    *commit = true;
+  }
+  else if (commit) {
+    // ensure commit now
+    bool any_uncommitted;
+    mi_bitmap_claim(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    if (any_uncommitted) {
+      bool commit_zero;
+      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+  }
+  return p;
+}
+
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, 
+                              bool* commit, bool* large, bool* is_zero, 
+                              size_t* memid, mi_os_tld_t* tld) 
+{
+  mi_assert_internal(commit != NULL && large != NULL && is_zero != NULL && memid != NULL && tld != NULL);
+  mi_assert_internal(size > 0);
+  *memid   = MI_MEMID_OS;
+  *is_zero = false;
+  
+  // try to allocate in an arena if the alignment is small enough
+  // and the object is not too large or too small.
+  if (alignment <= MI_SEGMENT_ALIGN && 
+      size <= MI_ARENA_MAX_OBJ_SIZE && 
+      size >= MI_ARENA_MIN_OBJ_SIZE)
+  {
+    const size_t bcount = mi_block_count_of_size(size);
+    const int numa_node = _mi_os_numa_node(tld); // current numa node
+
+    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
+    // try numa affine allocation
+    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
+      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
+      if (arena==NULL) break; // end reached
+      if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      { 
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
+        if (p != NULL) return p;
+      }
+    }
+    // try from another numa node instead..
+    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
+      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
+      if (arena==NULL) break; // end reached
+      if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
+          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
+      {
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
+        mi_assert_internal((uintptr_t)p % alignment == 0);
+        if (p != NULL) return p;
+      }
+    }
+  }
+
+  // finally, fall back to the OS
+  *is_zero = true;
+  *memid   = MI_MEMID_OS;
+  return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
+}
+
+void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_zero, memid, tld);
+}
+
+/* -----------------------------------------------------------
+  Arena free
+----------------------------------------------------------- */
+
+void _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats) {
+  mi_assert_internal(size > 0 && stats != NULL);
+  if (p==NULL) return;
+  if (size==0) return;
+  if (memid == MI_MEMID_OS) {
+    // was a direct OS allocation, pass through
+    _mi_os_free(p, size, stats);
+  }
+  else {
+    // allocated in an arena
+    size_t arena_idx;
+    size_t bitmap_idx;
+    mi_memid_indices(memid, &arena_idx, &bitmap_idx);
+    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
+    mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[arena_idx]));
+    mi_assert_internal(arena != NULL);
+    if (arena == NULL) {
+      _mi_fatal_error("trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx));
+    if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) {
+      _mi_fatal_error("trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
+      return;
+    }
+    const size_t blocks = mi_block_count_of_size(size);
+    bool ones = mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
+    if (!ones) {
+      _mi_fatal_error("trying to free an already freed block: %p, size %zu\n", p, size);
+      return;
+    };
+  }
+}
+
+/* -----------------------------------------------------------
+  Add an arena.
+----------------------------------------------------------- */
+
+static bool mi_arena_add(mi_arena_t* arena) {
+  mi_assert_internal(arena != NULL);
+  mi_assert_internal((uintptr_t)arena->start % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal(arena->block_count > 0);
+  
+  uintptr_t i = mi_atomic_addu(&mi_arena_count,1);
+  if (i >= MI_MAX_ARENAS) {
+    mi_atomic_subu(&mi_arena_count, 1);
+    return false;
+  }
+  mi_atomic_write_ptr(mi_atomic_cast(void*,&mi_arenas[i]), arena);
+  return true;
+}
+
+
+/* -----------------------------------------------------------
+  Reserve a huge page arena.
+----------------------------------------------------------- */
+#include <errno.h> // ENOMEM
+
+// reserve at a specific numa node
+int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages==0) return 0;
+  if (numa_node < -1) numa_node = -1;
+  if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count();
+  size_t hsize = 0;
+  size_t pages_reserved = 0;
+  void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize);
+  if (p==NULL || pages_reserved==0) {
+    _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu gb huge pages\n", pages_reserved);
+  
+  size_t bcount = mi_block_count_of_size(hsize);
+  size_t fields = (bcount + MI_BITMAP_FIELD_BITS - 1) / MI_BITMAP_FIELD_BITS;
+  size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));  
+  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  if (arena == NULL) {
+    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
+    return ENOMEM;
+  }
+  arena->block_count = bcount;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)p;  
+  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large = true;
+  arena->is_zero_init = true;
+  arena->is_committed = true;
+  arena->search_idx = 0;
+  arena->blocks_dirty = &arena->blocks_inuse[bcount];
+  arena->blocks_committed = NULL;
+  // the bitmaps are already zero initialized due to os_alloc
+  // just claim leftover blocks if needed
+  size_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); 
+  }
+  
+  mi_arena_add(arena);
+  return 0;
+}
+
+
+// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected)
+int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept {
+  if (pages == 0) return 0;
+
+  // pages per numa node
+  size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
+  if (numa_count <= 0) numa_count = 1;
+  const size_t pages_per = pages / numa_count;
+  const size_t pages_mod = pages % numa_count;
+  const size_t timeout_per = (timeout_msecs / numa_count) + 50;
+  
+  // reserve evenly among numa nodes
+  for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
+    size_t node_pages = pages_per;  // can be 0
+    if (numa_node < pages_mod) node_pages++;
+    int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
+    if (err) return err;
+    if (pages < node_pages) {
+      pages = 0;
+    }
+    else {
+      pages -= node_pages;
+    }
+  }
+
+  return 0;
+}
+
+int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
+  UNUSED(max_secs);
+  _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n");
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0));  
+  if (err==0 && pages_reserved!=NULL) *pages_reserved = pages;
+  return err;
+}
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@ -0,0 +1,240 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019, Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+This file is meant to be included in other files for efficiency.
+It implements a bitmap that can set/reset sequences of bits atomically
+and is used to concurrently claim memory ranges.
+
+A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
+
+A current limitation is that the bit sequences cannot cross fields
+and that the sequence must be smaller or equal to the bits in a field.
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_C
+#define MI_BITMAP_C
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
+
+// An atomic bitmap of `uintptr_t` fields
+typedef volatile _Atomic(uintptr_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*           mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+
+// The bit mask for a given number of blocks at a specified bit index.
+static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  if (count == MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  return ((((uintptr_t)1 << count) - 1) << bitidx);
+}
+
+
+/* -----------------------------------------------------------
+  Use bit scan forward/reverse to quickly find the first zero bit if it is available
+----------------------------------------------------------- */
+#if defined(_MSC_VER)
+#define MI_HAVE_BITSCAN
+#include <intrin.h>
+static inline size_t mi_bsf(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanForward)(&idx, x);
+  return idx;
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  if (x==0) return 8*MI_INTPTR_SIZE;
+  DWORD idx;
+  MI_64(_BitScanReverse)(&idx, x);
+  return idx;
+}
+#elif defined(__GNUC__) || defined(__clang__)
+#include <limits.h> // LONG_MAX
+#define MI_HAVE_BITSCAN
+#if (INTPTR_MAX == LONG_MAX)
+# define MI_L(x)  x##l
+#else
+# define MI_L(x)  x##ll
+#endif
+static inline size_t mi_bsf(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
+}
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
+}
+#endif
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits at in `idx`
+// in the bitmap field. Returns `true` on success.
+static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_fields, const size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  mi_assert_internal(bitidx + count <= MI_BITMAP_FIELD_BITS);
+
+  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  if ((field & mask) == 0) { // free?
+    if (mi_atomic_cas_strong(&bitmap[idx], (field|mask), field)) {
+      // claimed!
+      return true;
+    }
+  }
+  return false;
+}
+
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  volatile _Atomic(uintptr_t)* field = &bitmap[idx];
+  uintptr_t map  = mi_atomic_read(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const uintptr_t mask = mi_bitmap_mask_(count, 0);
+  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_BITSCAN
+  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    if ((map & m) == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const uintptr_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going
+        map = mi_atomic_read(field);
+        continue;
+      }
+      else {
+        // success, we claimed the bits!
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_BITSCAN
+      const size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+static inline bool mi_bitmap_try_find_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
+  for (size_t idx = 0; idx < bitmap_fields; idx++) {
+    if (mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
+  uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
+  uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static inline bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  mi_bitmap_field_t field = mi_atomic_read_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  return ((field & mask) == mask);
+}
+
+static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+static inline bool mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+#endif
--- a/src/heap.c
+++ b/src/heap.c
@ -45,7 +45,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void
 }


-#if MI_DEBUG>1
+#if MI_DEBUG>=3
 static bool _mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) {
  UNUSED(arg1);
  UNUSED(arg2);
@ -149,7 +149,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)

  // collect regions
  if (collect >= FORCE && _mi_is_main_thread()) {
-    _mi_mem_collect(&heap->tld->stats);
+    _mi_mem_collect(&heap->tld->os);
  }
 }

--- a/src/init.c
+++ b/src/init.c
@ -19,7 +19,7 @@ const mi_page_t _mi_page_empty = {
  0,
  #endif
  0,       // used
-  NULL, 
+  NULL,
  ATOMIC_VAR_INIT(0), ATOMIC_VAR_INIT(0),
  0, NULL, NULL, NULL
  #if (MI_INTPTR_SIZE==8 && defined(MI_ENCODE_FREELIST)) || (MI_INTPTR_SIZE==4 && !defined(MI_ENCODE_FREELIST))
@ -95,13 +95,14 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;


 #define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
+#define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))

 static mi_tld_t tld_main = {
  0, false,
  &_mi_heap_main,
-  { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
-  { 0, tld_main_stats },       // os
-  { MI_STATS_NULL }            // stats
+  { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats, tld_main_os }, // segments
+  { 0, tld_main_stats },  // os
+  { MI_STATS_NULL }             // stats
 };

 mi_heap_t _mi_heap_main = {
@ -191,7 +192,7 @@ uintptr_t _mi_random_init(uintptr_t seed /* can be zero */) {

 typedef struct mi_thread_data_s {
  mi_heap_t  heap;  // must come first due to cast in `_mi_heap_done`
-  mi_tld_t   tld;
+  mi_tld_t   tld;  
 } mi_thread_data_t;

 // Initialize the thread local default heap, called from `mi_thread_init`
@ -219,6 +220,7 @@ static bool _mi_heap_init(void) {
    memset(tld, 0, sizeof(*tld));
    tld->heap_backing = heap;
    tld->segments.stats = &tld->stats;
+    tld->segments.os = &tld->os;
    tld->os.stats = &tld->stats;
    _mi_heap_set_default_direct(heap);
  }
@ -237,7 +239,7 @@ static bool _mi_heap_done(mi_heap_t* heap) {
  // switch to backing heap and free it
  heap = heap->tld->heap_backing;
  if (!mi_heap_is_initialized(heap)) return false;
-  
+
  // collect if not the main thread
  if (heap != &_mi_heap_main) {
    _mi_heap_collect_abandon(heap);
@ -334,7 +336,7 @@ void mi_thread_init(void) mi_attr_noexcept
  mi_process_init();

  // initialize the thread local default heap
-  // (this will call `_mi_heap_set_default_direct` and thus set the 
+  // (this will call `_mi_heap_set_default_direct` and thus set the
  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
  if (_mi_heap_init()) return;  // returns true if already initialized

@ -368,9 +370,9 @@ void _mi_heap_set_default_direct(mi_heap_t* heap)  {
  #if defined(_WIN32) && defined(MI_SHARED_LIB)
    // nothing to do as it is done in DllMain
  #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
-    FlsSetValue(mi_fls_key, heap); 
+    FlsSetValue(mi_fls_key, heap);
  #elif defined(MI_USE_PTHREADS)
-    pthread_setspecific(mi_pthread_key, heap); 
+    pthread_setspecific(mi_pthread_key, heap);
  #endif
 }

@ -394,7 +396,7 @@ bool mi_is_redirected() mi_attr_noexcept {
 }

 // Communicate with the redirection module on Windows
-#if defined(_WIN32) && defined(MI_SHARED_LIB) 
+#if defined(_WIN32) && defined(MI_SHARED_LIB)
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -440,12 +442,6 @@ static void mi_process_load(void) {
  if (msg != NULL && (mi_option_is_enabled(mi_option_verbose) || mi_option_is_enabled(mi_option_show_errors))) {
    _mi_fputs(NULL,NULL,msg);
  }
-
-  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
-    size_t pages     = mi_option_get(mi_option_reserve_huge_os_pages);
-    double max_secs = (double)pages / 2.0; // 0.5s per page (1GiB)
-    mi_reserve_huge_os_pages(pages, max_secs, NULL);
-  }
 }

 // Initialize the process; called by thread_init or the process loader
@ -473,6 +469,11 @@ void mi_process_init(void) mi_attr_noexcept {
  _mi_verbose_message("secure level: %d\n", MI_SECURE);
  mi_thread_init();
  mi_stats_reset();  // only call stat reset *after* thread init (or the heap tld == NULL)
+
+  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
+    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);
+    mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+  }
 }

 // Called when the process is done (through `at_exit`)
@ -499,7 +500,7 @@ static void mi_process_done(void) {


 #if defined(_WIN32) && defined(MI_SHARED_LIB)
-  // Windows DLL: easy to hook into process_init and thread_done  
+  // Windows DLL: easy to hook into process_init and thread_done
  __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
    UNUSED(reserved);
    UNUSED(inst);
--- a/src/memory.c
+++ b/src/memory.c
@ -16,10 +16,10 @@ We need this memory layer between the raw OS calls because of:
 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
   to reuse memory effectively.
 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still (much) too expensive relative to the accesses in that
-   object :-( (`malloc-large` tests this). This means we need a cheaper way to
-   reuse memory.
-3. This layer can help with a NUMA aware allocation in the future.
+   an OS allocation/free is still (much) too expensive relative to the accesses 
+   in that object :-( (`malloc-large` tests this). This means we need a cheaper 
+   way to reuse memory.
+3. This layer allows for NUMA aware allocation.

 Possible issues:
 - (2) can potentially be addressed too with a small cache per thread which is much
@ -37,6 +37,8 @@ Possible issues:

 #include <string.h>  // memset

+#include "bitmap.inc.c"
+
 // Internal raw OS interface
 size_t  _mi_os_large_page_size();
 bool    _mi_os_protect(void* addr, size_t size);
@ -45,56 +47,60 @@ bool    _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
 bool    _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
-void*   _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
-void    _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
-void*   _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);
-bool    _mi_os_is_huge_reserved(void* p);
+
+// arena.c
+void    _mi_arena_free(void* p, size_t size, size_t memid, mi_stats_t* stats);
+void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+
+

 // Constants
 #if (MI_INTPTR_SIZE==8)
-#define MI_HEAP_REGION_MAX_SIZE    (256 * (1ULL << 30))  // 256GiB => 16KiB for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (256 * GiB)  // 48KiB for the region map 
 #elif (MI_INTPTR_SIZE==4)
-#define MI_HEAP_REGION_MAX_SIZE    (3 * (1UL << 30))    // 3GiB => 196 bytes for the region map
+#define MI_HEAP_REGION_MAX_SIZE    (3 * GiB)    // ~ KiB for the region map
 #else
 #error "define the maximum heap space allowed for regions on this platform"
 #endif

 #define MI_SEGMENT_ALIGN          MI_SEGMENT_SIZE

-#define MI_REGION_MAP_BITS        (MI_INTPTR_SIZE * 8)
-#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_REGION_MAP_BITS)
-#define MI_REGION_MAX_ALLOC_SIZE  ((MI_REGION_MAP_BITS/4)*MI_SEGMENT_SIZE)  // 64MiB
-#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)
-#define MI_REGION_MAP_FULL        UINTPTR_MAX
+#define MI_REGION_MAX_BLOCKS      MI_BITMAP_FIELD_BITS
+#define MI_REGION_SIZE            (MI_SEGMENT_SIZE * MI_BITMAP_FIELD_BITS)    // 256MiB  (64MiB on 32 bits)
+#define MI_REGION_MAX             (MI_HEAP_REGION_MAX_SIZE / MI_REGION_SIZE)  // 1024  (48 on 32 bits)
+#define MI_REGION_MAX_OBJ_BLOCKS  (MI_REGION_MAX_BLOCKS/4)                    // 64MiB
+#define MI_REGION_MAX_OBJ_SIZE    (MI_REGION_MAX_OBJ_BLOCKS*MI_SEGMENT_SIZE)  

-
-typedef uintptr_t mi_region_info_t;
-
-static inline mi_region_info_t mi_region_info_create(void* start, bool is_large, bool is_committed) {
-  return ((uintptr_t)start | ((uintptr_t)(is_large?1:0) << 1) | (is_committed?1:0));
-}
-
-static inline void* mi_region_info_read(mi_region_info_t info, bool* is_large, bool* is_committed) {
-  if (is_large) *is_large = ((info&0x02) != 0);
-  if (is_committed) *is_committed = ((info&0x01) != 0);
-  return (void*)(info & ~0x03);
-}
+// Region info is a pointer to the memory region and two bits for 
+// its flags: is_large, and is_committed.
+typedef union mi_region_info_u {
+  uintptr_t value;
+  struct {
+    bool  valid;
+    bool  is_large;
+    int   numa_node;
+  };
+} mi_region_info_t;


 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(uintptr_t)        map;   // in-use bit per MI_SEGMENT_SIZE block
-  volatile _Atomic(mi_region_info_t) info;  // start of virtual memory area, and flags
-  volatile _Atomic(uintptr_t)        dirty_mask; // bit per block if the contents are not zero'd
+  volatile _Atomic(uintptr_t)        info;        // is_large, and associated numa node + 1 (so 0 is no association)
+  volatile _Atomic(void*)            start;       // start of the memory area (and flags)
+  mi_bitmap_field_t                  in_use;      // bit per in-use block
+  mi_bitmap_field_t                  dirty;       // track if non-zero per block
+  mi_bitmap_field_t                  commit;      // track if committed per block (if `!info.is_committed))
+  mi_bitmap_field_t                  reset;       // track reset per block
+  volatile _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena-
 } mem_region_t;

-
-// The region map; 16KiB for a 256GiB HEAP_REGION_MAX
-// TODO: in the future, maintain a map per NUMA node for numa aware allocation
+// The region map
 static mem_region_t regions[MI_REGION_MAX];

-static volatile _Atomic(uintptr_t) regions_count; // = 0;        // allocated regions
+// Allocated regions
+static volatile _Atomic(uintptr_t) regions_count; // = 0;        


 /* ----------------------------------------------------------------------------
@ -103,257 +109,221 @@ Utility functions

 // Blocks (of 4MiB) needed for the given size.
 static size_t mi_region_block_count(size_t size) {
-  mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE);
-  return (size + MI_SEGMENT_SIZE - 1) / MI_SEGMENT_SIZE;
-}
-
-// The bit mask for a given number of blocks at a specified bit index.
-static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) {
-  mi_assert_internal(blocks + bitidx <= MI_REGION_MAP_BITS);
-  return ((((uintptr_t)1 << blocks) - 1) << bitidx);
+  return _mi_divide_up(size, MI_SEGMENT_SIZE);
 }

+/*
 // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
 static size_t mi_good_commit_size(size_t size) {
  if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
  return _mi_align_up(size, _mi_os_large_page_size());
 }
+*/

 // Return if a pointer points into a region reserved by us.
 bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  if (p==NULL) return false;
  size_t count = mi_atomic_read_relaxed(&regions_count);
  for (size_t i = 0; i < count; i++) {
-    uint8_t* start = (uint8_t*)mi_region_info_read( mi_atomic_read_relaxed(&regions[i].info), NULL, NULL);
+    uint8_t* start = (uint8_t*)mi_atomic_read_ptr_relaxed(&regions[i].start);
    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
  }
  return false;
 }


+static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  void* start = mi_atomic_read_ptr(&region->start);
+  mi_assert_internal(start != NULL);
+  return ((uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE));  
+}
+
+static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
+  mi_assert_internal(bit_idx < MI_BITMAP_FIELD_BITS);
+  size_t idx = region - regions;
+  mi_assert_internal(&regions[idx] == region);
+  return (idx*MI_BITMAP_FIELD_BITS + bit_idx)<<1;
+}
+
+static size_t mi_memid_create_from_arena(size_t arena_memid) {
+  return (arena_memid << 1) | 1;
+}
+
+
+static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_t* bit_idx, size_t* arena_memid) {
+  if ((id&1)==1) {
+    if (arena_memid != NULL) *arena_memid = (id>>1);
+    return true;
+  }
+  else {
+    size_t idx = (id >> 1) / MI_BITMAP_FIELD_BITS;
+    *bit_idx   = (mi_bitmap_index_t)(id>>1) % MI_BITMAP_FIELD_BITS;
+    *region    = &regions[idx];
+    return false;
+  }
+}
+
+
 /* ----------------------------------------------------------------------------
-Commit from a region
+  Allocate a region is allocated from the OS (or an arena)
 -----------------------------------------------------------------------------*/

-// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, 
-                                    size_t size, bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
+static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
-  size_t mask = mi_region_block_mask(blocks,bitidx);
-  mi_assert_internal(mask != 0);
-  mi_assert_internal((mask & mi_atomic_read_relaxed(&region->map)) == mask);
-  mi_assert_internal(&regions[idx] == region);
+  // not out of regions yet?
+  if (mi_atomic_read_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;

-  // ensure the region is reserved
-  mi_region_info_t info = mi_atomic_read(&region->info);
-  if (info == 0) 
-  {
-    bool region_commit = mi_option_is_enabled(mi_option_eager_region_commit);
-    bool region_large  = *allow_large;
-    void* start = NULL;
-    if (region_large) {
-      start = _mi_os_try_alloc_from_huge_reserved(MI_REGION_SIZE, MI_SEGMENT_ALIGN);
-      if (start != NULL) { region_commit = true; }
-    }
-    if (start == NULL) {
-      start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, region_commit, &region_large, tld);
-    }
-    mi_assert_internal(!(region_large && !*allow_large));
+  // try to allocate a fresh region from the OS
+  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
+  bool region_large = (commit && allow_large);
+  bool is_zero = false;
+  size_t arena_memid = 0;
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+  if (start == NULL) return false;
+  mi_assert_internal(!(region_large && !allow_large));
+  mi_assert_internal(!region_large || region_commit);

-    if (start == NULL) {
-      // failure to allocate from the OS! unclaim the blocks and fail
-      size_t map;
-      do {
-        map = mi_atomic_read_relaxed(&region->map);
-      } while (!mi_atomic_cas_weak(&region->map, map & ~mask, map));
-      return false;
-    }
+  // claim a fresh slot
+  const uintptr_t idx = mi_atomic_increment(&regions_count);
+  if (idx >= MI_REGION_MAX) {
+    mi_atomic_decrement(&regions_count);
+    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
+    return false;
+  }

-    // set the newly allocated region
-    info = mi_region_info_create(start,region_large,region_commit);
-    if (mi_atomic_cas_strong(&region->info, info, 0)) {
-      // update the region count
-      mi_atomic_increment(&regions_count);
-    }
-    else {
-      // failed, another thread allocated just before us!
-      // we assign it to a later slot instead (up to 4 tries).
-      for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
-        if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
-          mi_atomic_increment(&regions_count);
-          start = NULL;
-          break;
-        }
+  // allocated, initialize and claim the initial blocks
+  mem_region_t* r = &regions[idx];
+  r->arena_memid  = arena_memid;
+  mi_atomic_write(&r->in_use, 0);
+  mi_atomic_write(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
+  mi_atomic_write(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
+  mi_atomic_write(&r->reset, 0);
+  *bit_idx = 0;
+  mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_write_ptr(&r->start, start);
+
+  // and share it 
+  mi_region_info_t info;
+  info.valid = true;
+  info.is_large = region_large;
+  info.numa_node = _mi_os_numa_node(tld);
+  mi_atomic_write(&r->info, info.value); // now make it available to others
+  *region = r;
+  return true;
+}
+
+/* ----------------------------------------------------------------------------
+  Try to claim blocks in suitable regions
+-----------------------------------------------------------------------------*/
+
+static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
+  // initialized at all?
+  mi_region_info_t info;
+  info.value = mi_atomic_read_relaxed(&region->info);
+  if (info.value==0) return false;
+
+  // numa correct
+  if (numa_node >= 0) {  // use negative numa node to always succeed
+    int rnode = info.numa_node;
+    if (rnode >= 0 && rnode != numa_node) return false;
+  }
+
+  // check allow-large
+  if (!allow_large && info.is_large) return false;
+
+  return true;
+}
+
+
+static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
+{
+  // try all regions for a free slot  
+  const size_t count = mi_atomic_read(&regions_count);
+  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? 
+  for (size_t visited = 0; visited < count; visited++, idx++) {
+    if (idx >= count) idx = 0;  // wrap around
+    mem_region_t* r = &regions[idx];
+    if (mi_region_is_suitable(r, numa_node, allow_large)) {
+      if (mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+        tld->region_idx = idx;    // remember the last found position
+        *region = r;
+        return true;
      }
-      if (start != NULL) {
-        // free it if we didn't succeed to save it to some other region
-        _mi_os_free_ex(start, MI_REGION_SIZE, region_commit, tld->stats);
-      }
-      // and continue with the memory at our index
-      info = mi_atomic_read(&region->info);
    }
  }
-  mi_assert_internal(info == mi_atomic_read(&region->info));
-  mi_assert_internal(info != 0);
+  return false;
+}

-  // Commit the blocks to memory
-  bool region_is_committed = false;
-  bool region_is_large = false;
-  void* start = mi_region_info_read(info,&region_is_large,&region_is_committed);  
-  mi_assert_internal(!(region_is_large && !*allow_large));
-  mi_assert_internal(start!=NULL);

-  // set dirty bits
-  uintptr_t m;
-  do {
-    m = mi_atomic_read(&region->dirty_mask);
-  } while (!mi_atomic_cas_weak(&region->dirty_mask, m | mask, m));
-  *is_zero = ((m & mask) == 0); // no dirty bit set in our claimed range?
-
-  void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
-  if (*commit && !region_is_committed) {
-    // ensure commit 
-    bool commit_zero = false;
-    _mi_os_commit(blocks_start, mi_good_commit_size(size), &commit_zero, tld->stats);  // only commit needed size (unless using large OS pages)
-    if (commit_zero) *is_zero = true;
-  }
-  else if (!*commit && region_is_committed) {
-    // but even when no commit is requested, we might have committed anyway (in a huge OS page for example)
-    *commit = true;
+static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+{
+  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
+  mem_region_t* region;
+  mi_bitmap_index_t bit_idx;
+  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
+  // try to claim in existing regions
+  if (!mi_region_try_claim(numa_node, blocks, *is_large, &region, &bit_idx, tld)) {
+    // otherwise try to allocate a fresh region
+    if (!mi_region_try_alloc_os(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+      // out of regions or memory
+      return NULL;
+    }
  }
+  
+  
+  // found a region and claimed `blocks` at `bit_idx`
+  mi_assert_internal(region != NULL);
+  mi_assert_internal(mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));

+  mi_region_info_t info;
+  info.value = mi_atomic_read(&region->info);
+  void* start = mi_atomic_read_ptr(&region->start);
+  mi_assert_internal(!(info.is_large && !*is_large));
+  mi_assert_internal(start != NULL);
+
+  *is_zero = mi_bitmap_unclaim(&region->dirty, 1, blocks, bit_idx);  
+  *is_large = info.is_large;
+  *memid = mi_memid_create(region, bit_idx);
+  void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
+
+  // commit
+  if (*commit) {
+    // ensure commit
+    bool any_uncommitted;
+    mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
+    if (any_uncommitted) {
+      mi_assert_internal(!info.is_large);
+      bool commit_zero;
+      _mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld);
+      if (commit_zero) *is_zero = true;
+    }
+  }
+  else {
+    // no need to commit, but check if already fully committed
+    *commit = mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
+  }  
+  mi_assert_internal(mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
+
+  // unreset reset blocks
+  if (mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
+    mi_assert_internal(!info.is_large);
+    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit); 
+    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
+    bool reset_zero;
+    _mi_mem_unreset(p, blocks * MI_SEGMENT_SIZE, &reset_zero, tld);
+    if (reset_zero) *is_zero = true;
+  }
+  mi_assert_internal(!mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
+
+  #if (MI_DEBUG>=2)
+  if (*commit) { ((uint8_t*)p)[0] = 0; }
+  #endif
+  
  // and return the allocation  
-  mi_assert_internal(blocks_start != NULL);
-  *allow_large = region_is_large;
-  *p  = blocks_start;
-  *id = (idx*MI_REGION_MAP_BITS) + bitidx;
-  return true;
+  mi_assert_internal(p != NULL);  
+  return p;
 }

-// Use bit scan forward to quickly find the first zero bit if it is available
-#if defined(_MSC_VER)
-#define MI_HAVE_BITSCAN
-#include <intrin.h>
-static inline size_t mi_bsf(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  #if (MI_INTPTR_SIZE==8)
-  _BitScanForward64(&idx, x);
-  #else
-  _BitScanForward(&idx, x);
-  #endif
-  return idx;
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  #if (MI_INTPTR_SIZE==8)
-  _BitScanReverse64(&idx, x);
-  #else
-  _BitScanReverse(&idx, x);
-  #endif
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-#define MI_HAVE_BITSCAN
-static inline size_t mi_bsf(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctzl(x));
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - __builtin_clzl(x));
-}
-#endif
-
-// Allocate `blocks` in a `region` at `idx` of a given `size`.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, 
-                                   bool* commit, bool* allow_large, bool* is_zero, void** p, size_t* id, mi_os_tld_t* tld)
-{
-  mi_assert_internal(p != NULL && id != NULL);
-  mi_assert_internal(blocks < MI_REGION_MAP_BITS);
-
-  const uintptr_t mask = mi_region_block_mask(blocks, 0);
-  const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
-  uintptr_t map = mi_atomic_read(&region->map);
-  if (map==MI_REGION_MAP_FULL) return true;
-
-  #ifdef MI_HAVE_BITSCAN
-  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
-  #else
-  size_t bitidx = 0;               // otherwise start at 0
-  #endif
-  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
-
-  // scan linearly for a free range of zero bits
-  while(bitidx <= bitidx_max) {
-    if ((map & m) == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      uintptr_t newmap = map | m;
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak(&region->map, newmap, map)) {  // TODO: use strong cas here?
-        // no success, another thread claimed concurrently.. keep going
-        map = mi_atomic_read(&region->map);
-        continue;
-      }
-      else {
-        // success, we claimed the bits
-        // now commit the block memory -- this can still fail
-        return mi_region_commit_blocks(region, idx, bitidx, blocks, 
-                                       size, commit, allow_large, is_zero, p, id, tld);
-      }
-    }
-    else {
-      // on to the next bit range
-      #ifdef MI_HAVE_BITSCAN
-      size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
-      mi_assert_internal(shift > 0 && shift <= blocks);
-      #else
-      size_t shift = 1;
-      #endif
-      bitidx += shift;
-      m <<= shift;
-    }
-  }
-  // no error, but also no bits found
-  return true;
-}
-
-// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
-// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
-// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
-// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
-static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, 
-                                       bool* commit, bool* allow_large, bool* is_zero, 
-                                       void** p, size_t* id, mi_os_tld_t* tld)
-{
-  // check if there are available blocks in the region..
-  mi_assert_internal(idx < MI_REGION_MAX);
-  mem_region_t* region = &regions[idx];
-  uintptr_t m = mi_atomic_read_relaxed(&region->map);
-  if (m != MI_REGION_MAP_FULL) {  // some bits are zero    
-    bool ok = (*commit || *allow_large); // committing or allow-large is always ok
-    if (!ok) {
-      // otherwise skip incompatible regions if possible. 
-      // this is not guaranteed due to multiple threads allocating at the same time but
-      // that's ok. In secure mode, large is never allowed for any thread, so that works out; 
-      // otherwise we might just not be able to reset/decommit individual pages sometimes.
-      mi_region_info_t info = mi_atomic_read_relaxed(&region->info);
-      bool is_large;
-      bool is_committed;
-      void* start = mi_region_info_read(info,&is_large,&is_committed);
-      ok = (start == NULL || (*commit || !is_committed) || (*allow_large || !is_large)); // Todo: test with one bitmap operation?
-    }
-    if (ok) {
-      return mi_region_alloc_blocks(region, idx, blocks, size, commit, allow_large, is_zero, p, id, tld);
-    }
-  }
-  return true;  // no error, but no success either
-}

 /* ----------------------------------------------------------------------------
 Allocation
@ -361,59 +331,37 @@ static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size,

 // Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
 // (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, 
-                            size_t* id, mi_os_tld_t* tld)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
-  mi_assert_internal(id != NULL && tld != NULL);
+  mi_assert_internal(memid != NULL && tld != NULL);
  mi_assert_internal(size > 0);
-  *id = SIZE_MAX;
+  *memid = 0;
  *is_zero = false;
  bool default_large = false;
  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
-
-  // use direct OS allocation for huge blocks or alignment (with `id = SIZE_MAX`)
-  if (size > MI_REGION_MAX_ALLOC_SIZE || alignment > MI_SEGMENT_ALIGN) {
-    *is_zero = true;
-    return _mi_os_alloc_aligned(mi_good_commit_size(size), alignment, *commit, large, tld);  // round up size
-  }
-
-  // always round size to OS page size multiple (so commit/decommit go over the entire range)
-  // TODO: use large OS page size here?
+  if (size == 0) return NULL;
  size = _mi_align_up(size, _mi_os_page_size());

-  // calculate the number of needed blocks
-  size_t blocks = mi_region_block_count(size);
-  mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
-
-  // find a range of free blocks
-  void* p = NULL;
-  size_t count = mi_atomic_read(&regions_count);
-  size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention?
-  for (size_t visited = 0; visited < count; visited++, idx++) {
-    if (idx >= count) idx = 0;  // wrap around
-    if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
-    if (p != NULL) break;
-  }
-
-  if (p == NULL) {
-    // no free range in existing regions -- try to extend beyond the count.. but at most 8 regions
-    for (idx = count; idx < mi_atomic_read_relaxed(&regions_count) + 8 && idx < MI_REGION_MAX; idx++) {
-      if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, large, is_zero, &p, id, tld)) return NULL; // error
-      if (p != NULL) break;
+  // allocate from regions if possible
+  size_t arena_memid;
+  const size_t blocks = mi_region_block_count(size);
+  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
+    void* p = mi_region_try_alloc(blocks, commit, large, is_zero, memid, tld);
+    mi_assert_internal(p == NULL || (uintptr_t)p % alignment == 0);    
+    if (p != NULL) {
+      #if (MI_DEBUG>=2)
+      if (*commit) { ((uint8_t*)p)[0] = 0; }
+      #endif
+      return p;
    }
-  }
-
-  if (p == NULL) {
-    // we could not find a place to allocate, fall back to the os directly
    _mi_warning_message("unable to allocate from region: size %zu\n", size);
-    *is_zero = true;
-    p = _mi_os_alloc_aligned(size, alignment, commit, large, tld);
-  }
-  else {
-    tld->region_idx = idx;  // next start of search? currently not used as we use first-fit
  }

+  // and otherwise fall back to the OS
+  void* p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
+  *memid = mi_memid_create_from_arena(arena_memid);
  mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0);
+  if (p != NULL && *commit) { ((uint8_t*)p)[0] = 0; }
  return p;
 }

@ -424,67 +372,56 @@ Free
 -----------------------------------------------------------------------------*/

 // Free previously allocated memory with a given id.
-void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
-  mi_assert_internal(size > 0 && stats != NULL);
+void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_reset, mi_os_tld_t* tld) {
+  mi_assert_internal(size > 0 && tld != NULL);
  if (p==NULL) return;
  if (size==0) return;
-  if (id == SIZE_MAX) {
-   // was a direct OS allocation, pass through
-    _mi_os_free(p, size, stats);
+  size = _mi_align_up(size, _mi_os_page_size());
+  
+  size_t arena_memid = 0;
+  mi_bitmap_index_t bit_idx;
+  mem_region_t* region;
+  if (mi_memid_is_arena(id,&region,&bit_idx,&arena_memid)) {
+   // was a direct arena allocation, pass through
+    _mi_arena_free(p, size, arena_memid, tld->stats);
  }
  else {
    // allocated in a region
-    mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
-    // we can align the size up to page size (as we allocate that way too)
-    // this ensures we fully commit/decommit/reset
-    size = _mi_align_up(size, _mi_os_page_size());
-    size_t idx = (id / MI_REGION_MAP_BITS);
-    size_t bitidx = (id % MI_REGION_MAP_BITS);
-    size_t blocks = mi_region_block_count(size);
-    size_t mask = mi_region_block_mask(blocks, bitidx);
-    mi_assert_internal(idx < MI_REGION_MAX); if (idx >= MI_REGION_MAX) return; // or `abort`?
-    mem_region_t* region = &regions[idx];
-    mi_assert_internal((mi_atomic_read_relaxed(&region->map) & mask) == mask ); // claimed?
-    mi_region_info_t info = mi_atomic_read(&region->info);
-    bool is_large;
-    bool is_eager_committed;
-    void* start = mi_region_info_read(info,&is_large,&is_eager_committed);
-    mi_assert_internal(start != NULL);
-    void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
+    mi_assert_internal(size <= MI_REGION_MAX_OBJ_SIZE); if (size > MI_REGION_MAX_OBJ_SIZE) return;
+    const size_t blocks = mi_region_block_count(size);
+    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
+    mi_region_info_t info;
+    info.value = mi_atomic_read(&region->info);
+    mi_assert_internal(info.value != 0);
+    void* blocks_start = mi_region_blocks_start(region, bit_idx);
    mi_assert_internal(blocks_start == p); // not a pointer in our area?
-    mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
-    if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?
+    mi_assert_internal(bit_idx + blocks <= MI_BITMAP_FIELD_BITS);
+    if (blocks_start != p || bit_idx + blocks > MI_BITMAP_FIELD_BITS) return; // or `abort`?

-    // decommit (or reset) the blocks to reduce the working set.
-    // TODO: implement delayed decommit/reset as these calls are too expensive
-    // if the memory is reused soon.
-    // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
-    if (!is_large) {
-      if (mi_option_is_enabled(mi_option_segment_reset)) {
-        if (!is_eager_committed &&  // cannot reset large pages
-          (mi_option_is_enabled(mi_option_eager_commit) ||  // cannot reset halfway committed segments, use `option_page_reset` instead
-            mi_option_is_enabled(mi_option_reset_decommits))) // but we can decommit halfway committed segments
-        {
-          _mi_os_reset(p, size, stats);
-          //_mi_os_decommit(p, size, stats);  // todo: and clear dirty bits?
-        }
-      }
-    }    
-    if (!is_eager_committed) {
-      // adjust commit statistics as we commit again when re-using the same slot
-      _mi_stat_decrease(&stats->committed, mi_good_commit_size(size));
+    // committed?
+    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
+      mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
    }

-    // TODO: should we free empty regions? currently only done _mi_mem_collect.
-    // this frees up virtual address space which might be useful on 32-bit systems?
+    if (any_reset) {
+      // set the is_reset bits if any pages were reset
+      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
+    }
+
+    // reset the blocks to reduce the working set.
+    if (!info.is_large && mi_option_is_enabled(mi_option_segment_reset) &&
+        mi_option_is_enabled(mi_option_eager_commit))  // cannot reset halfway committed segments, use only `option_page_reset` instead            
+    {
+      bool any_unreset;
+      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
+      if (any_unreset) {
+        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
+      }
+    }    

    // and unclaim
-    uintptr_t map;
-    uintptr_t newmap;
-    do {
-      map = mi_atomic_read_relaxed(&region->map);
-      newmap = map & ~mask;
-    } while (!mi_atomic_cas_weak(&region->map, newmap, map));
+    bool all_unclaimed = mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+    mi_assert_internal(all_unclaimed); UNUSED(all_unclaimed);
  }
 }

@ -492,49 +429,51 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
 /* ----------------------------------------------------------------------------
  collection
 -----------------------------------------------------------------------------*/
-void _mi_mem_collect(mi_stats_t* stats) {
+void _mi_mem_collect(mi_os_tld_t* tld) {
  // free every region that has no segments in use.
-  for (size_t i = 0; i < regions_count; i++) {
+  uintptr_t rcount = mi_atomic_read_relaxed(&regions_count);
+  for (size_t i = 0; i < rcount; i++) {
    mem_region_t* region = &regions[i];
-    if (mi_atomic_read_relaxed(&region->map) == 0) {
+    if (mi_atomic_read_relaxed(&region->info) != 0) {
      // if no segments used, try to claim the whole region
      uintptr_t m;
      do {
-        m = mi_atomic_read_relaxed(&region->map);
-      } while(m == 0 && !mi_atomic_cas_weak(&region->map, ~((uintptr_t)0), 0 ));
+        m = mi_atomic_read_relaxed(&region->in_use);
+      } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
      if (m == 0) {
-        // on success, free the whole region (unless it was huge reserved)
-        bool is_eager_committed;
-        void* start = mi_region_info_read(mi_atomic_read(&region->info), NULL, &is_eager_committed);
-        if (start != NULL && !_mi_os_is_huge_reserved(start)) {
-          _mi_os_free_ex(start, MI_REGION_SIZE, is_eager_committed, stats);
+        // on success, free the whole region
+        void* start = mi_atomic_read_ptr(&regions[i].start);
+        size_t arena_memid = mi_atomic_read_relaxed(&regions[i].arena_memid);
+        memset(&regions[i], 0, sizeof(mem_region_t));
+        // and release the whole region
+        mi_atomic_write(&region->info, 0);
+        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {          
+          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, tld->stats);
        }
-        // and release
-        mi_atomic_write(&region->info,0);
-        mi_atomic_write(&region->map,0);
      }
    }
  }
 }

+
 /* ----------------------------------------------------------------------------
  Other
 -----------------------------------------------------------------------------*/

-bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return _mi_os_commit(p, size, is_zero, stats);
+bool _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_reset(p, size, tld->stats);
 }

-bool _mi_mem_decommit(void* p, size_t size, mi_stats_t* stats) {
-  return _mi_os_decommit(p, size, stats);
+bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_unreset(p, size, is_zero, tld->stats);
 }

-bool _mi_mem_reset(void* p, size_t size, mi_stats_t* stats) {
-  return _mi_os_reset(p, size, stats);
+bool _mi_mem_commit(void* p, size_t size, bool* is_zero, mi_os_tld_t* tld) {
+  return _mi_os_commit(p, size, is_zero, tld->stats);
 }

-bool _mi_mem_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return _mi_os_unreset(p, size, is_zero, stats);
+bool _mi_mem_decommit(void* p, size_t size, mi_os_tld_t* tld) {
+  return _mi_os_decommit(p, size, tld->stats);
 }

 bool _mi_mem_protect(void* p, size_t size) {
--- a/src/options.c
+++ b/src/options.c
@ -65,13 +65,14 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
-  { 0, UNINIT, MI_OPTION(page_reset) },
-  { 0, UNINIT, MI_OPTION(cache_reset) },
-  { 0, UNINIT, MI_OPTION(reset_decommits) },     // note: cannot enable this if secure is on
-  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 0, UNINIT, MI_OPTION(page_reset) },          // reset pages on free
  { 0, UNINIT, MI_OPTION(segment_reset) },       // reset segment memory on free (needs eager commit)
+  { 1, UNINIT, MI_OPTION(reset_decommits) },     // reset decommits memory
+  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 500,UNINIT, MI_OPTION(reset_delay) },        // reset delay in milli-seconds
+  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes. 
  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 16, UNINIT, MI_OPTION(max_errors) }          // maximum errors that are output
+  { 16,  UNINIT, MI_OPTION(max_errors) }         // maximum errors that are output
 };

 static void mi_option_init(mi_option_desc_t* desc);
@ -87,7 +88,7 @@ void _mi_options_init(void) {
      mi_option_desc_t* desc = &options[option];
      _mi_verbose_message("option '%s': %ld\n", desc->name, desc->value);
    }
-  }
+  }  
  mi_max_error_count = mi_option_get(mi_option_max_errors);
 }

@ -224,7 +225,6 @@ static void mi_add_stderr_output() {
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
 // --------------------------------------------------------
-#define MAX_ERROR_COUNT (10)
 static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings

 // When overriding malloc, we may recurse into mi_vfprintf if an allocation
--- a/src/os.c
+++ b/src/os.c
@ -36,8 +36,6 @@ terms of the MIT license. A copy of the license can be found in the file
  large OS pages (if MIMALLOC_LARGE_OS_PAGES is true).
 ----------------------------------------------------------- */
 bool    _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);
-bool    _mi_os_is_huge_reserved(void* p);
-void*   _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment);

 static void* mi_align_up_ptr(void* p, size_t alignment) {
  return (void*)_mi_align_up((uintptr_t)p, alignment);
@ -99,7 +97,7 @@ typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;

-static bool mi_win_enable_large_os_pages() 
+static bool mi_win_enable_large_os_pages()
 {
  if (large_os_page_size > 0) return true;

@ -150,10 +148,10 @@ void _mi_os_init(void) {
    FreeLibrary(hDll);
  }
  hDll = LoadLibrary(TEXT("ntdll.dll"));
-  if (hDll != NULL) {    
+  if (hDll != NULL) {
    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
    FreeLibrary(hDll);
-  }  
+  }
  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
    mi_win_enable_large_os_pages();
  }
@ -172,7 +170,7 @@ void _mi_os_init() {
    os_alloc_granularity = os_page_size;
  }
  if (mi_option_is_enabled(mi_option_large_os_pages)) {
-    large_os_page_size = (1UL << 21); // 2MiB
+    large_os_page_size = 2*MiB;
  }
 }
 #endif
@ -184,7 +182,7 @@ void _mi_os_init() {

 static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats)
 {
-  if (addr == NULL || size == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr)
  bool err = false;
 #if defined(_WIN32)
  err = (VirtualFree(addr, 0, MEM_RELEASE) == 0);
@ -193,7 +191,7 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
 #else
  err = (munmap(addr, size) == -1);
 #endif
-  if (was_committed) _mi_stat_decrease(&stats->committed, size); 
+  if (was_committed) _mi_stat_decrease(&stats->committed, size);
  _mi_stat_decrease(&stats->reserved, size);
  if (err) {
 #pragma warning(suppress:4996)
@ -209,39 +207,14 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);

 #ifdef _WIN32
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
-  if ((size % ((uintptr_t)1 << 30)) == 0 /* 1GiB multiple */
-    && (flags & MEM_LARGE_PAGES) != 0 && (flags & MEM_COMMIT) != 0 && (flags & MEM_RESERVE) != 0
-    && (addr != NULL || try_alignment == 0 || try_alignment % _mi_os_page_size() == 0)
-    && pNtAllocateVirtualMemoryEx != NULL)
-  {
-    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
-    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
-    #endif
-    MEM_EXTENDED_PARAMETER param = { 0, 0 };
-    param.Type = 5; // == MemExtendedParameterAttributeFlags;
-    param.ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
-    SIZE_T psize = size;
-    void*  base  = addr;
-    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, &param, 1);
-    if (err == 0) {
-      return base;
-    }
-    else {
-      // else fall back to regular large OS pages
-      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error 0x%lx)\n", err);
-    }
-  }
-#endif
-#if (MI_INTPTR_SIZE >= 8) 
+#if (MI_INTPTR_SIZE >= 8)
  // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
  void* hint;
  if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
    return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
  }
 #endif
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)  
+#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
  // on modern Windows try use VirtualAlloc2 for aligned allocation
  if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
    MEM_ADDRESS_REQUIREMENTS reqs = { 0 };
@ -259,7 +232,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
  mi_assert_internal(!(large_only && !allow_large));
  static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
  void* p = NULL;
-  if ((large_only || use_large_os_page(size, try_alignment)) 
+  if ((large_only || use_large_os_page(size, try_alignment))
      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
    if (!large_only && try_ok > 0) {
@ -327,7 +300,10 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
  #if !defined(MAP_ANONYMOUS)
  #define MAP_ANONYMOUS  MAP_ANON
  #endif
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS;
+  #if !defined(MAP_NORESERVE)
+  #define MAP_NORESERVE  0
+  #endif
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE;
  int fd = -1;
  #if defined(MAP_ALIGNED)  // BSD
  if (try_alignment > 0) {
@ -366,7 +342,8 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
      lflags |= MAP_HUGETLB;
      #endif
      #ifdef MAP_HUGE_1GB
-      if ((size % ((uintptr_t)1 << 30)) == 0) {
+      static bool mi_huge_pages_available = true;
+      if ((size % GiB) == 0 && mi_huge_pages_available) {
        lflags |= MAP_HUGE_1GB;
      }
      else
@ -385,6 +362,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
        p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
        #ifdef MAP_HUGE_1GB
        if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) {
+          mi_huge_pages_available = false; // don't try huge 1GiB pages again
          _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno);
          lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB);
          p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd);
@ -399,13 +377,13 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
  }
  if (p == NULL) {
    *is_large = false;
-    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);    
+    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
    #if defined(MADV_HUGEPAGE)
    // Many Linux systems don't allow MAP_HUGETLB but they support instead
-    // transparent huge pages (TPH). It is not required to call `madvise` with MADV_HUGE
+    // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
    // though since properly aligned allocations will already use large pages if available
    // in that case -- in particular for our large regions (in `memory.c`).
-    // However, some systems only allow TPH if called with explicit `madvise`, so
+    // However, some systems only allow THP if called with explicit `madvise`, so
    // when large OS pages are enabled for mimalloc, we call `madvice` anyways.
    if (allow_large && use_large_os_page(size, try_alignment)) {
      if (madvise(p, size, MADV_HUGEPAGE) == 0) {
@ -418,7 +396,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 }
 #endif

-// On 64-bit systems, we can do efficient aligned allocation by using 
+// On 64-bit systems, we can do efficient aligned allocation by using
 // the 4TiB to 30TiB area to allocate them.
 #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
 static volatile _Atomic(intptr_t) aligned_base;
@ -628,7 +606,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
  *is_zero = false;
  size_t csize;
  void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
-  if (csize == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
  int err = 0;
  if (commit) {
    _mi_stat_increase(&stats->committed, csize);
@ -651,31 +629,41 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
  }
  #elif defined(__wasi__)
  // WebAssembly guests can't control memory protection
+  #elif defined(MAP_FIXED)
+  if (!commit) {
+    // use mmap with MAP_FIXED to discard the existing memory (and reduce commit charge)
+    void* p = mmap(start, size, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), -1, 0);
+    if (p != start) { err = errno; }
+  }
+  else {
+    // for commit, just change the protection
+    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
+    if (err != 0) { err = errno; }
+  }
  #else
  err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
  if (err != 0) { err = errno; }
  #endif
  if (err != 0) {
-    _mi_warning_message("commit/decommit error: start: 0x%p, csize: 0x%x, err: %i\n", start, csize, err);
+    _mi_warning_message("%s error: start: 0x%p, csize: 0x%x, err: %i\n", commit ? "commit" : "decommit", start, csize, err);
  }
  mi_assert_internal(err == 0);
  return (err == 0);
 }

 bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return mi_os_commitx(addr, size, true, false /* conservative? */, is_zero, stats);
+  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
 }

 bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats) {
  bool is_zero;
-  return mi_os_commitx(addr, size, false, true /* conservative? */, &is_zero, stats);
+  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
 }

 bool _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
-  return mi_os_commitx(addr, size, true, true /* conservative? */, is_zero, stats);
+  return mi_os_commitx(addr, size, true, true /* conservative */, is_zero, stats);
 }

-
 // Signal to the OS that the address range is no longer in use
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
@ -684,7 +672,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
  // page align conservatively within the range
  size_t csize;
  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0 || _mi_os_is_huge_reserved(addr)) return true;
+  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr)
  if (reset) _mi_stat_increase(&stats->reset, csize);
        else _mi_stat_decrease(&stats->reset, csize);
  if (!reset) return true; // nothing to do on unreset!
@ -734,7 +722,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
 // We page align to a conservative area inside the range to reset.
 bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_decommit(addr,size,stats);
+    return _mi_os_decommit(addr, size, stats);
  }
  else {
    return mi_os_resetx(addr, size, true, stats);
@ -758,9 +746,11 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
  size_t csize = 0;
  void* start = mi_os_page_align_area_conservative(addr, size, &csize);
  if (csize == 0) return false;
+  /*
  if (_mi_os_is_huge_reserved(addr)) {
 	  _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
  }
+  */
  int err = 0;
 #ifdef _WIN32
  DWORD oldprotect = 0;
@ -810,141 +800,267 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {


 /* ----------------------------------------------------------------------------
-Support for huge OS pages (1Gib) that are reserved up-front and never
-released. Only regions are allocated in here (see `memory.c`) so the memory
-will be reused.
+Support for allocating huge OS pages (1Gib) that are reserved up-front
+and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
-#define MI_HUGE_OS_PAGE_SIZE ((size_t)1 << 30)  // 1GiB
+#define MI_HUGE_OS_PAGE_SIZE  (GiB)

-typedef struct mi_huge_info_s {
-  volatile _Atomic(void*)  start;     // start of huge page area (32TiB)
-  volatile _Atomic(size_t) reserved;  // total reserved size
-  volatile _Atomic(size_t) used;      // currently allocated
-} mi_huge_info_t;
-
-static mi_huge_info_t os_huge_reserved = { NULL, 0, ATOMIC_VAR_INIT(0) };
-
-bool _mi_os_is_huge_reserved(void* p) {
-  return (mi_atomic_read_ptr(&os_huge_reserved.start) != NULL && 
-          p >= mi_atomic_read_ptr(&os_huge_reserved.start) &&
-          (uint8_t*)p < (uint8_t*)mi_atomic_read_ptr(&os_huge_reserved.start) + mi_atomic_read(&os_huge_reserved.reserved));
-}
-
-void* _mi_os_try_alloc_from_huge_reserved(size_t size, size_t try_alignment)
+#if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
 {
-  // only allow large aligned allocations (e.g. regions)
-  if (size < MI_SEGMENT_SIZE || (size % MI_SEGMENT_SIZE) != 0) return NULL;
-  if (try_alignment > MI_SEGMENT_SIZE) return NULL;  
-  if (mi_atomic_read_ptr(&os_huge_reserved.start)==NULL) return NULL;
-  if (mi_atomic_read(&os_huge_reserved.used) >= mi_atomic_read(&os_huge_reserved.reserved)) return NULL; // already full
+  mi_assert_internal(size%GiB == 0);
+  mi_assert_internal(addr != NULL);
+  const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;

-  // always aligned
-  mi_assert_internal(mi_atomic_read(&os_huge_reserved.used) % MI_SEGMENT_SIZE == 0 );
-  mi_assert_internal( (uintptr_t)mi_atomic_read_ptr(&os_huge_reserved.start) % MI_SEGMENT_SIZE == 0 );
-  
-  // try to reserve space
-  size_t base = mi_atomic_addu( &os_huge_reserved.used, size );
-  if ((base + size) > os_huge_reserved.reserved) {
-    // "free" our over-allocation
-    mi_atomic_subu( &os_huge_reserved.used, size);
-    return NULL;
-  }
+  mi_win_enable_large_os_pages();

-  // success!
-  uint8_t* p = (uint8_t*)mi_atomic_read_ptr(&os_huge_reserved.start) + base;
-  mi_assert_internal( (uintptr_t)p % MI_SEGMENT_SIZE == 0 );
-  return p;
-}
-
-/*
-static void mi_os_free_huge_reserved() {
-  uint8_t* addr = os_huge_reserved.start;
-  size_t total  = os_huge_reserved.reserved;
-  os_huge_reserved.reserved = 0;
-  os_huge_reserved.start = NULL;
-  for( size_t current = 0; current < total; current += MI_HUGE_OS_PAGE_SIZE) {
-    _mi_os_free(addr + current, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
-  }
-}
-*/
-
-#if !(MI_INTPTR_SIZE >= 8 && (defined(_WIN32) || defined(MI_OS_USE_MMAP)))
-int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept {
-  UNUSED(pages); UNUSED(max_secs);
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  return ENOMEM; 
-}
-#else
-int mi_reserve_huge_os_pages( size_t pages, double max_secs, size_t* pages_reserved ) mi_attr_noexcept
-{
-  if (pages_reserved != NULL) *pages_reserved = 0;
-  if (max_secs==0) return ETIMEDOUT; // timeout 
-  if (pages==0) return 0;            // ok
-  if (!mi_atomic_cas_ptr_strong(&os_huge_reserved.start,(void*)1,NULL)) return ETIMEDOUT; // already reserved
-
-  // Set the start address after the 32TiB area
-  uint8_t* start = (uint8_t*)((uintptr_t)32 << 40); // 32TiB virtual start address
-  #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of huge pages unless in debug mode
-  uintptr_t r = _mi_random_init((uintptr_t)&mi_reserve_huge_os_pages);
-  start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
-  #endif
-
-  // Allocate one page at the time but try to place them contiguously
-  // We allocate one page at the time to be able to abort if it takes too long
-  double start_t = _mi_clock_start();
-  uint8_t* addr = start;  // current top of the allocations
-  for (size_t page = 0; page < pages; page++, addr += MI_HUGE_OS_PAGE_SIZE ) {
-    // allocate a page
-    void* p = NULL; 
-    bool is_large = true;
-    #ifdef _WIN32
-    if (page==0) { mi_win_enable_large_os_pages(); }
-    p = mi_win_virtual_alloc(addr, MI_HUGE_OS_PAGE_SIZE, 0, MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE, true, true, &is_large);
-    #elif defined(MI_OS_USE_MMAP)
-    p = mi_unix_mmap(addr, MI_HUGE_OS_PAGE_SIZE, 0, PROT_READ | PROT_WRITE, true, true, &is_large);
-    #else 
-    // always fail
-    #endif  
-    
-    // Did we succeed at a contiguous address?
-    if (p != addr) {
-      // no success, issue a warning and return with an error 
-      if (p != NULL) {
-        _mi_warning_message("could not allocate contiguous huge page %zu at 0x%p\n", page, addr); 
-        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main );
-      }
-      else {
-        #ifdef _WIN32
-        int err = GetLastError();
-        #else
-        int err = errno;
-        #endif
-        _mi_warning_message("could not allocate huge page %zu at 0x%p, error: %i\n", page, addr, err);
-      }
-      return ENOMEM;  
+  #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
+  MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
+  static bool mi_huge_pages_available = true;
+  if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
+    #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
+    #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
+    #endif
+    params[0].Type = 5; // == MemExtendedParameterAttributeFlags;
+    params[0].ULong64 = MEM_EXTENDED_PARAMETER_NONPAGED_HUGE;
+    ULONG param_count = 1;
+    if (numa_node >= 0) {
+      param_count++;
+      params[1].Type = MemExtendedParameterNumaNode;
+      params[1].ULong = (unsigned)numa_node;
    }
-    // success, record it
-    if (page==0) {
-      mi_atomic_write_ptr(&os_huge_reserved.start, addr);  // don't switch the order of these writes
-      mi_atomic_write(&os_huge_reserved.reserved, MI_HUGE_OS_PAGE_SIZE);
+    SIZE_T psize = size;
+    void* base = addr;
+    NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count);
+    if (err == 0 && base != NULL) {
+      return base;
    }
    else {
-      mi_atomic_addu(&os_huge_reserved.reserved,MI_HUGE_OS_PAGE_SIZE);
+      // fall back to regular large pages
+      mi_huge_pages_available = false; // don't try further huge pages
+      _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err);
    }
-    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE); 
-    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
-    if (pages_reserved != NULL) { *pages_reserved = page + 1; }
+  }
+  // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
+  if (pVirtualAlloc2 != NULL && numa_node >= 0) {
+    params[0].Type = MemExtendedParameterNumaNode;
+    params[0].ULong = (unsigned)numa_node;
+    return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
+  }
+  #endif
+  // otherwise use regular virtual alloc on older windows
+  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
+}

-    // check for timeout
-    double elapsed = _mi_clock_end(start_t);
-    if (elapsed > max_secs) return ETIMEDOUT; 
-    if (page >= 1) {
-      double estimate = ((elapsed / (double)(page+1)) * (double)pages);
-      if (estimate > 1.5*max_secs) return ETIMEDOUT; // seems like we are going to timeout
-    }
-  }  
-  _mi_verbose_message("reserved %zu huge pages\n", pages);
+#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
+#include <sys/syscall.h>
+#ifndef MPOL_PREFERRED
+#define MPOL_PREFERRED 1
+#endif
+#if defined(SYS_mbind)
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags);
+}
+#else
+static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) {
+  UNUSED(start); UNUSED(len); UNUSED(mode); UNUSED(nmask); UNUSED(maxnode); UNUSED(flags);
  return 0;
 }
 #endif
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  mi_assert_internal(size%GiB == 0);
+  bool is_large = true;
+  void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
+  if (p == NULL) return NULL;
+  if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
+    uintptr_t numa_mask = (1UL << numa_node);
+    // TODO: does `mbind` work correctly for huge OS pages? should we
+    // use `set_mempolicy` before calling mmap instead?
+    // see: <https://lkml.org/lkml/2017/2/9/875>
+    long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
+    if (err != 0) {
+      _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno));
+    }
+  }
+  return p;
+}
+#else
+static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) {
+  return NULL;
+}
+#endif

+#if (MI_INTPTR_SIZE >= 8)
+// To ensure proper alignment, use our own area for huge OS pages
+static _Atomic(uintptr_t)  mi_huge_start; // = 0
+
+// Claim an aligned address range for huge pages
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  if (total_size != NULL) *total_size = 0;
+  const size_t size = pages * MI_HUGE_OS_PAGE_SIZE;
+
+  uintptr_t start = 0;
+  uintptr_t end = 0;
+  uintptr_t expected;
+  do {
+    start = expected = mi_atomic_read_relaxed(&mi_huge_start);
+    if (start == 0) {
+      // Initialize the start address after the 32TiB area
+      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
+#if (MI_SECURE>0 || MI_DEBUG==0)      // security: randomize start of huge pages unless in debug mode
+      uintptr_t r = _mi_random_init((uintptr_t)&mi_os_claim_huge_pages);
+      start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x3FF));  // (randomly 0-1024)*1GiB == 0 to 1TiB
+#endif
+    }
+    end = start + size;
+    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
+  } while (!mi_atomic_cas_strong(&mi_huge_start, end, expected));
+
+  if (total_size != NULL) *total_size = size;
+  return (uint8_t*)start;
+}
+#else
+static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
+  if (total_size != NULL) *total_size = 0;
+  return NULL;
+}
+#endif
+
+// Allocate MI_SEGMENT_SIZE aligned huge pages
+void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize) {
+  if (psize != NULL) *psize = 0;
+  if (pages_reserved != NULL) *pages_reserved = 0;
+  size_t size = 0;
+  uint8_t* start = mi_os_claim_huge_pages(pages, &size);
+  if (start == NULL) return NULL; // or 32-bit systems
+
+  // Allocate one page at the time but try to place them contiguously
+  // We allocate one page at the time to be able to abort if it takes too long
+  // or to at least allocate as many as available on the system.
+  mi_msecs_t start_t = _mi_clock_start();
+  size_t page;
+  for (page = 0; page < pages; page++) {
+    // allocate a page
+    void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
+    void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node);
+
+    // Did we succeed at a contiguous address?
+    if (p != addr) {
+      // no success, issue a warning and break
+      if (p != NULL) {
+        _mi_warning_message("could not allocate contiguous huge page %zu at 0x%p\n", page, addr);
+        _mi_os_free(p, MI_HUGE_OS_PAGE_SIZE, &_mi_stats_main);
+      }
+      break;
+    }
+
+    // success, record it
+    _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
+    _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
+
+    // check for timeout
+    if (max_msecs > 0) {
+      mi_msecs_t elapsed = _mi_clock_end(start_t);
+      if (page >= 1) {
+        mi_msecs_t estimate = ((elapsed / (page+1)) * pages);
+        if (estimate > 2*max_msecs) { // seems like we are going to timeout, break
+          elapsed = max_msecs + 1;
+        }
+      }
+      if (elapsed > max_msecs) {
+        _mi_warning_message("huge page allocation timed out\n");
+        break;
+      }
+    }
+  }
+  mi_assert_internal(page*MI_HUGE_OS_PAGE_SIZE <= size);
+  if (pages_reserved != NULL) *pages_reserved = page;
+  if (psize != NULL) *psize = page * MI_HUGE_OS_PAGE_SIZE;
+  return (page == 0 ? NULL : start);
+}
+
+// free every huge page in a range individually (as we allocated per page)
+// note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems.
+void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
+  if (p==NULL || size==0) return;
+  uint8_t* base = (uint8_t*)p;
+  while (size >= MI_HUGE_OS_PAGE_SIZE) {
+    _mi_os_free(base, MI_HUGE_OS_PAGE_SIZE, stats);
+    size -= MI_HUGE_OS_PAGE_SIZE;
+  }
+}
+
+/* ----------------------------------------------------------------------------
+Support NUMA aware allocation
+-----------------------------------------------------------------------------*/
+#ifdef WIN32
+static size_t mi_os_numa_nodex() {
+  PROCESSOR_NUMBER pnum;
+  USHORT numa_node = 0;
+  GetCurrentProcessorNumberEx(&pnum);
+  GetNumaProcessorNodeEx(&pnum,&numa_node);
+  return numa_node;
+}
+
+static size_t mi_os_numa_node_countx(void) {
+  ULONG numa_max = 0;
+  GetNumaHighestNodeNumber(&numa_max);
+  return (numa_max + 1);
+}
+#elif defined(__linux__)
+#include <sys/syscall.h>  // getcpu
+#include <stdio.h>        // access
+
+static size_t mi_os_numa_nodex(void) {
+#ifdef SYS_getcpu
+  unsigned long node = 0;
+  unsigned long ncpu = 0;
+  long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
+  if (err != 0) return 0;
+  return node;
+#else
+  return 0;
+#endif
+}
+static size_t mi_os_numa_node_countx(void) {
+  char buf[128];
+  unsigned node = 0;
+  for(node = 0; node < 256; node++) {
+    // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation)
+    snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1);
+    if (access(buf,R_OK) != 0) break;
+  }
+  return (node+1);
+}
+#else
+static size_t mi_os_numa_nodex(void) {
+  return 0;
+}
+static size_t mi_os_numa_node_countx(void) {
+  return 1;
+}
+#endif
+
+size_t _mi_numa_node_count = 0;   // cache the node count
+
+size_t _mi_os_numa_node_count_get(void) {
+  if (mi_unlikely(_mi_numa_node_count <= 0)) {
+    long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
+    if (ncount <= 0) ncount = (long)mi_os_numa_node_countx();        // or detect dynamically
+    _mi_numa_node_count = (size_t)(ncount <= 0 ? 1 : ncount);
+    _mi_verbose_message("using %zd numa regions\n", _mi_numa_node_count);
+  }
+  mi_assert_internal(_mi_numa_node_count >= 1);
+  return _mi_numa_node_count;
+}
+
+int _mi_os_numa_node_get(mi_os_tld_t* tld) {
+  UNUSED(tld);
+  size_t numa_count = _mi_os_numa_node_count();
+  if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
+  // never more than the node count and >= 0
+  size_t numa_node = mi_os_numa_nodex();
+  if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
+  return (int)numa_node;
+}
--- a/src/page.c
+++ b/src/page.c
@ -38,7 +38,7 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta
 static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_stats_t* stats);


-#if (MI_DEBUG>1)
+#if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
  size_t count = 0;
  while (head != NULL) {
@ -75,7 +75,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {

  mi_segment_t* segment = _mi_page_segment(page);
  uint8_t* start = _mi_page_start(segment,page,NULL);
-  mi_assert_internal(start == _mi_segment_page_start(segment,page,page->block_size,NULL));
+  mi_assert_internal(start == _mi_segment_page_start(segment,page,page->block_size,NULL,NULL));
  //mi_assert_internal(start + page->capacity*page->block_size == page->top);

  mi_assert_internal(mi_page_list_is_valid(page,page->free));
@ -229,6 +229,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_expensive(mi_page_is_valid_init(page));
  mi_assert_internal(page->heap == NULL);
  mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE);
+  mi_assert_internal(!page->is_reset);  
  _mi_page_free_collect(page,false);
  mi_page_queue_t* pq = mi_page_queue(heap, page->block_size);
  mi_page_queue_push(heap, pq, page);
@ -342,7 +343,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) {
  mi_assert_expensive(_mi_page_is_valid(page));
  mi_assert_internal(pq == mi_page_queue_of(page));
  mi_assert_internal(page->heap != NULL);
-
+  
 #if MI_DEBUG > 1
  mi_heap_t* pheap = (mi_heap_t*)mi_atomic_read_ptr(mi_atomic_cast(void*, &page->heap));
 #endif
@ -597,7 +598,7 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi
  mi_assert_internal(block_size > 0);
  // set fields
  size_t page_size;
-  _mi_segment_page_start(segment, page, block_size, &page_size);
+  _mi_segment_page_start(segment, page, block_size, &page_size, NULL);
  page->block_size = block_size;
  mi_assert_internal(page_size / block_size < (1L<<16));
  page->reserved = (uint16_t)(page_size / block_size);
--- a/src/segment.c
+++ b/src/segment.c
@ -13,6 +13,8 @@ terms of the MIT license. A copy of the license can be found in the file

 #define MI_PAGE_HUGE_ALIGN  (256*1024)

+static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size);
+
 /* -----------------------------------------------------------
  Segment allocation
  We allocate pages inside big OS allocated "segments"
@ -40,8 +42,7 @@ terms of the MIT license. A copy of the license can be found in the file
  Queue of segments containing free pages
 ----------------------------------------------------------- */

-
-#if (MI_DEBUG>1)
+#if (MI_DEBUG>=3)
 static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, mi_segment_t* segment) {
  mi_assert_internal(segment != NULL);
  mi_segment_t* list = queue->first;
@ -111,7 +112,7 @@ static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_t
 Invariant checking
 ----------------------------------------------------------- */

-#if (MI_DEBUG > 1)
+#if (MI_DEBUG>=2)
 static bool mi_segment_is_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) {
  mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld);
  bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment));
@ -120,7 +121,9 @@ static bool mi_segment_is_in_free_queue(mi_segment_t* segment, mi_segments_tld_t
  }
  return in_queue;
 }
+#endif

+#if (MI_DEBUG>=3)
 static size_t mi_segment_pagesize(mi_segment_t* segment) {
  return ((size_t)1 << segment->page_shift);
 }
@ -141,31 +144,50 @@ static bool mi_segment_is_valid(mi_segment_t* segment) {
 }
 #endif

+
+/* -----------------------------------------------------------
+  Page reset
+----------------------------------------------------------- */
+
+static void mi_page_reset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld) {
+  if (!mi_option_is_enabled(mi_option_page_reset)) return;
+  if (segment->mem_is_fixed || page->segment_in_use || page->is_reset) return;
+  size_t psize;
+  void* start = mi_segment_raw_page_start(segment, page, &psize);
+  page->is_reset = true;
+  mi_assert_internal(size <= psize);
+  _mi_mem_reset(start, ((size == 0 || size > psize) ? psize : size), tld->os);
+}
+
+static void mi_page_unreset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld)
+{  
+  mi_assert_internal(page->is_reset);  
+  mi_assert_internal(!segment->mem_is_fixed);
+  page->is_reset = false;
+  size_t psize;
+  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
+  bool is_zero = false;
+  _mi_mem_unreset(start, ((size == 0 || size > psize) ? psize : size), &is_zero, tld->os);
+  if (is_zero) page->is_zero_init = true;
+}
+
+
 /* -----------------------------------------------------------
 Segment size calculations
 ----------------------------------------------------------- */

-// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
-uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size)
-{
+// Raw start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
+// The raw start is not taking aligned block allocation into consideration.
+static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) {
  size_t   psize = (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift);
-  uint8_t* p     = (uint8_t*)segment + page->segment_idx*psize;
+  uint8_t* p = (uint8_t*)segment + page->segment_idx * psize;

  if (page->segment_idx == 0) {
    // the first page starts after the segment info (and possible guard page)
-    p     += segment->segment_info_size;
+    p += segment->segment_info_size;
    psize -= segment->segment_info_size;
-    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
-    if (block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
-      size_t adjust = block_size - ((uintptr_t)p % block_size);
-      if (adjust < block_size) {
-        p     += adjust;
-        psize -= adjust;
-      }
-      mi_assert_internal((uintptr_t)p % block_size == 0);
-    }
  }
-  
+
  if (MI_SECURE > 1 || (MI_SECURE == 1 && page->segment_idx == segment->capacity - 1)) {
    // secure == 1: the last page has an os guard page at the end
    // secure >  1: every page has an os guard page
@ -173,19 +195,36 @@ uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* pa
  }

  if (page_size != NULL) *page_size = psize;
-  mi_assert_internal(_mi_ptr_page(p) == page);
+  mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page);
  mi_assert_internal(_mi_ptr_segment(p) == segment);
  return p;
 }

-static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size, size_t* info_size) {
-  /*
-  if (mi_option_is_enabled(mi_option_secure)) {
-    // always reserve maximally so the protection falls on
-    // the same address area, as we need to reuse them from the caches interchangably.
-    capacity = MI_SMALL_PAGES_PER_SEGMENT;
+// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set)
+uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t block_size, size_t* page_size, size_t* pre_size)
+{
+  size_t   psize;
+  uint8_t* p = mi_segment_raw_page_start(segment, page, &psize);
+  if (pre_size != NULL) *pre_size = 0;
+  if (page->segment_idx == 0 && block_size > 0 && segment->page_kind <= MI_PAGE_MEDIUM) {
+    // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore)
+    size_t adjust = block_size - ((uintptr_t)p % block_size);
+    if (adjust < block_size) {
+      p += adjust;
+      psize -= adjust;
+      if (pre_size != NULL) *pre_size = adjust;
+    }
+    mi_assert_internal((uintptr_t)p % block_size == 0);
  }
-  */
+    
+  if (page_size != NULL) *page_size = psize;
+  mi_assert_internal(page->block_size==0 || _mi_ptr_page(p) == page);
+  mi_assert_internal(_mi_ptr_segment(p) == segment);
+  return p;
+}
+
+static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size, size_t* info_size) 
+{
  const size_t minsize   = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */;
  size_t guardsize = 0;
  size_t isize     = 0;
@ -232,7 +271,15 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
    mi_assert_internal(!segment->mem_is_fixed);
    _mi_mem_unprotect(segment, segment->segment_size); // ensure no more guard pages are set
  }
-  _mi_mem_free(segment, segment_size, segment->memid, tld->stats);
+  
+  bool fully_committed = true;
+  bool any_reset = false;
+  for (size_t i = 0; i < segment->capacity; i++) {
+    const mi_page_t* page = &segment->pages[i];    
+    if (!page->is_committed) fully_committed = false;
+    if (page->is_reset) any_reset = true;
+  }
+  _mi_mem_free(segment, segment_size, segment->memid, fully_committed, any_reset, tld->os);
 }


@ -273,14 +320,11 @@ static bool mi_segment_cache_full(mi_segments_tld_t* tld)

 static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) {
  mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld));
-  mi_assert_internal(segment->next == NULL);
+  mi_assert_internal(segment->next == NULL);  
  if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) {
    return false;
  }
  mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE);
-  if (!segment->mem_is_fixed && mi_option_is_enabled(mi_option_cache_reset)) {
-    _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->stats);
-  }
  segment->next = tld->cache;
  tld->cache = segment;
  tld->cache_count++;
@ -325,75 +369,91 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
  mi_assert_internal(segment_size >= required);
  size_t page_size = (page_kind == MI_PAGE_HUGE ? segment_size : (size_t)1 << page_shift);

-  // Try to get it from our thread local cache first
-  bool eager_delay = (tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
-  bool eager  = !eager_delay && mi_option_is_enabled(mi_option_eager_commit);
-  bool commit = eager || (page_kind > MI_PAGE_MEDIUM);
-  bool protection_still_good = false;
+  // Initialize parameters
+  bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM && tld->count < (size_t)mi_option_get(mi_option_eager_commit_delay));
+  bool eager  = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit);
+  bool commit = eager || (page_kind >= MI_PAGE_LARGE);
+  bool pages_still_good = false;
  bool is_zero = false;
-  mi_segment_t* segment = mi_segment_cache_pop(segment_size, tld);
+  
+  // Try to get it from our thread local cache first
+  mi_segment_t* segment = NULL; // mi_segment_cache_pop(segment_size, tld);
  if (segment != NULL) {
-    if (MI_SECURE!=0) {
-      mi_assert_internal(!segment->mem_is_fixed);
-      if (segment->page_kind != page_kind) {
+    if (page_kind <= MI_PAGE_MEDIUM && segment->page_kind == page_kind && segment->segment_size == segment_size) {
+      pages_still_good = true;
+    }
+    else 
+    {
+      // different page kinds; unreset any reset pages, and unprotect
+      // TODO: optimize cache pop to return fitting pages if possible?
+      for (size_t i = 0; i < segment->capacity; i++) {
+        mi_page_t* page = &segment->pages[i];
+        if (page->is_reset) { 
+          mi_page_unreset(segment, page, 0, tld);  // todo: only unreset the part that was reset? (instead of the full page)
+        }
+      }
+      if (MI_SECURE!=0) {
+        mi_assert_internal(!segment->mem_is_fixed);
+        // TODO: should we unprotect per page? (with is_protected flag?)
        _mi_mem_unprotect(segment, segment->segment_size); // reset protection if the page kind differs
      }
-      else {
-        protection_still_good = true; // otherwise, the guard pages are still in place
-      }
-    }
-    if (!segment->mem_is_committed && page_kind > MI_PAGE_MEDIUM) {
-      mi_assert_internal(!segment->mem_is_fixed);
-      _mi_mem_commit(segment, segment->segment_size, &is_zero, tld->stats);
-      segment->mem_is_committed = true;
-    }
-    if (!segment->mem_is_fixed &&
-        (mi_option_is_enabled(mi_option_cache_reset) || mi_option_is_enabled(mi_option_page_reset))) {
-      bool reset_zero = false;
-      _mi_mem_unreset(segment, segment->segment_size, &reset_zero, tld->stats);
-      if (reset_zero) is_zero = true;
-    }
+    }    
  }
  else {
    // Allocate the segment from the OS
    size_t memid;
-    bool   mem_large = (!eager_delay && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
+    bool   mem_large = (!eager_delayed && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy    
    segment = (mi_segment_t*)_mi_mem_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_zero, &memid, os_tld);
    if (segment == NULL) return NULL;  // failed to allocate
    if (!commit) {
      // ensure the initial info is committed
      bool commit_zero = false;
-      _mi_mem_commit(segment, info_size, &commit_zero, tld->stats);
+      _mi_mem_commit(segment, info_size, &commit_zero, tld->os);
      if (commit_zero) is_zero = true;
    }
    segment->memid = memid;
    segment->mem_is_fixed = mem_large;
-    segment->mem_is_committed = commit;
+    segment->mem_is_committed = commit;    
    mi_segments_track_size((long)segment_size, tld);
  }
  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);

-  // zero the segment info (but not the `mem` fields)
-  ptrdiff_t ofs = offsetof(mi_segment_t,next);
-  memset((uint8_t*)segment + ofs, 0, info_size - ofs);    
-
-  // guard pages
-  if ((MI_SECURE != 0) && !protection_still_good) {
-    // in secure mode, we set up a protected page in between the segment info
-    // and the page data
-    mi_assert_internal( info_size == pre_size - _mi_os_page_size() && info_size % _mi_os_page_size() == 0);
-    _mi_mem_protect( (uint8_t*)segment + info_size, (pre_size - info_size) );
-    size_t os_page_size = _mi_os_page_size();
-    if (MI_SECURE <= 1) {
-      // and protect the last page too
-      _mi_mem_protect( (uint8_t*)segment + segment_size - os_page_size, os_page_size );
-    }
-    else {
-      // protect every page
-      for (size_t i = 0; i < capacity; i++) {
-        _mi_mem_protect( (uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size );
+  if (!pages_still_good) {    
+    // guard pages
+    if (MI_SECURE != 0) {
+      // in secure mode, we set up a protected page in between the segment info
+      // and the page data
+      mi_assert_internal(info_size == pre_size - _mi_os_page_size() && info_size % _mi_os_page_size() == 0);
+      _mi_mem_protect((uint8_t*)segment + info_size, (pre_size - info_size));
+      const size_t os_page_size = _mi_os_page_size();
+      if (MI_SECURE <= 1) {
+        // and protect the last page too
+        _mi_mem_protect((uint8_t*)segment + segment_size - os_page_size, os_page_size);
+      }
+      else {
+        // protect every page
+        for (size_t i = 0; i < capacity; i++) {
+          _mi_mem_protect((uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size);
+        }
      }
    }
+
+    // zero the segment info (but not the `mem` fields)
+    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+    memset((uint8_t*)segment + ofs, 0, info_size - ofs);
+
+    // initialize pages info
+    for (uint8_t i = 0; i < capacity; i++) {
+      segment->pages[i].segment_idx = i;
+      segment->pages[i].is_reset = false;
+      segment->pages[i].is_committed = commit;
+      segment->pages[i].is_zero_init = is_zero;
+    }
+  }
+  else {
+    // zero the segment info but not the pages info (and mem fields)
+    ptrdiff_t ofs = offsetof(mi_segment_t, next);
+    memset((uint8_t*)segment + ofs, 0, offsetof(mi_segment_t,pages) - ofs);
  }

  // initialize
@ -404,13 +464,8 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind,
  segment->segment_info_size = pre_size;
  segment->thread_id  = _mi_thread_id();
  segment->cookie = _mi_ptr_cookie(segment);
-  for (uint8_t i = 0; i < segment->capacity; i++) {
-    segment->pages[i].segment_idx = i;
-    segment->pages[i].is_reset = false;
-    segment->pages[i].is_committed = commit;
-    segment->pages[i].is_zero_init = is_zero;
-  }
  _mi_stat_increase(&tld->stats->page_committed, segment->segment_info_size);
+  
  //fprintf(stderr,"mimalloc: alloc segment at %p\n", (void*)segment);
  return segment;
 }
@ -457,30 +512,28 @@ static bool mi_segment_has_free(const mi_segment_t* segment) {
  return (segment->used < segment->capacity);
 }

-static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_stats_t* stats) {
+static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t* tld) {
  mi_assert_internal(mi_segment_has_free(segment));
  mi_assert_expensive(mi_segment_is_valid(segment));
  for (size_t i = 0; i < segment->capacity; i++) {
    mi_page_t* page = &segment->pages[i];
    if (!page->segment_in_use) {
-      if (page->is_reset || !page->is_committed) {
+      // set in-use before doing unreset to prevent delayed reset
+      page->segment_in_use = true;
+      segment->used++;                
+      if (!page->is_committed) {
+        mi_assert_internal(!segment->mem_is_fixed);
+        mi_assert_internal(!page->is_reset);
        size_t psize;
-        uint8_t* start = _mi_page_start(segment, page, &psize);        
-        if (!page->is_committed) {
-          mi_assert_internal(!segment->mem_is_fixed);
-          page->is_committed = true;
-          bool is_zero = false;
-          _mi_mem_commit(start,psize,&is_zero,stats);
-          if (is_zero) page->is_zero_init = true;
-        }
-        if (page->is_reset) {
-          mi_assert_internal(!segment->mem_is_fixed);
-          page->is_reset = false;
-          bool is_zero = false;
-          _mi_mem_unreset(start, psize, &is_zero, stats);
-          if (is_zero) page->is_zero_init = true;
-        }
+        uint8_t* start = _mi_page_start(segment, page, &psize);
+        page->is_committed = true;
+        bool is_zero = false;
+        _mi_mem_commit(start,psize,&is_zero,tld->os);
+        if (is_zero) page->is_zero_init = true;
      }
+      if (page->is_reset) {
+        mi_page_unreset(segment, page, 0, tld); // todo: only unreset the part that was reset?
+      }      
      return page;
    }
  }
@ -495,29 +548,29 @@ static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_stats_t* stats)

 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld);

-static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_stats_t* stats) {
-  UNUSED(stats);
+static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) {
  mi_assert_internal(page->segment_in_use);
  mi_assert_internal(mi_page_all_free(page));
  mi_assert_internal(page->is_committed);
  size_t inuse = page->capacity * page->block_size;
-  _mi_stat_decrease(&stats->page_committed, inuse);
-  _mi_stat_decrease(&stats->pages, 1);
+  _mi_stat_decrease(&tld->stats->page_committed, inuse);
+  _mi_stat_decrease(&tld->stats->pages, 1);
  
-  // reset the page memory to reduce memory pressure?
-  if (!segment->mem_is_fixed && !page->is_reset && mi_option_is_enabled(mi_option_page_reset)) {
-    size_t psize;
-    uint8_t* start = _mi_page_start(segment, page, &psize);
-    page->is_reset = true;
-    _mi_mem_reset(start, psize, stats);
-  }
+  // calculate the used size from the raw (non-aligned) start of the page
+  size_t pre_size;
+  _mi_segment_page_start(segment, page, page->block_size, NULL, &pre_size);
+  size_t used_size = pre_size + (page->capacity * page->block_size);

-  // zero the page data, but not the segment fields
+  // zero the page data, but not the segment fields  
  page->is_zero_init = false;
  ptrdiff_t ofs = offsetof(mi_page_t,capacity);
  memset((uint8_t*)page + ofs, 0, sizeof(*page) - ofs);
  page->segment_in_use = false;
  segment->used--;
+
+  // reset the page memory to reduce memory pressure?
+  // note: must come after setting `segment_in_use` to false
+  mi_page_reset(segment, page, used_size, tld);
 }

 void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
@ -527,7 +580,7 @@ void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld)
  mi_assert_expensive(mi_segment_is_valid(segment));

  // mark it as free now
-  mi_segment_page_clear(segment, page, tld->stats);
+  mi_segment_page_clear(segment, page, tld);

  if (segment->used == 0) {
    // no more used pages; remove from the free list and free the segment
@ -567,7 +620,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
  // remove the segment from the free page queue if needed
  mi_segment_remove_from_free_queue(segment,tld);
  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
-
+  
  // all pages in the segment are abandoned; add it to the abandoned list
  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
  mi_segments_track_size(-((long)segment->segment_size), tld);
@ -627,15 +680,17 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
    for (size_t i = 0; i < segment->capacity; i++) {
      mi_page_t* page = &segment->pages[i];
      if (page->segment_in_use) {
+        mi_assert_internal(!page->is_reset);
+        mi_assert_internal(page->is_committed);
        segment->abandoned--;
        mi_assert(page->next == NULL);
        _mi_stat_decrease(&tld->stats->pages_abandoned, 1);
        if (mi_page_all_free(page)) {
          // if everything free by now, free the page
-          mi_segment_page_clear(segment,page,tld->stats);
+          mi_segment_page_clear(segment,page,tld);
        }
        else {
-          // otherwise reclaim it
+          // otherwise reclaim it          
          _mi_page_reclaim(heap,page);
        }
      }
@ -664,9 +719,8 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
 // Requires that the page has free pages
 static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) {
  mi_assert_internal(mi_segment_has_free(segment));
-  mi_page_t* page = mi_segment_find_free(segment, tld->stats);
-  page->segment_in_use = true;  
-  segment->used++;
+  mi_page_t* page = mi_segment_find_free(segment, tld);
+  mi_assert_internal(page->segment_in_use);  
  mi_assert_internal(segment->used <= segment->capacity);
  if (segment->used == segment->capacity) {
    // if no more free pages, remove from the queue
@ -684,7 +738,11 @@ static mi_page_t* mi_segment_page_alloc(mi_page_kind_t kind, size_t page_shift,
    mi_segment_enqueue(free_queue, segment);
  }
  mi_assert_internal(free_queue->first != NULL);
-  return mi_segment_page_alloc_in(free_queue->first,tld);
+  mi_page_t* page = mi_segment_page_alloc_in(free_queue->first,tld);
+#if MI_DEBUG>=2
+  _mi_segment_page_start(_mi_page_segment(page), page, sizeof(void*), NULL, NULL)[0] = 0;
+#endif
+  return page;
 }

 static mi_page_t* mi_segment_small_page_alloc(mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
@ -705,6 +763,9 @@ static mi_page_t* mi_segment_large_page_alloc(mi_segments_tld_t* tld, mi_os_tld_
  segment->used = 1;
  mi_page_t* page = &segment->pages[0];
  page->segment_in_use = true;
+#if MI_DEBUG>=2
+  _mi_segment_page_start(segment, page, sizeof(void*), NULL, NULL)[0] = 0;
+#endif
  return page;
 }

@ -716,7 +777,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
  segment->used = 1;
  segment->thread_id = 0; // huge pages are immediately abandoned
  mi_page_t* page = &segment->pages[0];
-  page->segment_in_use = true;
+  page->segment_in_use = true;  
  return page;
 }

--- a/src/stats.c
+++ b/src/stats.c
@ -130,19 +130,23 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, const
  char buf[32];
  int  len = 32;
  const char* suffix = (unit <= 0 ? " " : "b");
-  double base = (unit == 0 ? 1000.0 : 1024.0);
+  const int64_t base = (unit == 0 ? 1000 : 1024);
  if (unit>0) n *= unit;

-  double pos = (double)(n < 0 ? -n : n);
-  if (pos < base)
-    snprintf(buf,len, "%d %s ", (int)n, suffix);
-  else if (pos < base*base)
-    snprintf(buf, len, "%.1f k%s", (double)n / base, suffix);
-  else if (pos < base*base*base)
-    snprintf(buf, len, "%.1f m%s", (double)n / (base*base), suffix);
-  else
-    snprintf(buf, len, "%.1f g%s", (double)n / (base*base*base), suffix);
-
+  const int64_t pos = (n < 0 ? -n : n);
+  if (pos < base) {
+    snprintf(buf, len, "%d %s ", (int)n, suffix);
+  }
+  else {
+    int64_t divider = base;
+    const char* magnitude = "k";
+    if (pos >= divider*base) { divider *= base; magnitude = "m"; }
+    if (pos >= divider*base) { divider *= base; magnitude = "g"; }
+    const int64_t tens = (n / (divider/10));
+    const long whole = (long)(tens/10);
+    const long frac1 = (long)(tens%10);
+    snprintf(buf, len, "%ld.%ld %s%s", whole, frac1, magnitude, suffix);
+  }
  _mi_fprintf(out, (fmt==NULL ? "%11s" : fmt), buf);
 }

@ -199,8 +203,10 @@ static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg
 }

 static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, mi_output_fun* out) {
-  double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count);
-  _mi_fprintf(out, "%10s: %7.1f avg\n", msg, avg);
+  const int64_t avg_tens = (stat->count == 0 ? 0 : (stat->total*10 / stat->count)); 
+  const long avg_whole = (long)(avg_tens/10);
+  const long avg_frac1 = (long)(avg_tens%10);
+  _mi_fprintf(out, "%10s: %5ld.%ld avg\n", msg, avg_whole, avg_frac1);
 }


@ -231,9 +237,9 @@ static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bin
 #endif


-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit);
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit);

-static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out) mi_attr_noexcept {
+static void _mi_stats_print(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out) mi_attr_noexcept {
  mi_print_header(out);
  #if MI_STAT>1
  mi_stat_count_t normal = { 0,0,0,0 };
@ -265,17 +271,17 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out)
  mi_stat_counter_print(&stats->commit_calls, "commits", out);
  mi_stat_print(&stats->threads, "threads", -1, out);
  mi_stat_counter_print_avg(&stats->searches, "searches", out);
+  _mi_fprintf(out, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
+  if (elapsed > 0) _mi_fprintf(out, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);

-  if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs);
-
-  double user_time;
-  double sys_time;
+  mi_msecs_t user_time;
+  mi_msecs_t sys_time;
  size_t peak_rss;
  size_t page_faults;
  size_t page_reclaim;
  size_t peak_commit;
  mi_process_info(&user_time, &sys_time, &peak_rss, &page_faults, &page_reclaim, &peak_commit);
-  _mi_fprintf(out,"%10s: user: %.3f s, system: %.3f s, faults: %lu, reclaims: %lu, rss: ", "process", user_time, sys_time, (unsigned long)page_faults, (unsigned long)page_reclaim );
+  _mi_fprintf(out,"%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, reclaims: %lu, rss: ", "process", user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults, (unsigned long)page_reclaim );
  mi_printf_amount((int64_t)peak_rss, 1, out, "%s");
  if (peak_commit > 0) {
    _mi_fprintf(out,", commit charge: ");
@ -284,9 +290,7 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, mi_output_fun* out)
  _mi_fprintf(out,"\n");
 }

-double _mi_clock_end(double start);
-double _mi_clock_start(void);
-static double mi_time_start = 0.0;
+static mi_msecs_t mi_time_start; // = 0

 static mi_stats_t* mi_stats_get_default(void) {
  mi_heap_t* heap = mi_heap_get_default();
@ -316,71 +320,72 @@ void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
 }


-static void mi_stats_print_ex(mi_stats_t* stats, double secs, mi_output_fun* out) {
+static void mi_stats_print_ex(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out) {
  mi_stats_merge_from(stats);
-  _mi_stats_print(&_mi_stats_main, secs, out);
+  _mi_stats_print(&_mi_stats_main, elapsed, out);
 }

 void mi_stats_print(mi_output_fun* out) mi_attr_noexcept {
-  mi_stats_print_ex(mi_stats_get_default(),_mi_clock_end(mi_time_start),out);
+  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
+  mi_stats_print_ex(mi_stats_get_default(),elapsed,out);
 }

 void mi_thread_stats_print(mi_output_fun* out) mi_attr_noexcept {
-  _mi_stats_print(mi_stats_get_default(), _mi_clock_end(mi_time_start), out);
+  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
+  _mi_stats_print(mi_stats_get_default(), elapsed, out);
 }


-
-// --------------------------------------------------------
-// Basic timer for convenience
-// --------------------------------------------------------
-
+// ----------------------------------------------------------------
+// Basic timer for convenience; use milli-seconds to avoid doubles
+// ----------------------------------------------------------------
 #ifdef _WIN32
 #include <windows.h>
-static double mi_to_seconds(LARGE_INTEGER t) {
-  static double freq = 0.0;
-  if (freq <= 0.0) {
+static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
+  static LARGE_INTEGER mfreq; // = 0
+  if (mfreq.QuadPart == 0LL) {
    LARGE_INTEGER f;
    QueryPerformanceFrequency(&f);
-    freq = (double)(f.QuadPart);
+    mfreq.QuadPart = f.QuadPart/1000LL;
+    if (mfreq.QuadPart == 0) mfreq.QuadPart = 1;
  }
-  return ((double)(t.QuadPart) / freq);
+  return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart);  
 }

-static double mi_clock_now(void) {
+mi_msecs_t _mi_clock_now(void) {
  LARGE_INTEGER t;
  QueryPerformanceCounter(&t);
-  return mi_to_seconds(t);
+  return mi_to_msecs(t);
 }
 #else
 #include <time.h>
 #ifdef CLOCK_REALTIME
-static double mi_clock_now(void) {
+mi_msecs_t _mi_clock_now(void) {
  struct timespec t;
  clock_gettime(CLOCK_REALTIME, &t);
-  return (double)t.tv_sec + (1.0e-9 * (double)t.tv_nsec);
+  return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000);
 }
 #else
 // low resolution timer
-static double mi_clock_now(void) {
-  return ((double)clock() / (double)CLOCKS_PER_SEC);
+mi_msecs_t _mi_clock_now(void) {
+  return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000));
 }
 #endif
 #endif


-static double mi_clock_diff = 0.0;
+static mi_msecs_t mi_clock_diff;

-double _mi_clock_start(void) {
+mi_msecs_t _mi_clock_start(void) {
  if (mi_clock_diff == 0.0) {
-    double t0 = mi_clock_now();
-    mi_clock_diff = mi_clock_now() - t0;
+    mi_msecs_t t0 = _mi_clock_now();
+    mi_clock_diff = _mi_clock_now() - t0;
  }
-  return mi_clock_now();
+  return _mi_clock_now();
 }

-double _mi_clock_end(double start) {
-  double end = mi_clock_now();
+mi_msecs_t _mi_clock_end(mi_msecs_t start) {
+  mi_msecs_t end = _mi_clock_now();
  return (end - start - mi_clock_diff);
 }

@ -394,21 +399,21 @@ double _mi_clock_end(double start) {
 #include <psapi.h>
 #pragma comment(lib,"psapi.lib")

-static double filetime_secs(const FILETIME* ftime) {
+static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
  ULARGE_INTEGER i;
  i.LowPart = ftime->dwLowDateTime;
  i.HighPart = ftime->dwHighDateTime;
-  double secs = (double)(i.QuadPart) * 1.0e-7; // FILETIME is in 100 nano seconds
-  return secs;
+  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
+  return msecs;
 }
-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
  FILETIME ct;
  FILETIME ut;
  FILETIME st;
  FILETIME et;
  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
-  *utime = filetime_secs(&ut);
-  *stime = filetime_secs(&st);
+  *utime = filetime_msecs(&ut);
+  *stime = filetime_msecs(&st);

  PROCESS_MEMORY_COUNTERS info;
  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
@ -427,11 +432,11 @@ static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size
 #include <mach/mach.h>
 #endif

-static double timeval_secs(const struct timeval* tv) {
-  return (double)tv->tv_sec + ((double)tv->tv_usec * 1.0e-6);
+static mi_msecs_t timeval_secs(const struct timeval* tv) {
+  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
 }

-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
  struct rusage rusage;
  getrusage(RUSAGE_SELF, &rusage);
 #if defined(__APPLE__) && defined(__MACH__)
@ -452,12 +457,12 @@ static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size
 #pragma message("define a way to get process info")
 #endif

-static void mi_process_info(double* utime, double* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
  *peak_rss = 0;
  *page_faults = 0;
  *page_reclaim = 0;
  *peak_commit = 0;
-  *utime = 0.0;
-  *stime = 0.0;
+  *utime = 0;
+  *stime = 0;
 }
 #endif
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@ -24,7 +24,7 @@ public:


 int main() {
-  //mi_stats_reset();  // ignore earlier allocations
+  mi_stats_reset();  // ignore earlier allocations
  atexit(free_p);
  void* p1 = malloc(78);
  void* p2 = mi_malloc_aligned(16,24);