Merge branch 'dev' into patch-1

2021-01-29 15:49:57 -08:00 · 2021-01-29 15:49:57 -08:00 · 71d80e914d
parent 1f08317f3c a6fa7b083e
commit 71d80e914d
62 changed files with 2600 additions and 1341 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -2,6 +2,7 @@
 * text eol=lf
 *.png binary
 *.pdn binary
+*.jpg binary
 *.sln binary
 *.suo binary
 *.vcproj binary
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -19,6 +19,8 @@ option(MI_BUILD_SHARED      "Build shared library" ON)
 option(MI_BUILD_STATIC      "Build static library" ON)
 option(MI_BUILD_OBJECT      "Build object library" ON)
 option(MI_BUILD_TESTS       "Build test executables" ON)
+option(MI_DEBUG_TSAN        "Build with thread sanitizer (needs clang)" OFF)
+option(MI_DEBUG_UBSAN       "Build with undefined-behavior sanitizer (needs clang++)" OFF)
 option(MI_CHECK_FULL        "Use full internal invariant checking in DEBUG mode (deprecated, use MI_DEBUG_FULL instead)" OFF)
 option(MI_INSTALL_TOPLEVEL  "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF)

@ -28,6 +30,7 @@ set(mi_sources
    src/stats.c
    src/random.c
    src/os.c
+    src/bitmap.c
    src/arena.c
    src/region.c
    src/segment.c
@ -44,7 +47,7 @@ set(mi_sources
 # -----------------------------------------------------------------------------

 if (NOT CMAKE_BUILD_TYPE)
-  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL MATCHES "ON")
+  if ("${CMAKE_BINARY_DIR}" MATCHES ".*(D|d)ebug$" OR  MI_DEBUG_FULL)
    message(STATUS "No build type selected, default to: Debug")
    set(CMAKE_BUILD_TYPE "Debug")
  else()
@ -66,20 +69,20 @@ if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
  set(MI_USE_CXX "ON")
 endif()

-if(MI_OVERRIDE MATCHES "ON")
+if(MI_OVERRIDE)
  message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
  if(APPLE)
-    if(MI_OSX_ZONE MATCHES "ON")
+    if(MI_OSX_ZONE)
      # use zone's on macOS
      message(STATUS "  Use malloc zone to override malloc (MI_OSX_ZONE=ON)")
      list(APPEND mi_sources src/alloc-override-osx.c)
      list(APPEND mi_defines MI_OSX_ZONE=1)
-      if(NOT MI_INTERPOSE MATCHES "ON")
+      if(NOT MI_INTERPOSE)
        message(STATUS "  (enabling INTERPOSE as well since zone's require this)")
        set(MI_INTERPOSE "ON")
      endif()
    endif()
-    if(MI_INTERPOSE MATCHES "ON")
+    if(MI_INTERPOSE)
      # use interpose on macOS
      message(STATUS "  Use interpose to override malloc (MI_INTERPOSE=ON)")
      list(APPEND mi_defines MI_INTERPOSE)
@ -87,42 +90,71 @@ if(MI_OVERRIDE MATCHES "ON")
  endif()
 endif()

-if(MI_SECURE MATCHES "ON")
+if(MI_SECURE)
  message(STATUS "Set full secure build (MI_SECURE=ON)")
  list(APPEND mi_defines MI_SECURE=4)
 endif()

-if(MI_SEE_ASM MATCHES "ON")
+if(MI_SEE_ASM)
  message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)")
  list(APPEND mi_cflags -save-temps)
 endif()

-if(MI_CHECK_FULL MATCHES "ON")
+if(MI_CHECK_FULL)
  message(STATUS "The MI_CHECK_FULL option is deprecated, use MI_DEBUG_FULL instead")
  set(MI_DEBUG_FULL "ON")
 endif()

-if(MI_DEBUG_FULL MATCHES "ON")
+if(MI_DEBUG_FULL)
  message(STATUS "Set debug level to full internal invariant checking (MI_DEBUG_FULL=ON)")
  list(APPEND mi_defines MI_DEBUG=3)   # full invariant checking
 endif()

-if(MI_PADDING MATCHES "OFF")
+if(NOT MI_PADDING)
  message(STATUS "Disable padding of heap blocks in debug mode (MI_PADDING=OFF)")
  list(APPEND mi_defines MI_PADDING=0)
 endif()

-if(MI_XMALLOC MATCHES "ON")
+if(MI_XMALLOC)
  message(STATUS "Enable abort() calls on memory allocation failure (MI_XMALLOC=ON)")
  list(APPEND mi_defines MI_XMALLOC=1)
 endif()

-if(MI_SHOW_ERRORS MATCHES "ON")
+if(MI_SHOW_ERRORS)
  message(STATUS "Enable printing of error and warning messages by default (MI_SHOW_ERRORS=ON)")
  list(APPEND mi_defines MI_SHOW_ERRORS=1)
 endif()

-if(MI_USE_CXX MATCHES "ON")
+if(MI_DEBUG_TSAN)
+  if(CMAKE_C_COMPILER_ID MATCHES "Clang")
+    message(STATUS "Build with thread sanitizer (MI_DEBUG_TSAN=ON)")
+    list(APPEND mi_defines MI_TSAN=1)
+    list(APPEND mi_cflags -fsanitize=thread -g -O1)
+    list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=thread)
+  else()
+    message(WARNING "Can only use thread sanitizer with clang (MI_DEBUG_TSAN=ON but ignored)")    
+  endif()  
+endif()
+
+if(MI_DEBUG_UBSAN)
+  if(CMAKE_BUILD_TYPE MATCHES "Debug")    
+    if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      message(STATUS "Build with undefined-behavior sanitizer (MI_DEBUG_UBSAN=ON)")
+      list(APPEND mi_cflags -fsanitize=undefined -g)
+      list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=undefined)
+      if (NOT MI_USE_CXX)
+        message(STATUS "(switch to use C++ due to MI_DEBUG_UBSAN)")
+        set(MI_USE_CXX "ON")
+      endif()
+    else()
+      message(WARNING "Can only use undefined-behavior sanitizer with clang++ (MI_DEBUG_UBSAN=ON but ignored)")    
+    endif()  
+  else()
+    message(WARNING "Can only use thread sanitizer with a debug build (CMAKE_BUILD_TYPE=${CMAKE_BUILD_TYPE})")    
+  endif()
+endif()
+
+if(MI_USE_CXX)
  message(STATUS "Use the C++ compiler to compile (MI_USE_CXX=ON)")
  set_source_files_properties(${mi_sources} PROPERTIES LANGUAGE CXX )
  set_source_files_properties(src/static.c test/test-api.c test/test-stress PROPERTIES LANGUAGE CXX )
@ -146,22 +178,29 @@ if(CMAKE_C_COMPILER_ID MATCHES "Intel")
  list(APPEND mi_cflags -Wall -fvisibility=hidden)
 endif()

-if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
-  if(MI_LOCAL_DYNAMIC_TLS MATCHES "ON")
+if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
+  if(MI_LOCAL_DYNAMIC_TLS)
    list(APPEND mi_cflags -ftls-model=local-dynamic)
  else()
    list(APPEND mi_cflags -ftls-model=initial-exec)
  endif()
 endif()

+if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914)
+  list(APPEND mi_cflags /Zc:__cplusplus)
+endif()
+
 # Architecture flags
-if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "arm")
-    list(APPEND mi_cflags -march=native)
+if(${CMAKE_HOST_SYSTEM_PROCESSOR} MATCHES "arm" AND NOT APPLE)
+    check_cxx_compiler_flag(-march=native CXX_SUPPORTS_MARCH_NATIVE)
+    if (CXX_SUPPORTS_MARCH_NATIVE)
+        list(APPEND mi_cflags -march=native)
+    endif()
 endif()

 # extra needed libraries
 if(WIN32)
-  list(APPEND mi_libraries psapi shell32 user32 bcrypt)
+  list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt)
 else()
  if(NOT ${CMAKE_C_COMPILER} MATCHES "android")
    list(APPEND mi_libraries pthread)
@ -176,16 +215,18 @@ endif()
 # Install and output names
 # -----------------------------------------------------------------------------

-if (MI_INSTALL_TOPLEVEL MATCHES "ON")
+if (MI_INSTALL_TOPLEVEL)
  set(mi_install_dir "${CMAKE_INSTALL_PREFIX}")
 else()
  set(mi_install_dir "${CMAKE_INSTALL_PREFIX}/lib/mimalloc-${mi_version}")
 endif()
-if(MI_SECURE MATCHES "ON")
+
+if(MI_SECURE MATCHES)
  set(mi_basename "mimalloc-secure")
 else()
  set(mi_basename "mimalloc")
 endif()
+
 string(TOLOWER "${CMAKE_BUILD_TYPE}" CMAKE_BUILD_TYPE_LC)
 if(NOT(CMAKE_BUILD_TYPE_LC MATCHES "^(release|relwithdebinfo|minsizerel)$"))
  set(mi_basename "${mi_basename}-${CMAKE_BUILD_TYPE_LC}") #append build type (e.g. -debug) if not a release version
@ -202,9 +243,15 @@ endif()
 if(MI_BUILD_TESTS)
  list(APPEND mi_build_targets "tests")
 endif()
+
 message(STATUS "")
 message(STATUS "Library base name: ${mi_basename}")
 message(STATUS "Build type       : ${CMAKE_BUILD_TYPE_LC}")
+if(MI_USE_CXX)
+  message(STATUS "Compiler         : ${CMAKE_CXX_COMPILER}")
+else()
+  message(STATUS "Compiler         : ${CMAKE_C_COMPILER}")
+endif()
 message(STATUS "Install directory: ${mi_install_dir}")
 message(STATUS "Build targets    : ${mi_build_targets}")
 message(STATUS "")
@ -306,7 +353,7 @@ endif()
 # API surface testing
 # -----------------------------------------------------------------------------

-if (MI_BUILD_TESTS MATCHES "ON")
+if (MI_BUILD_TESTS)
  add_executable(mimalloc-test-api test/test-api.c)
  target_compile_definitions(mimalloc-test-api PRIVATE ${mi_defines})
  target_compile_options(mimalloc-test-api PRIVATE ${mi_cflags})
@ -327,7 +374,7 @@ endif()
 # -----------------------------------------------------------------------------
 # Set override properties
 # -----------------------------------------------------------------------------
-if (MI_OVERRIDE MATCHES "ON")
+if (MI_OVERRIDE)
  if (MI_BUILD_SHARED)
    target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE)
  endif()
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -7,12 +7,12 @@ trigger:
 - master
 - dev

-jobs:
+jobs:  
 - job:
  displayName: Windows
  pool:
    vmImage:
-     windows-2019
+      windows-2019
  strategy:
    matrix:
      Debug:
@ -36,18 +36,20 @@ jobs:
    inputs:
      solution: $(BuildType)/libmimalloc.sln
      configuration: '$(MSBuildConfiguration)'
-  - script: |
-      cd $(BuildType)
-      ctest
+      msbuildArguments: -m
+  - script: ctest --verbose --timeout 120
+    workingDirectory: $(BuildType)
    displayName: CTest
-  - upload: $(Build.SourcesDirectory)/$(BuildType)
-    artifact: mimalloc-windows-$(BuildType)
+  #- script: $(BuildType)\$(BuildType)\mimalloc-test-stress
+  #  displayName: TestStress
+  #- upload: $(Build.SourcesDirectory)/$(BuildType)
+  #  artifact: mimalloc-windows-$(BuildType)

 - job:
  displayName: Linux
  pool:
    vmImage:
-     ubuntu-16.04
+     ubuntu-18.04
  strategy:
    matrix:
      Debug:
@ -97,10 +99,11 @@ jobs:
      cmakeArgs: .. $(cmakeExtraArgs)
  - script: make -j$(nproc) -C $(BuildType)
    displayName: Make
-  - script: make test -C $(BuildType)
+  - script: ctest --verbose --timeout 120
+    workingDirectory: $(BuildType)
    displayName: CTest
-  - upload: $(Build.SourcesDirectory)/$(BuildType)
-    artifact: mimalloc-ubuntu-$(BuildType)
+#  - upload: $(Build.SourcesDirectory)/$(BuildType)
+#    artifact: mimalloc-ubuntu-$(BuildType)

 - job:
  displayName: macOS
@ -125,7 +128,41 @@ jobs:
      cmakeArgs: .. $(cmakeExtraArgs)
  - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType)
    displayName: Make
-  - script: make test -C $(BuildType)
+  - script: ctest --verbose --timeout 120
+    workingDirectory: $(BuildType)
    displayName: CTest
-  - upload: $(Build.SourcesDirectory)/$(BuildType)
-    artifact: mimalloc-macos-$(BuildType)
+#  - upload: $(Build.SourcesDirectory)/$(BuildType)
+#    artifact: mimalloc-macos-$(BuildType)
+
+# - job:
+#   displayName: Windows-2017
+#   pool:
+#     vmImage:
+#       vs2017-win2016
+#   strategy:
+#     matrix:
+#       Debug:
+#         BuildType: debug
+#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON
+#         MSBuildConfiguration: Debug
+#       Release:
+#         BuildType: release
+#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Release
+#         MSBuildConfiguration: Release
+#       Secure:
+#         BuildType: secure
+#         cmakeExtraArgs: -A x64 -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON
+#         MSBuildConfiguration: Release
+#   steps:
+#   - task: CMake@1
+#     inputs:
+#       workingDirectory: $(BuildType)
+#       cmakeArgs: .. $(cmakeExtraArgs)
+#   - task: MSBuild@1
+#     inputs:
+#       solution: $(BuildType)/libmimalloc.sln
+#       configuration: '$(MSBuildConfiguration)'
+#   - script: |
+#       cd $(BuildType)
+#       ctest --verbose --timeout 120
+#     displayName: CTest
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@ -3,14 +3,16 @@ set(mi_version_minor 6)
 set(mi_version ${mi_version_major}.${mi_version_minor})

 set(PACKAGE_VERSION ${mi_version})
-if("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "${mi_version_major}")
-    if ("${PACKAGE_FIND_VERSION_MINOR}" EQUAL "${mi_version_minor}")
-        set(PACKAGE_VERSION_EXACT TRUE)
-    elseif("${PACKAGE_FIND_VERSION_MINOR}" LESS "${mi_version_minor}")
-        set(PACKAGE_VERSION_COMPATIBLE TRUE)
+if(PACKAGE_FIND_VERSION_MAJOR)
+    if("${PACKAGE_FIND_VERSION_MAJOR}" EQUAL "${mi_version_major}")
+        if ("${PACKAGE_FIND_VERSION_MINOR}" EQUAL "${mi_version_minor}")
+            set(PACKAGE_VERSION_EXACT TRUE)
+        elseif("${PACKAGE_FIND_VERSION_MINOR}" LESS "${mi_version_minor}")
+            set(PACKAGE_VERSION_COMPATIBLE TRUE)
+        else()
+            set(PACKAGE_VERSION_UNSUITABLE TRUE)
+        endif()
    else()
        set(PACKAGE_VERSION_UNSUITABLE TRUE)
    endif()
-else()
-    set(PACKAGE_VERSION_UNSUITABLE TRUE)
 endif()
--- a/doc/bench-c5-18xlarge-2020-01-20-a.svg
+++ b/doc/bench-c5-18xlarge-2020-01-20-a.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='167.731pt' version='1.1' viewBox='52.938 54.996 381.624 167.731' width='381.624pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip1'>
 <path d='M82.148 206.586H434.164V81.34H82.148Z'/>
--- a/doc/bench-c5-18xlarge-2020-01-20-b.svg
+++ b/doc/bench-c5-18xlarge-2020-01-20-b.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='258.383pt' version='1.1' viewBox='106.736 54.996 381.623 258.383' width='381.623pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip2'>
 <path d='M135.949 251.93H487.961V84.164H135.949Z'/>
--- a/doc/bench-c5-18xlarge-2020-01-20-rss-a.svg
+++ b/doc/bench-c5-18xlarge-2020-01-20-rss-a.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='193.064pt' version='1.1' viewBox='52.938 51.67 381.624 193.064' width='381.624pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip7'>
 <path d='M82.148 228.594H434.164V60.828H82.148Z'/>
--- a/doc/bench-c5-18xlarge-2020-01-20-rss-b.svg
+++ b/doc/bench-c5-18xlarge-2020-01-20-rss-b.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='255.738pt' version='1.1' viewBox='106.736 54.996 381.624 255.738' width='381.624pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip8'>
 <path d='M135.949 249.281H487.961V81.519H135.949Z'/>
--- a/doc/bench-r5a-1.svg
+++ b/doc/bench-r5a-1.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='164.687pt' version='1.1' viewBox='52.938 54.996 381.625 164.687' width='381.625pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip1'>
 <path d='M82.148 203.937H434.164V78.691H82.148Z'/>
--- a/doc/bench-r5a-12xlarge-2020-01-16-a.svg
+++ b/doc/bench-r5a-12xlarge-2020-01-16-a.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='165.084pt' version='1.1' viewBox='52.938 54.996 381.624 165.084' width='381.624pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip5'>
 <path d='M82.148 203.937H434.164V78.691H82.148Z'/>
--- a/doc/bench-r5a-12xlarge-2020-01-16-b.svg
+++ b/doc/bench-r5a-12xlarge-2020-01-16-b.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='258.383pt' version='1.1' viewBox='106.736 54.996 381.623 258.383' width='381.623pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip6'>
 <path d='M135.949 251.93H487.961V84.164H135.949Z'/>
--- a/doc/bench-r5a-2.svg
+++ b/doc/bench-r5a-2.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='243.704pt' version='1.1' viewBox='106.737 54.995 381.623 243.704' width='381.623pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip2'>
 <path d='M135.949 249.281H487.961V81.515H135.949Z'/>
--- a/doc/bench-r5a-rss-1.svg
+++ b/doc/bench-r5a-rss-1.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='164.687pt' version='1.1' viewBox='52.938 54.996 381.625 164.687' width='381.625pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip7'>
 <path d='M82.148 203.937H434.164V78.691H82.148Z'/>
--- a/doc/bench-r5a-rss-2.svg
+++ b/doc/bench-r5a-rss-2.svg
@ -1,6 +1,7 @@
 <?xml version='1.0' encoding='UTF-8'?>
 <!-- This file was generated by dvisvgm 2.4.2 -->
 <svg height='196.567pt' version='1.1' viewBox='106.737 54.996 381.623 196.567' width='381.623pt' xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink'>
+<rect width="1000%" height="1000%" fill="white"/>
 <defs>
 <clipPath id='clip8'>
 <path d='M135.949 203.938H487.961V78.692H135.949Z'/>
--- a/doc/ds-logo.jpg
+++ b/doc/ds-logo.jpg
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@ -26,17 +26,25 @@ without code changes, for example, on Unix you can use it as:

 Notable aspects of the design include:

- __small and consistent__: the library is less than 6k LOC using simple and
+- __small and consistent__: the library is about 8k LOC using simple and
  consistent data structures. This makes it very suitable
  to integrate and adapt in other projects. For runtime systems it
  provides hooks for a monotonic _heartbeat_ and deferred freeing (for
  bounded worst-case times with reference counting).
- __free list sharding__: the big idea: instead of one big free list (per size class) we have
-  many smaller lists per memory "page" which both reduces fragmentation
-  and increases locality --
+- __free list sharding__: instead of one big free list (per size class) we have
+  many smaller lists per "mimalloc page" which reduces fragmentation and
+  increases locality --
  things that are allocated close in time get allocated close in memory.
-  (A memory "page" in _mimalloc_ contains blocks of one size class and is
-  usually 64KiB on a 64-bit system).
+  (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).
+- __free list multi-sharding__: the big idea! Not only do we shard the free list
+  per mimalloc page, but for each page we have multiple free lists. In particular, there
+  is one list for thread-local `free` operations, and another one for concurrent `free`
+  operations. Free-ing from another thread can now be a single CAS without needing
+  sophisticated coordination between threads. Since there will be 
+  thousands of separate free lists, contention is naturally distributed over the heap,
+  and the chance of contending on a single location will be low -- this is quite
+  similar to randomized algorithms like skip lists where adding
+  a random oracle removes the need for a more complex algorithm.
 - __eager page reset__: when a "page" becomes empty (with increased chance
  due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged")
  reducing (real) memory pressure and fragmentation, especially in long running
@ -51,7 +59,7 @@ Notable aspects of the design include:
  times (_wcat_), bounded space overhead (~0.2% meta-data, with at most 12.5% waste in allocation sizes),
  and has no internal points of contention using only atomic operations.
 - __fast__: In our benchmarks (see [below](#performance)),
-  _mimalloc_ always outperforms all other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
+  _mimalloc_ outperforms all other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
  and usually uses less memory (up to 25% more in the worst case). A nice property
  is that it does consistently well over a wide range of benchmarks.

@ -298,7 +306,7 @@ size_t mi_good_size(size_t size);
 /// resource usage by calling this every once in a while.
 void   mi_collect(bool force);

-/// Print the main statistics.
+/// Deprecated
 /// @param out Ignored, outputs to the registered output function or stderr by default.
 ///
 /// Most detailed when using a debug build.
@ -309,7 +317,7 @@ void mi_stats_print(void* out);
 /// @param arg Optional argument passed to \a out (if not \a NULL)
 ///
 /// Most detailed when using a debug build.
-void mi_stats_print(mi_output_fun* out, void* arg);
+void mi_stats_print_out(mi_output_fun* out, void* arg);

 /// Reset statistics.
 void mi_stats_reset(void);
@ -441,6 +449,20 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
 /// Currenty only used on Windows.
 bool mi_is_redirected();

+/// Return process information (time and memory usage).
+/// @param elapsed_msecs   Optional. Elapsed wall-clock time of the process in milli-seconds.
+/// @param user_msecs      Optional. User time in milli-seconds (as the sum over all threads).
+/// @param system_msecs    Optional. System time in milli-seconds.
+/// @param current_rss     Optional. Current working set size (touched pages).
+/// @param peak_rss        Optional. Peak working set size (touched pages).
+/// @param current_commit  Optional. Current committed memory (backed by the page file).
+/// @param peak_commit     Optional. Peak committed memory (backed by the page file).
+/// @param page_faults     Optional. Count of hard page faults.
+///
+/// The \a current_rss is precise on Windows and MacOSX; other systems estimate
+/// this using \a current_commit. The \a commit is precise on Windows but estimated
+/// on other systems as the amount of read/write accessible memory reserved by mimalloc.
+void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);

 /// \}

@ -752,8 +774,8 @@ bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block
 /// Runtime options.
 typedef enum mi_option_e {
  // stable options
-  mi_option_show_stats,   ///< Print statistics to `stderr` when the program is done.
  mi_option_show_errors,  ///< Print error messages to `stderr`.
+  mi_option_show_stats,   ///< Print statistics to `stderr` when the program is done.
  mi_option_verbose,      ///< Print verbose messages to `stderr`.
  // the following options are experimental
  mi_option_eager_commit, ///< Eagerly commit segments (4MiB) (enabled by default).
@ -772,9 +794,11 @@ typedef enum mi_option_e {
 } mi_option_t;


-bool  mi_option_enabled(mi_option_t option);
-void  mi_option_enable(mi_option_t option, bool enable);
-void  mi_option_enable_default(mi_option_t option, bool enable);
+bool  mi_option_is_enabled(mi_option_t option);
+void  mi_option_enable(mi_option_t option);
+void  mi_option_disable(mi_option_t option);
+void  mi_option_set_enabled(mi_option_t option, bool enable);
+void  mi_option_set_enabled_default(mi_option_t option, bool enable);

 long  mi_option_get(mi_option_t option);
 void  mi_option_set(mi_option_t option, long value);
--- a/docs/group__extended.html
+++ b/docs/group__extended.html
@ -146,11 +146,11 @@ Functions</h2></td></tr>
 <tr class="memdesc:ga421430e2226d7d468529cec457396756"><td class="mdescLeft">&#160;</td><td class="mdescRight">Eagerly free memory.  <a href="#ga421430e2226d7d468529cec457396756">More...</a><br /></td></tr>
 <tr class="separator:ga421430e2226d7d468529cec457396756"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga2d126e5c62d3badc35445e5d84166df2"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#ga2d126e5c62d3badc35445e5d84166df2">mi_stats_print</a> (void *out)</td></tr>
-<tr class="memdesc:ga2d126e5c62d3badc35445e5d84166df2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Print the main statistics.  <a href="#ga2d126e5c62d3badc35445e5d84166df2">More...</a><br /></td></tr>
+<tr class="memdesc:ga2d126e5c62d3badc35445e5d84166df2"><td class="mdescLeft">&#160;</td><td class="mdescRight">Deprecated.  <a href="#ga2d126e5c62d3badc35445e5d84166df2">More...</a><br /></td></tr>
 <tr class="separator:ga2d126e5c62d3badc35445e5d84166df2"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga256cc6f13a142deabbadd954a217e228"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#ga256cc6f13a142deabbadd954a217e228">mi_stats_print</a> (<a class="el" href="group__extended.html#gad823d23444a4b77a40f66bf075a98a0c">mi_output_fun</a> *out, void *arg)</td></tr>
-<tr class="memdesc:ga256cc6f13a142deabbadd954a217e228"><td class="mdescLeft">&#160;</td><td class="mdescRight">Print the main statistics.  <a href="#ga256cc6f13a142deabbadd954a217e228">More...</a><br /></td></tr>
-<tr class="separator:ga256cc6f13a142deabbadd954a217e228"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga537f13b299ddf801e49a5a94fde02c79"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#ga537f13b299ddf801e49a5a94fde02c79">mi_stats_print_out</a> (<a class="el" href="group__extended.html#gad823d23444a4b77a40f66bf075a98a0c">mi_output_fun</a> *out, void *arg)</td></tr>
+<tr class="memdesc:ga537f13b299ddf801e49a5a94fde02c79"><td class="mdescLeft">&#160;</td><td class="mdescRight">Print the main statistics.  <a href="#ga537f13b299ddf801e49a5a94fde02c79">More...</a><br /></td></tr>
+<tr class="separator:ga537f13b299ddf801e49a5a94fde02c79"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga3bb8468b8cfcc6e2a61d98aee85c5f99"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#ga3bb8468b8cfcc6e2a61d98aee85c5f99">mi_stats_reset</a> (void)</td></tr>
 <tr class="memdesc:ga3bb8468b8cfcc6e2a61d98aee85c5f99"><td class="mdescLeft">&#160;</td><td class="mdescRight">Reset statistics.  <a href="#ga3bb8468b8cfcc6e2a61d98aee85c5f99">More...</a><br /></td></tr>
 <tr class="separator:ga3bb8468b8cfcc6e2a61d98aee85c5f99"><td class="memSeparator" colspan="2">&#160;</td></tr>
@ -187,6 +187,9 @@ Functions</h2></td></tr>
 <tr class="memitem:gaad25050b19f30cd79397b227e0157a3f"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#gaad25050b19f30cd79397b227e0157a3f">mi_is_redirected</a> ()</td></tr>
 <tr class="memdesc:gaad25050b19f30cd79397b227e0157a3f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Is the C runtime <em>malloc</em> API redirected?  <a href="#gaad25050b19f30cd79397b227e0157a3f">More...</a><br /></td></tr>
 <tr class="separator:gaad25050b19f30cd79397b227e0157a3f"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga7d862c2affd5790381da14eb102a364d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#ga7d862c2affd5790381da14eb102a364d">mi_process_info</a> (size_t *elapsed_msecs, size_t *user_msecs, size_t *system_msecs, size_t *current_rss, size_t *peak_rss, size_t *current_commit, size_t *peak_commit, size_t *page_faults)</td></tr>
+<tr class="memdesc:ga7d862c2affd5790381da14eb102a364d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Return process information (time and memory usage).  <a href="#ga7d862c2affd5790381da14eb102a364d">More...</a><br /></td></tr>
+<tr class="separator:ga7d862c2affd5790381da14eb102a364d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
 <p>Extended functionality. </p>
@ -411,6 +414,86 @@ Functions</h2></td></tr>
 </dl>
 <dl class="section return"><dt>Returns</dt><dd>a pointer to newly allocated memory of at least <em>size</em> bytes, or <em>NULL</em> if out of memory. This function is meant for use in run-time systems for best performance and does not check if <em>size</em> was indeed small &ndash; use with care! </dd></dl>

+</div>
+</div>
+<a id="ga7d862c2affd5790381da14eb102a364d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga7d862c2affd5790381da14eb102a364d">&#9670;&nbsp;</a></span>mi_process_info()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mi_process_info </td>
+          <td>(</td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>elapsed_msecs</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>user_msecs</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>system_msecs</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>current_rss</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>peak_rss</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>current_commit</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>peak_commit</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>page_faults</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+<p>Return process information (time and memory usage). </p>
+<dl class="params"><dt>Parameters</dt><dd>
+  <table class="params">
+    <tr><td class="paramname">elapsed_msecs</td><td>Optional. Elapsed wall-clock time of the process in milli-seconds. </td></tr>
+    <tr><td class="paramname">user_msecs</td><td>Optional. User time in milli-seconds (as the sum over all threads). </td></tr>
+    <tr><td class="paramname">system_msecs</td><td>Optional. System time in milli-seconds. </td></tr>
+    <tr><td class="paramname">current_rss</td><td>Optional. Current working set size (touched pages). </td></tr>
+    <tr><td class="paramname">peak_rss</td><td>Optional. Peak working set size (touched pages). </td></tr>
+    <tr><td class="paramname">current_commit</td><td>Optional. Current committed memory (backed by the page file). </td></tr>
+    <tr><td class="paramname">peak_commit</td><td>Optional. Peak committed memory (backed by the page file). </td></tr>
+    <tr><td class="paramname">page_faults</td><td>Optional. Count of hard page faults.</td></tr>
+  </table>
+  </dd>
+</dl>
+<p>The <em>current_rss</em> is precise on Windows and MacOSX; other systems estimate this using <em>current_commit</em>. The <em>commit</em> is precise on Windows but estimated on other systems as the amount of read/write accessible memory reserved by mimalloc. </p>
+
 </div>
 </div>
 <a id="ga3460a6ca91af97be4058f523d3cb8ece"></a>
@ -646,7 +729,7 @@ Functions</h2></td></tr>
 </div>
 </div>
 <a id="ga2d126e5c62d3badc35445e5d84166df2"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ga2d126e5c62d3badc35445e5d84166df2">&#9670;&nbsp;</a></span>mi_stats_print() <span class="overload">[1/2]</span></h2>
+<h2 class="memtitle"><span class="permalink"><a href="#ga2d126e5c62d3badc35445e5d84166df2">&#9670;&nbsp;</a></span>mi_stats_print()</h2>

 <div class="memitem">
 <div class="memproto">
@ -661,7 +744,7 @@ Functions</h2></td></tr>
      </table>
 </div><div class="memdoc">

-<p>Print the main statistics. </p>
+<p>Deprecated. </p>
 <dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
    <tr><td class="paramname">out</td><td>Ignored, outputs to the registered output function or stderr by default.</td></tr>
@ -672,14 +755,14 @@ Functions</h2></td></tr>

 </div>
 </div>
-<a id="ga256cc6f13a142deabbadd954a217e228"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ga256cc6f13a142deabbadd954a217e228">&#9670;&nbsp;</a></span>mi_stats_print() <span class="overload">[2/2]</span></h2>
+<a id="ga537f13b299ddf801e49a5a94fde02c79"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga537f13b299ddf801e49a5a94fde02c79">&#9670;&nbsp;</a></span>mi_stats_print_out()</h2>

 <div class="memitem">
 <div class="memproto">
      <table class="memname">
        <tr>
-          <td class="memname">void mi_stats_print </td>
+          <td class="memname">void mi_stats_print_out </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="group__extended.html#gad823d23444a4b77a40f66bf075a98a0c">mi_output_fun</a> *&#160;</td>
          <td class="paramname"><em>out</em>, </td>
--- a/docs/group__extended.js
+++ b/docs/group__extended.js
@ -9,6 +9,7 @@ var group__extended =
    [ "mi_is_in_heap_region", "group__extended.html#ga5f071b10d4df1c3658e04e7fd67a94e6", null ],
    [ "mi_is_redirected", "group__extended.html#gaad25050b19f30cd79397b227e0157a3f", null ],
    [ "mi_malloc_small", "group__extended.html#ga7136c2e55cb22c98ecf95d08d6debb99", null ],
+    [ "mi_process_info", "group__extended.html#ga7d862c2affd5790381da14eb102a364d", null ],
    [ "mi_register_deferred_free", "group__extended.html#ga3460a6ca91af97be4058f523d3cb8ece", null ],
    [ "mi_register_error", "group__extended.html#gaa1d55e0e894be240827e5d87ec3a1f45", null ],
    [ "mi_register_output", "group__extended.html#gae5b17ff027cd2150b43a33040250cf3f", null ],
@ -16,7 +17,7 @@ var group__extended =
    [ "mi_reserve_huge_os_pages_interleave", "group__extended.html#ga3132f521fb756fc0e8ec0b74fb58df50", null ],
    [ "mi_stats_merge", "group__extended.html#ga854b1de8cb067c7316286c28b2fcd3d1", null ],
    [ "mi_stats_print", "group__extended.html#ga2d126e5c62d3badc35445e5d84166df2", null ],
-    [ "mi_stats_print", "group__extended.html#ga256cc6f13a142deabbadd954a217e228", null ],
+    [ "mi_stats_print_out", "group__extended.html#ga537f13b299ddf801e49a5a94fde02c79", null ],
    [ "mi_stats_reset", "group__extended.html#ga3bb8468b8cfcc6e2a61d98aee85c5f99", null ],
    [ "mi_thread_done", "group__extended.html#ga0ae4581e85453456a0d658b2b98bf7bf", null ],
    [ "mi_thread_init", "group__extended.html#gaf8e73efc2cbca9ebfdfb166983a04c17", null ],
--- a/docs/group__options.html
+++ b/docs/group__options.html
@ -112,8 +112,8 @@ $(document).ready(function(){initNavTree('group__options.html','');});
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="enum-members"></a>
 Enumerations</h2></td></tr>
 <tr class="memitem:gafebf7ed116adb38ae5218bc3ce06884c"><td class="memItemLeft" align="right" valign="top">enum &#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> { <br />
-&#160;&#160;<a class="el" href="group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda">mi_option_show_stats</a>, 
-<a class="el" href="group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0">mi_option_show_errors</a>, 
+&#160;&#160;<a class="el" href="group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0">mi_option_show_errors</a>, 
+<a class="el" href="group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda">mi_option_show_stats</a>, 
 <a class="el" href="group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca7c8b7bf5281c581bad64f5daa6442777">mi_option_verbose</a>, 
 <a class="el" href="group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca1e8de72c93da7ff22d91e1e27b52ac2b">mi_option_eager_commit</a>, 
 <br />
@ -138,12 +138,16 @@ Enumerations</h2></td></tr>
 </table><table class="memberdecls">
 <tr class="heading"><td colspan="2"><h2 class="groupheader"><a name="func-members"></a>
 Functions</h2></td></tr>
-<tr class="memitem:gacebe3f6d91b4a50b54eb84e2a1da1b30"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#gacebe3f6d91b4a50b54eb84e2a1da1b30">mi_option_enabled</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option)</td></tr>
-<tr class="separator:gacebe3f6d91b4a50b54eb84e2a1da1b30"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga6d45a20a3131f18bc351b69763b38ce4"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#ga6d45a20a3131f18bc351b69763b38ce4">mi_option_enable</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option, bool enable)</td></tr>
-<tr class="separator:ga6d45a20a3131f18bc351b69763b38ce4"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga37988264b915a7db92530cc02d5494cb"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#ga37988264b915a7db92530cc02d5494cb">mi_option_enable_default</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option, bool enable)</td></tr>
-<tr class="separator:ga37988264b915a7db92530cc02d5494cb"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga459ad98f18b3fc9275474807fe0ca188"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#ga459ad98f18b3fc9275474807fe0ca188">mi_option_is_enabled</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option)</td></tr>
+<tr class="separator:ga459ad98f18b3fc9275474807fe0ca188"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga04180ae41b0d601421dd62ced40ca050"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#ga04180ae41b0d601421dd62ced40ca050">mi_option_enable</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option)</td></tr>
+<tr class="separator:ga04180ae41b0d601421dd62ced40ca050"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:gaebf6ff707a2e688ebb1a2296ca564054"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#gaebf6ff707a2e688ebb1a2296ca564054">mi_option_disable</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option)</td></tr>
+<tr class="separator:gaebf6ff707a2e688ebb1a2296ca564054"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga9a13d05fcb77489cb06d4d017ebd8bed"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#ga9a13d05fcb77489cb06d4d017ebd8bed">mi_option_set_enabled</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option, bool enable)</td></tr>
+<tr class="separator:ga9a13d05fcb77489cb06d4d017ebd8bed"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga65518b69ec5d32336b50e07f74b3f629"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#ga65518b69ec5d32336b50e07f74b3f629">mi_option_set_enabled_default</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option, bool enable)</td></tr>
+<tr class="separator:ga65518b69ec5d32336b50e07f74b3f629"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:ga7e8af195cc81d3fa64ccf2662caa565a"><td class="memItemLeft" align="right" valign="top">long&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#ga7e8af195cc81d3fa64ccf2662caa565a">mi_option_get</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option)</td></tr>
 <tr class="separator:ga7e8af195cc81d3fa64ccf2662caa565a"><td class="memSeparator" colspan="2">&#160;</td></tr>
 <tr class="memitem:gaf84921c32375e25754dc2ee6a911fa60"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__options.html#gaf84921c32375e25754dc2ee6a911fa60">mi_option_set</a> (<a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a> option, long value)</td></tr>
@ -168,9 +172,9 @@ Functions</h2></td></tr>

 <p>Runtime options. </p>
 <table class="fieldtable">
-<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><a id="ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda"></a>mi_option_show_stats&#160;</td><td class="fielddoc"><p>Print statistics to <code>stderr</code> when the program is done. </p>
+<tr><th colspan="2">Enumerator</th></tr><tr><td class="fieldname"><a id="ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0"></a>mi_option_show_errors&#160;</td><td class="fielddoc"><p>Print error messages to <code>stderr</code>. </p>
 </td></tr>
-<tr><td class="fieldname"><a id="ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0"></a>mi_option_show_errors&#160;</td><td class="fielddoc"><p>Print error messages to <code>stderr</code>. </p>
+<tr><td class="fieldname"><a id="ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda"></a>mi_option_show_stats&#160;</td><td class="fielddoc"><p>Print statistics to <code>stderr</code> when the program is done. </p>
 </td></tr>
 <tr><td class="fieldname"><a id="ggafebf7ed116adb38ae5218bc3ce06884ca7c8b7bf5281c581bad64f5daa6442777"></a>mi_option_verbose&#160;</td><td class="fielddoc"><p>Print verbose messages to <code>stderr</code>. </p>
 </td></tr>
@ -204,8 +208,26 @@ Functions</h2></td></tr>
 </div>
 </div>
 <h2 class="groupheader">Function Documentation</h2>
-<a id="ga6d45a20a3131f18bc351b69763b38ce4"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ga6d45a20a3131f18bc351b69763b38ce4">&#9670;&nbsp;</a></span>mi_option_enable()</h2>
+<a id="gaebf6ff707a2e688ebb1a2296ca564054"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#gaebf6ff707a2e688ebb1a2296ca564054">&#9670;&nbsp;</a></span>mi_option_disable()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mi_option_disable </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a>&#160;</td>
+          <td class="paramname"><em>option</em></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ga04180ae41b0d601421dd62ced40ca050"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga04180ae41b0d601421dd62ced40ca050">&#9670;&nbsp;</a></span>mi_option_enable()</h2>

 <div class="memitem">
 <div class="memproto">
@ -214,62 +236,6 @@ Functions</h2></td></tr>
          <td class="memname">void mi_option_enable </td>
          <td>(</td>
          <td class="paramtype"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a>&#160;</td>
-          <td class="paramname"><em>option</em>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">bool&#160;</td>
-          <td class="paramname"><em>enable</em>&#160;</td>
-        </tr>
-        <tr>
-          <td></td>
-          <td>)</td>
-          <td></td><td></td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="ga37988264b915a7db92530cc02d5494cb"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ga37988264b915a7db92530cc02d5494cb">&#9670;&nbsp;</a></span>mi_option_enable_default()</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">void mi_option_enable_default </td>
-          <td>(</td>
-          <td class="paramtype"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a>&#160;</td>
-          <td class="paramname"><em>option</em>, </td>
-        </tr>
-        <tr>
-          <td class="paramkey"></td>
-          <td></td>
-          <td class="paramtype">bool&#160;</td>
-          <td class="paramname"><em>enable</em>&#160;</td>
-        </tr>
-        <tr>
-          <td></td>
-          <td>)</td>
-          <td></td><td></td>
-        </tr>
-      </table>
-</div><div class="memdoc">
-
-</div>
-</div>
-<a id="gacebe3f6d91b4a50b54eb84e2a1da1b30"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#gacebe3f6d91b4a50b54eb84e2a1da1b30">&#9670;&nbsp;</a></span>mi_option_enabled()</h2>
-
-<div class="memitem">
-<div class="memproto">
-      <table class="memname">
-        <tr>
-          <td class="memname">bool mi_option_enabled </td>
-          <td>(</td>
-          <td class="paramtype"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a>&#160;</td>
          <td class="paramname"><em>option</em></td><td>)</td>
          <td></td>
        </tr>
@ -294,6 +260,24 @@ Functions</h2></td></tr>
      </table>
 </div><div class="memdoc">

+</div>
+</div>
+<a id="ga459ad98f18b3fc9275474807fe0ca188"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga459ad98f18b3fc9275474807fe0ca188">&#9670;&nbsp;</a></span>mi_option_is_enabled()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">bool mi_option_is_enabled </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a>&#160;</td>
+          <td class="paramname"><em>option</em></td><td>)</td>
+          <td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 <a id="gaf84921c32375e25754dc2ee6a911fa60"></a>
@ -350,6 +334,62 @@ Functions</h2></td></tr>
      </table>
 </div><div class="memdoc">

+</div>
+</div>
+<a id="ga9a13d05fcb77489cb06d4d017ebd8bed"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga9a13d05fcb77489cb06d4d017ebd8bed">&#9670;&nbsp;</a></span>mi_option_set_enabled()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mi_option_set_enabled </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a>&#160;</td>
+          <td class="paramname"><em>option</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">bool&#160;</td>
+          <td class="paramname"><em>enable</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
+</div>
+</div>
+<a id="ga65518b69ec5d32336b50e07f74b3f629"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga65518b69ec5d32336b50e07f74b3f629">&#9670;&nbsp;</a></span>mi_option_set_enabled_default()</h2>
+
+<div class="memitem">
+<div class="memproto">
+      <table class="memname">
+        <tr>
+          <td class="memname">void mi_option_set_enabled_default </td>
+          <td>(</td>
+          <td class="paramtype"><a class="el" href="group__options.html#gafebf7ed116adb38ae5218bc3ce06884c">mi_option_t</a>&#160;</td>
+          <td class="paramname"><em>option</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">bool&#160;</td>
+          <td class="paramname"><em>enable</em>&#160;</td>
+        </tr>
+        <tr>
+          <td></td>
+          <td>)</td>
+          <td></td><td></td>
+        </tr>
+      </table>
+</div><div class="memdoc">
+
 </div>
 </div>
 </div><!-- contents -->
--- a/docs/group__options.js
+++ b/docs/group__options.js
@ -1,8 +1,8 @@
 var group__options =
 [
    [ "mi_option_t", "group__options.html#gafebf7ed116adb38ae5218bc3ce06884c", [
-      [ "mi_option_show_stats", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda", null ],
      [ "mi_option_show_errors", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0", null ],
+      [ "mi_option_show_stats", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda", null ],
      [ "mi_option_verbose", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca7c8b7bf5281c581bad64f5daa6442777", null ],
      [ "mi_option_eager_commit", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca1e8de72c93da7ff22d91e1e27b52ac2b", null ],
      [ "mi_option_eager_region_commit", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca32ce97ece29f69e82579679cf8a307ad", null ],
@ -18,10 +18,12 @@ var group__options =
      [ "mi_option_os_tag", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca4b74ae2a69e445de6c2361b73c1d14bf", null ],
      [ "_mi_option_last", "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca5b4357b74be0d87568036c32eb1a2e4a", null ]
    ] ],
-    [ "mi_option_enable", "group__options.html#ga6d45a20a3131f18bc351b69763b38ce4", null ],
-    [ "mi_option_enable_default", "group__options.html#ga37988264b915a7db92530cc02d5494cb", null ],
-    [ "mi_option_enabled", "group__options.html#gacebe3f6d91b4a50b54eb84e2a1da1b30", null ],
+    [ "mi_option_disable", "group__options.html#gaebf6ff707a2e688ebb1a2296ca564054", null ],
+    [ "mi_option_enable", "group__options.html#ga04180ae41b0d601421dd62ced40ca050", null ],
    [ "mi_option_get", "group__options.html#ga7e8af195cc81d3fa64ccf2662caa565a", null ],
+    [ "mi_option_is_enabled", "group__options.html#ga459ad98f18b3fc9275474807fe0ca188", null ],
    [ "mi_option_set", "group__options.html#gaf84921c32375e25754dc2ee6a911fa60", null ],
-    [ "mi_option_set_default", "group__options.html#ga7ef623e440e6e5545cb08c94e71e4b90", null ]
+    [ "mi_option_set_default", "group__options.html#ga7ef623e440e6e5545cb08c94e71e4b90", null ],
+    [ "mi_option_set_enabled", "group__options.html#ga9a13d05fcb77489cb06d4d017ebd8bed", null ],
+    [ "mi_option_set_enabled_default", "group__options.html#ga65518b69ec5d32336b50e07f74b3f629", null ]
 ];
--- a/docs/index.html
+++ b/docs/index.html
@ -105,13 +105,14 @@ $(document).ready(function(){initNavTree('index.html','');});
 <div class="textblock"><p>This is the API documentation of the <a href="https://github.com/microsoft/mimalloc">mimalloc</a> allocator (pronounced "me-malloc") &ndash; a general purpose allocator with excellent <a href="bench.html">performance</a> characteristics. Initially developed by Daan Leijen for the run-time systems of the <a href="https://github.com/koka-lang/koka">Koka</a> and <a href="https://github.com/leanprover/lean">Lean</a> languages.</p>
 <p>It is a drop-in replacement for <code>malloc</code> and can be used in other programs without code changes, for example, on Unix you can use it as: </p><div class="fragment"><div class="line">&gt; LD_PRELOAD=/usr/bin/libmimalloc.so  myprogram</div></div><!-- fragment --><p>Notable aspects of the design include:</p>
 <ul>
-<li><b>small and consistent</b>: the library is less than 6k LOC using simple and consistent data structures. This makes it very suitable to integrate and adapt in other projects. For runtime systems it provides hooks for a monotonic <em>heartbeat</em> and deferred freeing (for bounded worst-case times with reference counting).</li>
-<li><b>free list sharding</b>: the big idea: instead of one big free list (per size class) we have many smaller lists per memory "page" which both reduces fragmentation and increases locality &ndash; things that are allocated close in time get allocated close in memory. (A memory "page" in <em>mimalloc</em> contains blocks of one size class and is usually 64KiB on a 64-bit system).</li>
+<li><b>small and consistent</b>: the library is about 8k LOC using simple and consistent data structures. This makes it very suitable to integrate and adapt in other projects. For runtime systems it provides hooks for a monotonic <em>heartbeat</em> and deferred freeing (for bounded worst-case times with reference counting).</li>
+<li><b>free list sharding</b>: instead of one big free list (per size class) we have many smaller lists per "mimalloc page" which reduces fragmentation and increases locality &ndash; things that are allocated close in time get allocated close in memory. (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).</li>
+<li><b>free list multi-sharding</b>: the big idea! Not only do we shard the free list per mimalloc page, but for each page we have multiple free lists. In particular, there is one list for thread-local <code>free</code> operations, and another one for concurrent <code>free</code> operations. Free-ing from another thread can now be a single CAS without needing sophisticated coordination between threads. Since there will be thousands of separate free lists, contention is naturally distributed over the heap, and the chance of contending on a single location will be low &ndash; this is quite similar to randomized algorithms like skip lists where adding a random oracle removes the need for a more complex algorithm.</li>
 <li><b>eager page reset</b>: when a "page" becomes empty (with increased chance due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged") reducing (real) memory pressure and fragmentation, especially in long running programs.</li>
 <li><b>secure</b>: <em>mimalloc</em> can be build in secure mode, adding guard pages, randomized allocation, encrypted free lists, etc. to protect against various heap vulnerabilities. The performance penalty is only around 3% on average over our benchmarks.</li>
 <li><b>first-class heaps</b>: efficiently create and use multiple heaps to allocate across different regions. A heap can be destroyed at once instead of deallocating each object separately.</li>
 <li><b>bounded</b>: it does not suffer from <em>blowup</em> [1], has bounded worst-case allocation times (<em>wcat</em>), bounded space overhead (~0.2% meta-data, with at most 12.5% waste in allocation sizes), and has no internal points of contention using only atomic operations.</li>
-<li><b>fast</b>: In our benchmarks (see <a href="#performance">below</a>), <em>mimalloc</em> always outperforms all other leading allocators (<em>jemalloc</em>, <em>tcmalloc</em>, <em>Hoard</em>, etc), and usually uses less memory (up to 25% more in the worst case). A nice property is that it does consistently well over a wide range of benchmarks.</li>
+<li><b>fast</b>: In our benchmarks (see <a href="#performance">below</a>), <em>mimalloc</em> outperforms all other leading allocators (<em>jemalloc</em>, <em>tcmalloc</em>, <em>Hoard</em>, etc), and usually uses less memory (up to 25% more in the worst case). A nice property is that it does consistently well over a wide range of benchmarks.</li>
 </ul>
 <p>You can read more on the design of <em>mimalloc</em> in the <a href="https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action">technical report</a> which also has detailed benchmark results.</p>
 <p>Further information:</p>
--- a/docs/mimalloc-doc_8h_source.html
+++ b/docs/mimalloc-doc_8h_source.html
--- a/docs/navtreeindex0.js
+++ b/docs/navtreeindex0.js
@ -38,29 +38,30 @@ var NAVTREEINDEX0 =
 "group__cpp.html#gaef2c2bdb4f70857902d3c8903ac095f3":[5,9,2],
 "group__cpp.html#structmi__stl__allocator":[5,9,0],
 "group__extended.html":[5,1],
-"group__extended.html#ga089c859d9eddc5f9b4bd946cd53cebee":[5,1,21],
-"group__extended.html#ga0ae4581e85453456a0d658b2b98bf7bf":[5,1,18],
+"group__extended.html#ga089c859d9eddc5f9b4bd946cd53cebee":[5,1,22],
+"group__extended.html#ga0ae4581e85453456a0d658b2b98bf7bf":[5,1,19],
 "group__extended.html#ga1ea64283508718d9d645c38efc2f4305":[5,1,0],
-"group__extended.html#ga220f29f40a44404b0061c15bc1c31152":[5,1,22],
+"group__extended.html#ga220f29f40a44404b0061c15bc1c31152":[5,1,23],
 "group__extended.html#ga251d369cda3f1c2a955c555486ed90e5":[5,1,2],
-"group__extended.html#ga256cc6f13a142deabbadd954a217e228":[5,1,16],
 "group__extended.html#ga299dae78d25ce112e384a98b7309c5be":[5,1,1],
-"group__extended.html#ga2d126e5c62d3badc35445e5d84166df2":[5,1,15],
-"group__extended.html#ga3132f521fb756fc0e8ec0b74fb58df50":[5,1,13],
-"group__extended.html#ga3460a6ca91af97be4058f523d3cb8ece":[5,1,9],
-"group__extended.html#ga3bb8468b8cfcc6e2a61d98aee85c5f99":[5,1,17],
+"group__extended.html#ga2d126e5c62d3badc35445e5d84166df2":[5,1,16],
+"group__extended.html#ga3132f521fb756fc0e8ec0b74fb58df50":[5,1,14],
+"group__extended.html#ga3460a6ca91af97be4058f523d3cb8ece":[5,1,10],
+"group__extended.html#ga3bb8468b8cfcc6e2a61d98aee85c5f99":[5,1,18],
 "group__extended.html#ga421430e2226d7d468529cec457396756":[5,1,4],
+"group__extended.html#ga537f13b299ddf801e49a5a94fde02c79":[5,1,17],
 "group__extended.html#ga5f071b10d4df1c3658e04e7fd67a94e6":[5,1,6],
 "group__extended.html#ga7136c2e55cb22c98ecf95d08d6debb99":[5,1,8],
-"group__extended.html#ga7795a13d20087447281858d2c771cca1":[5,1,12],
-"group__extended.html#ga854b1de8cb067c7316286c28b2fcd3d1":[5,1,14],
-"group__extended.html#gaa1d55e0e894be240827e5d87ec3a1f45":[5,1,10],
+"group__extended.html#ga7795a13d20087447281858d2c771cca1":[5,1,13],
+"group__extended.html#ga7d862c2affd5790381da14eb102a364d":[5,1,9],
+"group__extended.html#ga854b1de8cb067c7316286c28b2fcd3d1":[5,1,15],
+"group__extended.html#gaa1d55e0e894be240827e5d87ec3a1f45":[5,1,11],
 "group__extended.html#gaad25050b19f30cd79397b227e0157a3f":[5,1,7],
-"group__extended.html#gab1dac8476c46cb9eecab767eb40c1525":[5,1,20],
+"group__extended.html#gab1dac8476c46cb9eecab767eb40c1525":[5,1,21],
 "group__extended.html#gac057927cd06c854b45fe7847e921bd47":[5,1,5],
 "group__extended.html#gad823d23444a4b77a40f66bf075a98a0c":[5,1,3],
-"group__extended.html#gae5b17ff027cd2150b43a33040250cf3f":[5,1,11],
-"group__extended.html#gaf8e73efc2cbca9ebfdfb166983a04c17":[5,1,19],
+"group__extended.html#gae5b17ff027cd2150b43a33040250cf3f":[5,1,12],
+"group__extended.html#gaf8e73efc2cbca9ebfdfb166983a04c17":[5,1,20],
 "group__heap.html":[5,3],
 "group__heap.html#ga00e95ba1e01acac3cfd95bb7a357a6f0":[5,3,20],
 "group__heap.html#ga08ca6419a5c057a4d965868998eef487":[5,3,3],
@ -104,14 +105,16 @@ var NAVTREEINDEX0 =
 "group__malloc.html#gafdd9d8bb2986e668ba9884f28af38000":[5,0,12],
 "group__malloc.html#gafe68ac7c5e24a65cd55c9d6b152211a0":[5,0,6],
 "group__options.html":[5,7],
-"group__options.html#ga37988264b915a7db92530cc02d5494cb":[5,7,2],
-"group__options.html#ga6d45a20a3131f18bc351b69763b38ce4":[5,7,1],
-"group__options.html#ga7e8af195cc81d3fa64ccf2662caa565a":[5,7,4],
+"group__options.html#ga04180ae41b0d601421dd62ced40ca050":[5,7,2],
+"group__options.html#ga459ad98f18b3fc9275474807fe0ca188":[5,7,4],
+"group__options.html#ga65518b69ec5d32336b50e07f74b3f629":[5,7,8],
+"group__options.html#ga7e8af195cc81d3fa64ccf2662caa565a":[5,7,3],
 "group__options.html#ga7ef623e440e6e5545cb08c94e71e4b90":[5,7,6],
-"group__options.html#gacebe3f6d91b4a50b54eb84e2a1da1b30":[5,7,3],
+"group__options.html#ga9a13d05fcb77489cb06d4d017ebd8bed":[5,7,7],
+"group__options.html#gaebf6ff707a2e688ebb1a2296ca564054":[5,7,1],
 "group__options.html#gaf84921c32375e25754dc2ee6a911fa60":[5,7,5],
 "group__options.html#gafebf7ed116adb38ae5218bc3ce06884c":[5,7,0],
-"group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda":[5,7,0,0],
+"group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda":[5,7,0,1],
 "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0ac33a18f6b659fcfaf44efb0bab1b74":[5,7,0,11],
 "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca154fe170131d5212cff57e22b99523c5":[5,7,0,10],
 "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca17a190c25be381142d87e0468c4c068c":[5,7,0,13],
@ -126,7 +129,7 @@ var NAVTREEINDEX0 =
 "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884caca7ed041be3b0b9d0b82432c7bf41af2":[5,7,0,6],
 "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cada854dd272c66342f18a93ee254a2968":[5,7,0,8],
 "group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafb121d30d87591850d5410ccc3a95c6d":[5,7,0,9],
-"group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0":[5,7,0,1],
+"group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0":[5,7,0,0],
 "group__posix.html":[5,8],
 "group__posix.html#ga06d07cf357bbac5c73ba5d0c0c421e17":[5,8,7],
 "group__posix.html#ga0d28d5cf61e6bfbb18c63092939fe5c9":[5,8,3],
--- a/docs/search/all_6.js
+++ b/docs/search/all_6.js
@ -80,13 +80,13 @@ var searchData=
  ['mi_5fnew_5fnothrow',['mi_new_nothrow',['../group__cpp.html#gaeaded64eda71ed6b1d569d3e723abc4a',1,'mimalloc-doc.h']]],
  ['mi_5fnew_5frealloc',['mi_new_realloc',['../group__cpp.html#gaab78a32f55149e9fbf432d5288e38e1e',1,'mimalloc-doc.h']]],
  ['mi_5fnew_5freallocn',['mi_new_reallocn',['../group__cpp.html#ga756f4b2bc6a7ecd0a90baea8e90c7907',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fdisable',['mi_option_disable',['../group__options.html#gaebf6ff707a2e688ebb1a2296ca564054',1,'mimalloc-doc.h']]],
  ['mi_5foption_5feager_5fcommit',['mi_option_eager_commit',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca1e8de72c93da7ff22d91e1e27b52ac2b',1,'mimalloc-doc.h']]],
  ['mi_5foption_5feager_5fcommit_5fdelay',['mi_option_eager_commit_delay',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca17a190c25be381142d87e0468c4c068c',1,'mimalloc-doc.h']]],
  ['mi_5foption_5feager_5fregion_5fcommit',['mi_option_eager_region_commit',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca32ce97ece29f69e82579679cf8a307ad',1,'mimalloc-doc.h']]],
-  ['mi_5foption_5fenable',['mi_option_enable',['../group__options.html#ga6d45a20a3131f18bc351b69763b38ce4',1,'mimalloc-doc.h']]],
-  ['mi_5foption_5fenable_5fdefault',['mi_option_enable_default',['../group__options.html#ga37988264b915a7db92530cc02d5494cb',1,'mimalloc-doc.h']]],
-  ['mi_5foption_5fenabled',['mi_option_enabled',['../group__options.html#gacebe3f6d91b4a50b54eb84e2a1da1b30',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fenable',['mi_option_enable',['../group__options.html#ga04180ae41b0d601421dd62ced40ca050',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fget',['mi_option_get',['../group__options.html#ga7e8af195cc81d3fa64ccf2662caa565a',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fis_5fenabled',['mi_option_is_enabled',['../group__options.html#ga459ad98f18b3fc9275474807fe0ca188',1,'mimalloc-doc.h']]],
  ['mi_5foption_5flarge_5fos_5fpages',['mi_option_large_os_pages',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca4192d491200d0055df0554d4cf65054e',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fos_5ftag',['mi_option_os_tag',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca4b74ae2a69e445de6c2361b73c1d14bf',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fpage_5freset',['mi_option_page_reset',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cada854dd272c66342f18a93ee254a2968',1,'mimalloc-doc.h']]],
@ -97,6 +97,8 @@ var searchData=
  ['mi_5foption_5fsegment_5freset',['mi_option_segment_reset',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafb121d30d87591850d5410ccc3a95c6d',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fset',['mi_option_set',['../group__options.html#gaf84921c32375e25754dc2ee6a911fa60',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fset_5fdefault',['mi_option_set_default',['../group__options.html#ga7ef623e440e6e5545cb08c94e71e4b90',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fset_5fenabled',['mi_option_set_enabled',['../group__options.html#ga9a13d05fcb77489cb06d4d017ebd8bed',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fset_5fenabled_5fdefault',['mi_option_set_enabled_default',['../group__options.html#ga65518b69ec5d32336b50e07f74b3f629',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fshow_5ferrors',['mi_option_show_errors',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884cafbf4822e5c00732c5984b32a032837f0',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fshow_5fstats',['mi_option_show_stats',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca0957ef73b2550764b4840edf48422fda',1,'mimalloc-doc.h']]],
  ['mi_5foption_5ft',['mi_option_t',['../group__options.html#gafebf7ed116adb38ae5218bc3ce06884c',1,'mimalloc-doc.h']]],
@ -104,6 +106,7 @@ var searchData=
  ['mi_5foption_5fverbose',['mi_option_verbose',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca7c8b7bf5281c581bad64f5daa6442777',1,'mimalloc-doc.h']]],
  ['mi_5foutput_5ffun',['mi_output_fun',['../group__extended.html#gad823d23444a4b77a40f66bf075a98a0c',1,'mimalloc-doc.h']]],
  ['mi_5fposix_5fmemalign',['mi_posix_memalign',['../group__posix.html#gacff84f226ba9feb2031b8992e5579447',1,'mimalloc-doc.h']]],
+  ['mi_5fprocess_5finfo',['mi_process_info',['../group__extended.html#ga7d862c2affd5790381da14eb102a364d',1,'mimalloc-doc.h']]],
  ['mi_5fpvalloc',['mi_pvalloc',['../group__posix.html#gaeb325c39b887d3b90d85d1eb1712fb1e',1,'mimalloc-doc.h']]],
  ['mi_5frealloc',['mi_realloc',['../group__malloc.html#gaf11eb497da57bdfb2de65eb191c69db6',1,'mimalloc-doc.h']]],
  ['mi_5frealloc_5faligned',['mi_realloc_aligned',['../group__aligned.html#ga4028d1cf4aa4c87c880747044a8322ae',1,'mimalloc-doc.h']]],
@ -126,7 +129,8 @@ var searchData=
  ['mi_5frezalloc_5faligned_5fat',['mi_rezalloc_aligned_at',['../group__zeroinit.html#gae8b358c417e61d5307da002702b0a8e1',1,'mimalloc-doc.h']]],
  ['mi_5fsmall_5fsize_5fmax',['MI_SMALL_SIZE_MAX',['../group__extended.html#ga1ea64283508718d9d645c38efc2f4305',1,'mimalloc-doc.h']]],
  ['mi_5fstats_5fmerge',['mi_stats_merge',['../group__extended.html#ga854b1de8cb067c7316286c28b2fcd3d1',1,'mimalloc-doc.h']]],
-  ['mi_5fstats_5fprint',['mi_stats_print',['../group__extended.html#ga2d126e5c62d3badc35445e5d84166df2',1,'mi_stats_print(void *out):&#160;mimalloc-doc.h'],['../group__extended.html#ga256cc6f13a142deabbadd954a217e228',1,'mi_stats_print(mi_output_fun *out, void *arg):&#160;mimalloc-doc.h']]],
+  ['mi_5fstats_5fprint',['mi_stats_print',['../group__extended.html#ga2d126e5c62d3badc35445e5d84166df2',1,'mimalloc-doc.h']]],
+  ['mi_5fstats_5fprint_5fout',['mi_stats_print_out',['../group__extended.html#ga537f13b299ddf801e49a5a94fde02c79',1,'mimalloc-doc.h']]],
  ['mi_5fstats_5freset',['mi_stats_reset',['../group__extended.html#ga3bb8468b8cfcc6e2a61d98aee85c5f99',1,'mimalloc-doc.h']]],
  ['mi_5fstl_5fallocator',['mi_stl_allocator',['../group__cpp.html#structmi__stl__allocator',1,'']]],
  ['mi_5fstrdup',['mi_strdup',['../group__malloc.html#gac7cffe13f1f458ed16789488bf92b9b2',1,'mimalloc-doc.h']]],
--- a/docs/search/functions_0.js
+++ b/docs/search/functions_0.js
@ -66,13 +66,16 @@ var searchData=
  ['mi_5fnew_5fnothrow',['mi_new_nothrow',['../group__cpp.html#gaeaded64eda71ed6b1d569d3e723abc4a',1,'mimalloc-doc.h']]],
  ['mi_5fnew_5frealloc',['mi_new_realloc',['../group__cpp.html#gaab78a32f55149e9fbf432d5288e38e1e',1,'mimalloc-doc.h']]],
  ['mi_5fnew_5freallocn',['mi_new_reallocn',['../group__cpp.html#ga756f4b2bc6a7ecd0a90baea8e90c7907',1,'mimalloc-doc.h']]],
-  ['mi_5foption_5fenable',['mi_option_enable',['../group__options.html#ga6d45a20a3131f18bc351b69763b38ce4',1,'mimalloc-doc.h']]],
-  ['mi_5foption_5fenable_5fdefault',['mi_option_enable_default',['../group__options.html#ga37988264b915a7db92530cc02d5494cb',1,'mimalloc-doc.h']]],
-  ['mi_5foption_5fenabled',['mi_option_enabled',['../group__options.html#gacebe3f6d91b4a50b54eb84e2a1da1b30',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fdisable',['mi_option_disable',['../group__options.html#gaebf6ff707a2e688ebb1a2296ca564054',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fenable',['mi_option_enable',['../group__options.html#ga04180ae41b0d601421dd62ced40ca050',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fget',['mi_option_get',['../group__options.html#ga7e8af195cc81d3fa64ccf2662caa565a',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fis_5fenabled',['mi_option_is_enabled',['../group__options.html#ga459ad98f18b3fc9275474807fe0ca188',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fset',['mi_option_set',['../group__options.html#gaf84921c32375e25754dc2ee6a911fa60',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fset_5fdefault',['mi_option_set_default',['../group__options.html#ga7ef623e440e6e5545cb08c94e71e4b90',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fset_5fenabled',['mi_option_set_enabled',['../group__options.html#ga9a13d05fcb77489cb06d4d017ebd8bed',1,'mimalloc-doc.h']]],
+  ['mi_5foption_5fset_5fenabled_5fdefault',['mi_option_set_enabled_default',['../group__options.html#ga65518b69ec5d32336b50e07f74b3f629',1,'mimalloc-doc.h']]],
  ['mi_5fposix_5fmemalign',['mi_posix_memalign',['../group__posix.html#gacff84f226ba9feb2031b8992e5579447',1,'mimalloc-doc.h']]],
+  ['mi_5fprocess_5finfo',['mi_process_info',['../group__extended.html#ga7d862c2affd5790381da14eb102a364d',1,'mimalloc-doc.h']]],
  ['mi_5fpvalloc',['mi_pvalloc',['../group__posix.html#gaeb325c39b887d3b90d85d1eb1712fb1e',1,'mimalloc-doc.h']]],
  ['mi_5frealloc',['mi_realloc',['../group__malloc.html#gaf11eb497da57bdfb2de65eb191c69db6',1,'mimalloc-doc.h']]],
  ['mi_5frealloc_5faligned',['mi_realloc_aligned',['../group__aligned.html#ga4028d1cf4aa4c87c880747044a8322ae',1,'mimalloc-doc.h']]],
@ -93,7 +96,8 @@ var searchData=
  ['mi_5frezalloc_5faligned',['mi_rezalloc_aligned',['../group__zeroinit.html#gacd71a7bce96aab38ae6de17af2eb2cf0',1,'mimalloc-doc.h']]],
  ['mi_5frezalloc_5faligned_5fat',['mi_rezalloc_aligned_at',['../group__zeroinit.html#gae8b358c417e61d5307da002702b0a8e1',1,'mimalloc-doc.h']]],
  ['mi_5fstats_5fmerge',['mi_stats_merge',['../group__extended.html#ga854b1de8cb067c7316286c28b2fcd3d1',1,'mimalloc-doc.h']]],
-  ['mi_5fstats_5fprint',['mi_stats_print',['../group__extended.html#ga2d126e5c62d3badc35445e5d84166df2',1,'mi_stats_print(void *out):&#160;mimalloc-doc.h'],['../group__extended.html#ga256cc6f13a142deabbadd954a217e228',1,'mi_stats_print(mi_output_fun *out, void *arg):&#160;mimalloc-doc.h']]],
+  ['mi_5fstats_5fprint',['mi_stats_print',['../group__extended.html#ga2d126e5c62d3badc35445e5d84166df2',1,'mimalloc-doc.h']]],
+  ['mi_5fstats_5fprint_5fout',['mi_stats_print_out',['../group__extended.html#ga537f13b299ddf801e49a5a94fde02c79',1,'mimalloc-doc.h']]],
  ['mi_5fstats_5freset',['mi_stats_reset',['../group__extended.html#ga3bb8468b8cfcc6e2a61d98aee85c5f99',1,'mimalloc-doc.h']]],
  ['mi_5fstrdup',['mi_strdup',['../group__malloc.html#gac7cffe13f1f458ed16789488bf92b9b2',1,'mimalloc-doc.h']]],
  ['mi_5fstrndup',['mi_strndup',['../group__malloc.html#gaaabf971c2571891433477e2d21a35266',1,'mimalloc-doc.h']]],
--- a/ide/vs2017/mimalloc-override.vcxproj
+++ b/ide/vs2017/mimalloc-override.vcxproj
@ -215,6 +215,7 @@
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc-override.h" />
    <ClInclude Include="..\..\include\mimalloc-types.h" />
+    <ClInclude Include="..\..\src\bitmap.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\..\src\alloc-aligned.c">
@ -232,6 +233,7 @@
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\region.c" />
@ -251,4 +253,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/ide/vs2017/mimalloc-override.vcxproj.filters
+++ b/ide/vs2017/mimalloc-override.vcxproj.filters
@ -29,6 +29,9 @@
    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\bitmap.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\..\src\alloc.c">
@ -76,5 +79,8 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
-</Project>
+</Project>
--- a/ide/vs2017/mimalloc.vcxproj
+++ b/ide/vs2017/mimalloc.vcxproj
@ -110,7 +110,7 @@
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>CompileAsC</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
@ -129,7 +129,7 @@
      <ConformanceMode>true</ConformanceMode>
      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>CompileAsC</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
      <LanguageStandard>stdcpp17</LanguageStandard>
    </ClCompile>
@ -161,7 +161,7 @@
      <WholeProgramOptimization>false</WholeProgramOptimization>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>CompileAsC</CompileAs>
      <IntrinsicFunctions>true</IntrinsicFunctions>
    </ClCompile>
    <Link>
@ -188,7 +188,7 @@
      <WholeProgramOptimization>false</WholeProgramOptimization>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <InlineFunctionExpansion>Default</InlineFunctionExpansion>
-      <CompileAs>CompileAsCpp</CompileAs>
+      <CompileAs>CompileAsC</CompileAs>
      <IntrinsicFunctions>true</IntrinsicFunctions>
    </ClCompile>
    <Link>
@ -230,6 +230,7 @@
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\arena.c" />
+    <ClCompile Include="..\..\src\bitmap.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\region.c" />
@ -253,8 +254,9 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h" />
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-types.h" />
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
+    <ClInclude Include="..\..\src\bitmap.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/ide/vs2017/mimalloc.vcxproj.filters
+++ b/ide/vs2017/mimalloc.vcxproj.filters
@ -59,6 +59,9 @@
    <ClCompile Include="..\..\src\random.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
@ -79,5 +82,8 @@
    <ClInclude Include="..\..\include\mimalloc-new-delete.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\bitmap.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
  </ItemGroup>
-</Project>
+</Project>
--- a/ide/vs2019/mimalloc-override.vcxproj
+++ b/ide/vs2019/mimalloc-override.vcxproj
@ -215,6 +215,7 @@
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
    <ClInclude Include="..\..\include\mimalloc-override.h" />
    <ClInclude Include="..\..\include\mimalloc-types.h" />
+    <ClInclude Include="..\..\src\bitmap.h" />
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\..\src\alloc-aligned.c">
@ -232,9 +233,7 @@
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.inc.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.c" />
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
    <ClCompile Include="..\..\src\region.c" />
--- a/ide/vs2019/mimalloc-override.vcxproj.filters
+++ b/ide/vs2019/mimalloc-override.vcxproj.filters
@ -40,15 +40,15 @@
    <ClCompile Include="..\..\src\arena.c">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\bitmap.inc.c">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="..\..\src\random.c">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="..\..\src\options.c">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\src\bitmap.c">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc.h">
@ -69,6 +69,9 @@
    <ClInclude Include="..\..\include\mimalloc-types.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\bitmap.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Header Files">
--- a/ide/vs2019/mimalloc.vcxproj
+++ b/ide/vs2019/mimalloc.vcxproj
@ -220,8 +220,8 @@
    <ClCompile Include="..\..\src\alloc-posix.c" />
    <ClCompile Include="..\..\src\alloc.c" />
    <ClCompile Include="..\..\src\arena.c" />
-    <ClCompile Include="..\..\src\bitmap.inc.c">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
+    <ClCompile Include="..\..\src\bitmap.c">
+      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="..\..\src\heap.c" />
    <ClCompile Include="..\..\src\init.c" />
@ -246,6 +246,7 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-override.h" />
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-types.h" />
    <ClInclude Include="..\..\include\mimalloc-new-delete.h" />
+    <ClInclude Include="..\..\src\bitmap.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/ide/vs2019/mimalloc.vcxproj.filters
+++ b/ide/vs2019/mimalloc.vcxproj.filters
@ -46,10 +46,10 @@
    <ClCompile Include="..\..\src\arena.c">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\bitmap.inc.c">
+    <ClCompile Include="..\..\src\random.c">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\src\random.c">
+    <ClCompile Include="..\..\src\bitmap.c">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
@ -72,6 +72,9 @@
    <ClInclude Include="$(ProjectDir)..\..\include\mimalloc-types.h">
      <Filter>Header Files</Filter>
    </ClInclude>
+    <ClInclude Include="..\..\src\bitmap.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Header Files">
@ -81,4 +84,4 @@
      <UniqueIdentifier>{852a14ae-6dde-4e95-8077-ca705e97e5af}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
-</Project>
+</Project>
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018, Microsoft Research, Daan Leijen
+Copyright (c) 2018,2020 Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -8,120 +8,117 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_ATOMIC_H
 #define MIMALLOC_ATOMIC_H

-// ------------------------------------------------------
+// --------------------------------------------------------------------------------------------
 // Atomics
 // We need to be portable between C, C++, and MSVC.
-// ------------------------------------------------------
+// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode. 
+// This is why we try to use only `uintptr_t` and `<type>*` as atomic types. 
+// To gain better insight in the range of used atomics, we use explicitly named memory order operations 
+// instead of passing the memory order as a parameter.
+// -----------------------------------------------------------------------------------------------

-#if defined(_MSC_VER)
-#define _Atomic(tp)         tp
-#define ATOMIC_VAR_INIT(x)  x
-#elif defined(__cplusplus)
+#if defined(__cplusplus)
+// Use C++ atomics
 #include <atomic>
-#define  _Atomic(tp)        std::atomic<tp>
+#define  _Atomic(tp)            std::atomic<tp>
+#define  mi_atomic(name)        std::atomic_##name
+#define  mi_memory_order(name)  std::memory_order_##name
+#elif defined(_MSC_VER)
+// Use MSVC C wrapper for C11 atomics
+#define  _Atomic(tp)            tp
+#define  ATOMIC_VAR_INIT(x)     x
+#define  mi_atomic(name)        mi_atomic_##name
+#define  mi_memory_order(name)  mi_memory_order_##name
 #else
+// Use C11 atomics
 #include <stdatomic.h>
+#define  mi_atomic(name)        atomic_##name
+#define  mi_memory_order(name)  memory_order_##name
 #endif

-// ------------------------------------------------------
-// Atomic operations specialized for mimalloc
-// ------------------------------------------------------
+// Various defines for all used memory orders in mimalloc
+#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail)

-// Atomically add a 64-bit value; returns the previous value.
-// Note: not using _Atomic(int64_t) as it is only used for statistics.
-static inline void mi_atomic_addi64(volatile int64_t* p, int64_t add);
+#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail)  \
+  mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail)

-// Atomically add a value; returns the previous value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add);
+#define mi_atomic_load_acquire(p)                mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_load_relaxed(p)                mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_store_release(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_store_relaxed(p,x)             mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_exchange_release(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_exchange_acq_rel(p,x)          mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_cas_weak_release(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_weak_acq_rel(p,exp,des)    mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
+#define mi_atomic_cas_strong_release(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
+#define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))

-// Atomically "and" a value; returns the previous value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+#define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
+#define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))

-// Atomically "or" a value; returns the previous value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x);
+#define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
+#define mi_atomic_decrement_relaxed(p)           mi_atomic_sub_relaxed(p,(uintptr_t)1)
+#define mi_atomic_increment_acq_rel(p)           mi_atomic_add_acq_rel(p,(uintptr_t)1)
+#define mi_atomic_decrement_acq_rel(p)           mi_atomic_sub_acq_rel(p,(uintptr_t)1)

-// Atomically compare and exchange a value; returns `true` if successful.
-// May fail spuriously. Memory ordering as release on success, and relaxed on failure.
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
-
-// Atomically compare and exchange a value; returns `true` if successful.
-// Memory ordering is acquire-release
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected);
-
-// Atomically exchange a value. Memory ordering is acquire-release.
-static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange);
-
-// Atomically read a value. Memory ordering is relaxed.
-static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p);
-
-// Atomically read a value. Memory ordering is acquire.
-static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p);
-
-// Atomically write a value. Memory ordering is release.
-static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x);
-
-// Yield
 static inline void mi_atomic_yield(void);
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add);
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);


-// Atomically subtract a value; returns the previous value.
-static inline uintptr_t mi_atomic_sub(volatile _Atomic(uintptr_t)* p, uintptr_t sub) {
-  return mi_atomic_add(p, (uintptr_t)(-((intptr_t)sub)));
+#if defined(__cplusplus) || !defined(_MSC_VER)
+
+// In C++/C11 atomics we have polymorphic atomics so can use the typed `ptr` variants (where `tp` is the type of atomic value)
+// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well
+#define mi_atomic_load_ptr_acquire(tp,p)                mi_atomic_load_acquire(p)
+#define mi_atomic_load_ptr_relaxed(tp,p)                mi_atomic_load_relaxed(p)
+
+// In C++ we need to add casts to help resolve templates if NULL is passed
+#if defined(__cplusplus)
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,(tp*)x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,(tp*)x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,(tp*)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,(tp*)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,(tp*)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,(tp*)x)
+#else
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release(p,x)
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed(p,x)
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release(p,exp,des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel(p,exp,des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release(p,exp,des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          mi_atomic_exchange_release(p,x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          mi_atomic_exchange_acq_rel(p,x)
+#endif
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
+  return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
+}
+static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
+  int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
+  while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };
 }

-// Atomically increment a value; returns the incremented result.
-static inline uintptr_t mi_atomic_increment(volatile _Atomic(uintptr_t)* p) {
-  return mi_atomic_add(p, 1);
-}
-
-// Atomically decrement a value; returns the decremented result.
-static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) {
-  return mi_atomic_sub(p, 1);
-}
-
-// Atomically add a signed value; returns the previous value.
-static inline intptr_t mi_atomic_addi(volatile _Atomic(intptr_t)* p, intptr_t add) {
-  return (intptr_t)mi_atomic_add((volatile _Atomic(uintptr_t)*)p, (uintptr_t)add);
-}
-
-// Atomically subtract a signed value; returns the previous value.
-static inline intptr_t mi_atomic_subi(volatile _Atomic(intptr_t)* p, intptr_t sub) {
-  return (intptr_t)mi_atomic_addi(p,-sub);
-}
-
-// Atomically read a pointer; Memory order is relaxed (i.e. no fence, only atomic).
-#define mi_atomic_read_ptr_relaxed(T,p)  \
-  (T*)(mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)(p)))
-
-// Atomically read a pointer; Memory order is acquire.
-#define mi_atomic_read_ptr(T,p) \
-  (T*)(mi_atomic_read((const volatile _Atomic(uintptr_t)*)(p)))
-
-// Atomically write a pointer; Memory order is acquire.
-#define mi_atomic_write_ptr(T,p,x) \
-  mi_atomic_write((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)x))
-
-// Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously.
-// Memory order is release. (like a write)
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-#define mi_atomic_cas_ptr_weak(T,p,desired,expected) \
-  mi_atomic_cas_weak((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)(desired)), (uintptr_t)((T*)(expected)))
-    
-// Atomically compare and exchange a pointer; returns `true` if successful. Memory order is acquire_release.
-// (Note: expected and desired are in opposite order from atomic_compare_exchange)
-#define mi_atomic_cas_ptr_strong(T,p,desired,expected) \
-  mi_atomic_cas_strong((volatile _Atomic(uintptr_t)*)(p),(uintptr_t)((T*)(desired)), (uintptr_t)((T*)(expected))) 
-
-// Atomically exchange a pointer value.
-#define mi_atomic_exchange_ptr(T,p,exchange) \
-  (T*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)exchange))
+// Used by timers
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(load_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(load_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))


-#ifdef _MSC_VER
+
+#elif defined(_MSC_VER)
+
+// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics.
 #define WIN32_LEAN_AND_MEAN
-#include <windows.h>
+#include <Windows.h>
 #include <intrin.h>
 #ifdef _WIN64
 typedef LONG64   msc_intptr_t;
@ -130,128 +127,206 @@ typedef LONG64   msc_intptr_t;
 typedef LONG     msc_intptr_t;
 #define MI_64(f) f
 #endif
-static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add) {
+
+typedef enum mi_memory_order_e {
+  mi_memory_order_relaxed,
+  mi_memory_order_consume,
+  mi_memory_order_acquire,
+  mi_memory_order_release,
+  mi_memory_order_acq_rel,
+  mi_memory_order_seq_cst
+} mi_memory_order;
+
+static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)*p, uintptr_t add, mi_memory_order mo) {
+  (void)(mo);
  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add);
 }
-static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) {
+  (void)(mo);
+  return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub));
+}
+static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
  return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
-static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
+static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
  return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x);
 }
-static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return (expected == (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected));
+static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  (void)(mo1); (void)(mo2);
+  uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected));
+  if (read == *expected) {
+    return true;
+  }
+  else {
+    *expected = read;
+    return false;
+  }
 }
-static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  return mi_atomic_cas_strong(p,desired,expected);
+static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) {
+  return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2);
 }
-static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
+static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)*p, uintptr_t exchange, mi_memory_order mo) {
+  (void)(mo);
  return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange);
 }
-static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) {
-  return *p;
+static inline void mi_atomic_thread_fence(mi_memory_order mo) {
+  (void)(mo);
+  _Atomic(uintptr_t)x = 0;
+  mi_atomic_exchange_explicit(&x, 1, mo);
 }
-static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) {
+static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
  return *p;
+#else
+  uintptr_t x = *p;
+  if (mo > mi_memory_order_relaxed) {
+    while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ };
+  }
+  return x;
+#endif
 }
-static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
-  #if defined(_M_IX86) || defined(_M_X64)
+static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)*p, uintptr_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_IX86) || defined(_M_X64)
  *p = x;
-  #else
-  mi_atomic_exchange(p,x);
-  #endif
+#else
+  mi_atomic_exchange_explicit(p, x, mo);
+#endif
 }
-static inline void mi_atomic_yield(void) {
-  YieldProcessor();
+static inline int64_t mi_atomic_loadi64_explicit(_Atomic(int64_t)*p, mi_memory_order mo) {
+  (void)(mo);
+#if defined(_M_X64)
+  return *p;
+#else
+  int64_t old = *p;
+  int64_t x = old;
+  while ((old = InterlockedCompareExchange64(p, x, old)) != x) {
+    x = old;
+  }
+  return x;
+#endif
 }
-static inline void mi_atomic_addi64(volatile _Atomic(int64_t)* p, int64_t add) {
-  #ifdef _WIN64
-  mi_atomic_addi(p,add);
-  #else
+static inline void mi_atomic_storei64_explicit(_Atomic(int64_t)*p, int64_t x, mi_memory_order mo) {
+  (void)(mo);
+#if defined(x_M_IX86) || defined(_M_X64)
+  *p = x;
+#else
+  InterlockedExchange64(p, x);
+#endif
+}
+
+// These are used by the statistics
+static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)*p, int64_t add) {
+#ifdef _WIN64
+  return (int64_t)mi_atomic_addi((int64_t*)p, add);
+#else
  int64_t current;
  int64_t sum;
  do {
    current = *p;
    sum = current + add;
  } while (_InterlockedCompareExchange64(p, sum, current) != current);
-  #endif
-}
-
-#else
-#ifdef __cplusplus
-#define  MI_USING_STD   using namespace std;
-#else
-#define  MI_USING_STD
+  return current;
 #endif
-static inline void mi_atomic_addi64(volatile int64_t* p, int64_t add) {
-  MI_USING_STD
-  atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed);
 }
-static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add) {
-  MI_USING_STD
-  return atomic_fetch_add_explicit(p, add, memory_order_relaxed);
-}
-static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_fetch_and_explicit(p, x, memory_order_relaxed);
-}
-static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_fetch_or_explicit(p, x, memory_order_relaxed);
-}
-static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  MI_USING_STD
-  return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_release, memory_order_relaxed);
-}
-static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) {
-  MI_USING_STD
-  return atomic_compare_exchange_strong_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_relaxed);
-}
-static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) {
-  MI_USING_STD
-  return atomic_exchange_explicit(p, exchange, memory_order_acq_rel);
-}
-static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p) {
-  MI_USING_STD
-  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed);
-}
-static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p) {
-  MI_USING_STD
-  return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_acquire);
-}
-static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) {
-  MI_USING_STD
-  return atomic_store_explicit(p, x, memory_order_release);
+static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) {
+  int64_t current;
+  do {
+    current = *p;
+  } while (current < x && _InterlockedCompareExchange64(p, x, current) != current);
 }

+// The pointer macros cast to `uintptr_t`.
+#define mi_atomic_load_ptr_acquire(tp,p)                (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_load_ptr_relaxed(tp,p)                (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p))
+#define mi_atomic_store_ptr_release(tp,p,x)             mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_store_ptr_relaxed(tp,p,x)             mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x))
+#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des)    mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des)    mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des)  mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des)
+#define mi_atomic_exchange_ptr_release(tp,p,x)          (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+#define mi_atomic_exchange_ptr_acq_rel(tp,p,x)          (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x)
+
+#define mi_atomic_loadi64_acquire(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(acquire))
+#define mi_atomic_loadi64_relaxed(p)    mi_atomic(loadi64_explicit)(p,mi_memory_order(relaxed))
+#define mi_atomic_storei64_release(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(release))
+#define mi_atomic_storei64_relaxed(p,x) mi_atomic(storei64_explicit)(p,x,mi_memory_order(relaxed))
+
+
+#endif
+
+
+// Atomically add a signed value; returns the previous value.
+static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) {
+  return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add);
+}
+
+// Atomically subtract a signed value; returns the previous value.
+static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
+  return (intptr_t)mi_atomic_addi(p, -sub);
+}
+
+// Yield 
 #if defined(__cplusplus)
-  #include <thread>
-  static inline void mi_atomic_yield(void) {
-    std::this_thread::yield();
-  }
+#include <thread>
+static inline void mi_atomic_yield(void) {
+  std::this_thread::yield();
+}
+#elif defined(_WIN32)
+#define WIN32_LEAN_AND_MEAN
+#include <windows.h>
+static inline void mi_atomic_yield(void) {
+  YieldProcessor();
+}
+#elif defined(__SSE2__)
+#include <emmintrin.h>
+static inline void mi_atomic_yield(void) {
+  _mm_pause();
+}
 #elif (defined(__GNUC__) || defined(__clang__)) && \
-      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
+      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__armel__) || defined(__ARMEL__) || \
+       defined(__aarch64__) || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__))
 #if defined(__x86_64__) || defined(__i386__)
-  static inline void mi_atomic_yield(void) {
-    asm volatile ("pause" ::: "memory");
-  }
-#elif defined(__arm__) || defined(__aarch64__)
-  static inline void mi_atomic_yield(void) {
-    asm volatile("yield");
-  }
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile ("pause" ::: "memory");
+}
+#elif defined(__aarch64__)
+static inline void mi_atomic_yield(void) {
+  asm volatile("wfe");
+}
+#elif (defined(__arm__) && __ARM_ARCH__ >= 7)
+static inline void mi_atomic_yield(void) {
+  __asm__ volatile("yield" ::: "memory");
+}
+#elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__)
+static inline void mi_atomic_yield(void) {
+  __asm__ __volatile__ ("or 27,27,27" ::: "memory");
+}
+#elif defined(__armel__) || defined(__ARMEL__)
+static inline void mi_atomic_yield(void) {
+  asm volatile ("nop" ::: "memory");
+}
 #endif
+#elif defined(__sun)
+// Fallback for other archs
+#include <synch.h>
+static inline void mi_atomic_yield(void) {
+  smt_pause();
+}
 #elif defined(__wasi__)
-  #include <sched.h>
-  static inline void mi_atomic_yield(void) {
-    sched_yield();
-  }
+#include <sched.h>
+static inline void mi_atomic_yield(void) {
+  sched_yield();
+}
 #else
-  #include <unistd.h>
-  static inline void mi_atomic_yield(void) {
-    sleep(0);
-  }
+#include <unistd.h>
+static inline void mi_atomic_yield(void) {
+  sleep(0);
+}
 #endif

-#endif

 #endif // __MIMALLOC_ATOMIC_H
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -32,7 +32,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_cache_align
 #endif

-
 // "options.c"
 void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
 void       _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...);
@ -64,7 +63,7 @@ void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free th
 size_t     _mi_os_good_alloc_size(size_t size);

 // memory.c
-void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
+void*      _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* id, mi_os_tld_t* tld);
 void       _mi_mem_free(void* p, size_t size, size_t id, bool fully_committed, bool any_reset, mi_os_tld_t* tld);

 bool       _mi_mem_reset(void* p, size_t size, mi_os_tld_t* tld);
@ -107,7 +106,6 @@ void       _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page);   // callback fro

 size_t     _mi_bin_size(uint8_t bin);           // for stats
 uint8_t    _mi_bin(size_t size);                // for stats
-uint8_t    _mi_bsr(uintptr_t x);                // bit-scan-right, used on BSD in "os.c"

 // "heap.c"
 void       _mi_heap_destroy_pages(mi_heap_t* heap);
@ -238,23 +236,28 @@ static inline bool mi_malloc_satisfies_alignment(size_t alignment, size_t size)
 }

 // Overflow detecting multiply
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
-#include <limits.h>   // UINT_MAX, ULONG_MAX
-#if (SIZE_MAX == UINT_MAX)
-  return __builtin_umul_overflow(count, size, total);
-#elif (SIZE_MAX == ULONG_MAX)
-  return __builtin_umull_overflow(count, size, total);
-#else
-  return __builtin_umulll_overflow(count, size, total);
+#include <limits.h>      // UINT_MAX, ULONG_MAX
+#if defined(_CLOCK_T)    // for Illumos
+#undef _CLOCK_T
 #endif
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
+  #if (SIZE_MAX == UINT_MAX)
+    return __builtin_umul_overflow(count, size, total);
+  #elif (SIZE_MAX == ULONG_MAX)
+    return __builtin_umull_overflow(count, size, total);
+  #else
+    return __builtin_umulll_overflow(count, size, total);
+  #endif
+}
 #else /* __builtin_umul_overflow is unavailable */
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
  #define MI_MUL_NO_OVERFLOW ((size_t)1 << (4*sizeof(size_t)))  // sqrt(SIZE_MAX)
  *total = count * size;
  return ((size >= MI_MUL_NO_OVERFLOW || count >= MI_MUL_NO_OVERFLOW)
-          && size > 0 && (SIZE_MAX / size) < count);
-#endif
+    && size > 0 && (SIZE_MAX / size) < count);
 }
+#endif

 // Safe multiply `count*size` into `total`; return `true` on overflow.
 static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* total) {
@ -263,7 +266,7 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
    return false;
  }
  else if (mi_unlikely(mi_mul_overflow(count, size, total))) {
-    _mi_error_message(EOVERFLOW, "allocation request too large (%zu * %zu bytes)\n", count, size);
+    _mi_error_message(EOVERFLOW, "allocation request is too large (%zu * %zu bytes)\n", count, size);
    *total = SIZE_MAX;
    return true;
  }
@ -282,7 +285,7 @@ We try to circumvent this in an efficient way:
 - macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
           loader itself calls `malloc` even before the modules are initialized.
 - OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
- DragonFly: not yet working.
+- DragonFly: the uniqueid use is buggy but kept for reference.
 ------------------------------------------------------------------------------------------- */

 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
@ -300,7 +303,7 @@ mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing hea
 #define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
 #elif defined(__DragonFly__)
 #warning "mimalloc is not working correctly on DragonFly yet."
-#define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+//#define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
 #endif
 #endif

@ -312,7 +315,7 @@ static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
  pthread_t self = pthread_self();
  #if defined(__DragonFly__)
  if (self==NULL) {
-    static mi_heap_t* pheap_main = _mi_heap_main_get();
+    mi_heap_t* pheap_main = _mi_heap_main_get();
    return &pheap_main;
  }
  #endif
@ -443,21 +446,21 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {

 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
-  return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3);
+  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
 }

 static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
-  return (mi_delayed_t)(mi_atomic_read_relaxed(&page->xthread_free) & 3);
+  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
 }

 // Heap access
 static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
-  return (mi_heap_t*)(mi_atomic_read_relaxed(&page->xheap));
+  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
 }

 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  mi_atomic_write(&page->xheap,(uintptr_t)heap);
+  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
 }

 // Thread free flag helpers
@ -569,11 +572,11 @@ static inline bool mi_is_in_same_page(const void* p, const void* q) {

 static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) {
  shift %= MI_INTPTR_BITS;
-  return ((x << shift) | (x >> (MI_INTPTR_BITS - shift)));
+  return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift))));
 }
 static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
  shift %= MI_INTPTR_BITS;
-  return ((x >> shift) | (x << (MI_INTPTR_BITS - shift)));
+  return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift))));
 }

 static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
@ -694,15 +697,21 @@ static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
  __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // 32-bit always uses GS
 #elif defined(__MACH__) && defined(__x86_64__)
  __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+  __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x32 ABI
 #elif defined(__x86_64__)
  __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
 #elif defined(__arm__)
  void** tcb; UNUSED(ofs);
-  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
  res = tcb[slot];
 #elif defined(__aarch64__)
  void** tcb; UNUSED(ofs);
-  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+#if defined(__APPLE__) // issue #343
+  __asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tcb));
+#else
+  __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+#endif
  res = tcb[slot];
 #endif
  return res;
@ -715,15 +724,21 @@ static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
  __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
 #elif defined(__MACH__) && defined(__x86_64__)
  __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__) && (MI_INTPTR_SIZE==4)
+  __asm__("movl %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x32 ABI
 #elif defined(__x86_64__)
  __asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
 #elif defined(__arm__)
  void** tcb; UNUSED(ofs);
-  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
  tcb[slot] = value;
 #elif defined(__aarch64__)
  void** tcb; UNUSED(ofs);
-  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+#if defined(__APPLE__) // issue #343
+  __asm__ volatile ("mrs %0, tpidrro_el0" : "=r" (tcb));
+#else
+  __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+#endif
  tcb[slot] = value;
 #endif
 }
@ -739,5 +754,108 @@ static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
 }
 #endif

+// -----------------------------------------------------------------------
+// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero)
+// -----------------------------------------------------------------------
+
+#if defined(__GNUC__)
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_clzl(x);
+#else
+  return __builtin_clzll(x);
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (INTPTR_MAX == LONG_MAX)
+  return __builtin_ctzl(x);
+#else
+  return __builtin_ctzll(x);
+#endif
+}
+
+#elif defined(_MSC_VER) 
+
+#include <limits.h>       // LONG_MAX
+#define MI_HAVE_FAST_BITSCAN
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanReverse(&idx, x);
+#else
+  _BitScanReverse64(&idx, x);
+#endif  
+  return ((MI_INTPTR_BITS - 1) - idx);
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+  unsigned long idx;
+#if (INTPTR_MAX == LONG_MAX)
+  _BitScanForward(&idx, x);
+#else
+  _BitScanForward64(&idx, x);
+#endif  
+  return idx;
+}
+
+#else
+static inline size_t mi_ctz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const unsigned char debruijn[32] = {
+    0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
+    31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9
+  };
+  if (x==0) return 32;
+  return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27];
+}
+static inline size_t mi_clz32(uint32_t x) {
+  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
+  static const uint8_t debruijn[32] = {
+    31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1,
+    23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0
+  };
+  if (x==0) return 32;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27];
+}
+
+static inline size_t mi_clz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;  
+#if (MI_INTPTR_BITS <= 32)
+  return mi_clz32((uint32_t)x);
+#else
+  size_t count = mi_clz32((uint32_t)(x >> 32));
+  if (count < 32) return count;
+  return (32 + mi_clz32((uint32_t)x));
+#endif
+}
+static inline size_t mi_ctz(uintptr_t x) {
+  if (x==0) return MI_INTPTR_BITS;
+#if (MI_INTPTR_BITS <= 32)
+  return mi_ctz32((uint32_t)x);
+#else
+  size_t count = mi_ctz32((uint32_t)x);
+  if (count < 32) return count;
+  return (32 + mi_ctz32((uint32_t)(x>>32)));
+#endif
+}
+
+#endif
+
+// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero)
+static inline size_t mi_bsr(uintptr_t x) {
+  return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x));
+}
+

 #endif
--- a/include/mimalloc-override.h
+++ b/include/mimalloc-override.h
@ -24,7 +24,7 @@ not accidentally mix pointers from different allocators).
 #define free(p)                 mi_free(p)

 #define strdup(s)               mi_strdup(s)
-#define strndup(s)              mi_strndup(s)
+#define strndup(s,n)              mi_strndup(s,n)
 #define realpath(f,n)           mi_realpath(f,n)

 // Microsoft extensions
@ -33,7 +33,7 @@ not accidentally mix pointers from different allocators).
 #define _recalloc(p,n,c)        mi_recalloc(p,n,c)

 #define _strdup(s)              mi_strdup(s)
-#define _strndup(s)             mi_strndup(s)
+#define _strndup(s,n)           mi_strndup(s,n)
 #define _wcsdup(s)              (wchar_t*)mi_wcsdup((const unsigned short*)(s))
 #define _mbsdup(s)              mi_mbsdup(s)
 #define _dupenv_s(b,n,v)        mi_dupenv_s(b,n,v)
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@ -12,9 +12,15 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdint.h>   // uintptr_t, uint16_t, etc
 #include <mimalloc-atomic.h>  // _Atomic

+#ifdef _MSC_VER
+#pragma warning(disable:4214) // bitfield is not int
+#endif 
+
 // Minimal alignment necessary. On most platforms 16 bytes are needed
 // due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
+#ifndef MI_MAX_ALIGN_SIZE
 #define MI_MAX_ALIGN_SIZE  16   // sizeof(max_align_t)
+#endif

 // ------------------------------------------------------
 // Variants
@ -155,6 +161,7 @@ typedef enum mi_delayed_e {

 // The `in_full` and `has_aligned` page flags are put in a union to efficiently
 // test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#if !MI_TSAN
 typedef union mi_page_flags_s {
  uint8_t full_aligned;
  struct {
@ -162,6 +169,16 @@ typedef union mi_page_flags_s {
    uint8_t has_aligned : 1;
  } x;
 } mi_page_flags_t;
+#else
+// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
+typedef union mi_page_flags_s {
+  uint16_t full_aligned;
+  struct {
+    uint8_t in_full;
+    uint8_t has_aligned;
+  } x;
+} mi_page_flags_t;
+#endif

 // Thread free list.
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
@ -222,8 +239,8 @@ typedef struct mi_page_s {
  uint32_t              xblock_size;       // size available in each block (always `>0`) 

  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  volatile _Atomic(mi_thread_free_t) xthread_free;   // list of deferred free blocks freed by other threads
-  volatile _Atomic(uintptr_t)        xheap;
+  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
+  _Atomic(uintptr_t)        xheap;
  
  struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
  struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
@ -243,28 +260,29 @@ typedef enum mi_page_kind_e {
 // contain blocks.
 typedef struct mi_segment_s {
  // memory fields
-  size_t          memid;            // id for the os-level memory manager
-  bool            mem_is_fixed;     // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
-  bool            mem_is_committed; // `true` if the whole segment is eagerly committed
+  size_t               memid;            // id for the os-level memory manager
+  bool                 mem_is_pinned;    // `true` if we cannot decommit/reset/protect in this memory (i.e. when allocated using large OS pages)
+  bool                 mem_is_committed; // `true` if the whole segment is eagerly committed  

  // segment fields
-  struct mi_segment_s* next;        // must be the first segment field -- see `segment.c:segment_alloc`
+  _Atomic(struct mi_segment_s*) abandoned_next;
+  struct mi_segment_s* next;             // must be the first segment field after abandoned_next -- see `segment.c:segment_init`
  struct mi_segment_s* prev;
-  struct mi_segment_s* abandoned_next;
-  size_t          abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
-  size_t          abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)

-  size_t          used;        // count of pages in use (`used <= capacity`)
-  size_t          capacity;    // count of available pages (`#free + used`)
-  size_t          segment_size;// for huge pages this may be different from `MI_SEGMENT_SIZE`
-  size_t          segment_info_size;  // space we are using from the first page for segment meta-data and possible guard pages.
-  uintptr_t       cookie;      // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`
+  size_t               abandoned;        // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
+  size_t               abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long)
+
+  size_t               used;             // count of pages in use (`used <= capacity`)
+  size_t               capacity;         // count of available pages (`#free + used`)
+  size_t               segment_size;     // for huge pages this may be different from `MI_SEGMENT_SIZE`
+  size_t               segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages.
+  uintptr_t            cookie;           // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie`

  // layout like this to optimize access in `mi_free`
-  size_t          page_shift;  // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
-  volatile _Atomic(uintptr_t) thread_id;   // unique id of the thread owning this segment
-  mi_page_kind_t  page_kind;   // kind of pages: small, large, or huge
-  mi_page_t       pages[1];    // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
+  size_t               page_shift;       // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`).
+  _Atomic(uintptr_t)   thread_id;        // unique id of the thread owning this segment
+  mi_page_kind_t       page_kind;        // kind of pages: small, large, or huge
+  mi_page_t            pages[1];         // up to `MI_SMALL_PAGES_PER_SEGMENT` pages
 } mi_segment_t;


@ -322,7 +340,7 @@ struct mi_heap_s {
  mi_tld_t*             tld;
  mi_page_t*            pages_free_direct[MI_PAGES_DIRECT];  // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size.
  mi_page_queue_t       pages[MI_BIN_FULL + 1];              // queue of pages for each size class (or "bin")
-  volatile _Atomic(mi_block_t*) thread_delayed_free;
+  _Atomic(mi_block_t*)  thread_delayed_free;
  uintptr_t             thread_id;                           // thread this heap belongs too
  uintptr_t             cookie;                              // random cookie to verify pointers (see `_mi_ptr_cookie`)
  uintptr_t             keys[2];                             // two random keys used to encode the `thread_delayed_free` list
@ -398,6 +416,7 @@ typedef struct mi_stats_s {
  mi_stat_count_t segments_abandoned;
  mi_stat_count_t pages_abandoned;
  mi_stat_count_t threads;
+  mi_stat_count_t normal;
  mi_stat_count_t huge;
  mi_stat_count_t giant;
  mi_stat_count_t malloc;
@ -407,10 +426,11 @@ typedef struct mi_stats_s {
  mi_stat_counter_t commit_calls;
  mi_stat_counter_t page_no_retire;
  mi_stat_counter_t searches;
+  mi_stat_counter_t normal_count;
  mi_stat_counter_t huge_count;
  mi_stat_counter_t giant_count;
 #if MI_STAT>1
-  mi_stat_count_t normal[MI_BIN_HUGE+1];
+  mi_stat_count_t normal_bins[MI_BIN_HUGE+1];
 #endif
 } mi_stats_t;

@ -429,6 +449,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount);
 #define mi_stat_counter_increase(stat,amount) (void)0
 #endif

+#define mi_heap_stat_counter_increase(heap,stat,amount)  mi_stat_counter_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_increase(heap,stat,amount)  mi_stat_increase( (heap)->tld->stats.stat, amount)
 #define mi_heap_stat_decrease(heap,stat,amount)  mi_stat_decrease( (heap)->tld->stats.stat, amount)

--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@ -1,5 +1,5 @@
 /* ----------------------------------------------------------------------------
-Copyright (c) 2018, Microsoft Research, Daan Leijen
+Copyright (c) 2018-2020, Microsoft Research, Daan Leijen
 This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #ifndef MIMALLOC_H
 #define MIMALLOC_H

-#define MI_MALLOC_VERSION 163   // major + 2 digits minor
+#define MI_MALLOC_VERSION 167   // major + 2 digits minor

 // ------------------------------------------------------
 // Compiler specific attributes
@ -24,7 +24,7 @@ terms of the MIT license. A copy of the license can be found in the file
  #define mi_attr_noexcept
 #endif

-#if (__cplusplus >= 201703)
+#if defined(__cplusplus) && (__cplusplus >= 201703)
  #define mi_decl_nodiscard    [[nodiscard]]
 #elif (__GNUC__ >= 4) || defined(__clang__)  // includes clang, icc, and clang-cl
  #define mi_decl_nodiscard    __attribute__((warn_unused_result))
@ -153,6 +153,9 @@ mi_decl_export void mi_thread_init(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_done(void)      mi_attr_noexcept;
 mi_decl_export void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept;

+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, 
+                                    size_t* current_rss, size_t* peak_rss, 
+                                    size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept;

 // -------------------------------------------------------------------------------------
 // Aligned allocation
@ -192,7 +195,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_
 mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2);

 mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize)              mi_attr_noexcept mi_attr_alloc_size(3);
-mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);;
+mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size)  mi_attr_noexcept mi_attr_alloc_size2(3,4);
 mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize)             mi_attr_noexcept mi_attr_alloc_size(3);

 mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s)            mi_attr_noexcept mi_attr_malloc;
@ -256,11 +259,15 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b

 // Experimental
 mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
-mi_decl_nodiscard mi_decl_export bool mi_is_redirected() mi_attr_noexcept;
+mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;

 mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept;
 mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept;

+mi_decl_export int  mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept;
+mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept;
+
+
 // deprecated
 mi_decl_export int  mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept;

@ -299,6 +306,7 @@ typedef enum mi_option_e {
  mi_option_reset_decommits,
  mi_option_large_os_pages,         // implies eager commit
  mi_option_reserve_huge_os_pages,
+  mi_option_reserve_os_memory,
  mi_option_segment_cache,
  mi_option_page_reset,
  mi_option_abandoned_page_reset,
@ -306,8 +314,10 @@ typedef enum mi_option_e {
  mi_option_eager_commit_delay,
  mi_option_reset_delay,
  mi_option_use_numa_nodes,
+  mi_option_limit_os_alloc,
  mi_option_os_tag,
  mi_option_max_errors,
+  mi_option_max_warnings,
  _mi_option_last
 } mi_option_t;

--- a/readme.md
+++ b/readme.md
@ -11,26 +11,34 @@ mimalloc (pronounced "me-malloc")
 is a general purpose allocator with excellent [performance](#performance) characteristics.
 Initially developed by Daan Leijen for the run-time systems of the
 [Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages.
-Latest release:`v1.6.3` (2020-05-05).
+Latest release:`v1.6.7` (2020-09-24).

 It is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
 ```
 > LD_PRELOAD=/usr/bin/libmimalloc.so  myprogram
 ```
-It also has an easy way to override the allocator in [Windows](#override_on_windows). Notable aspects of the design include:
+It also has an easy way to override the default allocator in [Windows](#override_on_windows). Notable aspects of the design include:

- __small and consistent__: the library is about 6k LOC using simple and
+- __small and consistent__: the library is about 8k LOC using simple and
  consistent data structures. This makes it very suitable
  to integrate and adapt in other projects. For runtime systems it
  provides hooks for a monotonic _heartbeat_ and deferred freeing (for
  bounded worst-case times with reference counting).
- __free list sharding__: the big idea: instead of one big free list (per size class) we have
-  many smaller lists per memory "page" which both reduces fragmentation
-  and increases locality --
+- __free list sharding__: instead of one big free list (per size class) we have
+  many smaller lists per "mimalloc page" which reduces fragmentation and
+  increases locality --
  things that are allocated close in time get allocated close in memory.
-  (A memory "page" in _mimalloc_ contains blocks of one size class and is
-  usually 64KiB on a 64-bit system).
+  (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).
+- __free list multi-sharding__: the big idea! Not only do we shard the free list
+  per mimalloc page, but for each page we have multiple free lists. In particular, there
+  is one list for thread-local `free` operations, and another one for concurrent `free`
+  operations. Free-ing from another thread can now be a single CAS without needing
+  sophisticated coordination between threads. Since there will be 
+  thousands of separate free lists, contention is naturally distributed over the heap,
+  and the chance of contending on a single location will be low -- this is quite
+  similar to randomized algorithms like skip lists where adding
+  a random oracle removes the need for a more complex algorithm.
 - __eager page reset__: when a "page" becomes empty (with increased chance
  due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged")
  reducing (real) memory pressure and fragmentation, especially in long running
@ -55,8 +63,20 @@ You can read more on the design of _mimalloc_ in the [technical report](https://

 Enjoy!  

+### Branches
+
+* `master`: latest stable release.
+* `dev`: latest development branch.
+* `dev-slice`: experimental branch with a different way of managing mimalloc pages that tends 
+  to use less memory than regular mimalloc with similar performance. Give it a try and please
+  report any significant performance improvement or degradation.
+
 ### Releases

+* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
+  handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
+* 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
+  support for IllumOS and Haiku, NUMA support for Vista/XP, improved NUMA detection for AMD Ryzen, ubsan support.
 * 2020-05-05, `v1.6.3`: stable release 1.6: improved behavior in out-of-memory situations, improved malloc zones on macOS,
  build PIC static libraries by default, add option to abort on out-of-memory, line buffered statistics.
 * 2020-04-20, `v1.6.2`: stable release 1.6: fix compilation on Android, MingW, Raspberry, and Conda,
@ -82,9 +102,26 @@ free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af

 Special thanks to:

-* Jason Gibson (@jasongibson) for exhaustive testing on large workloads and server environments and finding complex bugs in (early versions of) `mimalloc`.
+* [David Carlier](https://devnexen.blogspot.com/) (@devnexen) for his many contributions, and making
+  mimalloc work better on many less common operating systems, like Haiku, Dragonfly, etc.
+* Mary Feofanova (@mary3000), Evgeniy Moiseenko, and Manuel Pöter (@mpoeter) for making mimalloc TSAN checkable, and finding
+  memory model bugs using the [genMC] model checker.
+* Weipeng Liu (@pongba), Zhuowei Li, Junhua Wang, and Jakub Szymanski, for their early support of mimalloc and deployment
+  at large scale services, leading to many improvements in the mimalloc algorithms for large workloads.
+* Jason Gibson (@jasongibson) for exhaustive testing on large scale workloads and server environments, and finding complex bugs 
+  in (early versions of) `mimalloc`.
 * Manuel Pöter (@mpoeter) and Sam Gross (@colesbury) for finding an ABA concurrency issue in abandoned segment reclamation.

+[genMC]: https://plv.mpi-sws.org/genmc/
+
+### Usage
+
+mimalloc is used in various large scale low-latency services and programs, for example:
+
+<a href="https://www.bing.com"><img align="left"  height="50" src="https://upload.wikimedia.org/wikipedia/commons/e/e9/Bing_logo.svg"></a>
+<a href="https://azure.microsoft.com/"><img align="left" height="50" src="https://upload.wikimedia.org/wikipedia/commons/a/a8/Microsoft_Azure_Logo.svg"></a>
+<a href="https://deathstrandingpc.505games.com"><img height="100" src="doc/ds-logo.jpg" style="border-radius=1ex;vertical-align:center"></a>
+
 # Building

 ## Windows
@ -213,7 +250,7 @@ completely and redirect all calls to the _mimalloc_ library instead .
 ## Environment Options

 You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
-or via environment variables.
+or via environment variables:

 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
 - `MIMALLOC_VERBOSE=1`: show verbose messages.
@ -263,11 +300,11 @@ _mimalloc_ can be build in secure mode by using the `-DMI_SECURE=ON` flags in `c
 to make mimalloc more robust against exploits. In particular:

 - All internal mimalloc pages are surrounded by guard pages and the heap metadata is behind a guard page as well (so a buffer overflow
-  exploit cannot reach into the metadata),
+  exploit cannot reach into the metadata).
 - All free list pointers are
  [encoded](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396)
-  with per-page keys which is used both to prevent overwrites with a known pointer, as well as to detect heap corruption,
- Double free's are detected (and ignored),
+  with per-page keys which is used both to prevent overwrites with a known pointer, as well as to detect heap corruption.
+- Double free's are detected (and ignored).
 - The free lists are initialized in a random order and allocation randomly chooses between extension and reuse within a page to
  mitigate against attacks that rely on a predicable allocation order. Similarly, the larger heap blocks allocated by mimalloc
  from the OS are also address randomized.
@ -409,7 +446,7 @@ as [mimalloc-bench](https://github.com/daanx/mimalloc-bench).
 Testing on a big Amazon EC2 compute instance
 ([c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized))
 consisting of a 72 processor Intel Xeon at 3GHz
-with 144GiB ECC memory, running	Ubuntu 18.04.1 with LibC 2.27 and GCC 7.4.0.
+with 144GiB ECC memory, running	Ubuntu 18.04.1 with glibc 2.27 and GCC 7.4.0.
 The measured allocators are _mimalloc_ (xmi, tag:v1.4.0, page reset enabled)
 and its secure build as _smi_,
 Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (tc, tag:gperftools-2.7) used in Chrome,
@ -419,7 +456,7 @@ the Intel thread building blocks [allocator](https://github.com/intel/tbb) (tbb,
 the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (tag:3.13) allocator by Emery Berger \[1],
 the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:51222e7) allocator by
 Bobby Powers _et al_ \[8],
-and finally the default system allocator (glibc, 2.7.0) (based on _PtMalloc2_).
+and finally the default system allocator (glibc, 2.27) (based on _PtMalloc2_).

 <img width="90%" src="doc/bench-c5-18xlarge-2020-01-20-a.svg"/>
 <img width="90%" src="doc/bench-c5-18xlarge-2020-01-20-b.svg"/>
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -17,8 +17,7 @@ terms of the MIT license. A copy of the license can be found in the file
 static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t size, const size_t alignment, const size_t offset, const bool zero) mi_attr_noexcept {
  // note: we don't require `size > offset`, we just guarantee that
  // the address at offset is aligned regardless of the allocated size.
-  mi_assert(alignment > 0 && alignment % sizeof(void*) == 0);
-
+  mi_assert(alignment > 0);
  if (mi_unlikely(size > PTRDIFF_MAX)) return NULL;   // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
  if (mi_unlikely(alignment==0 || !_mi_is_power_of_two(alignment))) return NULL; // require power-of-two (see <https://en.cppreference.com/w/c/memory/aligned_alloc>)
  const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
@ -54,7 +53,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t

  // .. and align within the allocation
  uintptr_t adjust = alignment - (((uintptr_t)p + offset) & align_mask);
-  mi_assert_internal(adjust % sizeof(uintptr_t) == 0);
+  mi_assert_internal(adjust <= alignment);
  void* aligned_p = (adjust == alignment ? p : (void*)((uintptr_t)p + adjust));
  if (aligned_p != p) mi_page_set_has_aligned(_mi_ptr_page(p), true); 
  mi_assert_internal(((uintptr_t)aligned_p + offset) % alignment == 0);
--- a/src/alloc-override-osx.c
+++ b/src/alloc-override-osx.c
@ -35,7 +35,6 @@ terms of the MIT license. A copy of the license can be found in the file
 extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_import));
 #endif

-
 /* ------------------------------------------------------
   malloc zone members
 ------------------------------------------------------ */
@ -44,7 +43,7 @@ static size_t zone_size(malloc_zone_t* zone, const void* p) {
  UNUSED(zone);
  if (!mi_is_in_heap_region(p))
    return 0; // not our pointer, bail out
-  
+
  return mi_usable_size(p);
 }

@ -190,63 +189,85 @@ static malloc_zone_t* mi_get_default_zone()
  }
 }

-static void __attribute__((constructor)) _mi_macos_override_malloc()
-{
-  static malloc_introspection_t intro;
-  memset(&intro, 0, sizeof(intro));
+static malloc_introspection_t mi_introspect = {
+  .enumerator = &intro_enumerator,
+  .good_size = &intro_good_size,
+  .check = &intro_check,
+  .print = &intro_print,
+  .log = &intro_log,
+  .force_lock = &intro_force_lock,
+  .force_unlock = &intro_force_unlock,
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  .zone_locked = &intro_zone_locked,
+  .statistics = &intro_statistics,
+#endif
+};

-  intro.enumerator = &intro_enumerator;
-  intro.good_size = &intro_good_size;
-  intro.check = &intro_check;
-  intro.print = &intro_print;
-  intro.log = &intro_log;
-  intro.force_lock = &intro_force_lock;
-  intro.force_unlock = &intro_force_unlock;
+static malloc_zone_t mi_malloc_zone = {
+  .size = &zone_size,
+  .zone_name = "mimalloc",
+  .introspect = &mi_introspect,
+  .malloc = &zone_malloc,
+  .calloc = &zone_calloc,
+  .valloc = &zone_valloc,
+  .free = &zone_free,
+  .realloc = &zone_realloc,
+  .destroy = &zone_destroy,
+  .batch_malloc = &zone_batch_malloc,
+  .batch_free = &zone_batch_free,
+#if defined(MAC_OS_X_VERSION_10_6) && \
+    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
+  // switch to version 9 on OSX 10.6 to support memalign.
+  .version = 9,
+  .memalign = &zone_memalign,
+  .free_definite_size = &zone_free_definite_size,
+  .pressure_relief = &zone_pressure_relief,
+#else
+  .version = 4,
+#endif
+};

-  static malloc_zone_t zone;
-  memset(&zone, 0, sizeof(zone));

-  zone.version = 4;
-  zone.zone_name = "mimalloc";
-  zone.size = &zone_size;
-  zone.introspect = &intro;
-  zone.malloc = &zone_malloc;
-  zone.calloc = &zone_calloc;
-  zone.valloc = &zone_valloc;
-  zone.free = &zone_free;
-  zone.realloc = &zone_realloc;
-  zone.destroy = &zone_destroy;
-  zone.batch_malloc = &zone_batch_malloc;
-  zone.batch_free = &zone_batch_free;
+#if defined(MI_SHARED_LIB_EXPORT) && defined(MI_INTERPOSE)

+static malloc_zone_t *mi_malloc_default_zone(void) {
+  return &mi_malloc_zone;
+}
+// TODO: should use the macros in alloc-override but they aren't available here.
+__attribute__((used)) static struct {
+  const void *replacement;
+  const void *target;
+} replace_malloc_default_zone[] __attribute__((section("__DATA, __interpose"))) = {
+  { (const void*)mi_malloc_default_zone, (const void*)malloc_default_zone },
+};
+#endif
+
+static void __attribute__((constructor(0))) _mi_macos_override_malloc() {
  malloc_zone_t* purgeable_zone = NULL;

 #if defined(MAC_OS_X_VERSION_10_6) && \
    MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6
-  // switch to version 9 on OSX 10.6 to support memalign.
-  zone.version = 9;
-  zone.memalign = &zone_memalign;
-  zone.free_definite_size = &zone_free_definite_size;
-  zone.pressure_relief = &zone_pressure_relief;
-  intro.zone_locked = &intro_zone_locked;
-  intro.statistics = &intro_statistics;
-
  // force the purgeable zone to exist to avoid strange bugs
  if (malloc_default_purgeable_zone) {
    purgeable_zone = malloc_default_purgeable_zone();
  }
 #endif

-  // Register our zone
-  malloc_zone_register(&zone);
-
+  // Register our zone.
+  // thomcc: I think this is still needed to put us in the zone list.
+  malloc_zone_register(&mi_malloc_zone);
  // Unregister the default zone, this makes our zone the new default
  // as that was the last registered.
  malloc_zone_t *default_zone = mi_get_default_zone();
-  malloc_zone_unregister(default_zone);
+  // thomcc: Unsure if the next test is *always* false or just false in the
+  // cases I've tried. I'm also unsure if the code inside is needed. at all
+  if (default_zone != &mi_malloc_zone) {
+    malloc_zone_unregister(default_zone);

-  // Reregister the default zone so free and realloc in that zone keep working.
-  malloc_zone_register(default_zone);
+    // Reregister the default zone so free and realloc in that zone keep working.
+    malloc_zone_register(default_zone);
+  }

  // Unregister, and re-register the purgeable_zone to avoid bugs if it occurs
  // earlier than the default zone.
@ -257,4 +278,4 @@ static void __attribute__((constructor)) _mi_macos_override_malloc()

 }

-#endif // MI_MALLOC_OVERRIDE
+#endif // MI_MALLOC_OVERRIDE
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -60,6 +60,13 @@ terms of the MIT license. A copy of the license can be found in the file
    MI_INTERPOSE_MI(posix_memalign),
    MI_INTERPOSE_MI(reallocf),
    MI_INTERPOSE_MI(valloc),
+    #ifndef MI_OSX_ZONE
+    // some code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
+    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
+    #else
+    // We interpose malloc_default_zone in alloc-override-osx.c
+    MI_INTERPOSE_MI(free),
+    #endif
    // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity <https://github.com/nneonneo/osx-10.9-opensource/blob/master/objc4-551.1/runtime/hashtable2.mm>)
    MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us
  };
@ -183,7 +190,8 @@ void* _aligned_malloc(size_t alignment, size_t size)          { return mi_aligne
 // on some glibc `aligned_alloc` is declared `static inline` so we cannot override it (e.g. Conda). This happens
 // when _GLIBCXX_HAVE_ALIGNED_ALLOC is not defined. However, in those cases it will use `memalign`, `posix_memalign`, 
 // or `_aligned_malloc` and we can avoid overriding it ourselves.
-#if _GLIBCXX_HAVE_ALIGNED_ALLOC
+// We should always override if using C compilation. (issue #276)
+#if _GLIBCXX_HAVE_ALIGNED_ALLOC || !defined(__cplusplus)
 void* aligned_alloc(size_t alignment, size_t size)   { return mi_aligned_alloc(alignment, size); }
 #endif

--- a/src/alloc-posix.c
+++ b/src/alloc-posix.c
@ -20,6 +20,10 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <string.h>  // memcpy
 #include <stdlib.h>  // getenv

+#ifdef _MSC_VER
+#pragma warning(disable:4996)  // getenv _wgetenv
+#endif
+
 #ifndef EINVAL
 #define EINVAL 22
 #endif
@ -111,8 +115,7 @@ mi_decl_restrict unsigned char* mi_mbsdup(const unsigned char* s)  mi_attr_noexc
 int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept {
  if (buf==NULL || name==NULL) return EINVAL;
  if (size != NULL) *size = 0;
-  #pragma warning(suppress:4996)
-  char* p = getenv(name);
+  char* p = getenv(name);        // mscver warning 4996
  if (p==NULL) {
    *buf = NULL;
  }
@ -132,8 +135,7 @@ int mi_wdupenv_s(unsigned short** buf, size_t* size, const unsigned short* name)
  *buf = NULL;
  return EINVAL;
 #else
-  #pragma warning(suppress:4996)
-  unsigned short* p = (unsigned short*)_wgetenv((const wchar_t*)name);
+  unsigned short* p = (unsigned short*)_wgetenv((const wchar_t*)name);  // msvc warning 4996
  if (p==NULL) {
    *buf = NULL;
  }
--- a/src/alloc.c
+++ b/src/alloc.c
@ -23,27 +23,34 @@ terms of the MIT license. A copy of the license can be found in the file
 // Fall back to generic allocation only if the list is empty.
 extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept {
  mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
-  mi_block_t* block = page->free;
+  mi_block_t* const block = page->free;
  if (mi_unlikely(block == NULL)) {
    return _mi_malloc_generic(heap, size); 
  }
  mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
  // pop from the free list
-  page->free = mi_block_next(page, block);
  page->used++;
+  page->free = mi_block_next(page, block);
  mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page);
+
 #if (MI_DEBUG>0)
  if (!page->is_zero) { memset(block, MI_DEBUG_UNINIT, size); }
 #elif (MI_SECURE!=0)
  block->next = 0;  // don't leak internal data
 #endif
-#if (MI_STAT>1)
+
+#if (MI_STAT>0)
  const size_t bsize = mi_page_usable_block_size(page);
  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_increase(heap, normal, bsize);
+    mi_heap_stat_counter_increase(heap, normal_count, 1);
+#if (MI_STAT>1)
    const size_t bin = _mi_bin(bsize);
-    mi_heap_stat_increase(heap, normal[bin], 1);
+    mi_heap_stat_increase(heap, normal_bins[bin], 1);
+#endif
  }
 #endif
+
 #if (MI_PADDING > 0) && defined(MI_ENCODE_FREELIST)
  mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
@ -54,6 +61,7 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
  for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
 #endif
+
  return block;
 }

@ -282,6 +290,49 @@ static void mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, co
 }
 #endif

+// only maintain stats for smaller objects if requested
+#if (MI_STAT>0)
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+#if (MI_STAT < 2)  
+  UNUSED(block);
+#endif
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_usable_block_size(page);  
+#if (MI_STAT>1)
+  const size_t usize = mi_page_usable_size_of(page, block);
+  mi_heap_stat_decrease(heap, malloc, usize);
+#endif  
+  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, normal, bsize);
+#if (MI_STAT > 1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1);
+#endif
+  }
+}
+#else
+static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) {
+  UNUSED(page); UNUSED(block);
+}
+#endif
+
+#if (MI_STAT>0)
+// maintain stats for huge objects
+static void mi_stat_huge_free(const mi_page_t* page) {
+  mi_heap_t* const heap = mi_heap_get_default();
+  const size_t bsize = mi_page_block_size(page); // to match stats in `page.c:mi_page_huge_alloc`
+  if (bsize <= MI_HUGE_OBJ_SIZE_MAX) {
+    mi_heap_stat_decrease(heap, huge, bsize);
+  }
+  else {
+    mi_heap_stat_decrease(heap, giant, bsize);
+  }
+}
+#else
+static void mi_stat_huge_free(const mi_page_t* page) {
+  UNUSED(page);
+}
+#endif
+
 // ------------------------------------------------------
 // Free
 // ------------------------------------------------------
@ -300,16 +351,16 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
  // huge page segments are always abandoned and can be freed immediately
  mi_segment_t* const segment = _mi_page_segment(page);
  if (segment->page_kind==MI_PAGE_HUGE) {
+    mi_stat_huge_free(page);
    _mi_segment_huge_page_free(segment, page, block);
    return;
  }

  // Try to put the block on either the page-local thread free list, or the heap delayed free list.
-  mi_thread_free_t tfree;
  mi_thread_free_t tfreex;
  bool use_delayed;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
  do {
-    tfree = mi_atomic_read_relaxed(&page->xthread_free);
    use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE);
    if (mi_unlikely(use_delayed)) {
      // unlikely: this only happens on the first concurrent free in a page that is in the full list
@ -320,31 +371,30 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
      mi_block_set_next(page, block, mi_tf_block(tfree));
      tfreex = mi_tf_set_block(tfree,block);
    }
-  } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));

  if (mi_unlikely(use_delayed)) {
    // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`)
-    mi_heap_t* const heap = mi_page_heap(page);
+    mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page);
    mi_assert_internal(heap != NULL);
    if (heap != NULL) {
      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
-      mi_block_t* dfree;
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
      do {
-        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
        mi_block_set_nextx(heap,block,dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
    }

    // and reset the MI_DELAYED_FREEING flag
+    tfree = mi_atomic_load_relaxed(&page->xthread_free);
    do {
-      tfreex = tfree = mi_atomic_read_relaxed(&page->xthread_free);
+      tfreex = tfree;
      mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING);
      tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE);
-    } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+    } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
  }
 }

-
 // regular free
 static inline void _mi_free_block(mi_page_t* page, bool local, mi_block_t* block)
 {
@ -384,62 +434,65 @@ mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* p
 static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, bool local, void* p) {
  mi_page_t* const page = _mi_segment_page_of(segment, p);
  mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p);
+  mi_stat_free(page, block);
  _mi_free_block(page, local, block);
 }

+// Get the segment data belonging to a pointer
+// This is just a single `and` in assembly but does further checks in debug mode
+// (and secure mode) if this was a valid pointer.
+static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) 
+{
+  UNUSED(msg);
+#if (MI_DEBUG>0)
+  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
+    _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p);
+    return NULL;
+  }
+#endif
+
+  mi_segment_t* const segment = _mi_ptr_segment(p);
+  if (mi_unlikely(segment == NULL)) return NULL;  // checks also for (p==NULL)
+
+#if (MI_DEBUG>0)
+  if (mi_unlikely(!mi_is_in_heap_region(p))) {
+    _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n"
+      "(this may still be a valid very large allocation (over 64MiB))\n", msg, p);
+    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
+      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
+    }
+  }
+#endif
+#if (MI_DEBUG>0 || MI_SECURE>=4)
+  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
+    _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", p);
+  }
+#endif
+  return segment;
+}
+
+
 // Free a block
 void mi_free(void* p) mi_attr_noexcept
 {
-#if (MI_DEBUG>0)
-  if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) {
-    _mi_error_message(EINVAL, "trying to free an invalid (unaligned) pointer: %p\n", p);
-    return;
-  }
-#endif
-
-  const mi_segment_t* const segment = _mi_ptr_segment(p);
-  if (mi_unlikely(segment == NULL)) return;  // checks for (p==NULL)
-
-#if (MI_DEBUG!=0)
-  if (mi_unlikely(!mi_is_in_heap_region(p))) {
-    _mi_warning_message("possibly trying to free a pointer that does not point to a valid heap region: %p\n"
-      "(this may still be a valid very large allocation (over 64MiB))\n", p);
-    if (mi_likely(_mi_ptr_cookie(segment) == segment->cookie)) {
-      _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p);
-    } 
-  }
-#endif
-#if (MI_DEBUG!=0 || MI_SECURE>=4)
-  if (mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie)) {
-    _mi_error_message(EINVAL, "trying to free a pointer that does not point to a valid heap space: %p\n", p);
-    return;
-  }
-#endif
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free");
+  if (mi_unlikely(segment == NULL)) return; 

  const uintptr_t tid = _mi_thread_id();
  mi_page_t* const page = _mi_segment_page_of(segment, p);
  mi_block_t* const block = (mi_block_t*)p;

-#if (MI_STAT>1)
-  mi_heap_t* const heap = mi_heap_get_default();
-  const size_t bsize = mi_page_usable_block_size(page);
-  mi_heap_stat_decrease(heap, malloc, bsize);
-  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { // huge page stats are accounted for in `_mi_page_retire`
-    mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], 1);
-  }
-#endif
-
  if (mi_likely(tid == segment->thread_id && page->flags.full_aligned == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
    // local, and not full or aligned
    if (mi_unlikely(mi_check_is_double_free(page,block))) return;
    mi_check_padding(page, block);
+    mi_stat_free(page, block);
    #if (MI_DEBUG!=0)
    memset(block, MI_DEBUG_FREED, mi_page_block_size(page));
    #endif
    mi_block_set_next(page, block, page->local_free);
    page->local_free = block;
-    page->used--;
-    if (mi_unlikely(mi_page_all_free(page))) {
+    if (mi_unlikely(--page->used == 0)) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))    
      _mi_page_retire(page);
    }
  }
@ -473,9 +526,9 @@ bool _mi_free_delayed_block(mi_block_t* block) {
 }

 // Bytes available in a block
-size_t mi_usable_size(const void* p) mi_attr_noexcept {
-  if (p==NULL) return 0;
-  const mi_segment_t* const segment = _mi_ptr_segment(p);
+static size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept {
+  const mi_segment_t* const segment = mi_checked_ptr_segment(p,msg);
+  if (segment==NULL) return 0;
  const mi_page_t* const page = _mi_segment_page_of(segment, p);
  const mi_block_t* block = (const mi_block_t*)p;
  if (mi_unlikely(mi_page_has_aligned(page))) {
@ -490,6 +543,10 @@ size_t mi_usable_size(const void* p) mi_attr_noexcept {
  }
 }

+size_t mi_usable_size(const void* p) mi_attr_noexcept {
+  return _mi_usable_size(p, "mi_usable_size");
+}
+

 // ------------------------------------------------------
 // ensure explicit external inline definitions are emitted!
@ -513,7 +570,7 @@ void* _mi_externs[] = {

 void mi_free_size(void* p, size_t size) mi_attr_noexcept {
  UNUSED_RELEASE(size);
-  mi_assert(p == NULL || size <= mi_usable_size(p));
+  mi_assert(p == NULL || size <= _mi_usable_size(p,"mi_free_size"));
  mi_free(p);
 }

@ -553,14 +610,14 @@ mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept {
 // Expand in place or fail
 void* mi_expand(void* p, size_t newsize) mi_attr_noexcept {
  if (p == NULL) return NULL;
-  size_t size = mi_usable_size(p);
+  size_t size = _mi_usable_size(p,"mi_expand");
  if (newsize > size) return NULL;
  return p; // it fits
 }

 void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) {
  if (p == NULL) return _mi_heap_malloc_zero(heap,newsize,zero);
-  size_t size = mi_usable_size(p);
+  size_t size = _mi_usable_size(p,"mi_realloc");
  if (newsize <= size && newsize >= (size / 2)) {
    return p;  // reallocation still fits and not more than 50% waste
  }
@ -669,7 +726,7 @@ mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept {
 #ifndef PATH_MAX
 #define PATH_MAX MAX_PATH
 #endif
-#include <windows.h>
+#include <Windows.h>
 mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept {
  // todo: use GetFullPathNameW to allow longer file names
  char buf[PATH_MAX];
@ -733,7 +790,12 @@ but we call `exit` instead (i.e. not returning).
 #ifdef __cplusplus
 #include <new>
 static bool mi_try_new_handler(bool nothrow) {
-  std::new_handler h = std::get_new_handler();
+  #if defined(_MSC_VER) || (__cplusplus >= 201103L)
+    std::new_handler h = std::get_new_handler();
+  #else
+    std::new_handler h = std::set_new_handler();
+    std::set_new_handler(h);
+  #endif  
  if (h==NULL) {
    if (!nothrow) throw std::bad_alloc();
    return false;
@ -750,12 +812,12 @@ typedef void (*std_new_handler_t)();
 std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv() {
  return NULL;
 }
-std_new_handler_t mi_get_new_handler() {
+static std_new_handler_t mi_get_new_handler() {
  return _ZSt15get_new_handlerv();
 }
 #else
 // note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`.
-std_new_handler_t mi_get_new_handler() {
+static std_new_handler_t mi_get_new_handler() {
  return NULL;
 }
 #endif
--- a/src/arena.c
+++ b/src/arena.c
@ -30,12 +30,13 @@ of 256MiB in practice.
 #include "mimalloc-atomic.h"

 #include <string.h>  // memset
+#include <errno.h> // ENOMEM

-#include "bitmap.inc.c"  // atomic bitmap
+#include "bitmap.h"  // atomic bitmap


 // os.c
-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld);
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* stats);
 void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats);
 void  _mi_os_free(void* p, size_t size, mi_stats_t* stats);

@ -43,14 +44,14 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_sec
 void  _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats);

 bool  _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+bool  _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats);

 /* -----------------------------------------------------------
  Arena allocation
 ----------------------------------------------------------- */

 #define MI_SEGMENT_ALIGN      MI_SEGMENT_SIZE
-#define MI_ARENA_BLOCK_SIZE   (8*MI_SEGMENT_ALIGN)     // 32MiB
-#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_FIELD_BITS * MI_ARENA_BLOCK_SIZE)  // 2GiB
+#define MI_ARENA_BLOCK_SIZE   (4*MI_SEGMENT_ALIGN)     // 32MiB
 #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2)  // 16MiB
 #define MI_MAX_ARENAS         (64)                     // not more than 256 (since we use 8 bits in the memid)

@ -61,12 +62,12 @@ typedef struct mi_arena_s {
  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
  int      numa_node;                     // associated NUMA node
  bool     is_zero_init;                  // is the arena zero initialized?
-  bool     is_committed;                  // is the memory committed
-  bool     is_large;                      // large OS page allocated
-  volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks
+  bool     is_committed;                  // is the memory fully committed? (if so, block_committed == NULL)
+  bool     is_large;                      // large- or huge OS pages (always committed)
+  _Atomic(uintptr_t) search_idx;          // optimization to start the search for free blocks
  mi_bitmap_field_t* blocks_dirty;        // are the blocks potentially non-zero?
  mi_bitmap_field_t* blocks_committed;    // if `!is_committed`, are the blocks committed?
-  mi_bitmap_field_t  blocks_inuse[1];       // in-place bitmap of in-use blocks (of size `field_count`)
+  mi_bitmap_field_t  blocks_inuse[1];     // in-place bitmap of in-use blocks (of size `field_count`)
 } mi_arena_t;


@ -104,16 +105,11 @@ static size_t mi_block_count_of_size(size_t size) {
 ----------------------------------------------------------- */
 static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx)
 {
-  const size_t fcount = arena->field_count;
-  size_t idx = mi_atomic_read(&arena->search_idx);  // start from last search
-  for (size_t visited = 0; visited < fcount; visited++, idx++) {
-    if (idx >= fcount) idx = 0;  // wrap around
-    // try to atomically claim a range of bits
-    if (mi_bitmap_try_find_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) {
-      mi_atomic_write(&arena->search_idx, idx);  // start search from here next time
-      return true;
-    }
-  }
+  size_t idx = mi_atomic_load_acquire(&arena->search_idx);  // start from last search
+  if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx)) {
+    mi_atomic_store_release(&arena->search_idx, idx);  // start search from here next time
+    return true;
+  };
  return false;
 }

@ -123,16 +119,17 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
 ----------------------------------------------------------- */

 static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t needed_bcount,
-                                 bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+                                 bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
  mi_bitmap_index_t bitmap_index;
  if (!mi_arena_alloc(arena, needed_bcount, &bitmap_index)) return NULL;

  // claimed it! set the dirty bits (todo: no need for an atomic op here?)
-  void* p  = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
-  *memid   = mi_arena_id_create(arena_index, bitmap_index);
-  *is_zero = mi_bitmap_claim(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
-  *large   = arena->is_large;
+  void* p    = arena->start + (mi_bitmap_index_bit(bitmap_index)*MI_ARENA_BLOCK_SIZE);
+  *memid     = mi_arena_id_create(arena_index, bitmap_index);
+  *is_zero   = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  *large     = arena->is_large;
+  *is_pinned = (arena->is_large || arena->is_committed);
  if (arena->is_committed) {
    // always committed
    *commit = true;
@ -140,7 +137,7 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
  else if (*commit) {
    // arena not committed as a whole, but commit requested: ensure commit now
    bool any_uncommitted;
-    mi_bitmap_claim(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
+    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
    if (any_uncommitted) {
      bool commit_zero;
      _mi_os_commit(p, needed_bcount * MI_ARENA_BLOCK_SIZE, &commit_zero, tld->stats);
@ -149,25 +146,25 @@ static void* mi_arena_alloc_from(mi_arena_t* arena, size_t arena_index, size_t n
  }
  else {
    // no need to commit, but check if already fully committed
-    *commit = mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
+    *commit = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
  }
  return p;
 }

-void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
-                              bool* commit, bool* large, bool* is_zero,
+void* _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero,
                              size_t* memid, mi_os_tld_t* tld)
 {
-  mi_assert_internal(commit != NULL && large != NULL && is_zero != NULL && memid != NULL && tld != NULL);
+  mi_assert_internal(commit != NULL && is_pinned != NULL && is_zero != NULL && memid != NULL && tld != NULL);
  mi_assert_internal(size > 0);
  *memid   = MI_MEMID_OS;
  *is_zero = false;
+  *is_pinned = false;

  // try to allocate in an arena if the alignment is small enough
  // and the object is not too large or too small.
  if (alignment <= MI_SEGMENT_ALIGN &&
-      size <= MI_ARENA_MAX_OBJ_SIZE &&
-      size >= MI_ARENA_MIN_OBJ_SIZE)
+      size >= MI_ARENA_MIN_OBJ_SIZE &&
+      mi_atomic_load_relaxed(&mi_arena_count) > 0)
  {
    const size_t bcount = mi_block_count_of_size(size);
    const int numa_node = _mi_os_numa_node(tld); // current numa node
@ -175,24 +172,24 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
    // try numa affine allocation
    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
      if (arena==NULL) break; // end reached
      if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
      {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
        mi_assert_internal((uintptr_t)p % alignment == 0);
        if (p != NULL) return p;
      }
    }
    // try from another numa node instead..
    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
+      mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
      if (arena==NULL) break; // end reached
      if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
      {
-        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_zero, memid, tld);
+        void* p = mi_arena_alloc_from(arena, i, bcount, commit, large, is_pinned, is_zero, memid, tld);
        mi_assert_internal((uintptr_t)p % alignment == 0);
        if (p != NULL) return p;
      }
@ -200,14 +197,17 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
  }

  // finally, fall back to the OS
+  if (mi_option_is_enabled(mi_option_limit_os_alloc)) return NULL;
  *is_zero = true;
-  *memid   = MI_MEMID_OS;
-  return _mi_os_alloc_aligned(size, alignment, *commit, large, tld);
+  *memid   = MI_MEMID_OS;  
+  void* p = _mi_os_alloc_aligned(size, alignment, *commit, large, tld->stats);
+  if (p != NULL) *is_pinned = *large;
+  return p;
 }

-void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+void* _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
-  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_zero, memid, tld);
+  return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, commit, large, is_pinned, is_zero, memid, tld);
 }

 /* -----------------------------------------------------------
@ -228,8 +228,10 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_s
    size_t bitmap_idx;
    mi_arena_id_indices(memid, &arena_idx, &bitmap_idx);
    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
+    mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
    mi_assert_internal(arena != NULL);
+    const size_t blocks = mi_block_count_of_size(size);
+    // checks
    if (arena == NULL) {
      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
      return;
@ -239,9 +241,18 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_s
      _mi_error_message(EINVAL, "trying to free from non-existent arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid);
      return;
    }
-    const size_t blocks = mi_block_count_of_size(size);
-    bool ones = mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
-    if (!ones) {
+    // potentially decommit
+    if (arena->is_committed) {
+      mi_assert_internal(all_committed); 
+    }
+    else {
+      mi_assert_internal(arena->blocks_committed != NULL);
+      _mi_os_decommit(p, blocks * MI_ARENA_BLOCK_SIZE, stats); // ok if this fails
+      _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx);
+    }
+    // and make it available to others again 
+    bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx);
+    if (!all_inuse) {
      _mi_error_message(EAGAIN, "trying to free an already freed block: %p, size %zu\n", p, size);
      return;
    };
@ -254,24 +265,76 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_s

 static bool mi_arena_add(mi_arena_t* arena) {
  mi_assert_internal(arena != NULL);
-  mi_assert_internal((uintptr_t)mi_atomic_read_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
  mi_assert_internal(arena->block_count > 0);

-  uintptr_t i = mi_atomic_increment(&mi_arena_count);
+  uintptr_t i = mi_atomic_increment_acq_rel(&mi_arena_count);
  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_decrement(&mi_arena_count);
+    mi_atomic_decrement_acq_rel(&mi_arena_count);
    return false;
  }
-  mi_atomic_write_ptr(mi_arena_t,&mi_arenas[i], arena);
+  mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena);
  return true;
 }

+bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept
+{
+  if (is_large) {
+    mi_assert_internal(is_committed);
+    is_committed = true;
+  }
+  
+  const size_t bcount = mi_block_count_of_size(size);
+  const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
+  const size_t bitmaps = (is_committed ? 2 : 3);
+  const size_t asize  = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t));
+  mi_arena_t* arena   = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
+  if (arena == NULL) return false;
+
+  arena->block_count = bcount;
+  arena->field_count = fields;
+  arena->start = (uint8_t*)start;
+  arena->numa_node    = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
+  arena->is_large     = is_large;
+  arena->is_zero_init = is_zero;
+  arena->is_committed = is_committed;
+  arena->search_idx   = 0;
+  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
+  arena->blocks_committed = (is_committed ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
+  // the bitmaps are already zero initialized due to os_alloc
+  // just claim leftover blocks if needed
+  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
+  mi_assert_internal(post >= 0);
+  if (post > 0) {
+    // don't use leftover bits at the end
+    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
+    _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
+  }
+
+  mi_arena_add(arena);
+  return true;
+}
+
+// Reserve a range of regular OS memory
+int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept 
+{
+  size = _mi_os_good_alloc_size(size);
+  bool large = allow_large;
+  void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, &large, &_mi_stats_main);
+  if (start==NULL) return ENOMEM;
+  if (!mi_manage_os_memory(start, size, (large || commit), large, true, -1)) {
+    _mi_os_free_ex(start, size, commit, &_mi_stats_main);
+    _mi_verbose_message("failed to reserve %zu k memory\n", _mi_divide_up(size,1024));
+    return ENOMEM;
+  }
+  _mi_verbose_message("reserved %zu kb memory%s\n", _mi_divide_up(size,1024), large ? " (in large os pages)" : "");
+  return 0;
+}
+

 /* -----------------------------------------------------------
  Reserve a huge page arena.
 ----------------------------------------------------------- */
-#include <errno.h> // ENOMEM
-
 // reserve at a specific numa node
 int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept {
  if (pages==0) return 0;
@ -286,35 +349,10 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
  }
  _mi_verbose_message("numa node %i: reserved %zu gb huge pages (of the %zu gb requested)\n", numa_node, pages_reserved, pages);

-  size_t bcount = mi_block_count_of_size(hsize);
-  size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
-  size_t asize = sizeof(mi_arena_t) + (2*fields*sizeof(mi_bitmap_field_t));
-  mi_arena_t* arena = (mi_arena_t*)_mi_os_alloc(asize, &_mi_stats_main); // TODO: can we avoid allocating from the OS?
-  if (arena == NULL) {
+  if (!mi_manage_os_memory(p, hsize, true, true, true, numa_node)) {
    _mi_os_free_huge_pages(p, hsize, &_mi_stats_main);
    return ENOMEM;
  }
-  arena->block_count = bcount;
-  arena->field_count = fields;
-  arena->start = (uint8_t*)p;
-  arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1)
-  arena->is_large = true;
-  arena->is_zero_init = true;
-  arena->is_committed = true;
-  arena->search_idx = 0;
-  arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap
-  arena->blocks_committed = NULL;
-  // the bitmaps are already zero initialized due to os_alloc
-  // just claim leftover blocks if needed
-  ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount;
-  mi_assert_internal(post >= 0);
-  if (post > 0) {
-    // don't use leftover bits at the end
-    mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post);
-    mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL);
-  }
-
-  mi_arena_add(arena);
  return 0;
 }

--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -0,0 +1,395 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019,2020 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`uintptr_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+
+#include "mimalloc.h"
+#include "mimalloc-internal.h"
+#include "bitmap.h"
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+// The bit mask for a given number of blocks at a specified bit index.
+static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
+  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
+  mi_assert_internal(count > 0);
+  if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
+  if (count == 0) return 0;
+  return ((((uintptr_t)1 << count) - 1) << bitidx);
+}
+
+
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  mi_assert_internal(count <= MI_BITMAP_FIELD_BITS);
+  _Atomic(uintptr_t)* field = &bitmap[idx];
+  uintptr_t map  = mi_atomic_load_relaxed(field);
+  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
+
+  // search for 0-bit sequence of length count
+  const uintptr_t mask = mi_bitmap_mask_(count, 0);
+  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
+
+#ifdef MI_HAVE_FAST_BITSCAN
+  size_t bitidx = mi_ctz(~map);    // quickly find the first zero bit if possible
+#else
+  size_t bitidx = 0;               // otherwise start at 0
+#endif
+  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
+
+  // scan linearly for a free range of zero bits
+  while (bitidx <= bitidx_max) {
+    const uintptr_t mapm = map & m;
+    if (mapm == 0) {  // are the mask bits free at bitidx?
+      mi_assert_internal((m >> bitidx) == mask); // no overflow?
+      const uintptr_t newmap = map | m;
+      mi_assert_internal((newmap^map) >> bitidx == mask);
+      if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) {  // TODO: use strong cas here?
+        // no success, another thread claimed concurrently.. keep going (with updated `map`)
+        continue;
+      }
+      else {
+        // success, we claimed the bits!
+        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
+        return true;
+      }
+    }
+    else {
+      // on to the next bit range
+#ifdef MI_HAVE_FAST_BITSCAN
+      const size_t shift = (count == 1 ? 1 : mi_bsr(mapm) - bitidx + 1);
+      mi_assert_internal(shift > 0 && shift <= count);
+#else
+      const size_t shift = 1;
+#endif
+      bitidx += shift;
+      m <<= shift;
+    }
+  }
+  // no bits found
+  return false;
+}
+
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/*
+// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
+bool _mi_bitmap_try_find_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, 0, count, bitmap_idx);
+}
+*/
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  // mi_assert_internal((bitmap[idx] & mask) == mask);
+  uintptr_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask);
+  return ((prev & mask) == mask);
+}
+
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
+  uintptr_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask);
+  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
+  return ((prev & mask) == 0);
+}
+
+// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
+  const size_t idx = mi_bitmap_index_field(bitmap_idx);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
+  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
+  uintptr_t field = mi_atomic_load_relaxed(&bitmap[idx]);
+  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
+  return ((field & mask) == mask);
+}
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Try to atomically claim a sequence of `count` bits starting from the field 
+// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success.
+static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx)
+{
+  mi_assert_internal(bitmap_idx != NULL);
+  
+  // check initial trailing zeros
+  _Atomic(uintptr_t)* field = &bitmap[idx];
+  uintptr_t map = mi_atomic_load_relaxed(field);  
+  const size_t initial = mi_clz(map);  // count of initial zeros starting at idx
+  mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS);
+  if (initial == 0)     return false;
+  if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx);     // no need to cross fields
+  if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries
+
+  // scan ahead
+  size_t found = initial;
+  uintptr_t mask = 0;     // mask bits for the final field
+  while(found < count) {
+    field++;
+    map = mi_atomic_load_relaxed(field);
+    const uintptr_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found));
+    mask = mi_bitmap_mask_(mask_bits, 0);
+    if ((map & mask) != 0) return false;
+    found += mask_bits;
+  }
+  mi_assert_internal(field < &bitmap[bitmap_fields]);
+
+  // found range of zeros up to the final field; mask contains mask in the final field
+  // now claim it atomically
+  _Atomic(uintptr_t)* const final_field = field;
+  const uintptr_t final_mask = mask;
+  _Atomic(uintptr_t)* const initial_field = &bitmap[idx];
+  const uintptr_t initial_mask = mi_bitmap_mask_(initial, MI_BITMAP_FIELD_BITS - initial);
+
+  // initial field
+  uintptr_t newmap;
+  field = initial_field;
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | initial_mask;
+    if ((map & initial_mask) != 0) { goto rollback; };
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  
+  // intermediate fields
+  while (++field < final_field) {
+    newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
+    map = 0;
+    if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; }
+  }
+  
+  // final field
+  mi_assert_internal(field == final_field);
+  map = mi_atomic_load_relaxed(field);
+  do {
+    newmap = map | final_mask;
+    if ((map & final_mask) != 0) { goto rollback; }
+  } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+
+  // claimed!
+  *bitmap_idx = mi_bitmap_index_create(idx, MI_BITMAP_FIELD_BITS - initial);
+  return true;
+
+rollback: 
+  // roll back intermediate fields
+  while (--field > initial_field) {
+    newmap = 0;
+    map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0);
+    mi_assert_internal(mi_atomic_load_relaxed(field) == map);
+    mi_atomic_store_release(field, newmap);
+  }
+  if (field == initial_field) {
+    map = mi_atomic_load_relaxed(field);
+    do {
+      mi_assert_internal((map & initial_mask) == initial_mask);
+      newmap = map & ~initial_mask;
+    } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap));
+  }  
+  // retry? (we make a recursive call instead of goto to be able to use const declarations)
+  if (retries < 4) {
+    return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx);
+  }
+  else {
+    return false;
+  }
+}
+
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) {
+  mi_assert_internal(count > 0);
+  if (count==1) return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx);
+  size_t idx = start_field_idx;
+  for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) {
+    if (idx >= bitmap_fields) idx = 0; // wrap
+    // try to claim inside the field
+    if (count <= MI_BITMAP_FIELD_BITS) {
+      if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
+        return true;
+      }
+    }
+    // try to claim across fields
+    if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// Helper for masks across fields; returns the mid count, post_mask may be 0
+static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, uintptr_t* pre_mask, uintptr_t* mid_mask, uintptr_t* post_mask) {
+  UNUSED_RELEASE(bitmap_fields);
+  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
+  if (mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS)) {
+    *pre_mask = mi_bitmap_mask_(count, bitidx);
+    *mid_mask = 0;
+    *post_mask = 0;
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields);
+    return 0;
+  }
+  else {
+    const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx;
+    mi_assert_internal(pre_bits < count);
+    *pre_mask = mi_bitmap_mask_(pre_bits, bitidx);
+    count -= pre_bits;
+    const size_t mid_count = (count / MI_BITMAP_FIELD_BITS);
+    *mid_mask = MI_BITMAP_FIELD_FULL;
+    count %= MI_BITMAP_FIELD_BITS;
+    *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0));
+    mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields);
+    return mid_count;
+  }
+}
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  uintptr_t pre_mask;
+  uintptr_t mid_mask;
+  uintptr_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);  
+  bool all_one = true;
+  _Atomic(uintptr_t)*field = &bitmap[idx];
+  uintptr_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask);
+  if ((prev & pre_mask) != pre_mask) all_one = false;
+  while(mid_count-- > 0) {
+    prev = mi_atomic_and_acq_rel(field++, ~mid_mask);
+    if ((prev & mid_mask) != mid_mask) all_one = false;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_and_acq_rel(field, ~post_mask);
+    if ((prev & post_mask) != post_mask) all_one = false;
+  }
+  return all_one;  
+}
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  uintptr_t pre_mask;
+  uintptr_t mid_mask;
+  uintptr_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_zero = true;
+  bool any_zero = false;
+  _Atomic(uintptr_t)*field = &bitmap[idx];
+  uintptr_t prev = mi_atomic_or_acq_rel(field++, pre_mask);
+  if ((prev & pre_mask) != 0) all_zero = false;
+  if ((prev & pre_mask) != pre_mask) any_zero = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_or_acq_rel(field++, mid_mask);
+    if ((prev & mid_mask) != 0) all_zero = false;
+    if ((prev & mid_mask) != mid_mask) any_zero = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_or_acq_rel(field, post_mask);
+    if ((prev & post_mask) != 0) all_zero = false;
+    if ((prev & post_mask) != post_mask) any_zero = true;
+  }
+  if (pany_zero != NULL) *pany_zero = any_zero;
+  return all_zero;
+}
+
+
+// Returns `true` if all `count` bits were 1. 
+// `any_ones` is `true` if there was at least one bit set to one.
+static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) {
+  size_t idx = mi_bitmap_index_field(bitmap_idx);
+  uintptr_t pre_mask;
+  uintptr_t mid_mask;
+  uintptr_t post_mask;
+  size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask);
+  bool all_ones = true;
+  bool any_ones = false;
+  _Atomic(uintptr_t)* field = &bitmap[idx];
+  uintptr_t prev = mi_atomic_load_relaxed(field++);
+  if ((prev & pre_mask) != pre_mask) all_ones = false;
+  if ((prev & pre_mask) != 0) any_ones = true;
+  while (mid_count-- > 0) {
+    prev = mi_atomic_load_relaxed(field++);
+    if ((prev & pre_mask) != pre_mask) all_ones = false;
+    if ((prev & pre_mask) != 0) any_ones = true;
+  }
+  if (post_mask!=0) {
+    prev = mi_atomic_load_relaxed(field);
+    if ((prev & pre_mask) != pre_mask) all_ones = false;
+    if ((prev & pre_mask) != 0) any_ones = true;
+  }  
+  if (pany_ones != NULL) *pany_ones = any_ones;
+  return all_ones;
+}
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL);
+}
+
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
+  bool any_ones;
+  mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
+  return any_ones;
+}
--- a/src/bitmap.h
+++ b/src/bitmap.h
@ -0,0 +1,102 @@
+/* ----------------------------------------------------------------------------
+Copyright (c) 2019,2020 Microsoft Research, Daan Leijen
+This is free software; you can redistribute it and/or modify it under the
+terms of the MIT license. A copy of the license can be found in the file
+"LICENSE" at the root of this distribution.
+-----------------------------------------------------------------------------*/
+
+/* ----------------------------------------------------------------------------
+Concurrent bitmap that can set/reset sequences of bits atomically,
+represeted as an array of fields where each field is a machine word (`uintptr_t`)
+
+There are two api's; the standard one cannot have sequences that cross
+between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS).
+(this is used in region allocation)
+
+The `_across` postfixed functions do allow sequences that can cross over
+between the fields. (This is used in arena allocation)
+---------------------------------------------------------------------------- */
+#pragma once
+#ifndef MI_BITMAP_H
+#define MI_BITMAP_H
+
+/* -----------------------------------------------------------
+  Bitmap definition
+----------------------------------------------------------- */
+
+#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
+#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
+
+// An atomic bitmap of `uintptr_t` fields
+typedef _Atomic(uintptr_t)  mi_bitmap_field_t;
+typedef mi_bitmap_field_t*  mi_bitmap_t;
+
+// A bitmap index is the index of the bit in a bitmap.
+typedef size_t mi_bitmap_index_t;
+
+// Create a bit index.
+static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
+  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
+  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
+}
+
+// Get the field index from a bit index.
+static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
+}
+
+// Get the bit index in a bitmap field
+static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
+  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
+}
+
+// Get the full bit index
+static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
+  return bitmap_idx;
+}
+
+/* -----------------------------------------------------------
+  Claim a bit sequence atomically
+----------------------------------------------------------- */
+
+// Try to atomically claim a sequence of `count` bits in a single
+// field at `idx` in `bitmap`. Returns `true` on success.
+bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields.
+bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero);
+
+bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+
+//--------------------------------------------------------------------------
+// the `_across` functions work on bitmaps where sequences can cross over
+// between the fields. This is used in arena allocation
+//--------------------------------------------------------------------------
+
+// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success.
+// Starts at idx, and wraps around to search in all `bitmap_fields` fields.
+bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 0 atomically
+// Returns `true` if all `count` bits were 1 previously.
+bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+// Set `count` bits at `bitmap_idx` to 1 atomically
+// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
+bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero);
+
+bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx);
+
+#endif
--- a/src/bitmap.inc.c
+++ b/src/bitmap.inc.c
@ -1,240 +0,0 @@
-/* ----------------------------------------------------------------------------
-Copyright (c) 2019, Microsoft Research, Daan Leijen
-This is free software; you can redistribute it and/or modify it under the
-terms of the MIT license. A copy of the license can be found in the file
-"LICENSE" at the root of this distribution.
-----------------------------------------------------------------------------*/
-
-/* ----------------------------------------------------------------------------
-This file is meant to be included in other files for efficiency.
-It implements a bitmap that can set/reset sequences of bits atomically
-and is used to concurrently claim memory ranges.
-
-A bitmap is an array of fields where each field is a machine word (`uintptr_t`)
-
-A current limitation is that the bit sequences cannot cross fields
-and that the sequence must be smaller or equal to the bits in a field.
---------------------------------------------------------------------------- */
-#pragma once
-#ifndef MI_BITMAP_C
-#define MI_BITMAP_C
-
-#include "mimalloc.h"
-#include "mimalloc-internal.h"
-
-/* -----------------------------------------------------------
-  Bitmap definition
----------------------------------------------------------- */
-
-#define MI_BITMAP_FIELD_BITS   (8*MI_INTPTR_SIZE)
-#define MI_BITMAP_FIELD_FULL   (~((uintptr_t)0))   // all bits set
-
-// An atomic bitmap of `uintptr_t` fields
-typedef volatile _Atomic(uintptr_t)  mi_bitmap_field_t;
-typedef mi_bitmap_field_t*           mi_bitmap_t;
-
-// A bitmap index is the index of the bit in a bitmap.
-typedef size_t mi_bitmap_index_t;
-
-// Create a bit index.
-static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) {
-  mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS);
-  return (idx*MI_BITMAP_FIELD_BITS) + bitidx;
-}
-
-// Get the field index from a bit index.
-static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx / MI_BITMAP_FIELD_BITS);
-}
-
-// Get the bit index in a bitmap field
-static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) {
-  return (bitmap_idx % MI_BITMAP_FIELD_BITS);
-}
-
-// Get the full bit index
-static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) {
-  return bitmap_idx;
-}
-
-
-// The bit mask for a given number of blocks at a specified bit index.
-static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) {
-  mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS);
-  if (count == MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL;
-  return ((((uintptr_t)1 << count) - 1) << bitidx);
-}
-
-
-/* -----------------------------------------------------------
-  Use bit scan forward/reverse to quickly find the first zero bit if it is available
----------------------------------------------------------- */
-#if defined(_MSC_VER)
-#define MI_HAVE_BITSCAN
-#include <intrin.h>
-static inline size_t mi_bsf(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanForward)(&idx, x);
-  return idx;
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  if (x==0) return 8*MI_INTPTR_SIZE;
-  DWORD idx;
-  MI_64(_BitScanReverse)(&idx, x);
-  return idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-#include <limits.h> // LONG_MAX
-#define MI_HAVE_BITSCAN
-#if (INTPTR_MAX == LONG_MAX)
-# define MI_L(x)  x##l
-#else
-# define MI_L(x)  x##ll
-#endif
-static inline size_t mi_bsf(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : MI_L(__builtin_ctz)(x));
-}
-static inline size_t mi_bsr(uintptr_t x) {
-  return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - MI_L(__builtin_clz)(x));
-}
-#endif
-
-/* -----------------------------------------------------------
-  Claim a bit sequence atomically
----------------------------------------------------------- */
-
-// Try to atomically claim a sequence of `count` bits at in `idx`
-// in the bitmap field. Returns `true` on success.
-static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_fields, const size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  mi_assert_internal(bitidx + count <= MI_BITMAP_FIELD_BITS);
-
-  uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]);
-  if ((field & mask) == 0) { // free?
-    if (mi_atomic_cas_strong(&bitmap[idx], (field|mask), field)) {
-      // claimed!
-      return true;
-    }
-  }
-  return false;
-}
-
-
-// Try to atomically claim a sequence of `count` bits in a single
-// field at `idx` in `bitmap`. Returns `true` on success.
-static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx)
-{
-  mi_assert_internal(bitmap_idx != NULL);
-  volatile _Atomic(uintptr_t)* field = &bitmap[idx];
-  uintptr_t map  = mi_atomic_read(field);
-  if (map==MI_BITMAP_FIELD_FULL) return false; // short cut
-
-  // search for 0-bit sequence of length count
-  const uintptr_t mask = mi_bitmap_mask_(count, 0);
-  const size_t    bitidx_max = MI_BITMAP_FIELD_BITS - count;
-
-#ifdef MI_HAVE_BITSCAN
-  size_t bitidx = mi_bsf(~map);    // quickly find the first zero bit if possible
-#else
-  size_t bitidx = 0;               // otherwise start at 0
-#endif
-  uintptr_t m = (mask << bitidx);     // invariant: m == mask shifted by bitidx
-
-  // scan linearly for a free range of zero bits
-  while (bitidx <= bitidx_max) {
-    if ((map & m) == 0) {  // are the mask bits free at bitidx?
-      mi_assert_internal((m >> bitidx) == mask); // no overflow?
-      const uintptr_t newmap = map | m;
-      mi_assert_internal((newmap^map) >> bitidx == mask);
-      if (!mi_atomic_cas_weak(field, newmap, map)) {  // TODO: use strong cas here?
-        // no success, another thread claimed concurrently.. keep going
-        map = mi_atomic_read(field);
-        continue;
-      }
-      else {
-        // success, we claimed the bits!
-        *bitmap_idx = mi_bitmap_index_create(idx, bitidx);
-        return true;
-      }
-    }
-    else {
-      // on to the next bit range
-#ifdef MI_HAVE_BITSCAN
-      const size_t shift = (count == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
-      mi_assert_internal(shift > 0 && shift <= count);
-#else
-      const size_t shift = 1;
-#endif
-      bitidx += shift;
-      m <<= shift;
-    }
-  }
-  // no bits found
-  return false;
-}
-
-
-// Find `count` bits of 0 and set them to 1 atomically; returns `true` on success.
-// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never span fields.
-static inline bool mi_bitmap_try_find_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t* bitmap_idx) {
-  for (size_t idx = 0; idx < bitmap_fields; idx++) {
-    if (mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Set `count` bits at `bitmap_idx` to 0 atomically
-// Returns `true` if all `count` bits were 1 previously.
-static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  // mi_assert_internal((bitmap[idx] & mask) == mask);
-  uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask);
-  return ((prev & mask) == mask);
-}
-
-
-// Set `count` bits at `bitmap_idx` to 1 atomically
-// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit.
-static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0);
-  uintptr_t prev = mi_atomic_or(&bitmap[idx], mask);
-  if (any_zero != NULL) *any_zero = ((prev & mask) != mask);
-  return ((prev & mask) == 0);
-}
-
-// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one.
-static inline bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) {
-  const size_t idx = mi_bitmap_index_field(bitmap_idx);
-  const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx);
-  const uintptr_t mask = mi_bitmap_mask_(count, bitidx);
-  mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields);
-  uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]);
-  if (any_ones != NULL) *any_ones = ((field & mask) != 0);
-  return ((field & mask) == mask);
-}
-
-static inline bool mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL);
-}
-
-static inline bool mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) {
-  bool any_ones;
-  mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones);
-  return any_ones;
-}
-
-
-#endif
--- a/src/heap.c
+++ b/src/heap.c
@ -11,6 +11,9 @@ terms of the MIT license. A copy of the license can be found in the file

 #include <string.h>  // memset, memcpy

+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif

 /* -----------------------------------------------------------
  Helpers
@ -111,7 +114,7 @@ static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq

 static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
 {
-  if (!mi_heap_is_initialized(heap)) return;
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
  _mi_deferred_free(heap, collect >= MI_FORCE);

  // note: never reclaim on collect but leave it to threads that need storage to reclaim 
@ -128,7 +131,6 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)
    _mi_abandoned_reclaim_all(heap, &heap->tld->segments);
  }
  
-
  // if abandoning, mark all pages to no longer add to delayed_free
  if (collect == MI_ABANDON) {
    mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL);
@ -143,19 +145,17 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)

  // collect all pages owned by this thread
  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  mi_assert_internal( collect != MI_ABANDON || mi_atomic_read_ptr(mi_block_t,&heap->thread_delayed_free) == NULL );
+  mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL );

  // collect segment caches
  if (collect >= MI_FORCE) {
    _mi_segment_thread_collect(&heap->tld->segments);
  }

-  #ifndef NDEBUG
-  // collect regions
+  // collect regions on program-exit (or shared library unload)
  if (collect >= MI_FORCE && _mi_is_main_thread() && mi_heap_is_backing(heap)) {
    _mi_mem_collect(&heap->tld->os);
  }
-  #endif
 }

 void _mi_heap_collect_abandon(mi_heap_t* heap) {
@ -213,6 +213,7 @@ uintptr_t _mi_heap_random_next(mi_heap_t* heap) {

 // zero out the page queues
 static void mi_heap_reset_pages(mi_heap_t* heap) {
+  mi_assert_internal(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
  // TODO: copy full empty heap instead?
  memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct));
@ -228,6 +229,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
 static void mi_heap_free(mi_heap_t* heap) {
  mi_assert(heap != NULL);
  mi_assert_internal(mi_heap_is_initialized(heap));
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
  if (mi_heap_is_backing(heap)) return; // dont free the backing heap

  // reset default
@ -272,17 +274,20 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_
  const size_t bsize = mi_page_block_size(page);
  if (bsize > MI_LARGE_OBJ_SIZE_MAX) {
    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      _mi_stat_decrease(&heap->tld->stats.giant, bsize);
+      mi_heap_stat_decrease(heap, giant, bsize);
    }
    else {
-      _mi_stat_decrease(&heap->tld->stats.huge, bsize);
+      mi_heap_stat_decrease(heap, huge, bsize);
    }
  }
-#if (MI_STAT>1)
+#if (MI_STAT)
  _mi_page_free_collect(page, false);  // update used count
  const size_t inuse = page->used;
  if (bsize <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease(heap, normal[_mi_bin(bsize)], inuse);
+    mi_heap_stat_decrease(heap, normal, bsize * inuse);
+#if (MI_STAT>1)
+    mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse);
+#endif
  }
  mi_heap_stat_decrease(heap, malloc, bsize * inuse);  // todo: off for aligned blocks...
 #endif
@ -310,7 +315,7 @@ void mi_heap_destroy(mi_heap_t* heap) {
  mi_assert(mi_heap_is_initialized(heap));
  mi_assert(heap->no_reclaim);
  mi_assert_expensive(mi_heap_is_valid(heap));
-  if (!mi_heap_is_initialized(heap)) return;
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;
  if (!heap->no_reclaim) {
    // don't free in case it may contain reclaimed pages
    mi_heap_delete(heap);
@ -354,7 +359,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) {
  // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a 
  // the regular `_mi_free_delayed_block` which is safe.
  _mi_heap_delayed_free(from);  
-  mi_assert_internal(from->thread_delayed_free == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL);

  // and reset the `from` heap
  mi_heap_reset_pages(from);  
@ -366,7 +371,7 @@ void mi_heap_delete(mi_heap_t* heap)
  mi_assert(heap != NULL);
  mi_assert(mi_heap_is_initialized(heap));
  mi_assert_expensive(mi_heap_is_valid(heap));
-  if (!mi_heap_is_initialized(heap)) return;
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return;

  if (!mi_heap_is_backing(heap)) {
    // tranfer still used pages to the backing heap
@ -381,8 +386,9 @@ void mi_heap_delete(mi_heap_t* heap)
 }

 mi_heap_t* mi_heap_set_default(mi_heap_t* heap) {
+  mi_assert(heap != NULL);
  mi_assert(mi_heap_is_initialized(heap));
-  if (!mi_heap_is_initialized(heap)) return NULL;
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL;
  mi_assert_expensive(mi_heap_is_valid(heap));
  mi_heap_t* old = mi_get_default_heap();
  _mi_heap_set_default_direct(heap);
@ -408,7 +414,7 @@ static mi_heap_t* mi_heap_of_block(const void* p) {

 bool mi_heap_contains_block(mi_heap_t* heap, const void* p) {
  mi_assert(heap != NULL);
-  if (!mi_heap_is_initialized(heap)) return false;
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
  return (heap == mi_heap_of_block(p));
 }

@ -426,7 +432,7 @@ static bool mi_heap_page_check_owned(mi_heap_t* heap, mi_page_queue_t* pq, mi_pa

 bool mi_heap_check_owned(mi_heap_t* heap, const void* p) {
  mi_assert(heap != NULL);
-  if (!mi_heap_is_initialized(heap)) return false;
+  if (heap==NULL || !mi_heap_is_initialized(heap)) return false;
  if (((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0) return false;  // only aligned pointers
  bool found = false;
  mi_heap_visit_pages(heap, &mi_heap_page_check_owned, (void*)p, &found);
--- a/src/init.c
+++ b/src/init.c
@ -73,8 +73,8 @@ const mi_page_t _mi_page_empty = {
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
-  MI_STAT_COUNT_NULL(), \
-  { 0, 0 }, { 0, 0 }, { 0, 0 },  \
+  MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \
+  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 },     \
  { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \
  MI_STAT_COUNT_END_NULL()

@ -105,10 +105,6 @@ const mi_heap_t _mi_heap_empty = {
 // the thread-local default heap for allocation
 mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty;

-
-#define tld_main_stats  ((mi_stats_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,stats)))
-#define tld_main_os     ((mi_os_tld_t*)((uint8_t*)&tld_main + offsetof(mi_tld_t,os)))
-
 extern mi_heap_t _mi_heap_main;

 static mi_tld_t tld_main = {
@ -116,9 +112,9 @@ static mi_tld_t tld_main = {
  &_mi_heap_main, &_mi_heap_main,
  { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0},
    0, 0, 0, 0, 0, 0, NULL,
-    tld_main_stats, tld_main_os
+    &tld_main.stats, &tld_main.os
  }, // segments
-  { 0, tld_main_stats },  // os
+  { 0, &tld_main.stats },  // os
  { MI_STATS_NULL }       // stats
 };

@ -180,10 +176,15 @@ static bool _mi_heap_init(void) {
  }
  else {
    // use `_mi_os_alloc` to allocate directly from the OS
-    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t),&_mi_stats_main); // Todo: more efficient allocation?
+    mi_thread_data_t* td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main); // Todo: more efficient allocation?
    if (td == NULL) {
-      _mi_error_message(ENOMEM, "failed to allocate thread local heap memory\n");
-      return false;
+      // if this fails, try once more. (issue #257)
+      td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &_mi_stats_main);
+      if (td == NULL) {
+        // really out of memory
+        _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t));
+        return false;
+      }
    }
    // OS allocated so already zero initialized
    mi_tld_t*  tld = &td->tld;
@ -200,7 +201,7 @@ static bool _mi_heap_init(void) {
    tld->segments.stats = &tld->stats;
    tld->segments.os = &tld->os;
    tld->os.stats = &tld->stats;
-    _mi_heap_set_default_direct(heap);
+    _mi_heap_set_default_direct(heap);    
  }
  return false;
 }
@ -234,9 +235,8 @@ static bool _mi_heap_done(mi_heap_t* heap) {
    _mi_heap_collect_abandon(heap);
  }
  
-
  // merge stats
-  _mi_stats_done(&heap->tld->stats);
+  _mi_stats_done(&heap->tld->stats);  

  // free if not the main thread
  if (heap != &_mi_heap_main) {
@ -284,7 +284,7 @@ static void _mi_thread_done(mi_heap_t* default_heap);
  // nothing to do as it is done in DllMain
 #elif defined(_WIN32) && !defined(MI_SHARED_LIB)
  // use thread local storage keys to detect thread ending
-  #include <windows.h>
+  #include <Windows.h>
  #include <fibersapi.h>
  #if (_WIN32_WINNT < 0x600)  // before Windows Vista 
  WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback );
@ -336,17 +336,13 @@ void mi_thread_init(void) mi_attr_noexcept
 {
  // ensure our process has started already
  mi_process_init();
-
+  
  // initialize the thread local default heap
  // (this will call `_mi_heap_set_default_direct` and thus set the
  //  fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called)
  if (_mi_heap_init()) return;  // returns true if already initialized

-  // don't further initialize for the main thread
-  if (_mi_is_main_thread()) return;
-
-  _mi_stat_increase(&mi_get_default_heap()->tld->stats.threads, 1);
-
+  _mi_stat_increase(&_mi_stats_main.threads, 1);
  //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
 }

@ -355,14 +351,11 @@ void mi_thread_done(void) mi_attr_noexcept {
 }

 static void _mi_thread_done(mi_heap_t* heap) {
+  _mi_stat_decrease(&_mi_stats_main.threads, 1);
+
  // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
  if (heap->thread_id != _mi_thread_id()) return;
-
-  // stats
-  if (!_mi_is_main_thread() && mi_heap_is_initialized(heap))  {
-    _mi_stat_decrease(&heap->tld->stats.threads, 1);
-  }
-
+  
  // abandon the thread local heap
  if (_mi_heap_done(heap)) return;  // returns true if already ran
 }
@ -403,11 +396,11 @@ static bool os_preloading = true;    // true until this module is initialized
 static bool mi_redirected = false;   // true if malloc redirects to mi_malloc

 // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false.
-bool _mi_preloading() {
+bool _mi_preloading(void) {
  return os_preloading;
 }

-bool mi_is_redirected() mi_attr_noexcept {
+bool mi_is_redirected(void) mi_attr_noexcept {
  return mi_redirected;
 }

@ -429,7 +422,7 @@ mi_decl_export void _mi_redirect_entry(DWORD reason) {
  }
 }
 __declspec(dllimport) bool mi_allocator_init(const char** message);
-__declspec(dllimport) void mi_allocator_done();
+__declspec(dllimport) void mi_allocator_done(void);
 #ifdef __cplusplus
 }
 #endif
@ -438,7 +431,7 @@ static bool mi_allocator_init(const char** message) {
  if (message != NULL) *message = NULL;
  return true;
 }
-static void mi_allocator_done() {
+static void mi_allocator_done(void) {
  // nothing to do
 }
 #endif
@ -485,6 +478,10 @@ void mi_process_init(void) mi_attr_noexcept {
  if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
    size_t pages = mi_option_get(mi_option_reserve_huge_os_pages);
    mi_reserve_huge_os_pages_interleave(pages, 0, pages*500);
+  } 
+  if (mi_option_is_enabled(mi_option_reserve_os_memory)) {
+    long ksize = mi_option_get(mi_option_reserve_os_memory);
+    if (ksize > 0) mi_reserve_os_memory((size_t)ksize*KiB, true, true);
  }
 }

@ -501,11 +498,15 @@ static void mi_process_done(void) {
  FlsSetValue(mi_fls_key, NULL);  // don't call main-thread callback
  FlsFree(mi_fls_key);            // call thread-done on all threads to prevent dangling callback pointer if statically linked with a DLL; Issue #208
  #endif
-  #ifndef NDEBUG
-  mi_collect(true);
+  
+  #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB)  
+  // free all memory if possible on process exit. This is not needed for a stand-alone process
+  // but should be done if mimalloc is statically linked into another shared library which
+  // is repeatedly loaded/unloaded, see issue #281.
+  mi_collect(true /* force */ );
  #endif
-  if (mi_option_is_enabled(mi_option_show_stats) ||
-      mi_option_is_enabled(mi_option_verbose)) {
+
+  if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) {
    mi_stats_print(NULL);
  }
  mi_allocator_done();  
--- a/src/options.c
+++ b/src/options.c
@ -14,7 +14,13 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <ctype.h>  // toupper
 #include <stdarg.h>

-static uintptr_t mi_max_error_count = 16;  // stop outputting errors after this
+#ifdef _MSC_VER
+#pragma warning(disable:4996)   // strncpy, strncat
+#endif
+
+
+static uintptr_t mi_max_error_count   = 16; // stop outputting errors after this
+static uintptr_t mi_max_warning_count = 16; // stop outputting warnings after this

 static void mi_add_stderr_output();

@ -60,7 +66,7 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(verbose) },

  // the following options are experimental and not all combinations make sense.
-  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit on demand
+  { 1, UNINIT, MI_OPTION(eager_commit) },        // commit per segment directly (4MiB)  (but see also `eager_commit_delay`)
  #if defined(_WIN32) || (MI_INTPTR_SIZE <= 4)   // and other OS's without overcommit?
  { 0, UNINIT, MI_OPTION(eager_region_commit) },
  { 1, UNINIT, MI_OPTION(reset_decommits) },     // reset decommits memory
@ -69,7 +75,8 @@ static mi_option_desc_t options[_mi_option_last] =
  { 0, UNINIT, MI_OPTION(reset_decommits) },     // reset uses MADV_FREE/MADV_DONTNEED
  #endif
  { 0, UNINIT, MI_OPTION(large_os_pages) },      // use large OS pages, use only with eager commit to prevent fragmentation of VMA's
-  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },
+  { 0, UNINIT, MI_OPTION(reserve_huge_os_pages) },  // per 1GiB huge pages
+  { 0, UNINIT, MI_OPTION(reserve_os_memory)     },
  { 0, UNINIT, MI_OPTION(segment_cache) },       // cache N segments per thread
  { 1, UNINIT, MI_OPTION(page_reset) },          // reset page memory on free
  { 0, UNINIT, MI_OPTION(abandoned_page_reset) },// reset free page memory when a thread terminates
@ -77,12 +84,15 @@ static mi_option_desc_t options[_mi_option_last] =
 #if defined(__NetBSD__)
  { 0, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
 #else
-  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed
+  { 1, UNINIT, MI_OPTION(eager_commit_delay) },  // the first N segments per thread are not eagerly committed (but per page in the segment on demand)
 #endif
  { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
  { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 0,   UNINIT, MI_OPTION(limit_os_alloc) },    // 1 = do not use OS memory for allocation (but only reserved arenas)
  { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
-  { 16,  UNINIT, MI_OPTION(max_errors) }         // maximum errors that are output
+  { 16,  UNINIT, MI_OPTION(max_errors) },        // maximum errors that are output
+  { 16,  UNINIT, MI_OPTION(max_warnings) }       // maximum warnings that are output
+
 };

 static void mi_option_init(mi_option_desc_t* desc);
@ -100,6 +110,7 @@ void _mi_options_init(void) {
    }
  }
  mi_max_error_count = mi_option_get(mi_option_max_errors);
+  mi_max_warning_count = mi_option_get(mi_option_max_warnings);
 }

 long mi_option_get(mi_option_t option) {
@ -165,7 +176,7 @@ static void mi_out_stderr(const char* msg, void* arg) {
 // an output function is registered it is called immediately with
 // the output up to that point.
 #ifndef MI_MAX_DELAY_OUTPUT
-#define MI_MAX_DELAY_OUTPUT (32*1024)
+#define MI_MAX_DELAY_OUTPUT ((uintptr_t)(32*1024))
 #endif
 static char out_buf[MI_MAX_DELAY_OUTPUT+1];
 static _Atomic(uintptr_t) out_len;
@ -173,11 +184,11 @@ static _Atomic(uintptr_t) out_len;
 static void mi_out_buf(const char* msg, void* arg) {
  UNUSED(arg);
  if (msg==NULL) return;
-  if (mi_atomic_read_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
+  if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return;
  size_t n = strlen(msg);
  if (n==0) return;
  // claim space
-  uintptr_t start = mi_atomic_add(&out_len, n);
+  uintptr_t start = mi_atomic_add_acq_rel(&out_len, n);
  if (start >= MI_MAX_DELAY_OUTPUT) return;
  // check bound
  if (start+n >= MI_MAX_DELAY_OUTPUT) {
@ -189,7 +200,7 @@ static void mi_out_buf(const char* msg, void* arg) {
 static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
  if (out==NULL) return;
  // claim (if `no_more_buf == true`, no more output will be added after this point)
-  size_t count = mi_atomic_add(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
  // and output the current contents
  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
  out_buf[count] = 0;
@ -215,19 +226,18 @@ static void mi_out_buf_stderr(const char* msg, void* arg) {

 // Should be atomic but gives errors on many platforms as generally we cannot cast a function pointer to a uintptr_t.
 // For now, don't register output from multiple threads.
-#pragma warning(suppress:4180)
 static mi_output_fun* volatile mi_out_default; // = NULL
-static volatile _Atomic(void*) mi_out_arg; // = NULL
+static _Atomic(void*) mi_out_arg; // = NULL

 static mi_output_fun* mi_out_get_default(void** parg) {
-  if (parg != NULL) { *parg = mi_atomic_read_ptr(void,&mi_out_arg); }
+  if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); }
  mi_output_fun* out = mi_out_default;
  return (out == NULL ? &mi_out_buf : out);
 }

 void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
-  mi_atomic_write_ptr(void,&mi_out_arg, arg);
+  mi_atomic_store_ptr_release(void,&mi_out_arg, arg);
  if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
 }

@ -241,14 +251,15 @@ static void mi_add_stderr_output() {
 // --------------------------------------------------------
 // Messages, all end up calling `_mi_fputs`.
 // --------------------------------------------------------
-static volatile _Atomic(uintptr_t) error_count; // = 0;  // when MAX_ERROR_COUNT stop emitting errors and warnings
+static _Atomic(uintptr_t) error_count;   // = 0;  // when >= max_error_count stop emitting errors
+static _Atomic(uintptr_t) warning_count; // = 0;  // when >= max_warning_count stop emitting warnings

 // When overriding malloc, we may recurse into mi_vfprintf if an allocation
 // inside the C runtime causes another message.
 static mi_decl_thread bool recurse = false;

 static bool mi_recurse_enter(void) {
-  #ifdef MI_TLS_RECURSE_GUARD
+  #if defined(__MACH__) || defined(MI_TLS_RECURSE_GUARD)
  if (_mi_preloading()) return true;
  #endif
  if (recurse) return false;
@ -257,7 +268,7 @@ static bool mi_recurse_enter(void) {
 }

 static void mi_recurse_exit(void) {
-  #ifdef MI_TLS_RECURSE_GUARD
+  #if defined(__MACH__) || defined(MI_TLS_RECURSE_GUARD)
  if (_mi_preloading()) return;
  #endif
  recurse = false;
@ -313,13 +324,13 @@ void _mi_verbose_message(const char* fmt, ...) {

 static void mi_show_error_message(const char* fmt, va_list args) {
  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
+  if (mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return;
  mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args);
 }

 void _mi_warning_message(const char* fmt, ...) {
  if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return;
-  if (mi_atomic_increment(&error_count) > mi_max_error_count) return;
+  if (mi_atomic_increment_acq_rel(&warning_count) > mi_max_warning_count) return;
  va_list args;
  va_start(args,fmt);
  mi_vfprintf(NULL, NULL, "mimalloc: warning: ", fmt, args);
@ -339,7 +350,7 @@ void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, co
 // --------------------------------------------------------

 static mi_error_fun* volatile  mi_error_handler; // = NULL
-static volatile _Atomic(void*) mi_error_arg;     // = NULL
+static _Atomic(void*) mi_error_arg;     // = NULL

 static void mi_error_default(int err) {
  UNUSED(err);
@ -365,7 +376,7 @@ static void mi_error_default(int err) {

 void mi_register_error(mi_error_fun* fun, void* arg) {
  mi_error_handler = fun;  // can be NULL
-  mi_atomic_write_ptr(void,&mi_error_arg, arg);
+  mi_atomic_store_ptr_release(void,&mi_error_arg, arg);
 }

 void _mi_error_message(int err, const char* fmt, ...) {
@ -376,7 +387,7 @@ void _mi_error_message(int err, const char* fmt, ...) {
  va_end(args);
  // and call the error handler which may abort (or return normally)
  if (mi_error_handler != NULL) {
-    mi_error_handler(err, mi_atomic_read_ptr(void,&mi_error_arg));
+    mi_error_handler(err, mi_atomic_load_ptr_acquire(void,&mi_error_arg));
  }
  else {
    mi_error_default(err);
@ -389,33 +400,73 @@ void _mi_error_message(int err, const char* fmt, ...) {

 static void mi_strlcpy(char* dest, const char* src, size_t dest_size) {
  dest[0] = 0;
-  #pragma warning(suppress:4996)
  strncpy(dest, src, dest_size - 1);
  dest[dest_size - 1] = 0;
 }

 static void mi_strlcat(char* dest, const char* src, size_t dest_size) {
-  #pragma warning(suppress:4996)
  strncat(dest, src, dest_size - 1);
  dest[dest_size - 1] = 0;
 }

+static inline int mi_strnicmp(const char* s, const char* t, size_t n) {
+  if (n==0) return 0;
+  for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) {
+    if (toupper(*s) != toupper(*t)) break;
+  }
+  return (n==0 ? 0 : *s - *t);
+}
+
 #if defined _WIN32
 // On Windows use GetEnvironmentVariable instead of getenv to work
 // reliably even when this is invoked before the C runtime is initialized.
 // i.e. when `_mi_preloading() == true`.
 // Note: on windows, environment names are not case sensitive.
-#include <windows.h>
+#include <Windows.h>
 static bool mi_getenv(const char* name, char* result, size_t result_size) {
  result[0] = 0;
  size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size);
  return (len > 0 && len < result_size);
 }
-#else
+#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0)
+// On Posix systemsr use `environ` to acces environment variables 
+// even before the C runtime is initialized.
+#if defined(__APPLE__) && defined(__has_include) && __has_include(<crt_externs.h>)
+#include <crt_externs.h>
+static char** mi_get_environ(void) {
+  return (*_NSGetEnviron());
+}
+#else 
+extern char** environ;
+static char** mi_get_environ(void) {
+  return environ;
+}
+#endif
 static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  if (name==NULL) return false;  
+  const size_t len = strlen(name);
+  if (len == 0) return false;  
+  char** env = mi_get_environ();
+  if (env == NULL) return false;
+  // compare up to 256 entries
+  for (int i = 0; i < 256 && env[i] != NULL; i++) {
+    const char* s = env[i];
+    if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
+      // found it
+      mi_strlcpy(result, s + len + 1, result_size);
+      return true;
+    }
+  }
+  return false;
+}
+#else  
+// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime
+static bool mi_getenv(const char* name, char* result, size_t result_size) {
+  // cannot call getenv() when still initializing the C runtime.
+  if (_mi_preloading()) return false;
  const char* s = getenv(name);
  if (s == NULL) {
-    // in unix environments we check the upper case name too.
+    // we check the upper case name too.
    char buf[64+1];
    size_t len = strlen(name);
    if (len >= sizeof(buf)) len = sizeof(buf) - 1;
@ -434,11 +485,8 @@ static bool mi_getenv(const char* name, char* result, size_t result_size) {
  }
 }
 #endif
-static void mi_option_init(mi_option_desc_t* desc) {
-  #ifndef _WIN32
-  // cannot call getenv() when still initializing the C runtime.
-  if (_mi_preloading()) return;
-  #endif
+
+static void mi_option_init(mi_option_desc_t* desc) {  
  // Read option value from the environment
  char buf[64+1];
  mi_strlcpy(buf, "mimalloc_", sizeof(buf));
@ -462,6 +510,14 @@ static void mi_option_init(mi_option_desc_t* desc) {
    else {
      char* end = buf;
      long value = strtol(buf, &end, 10);
+      if (desc->option == mi_option_reserve_os_memory) {
+        // this option is interpreted in KiB to prevent overflow of `long`
+        if (*end == 'K') { end++; }
+        else if (*end == 'M') { value *= KiB; end++; }
+        else if (*end == 'G') { value *= MiB; end++; }
+        else { value = (value + KiB - 1) / KiB; }
+        if (*end == 'B') { end++; }
+      }
      if (*end == 0) {
        desc->value = value;
        desc->init = INITIALIZED;
@ -471,9 +527,9 @@ static void mi_option_init(mi_option_desc_t* desc) {
        desc->init = DEFAULTED;
      }
    }
+    mi_assert_internal(desc->init != UNINIT);
  }
-  else {
+  else if (!_mi_preloading()) {
    desc->init = DEFAULTED;
  }
-  mi_assert_internal(desc->init != UNINIT);
 }
--- a/src/os.c
+++ b/src/os.c
@ -8,27 +8,51 @@ terms of the MIT license. A copy of the license can be found in the file
 #define _DEFAULT_SOURCE   // ensure mmap flags are defined
 #endif

+#if defined(__sun)
+// illumos provides new mman.h api when any of these are defined
+// otherwise the old api based on caddr_t which predates the void pointers one.
+// stock solaris provides only the former, chose to atomically to discard those
+// flags only here rather than project wide tough.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif
 #include "mimalloc.h"
 #include "mimalloc-internal.h"
 #include "mimalloc-atomic.h"

 #include <string.h>  // strerror

+#ifdef _MSC_VER
+#pragma warning(disable:4996)  // strerror
+#endif
+

 #if defined(_WIN32)
-#include <windows.h>
+#include <Windows.h>
 #elif defined(__wasi__)
 // stdlib.h is all we need, and has already been included in mimalloc.h
 #else
 #include <sys/mman.h>  // mmap
 #include <unistd.h>    // sysconf
 #if defined(__linux__)
+#include <features.h>
+#if defined(__GLIBC__)
 #include <linux/mman.h> // linux mmap flags
+#else
+#include <sys/mman.h>
+#endif
 #endif
 #if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR
 #include <mach/vm_statistics.h>
 #endif
 #endif
+#if defined(__HAIKU__)
+#define madvise posix_madvise
+#define MADV_DONTNEED POSIX_MADV_DONTNEED
+#endif
+#endif

 /* -----------------------------------------------------------
  Initialization.
@ -90,6 +114,7 @@ size_t _mi_os_good_alloc_size(size_t size) {
 // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016.
 // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility)
 // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB)
+//
 // We hide MEM_EXTENDED_PARAMETER to compile with older SDK's.
 #include <winternl.h>
 typedef PVOID    (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG);
@ -97,6 +122,17 @@ typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;

+// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7
+#if (_WIN32_WINNT < 0x601)  // before Win7
+typedef struct _PROCESSOR_NUMBER { WORD Group; BYTE Number; BYTE Reserved; } PROCESSOR_NUMBER, *PPROCESSOR_NUMBER;
+#endif
+typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(PPROCESSOR_NUMBER ProcNumber);
+typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(PPROCESSOR_NUMBER Processor, PUSHORT NodeNumber);
+typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask);
+static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL;
+static PGetNumaProcessorNodeEx      pGetNumaProcessorNodeEx = NULL;
+static PGetNumaNodeProcessorMaskEx  pGetNumaNodeProcessorMaskEx = NULL;
+
 static bool mi_win_enable_large_os_pages()
 {
  if (large_os_page_size > 0) return true;
@ -147,11 +183,20 @@ void _mi_os_init(void) {
    if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2");
    FreeLibrary(hDll);
  }
+  // NtAllocateVirtualMemoryEx is used for huge page allocation
  hDll = LoadLibrary(TEXT("ntdll.dll"));
  if (hDll != NULL) {
    pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
    FreeLibrary(hDll);
  }
+  // Try to use Win7+ numa API
+  hDll = LoadLibrary(TEXT("kernel32.dll"));
+  if (hDll != NULL) {
+    pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx");
+    pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx");
+    pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx");
+    FreeLibrary(hDll);
+  }
  if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
    mi_win_enable_large_os_pages();
  }
@ -192,7 +237,6 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
  if (was_committed) _mi_stat_decrease(&stats->committed, size);
  _mi_stat_decrease(&stats->reserved, size);
  if (err) {
-    #pragma warning(suppress:4996)
    _mi_warning_message("munmap failed: %s, addr 0x%8li, size %lu\n", strerror(errno), (size_t)addr, size);
    return false;
  }
@ -236,15 +280,15 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment

 static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) {
  mi_assert_internal(!(large_only && !allow_large));
-  static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
+  static _Atomic(uintptr_t) large_page_try_ok; // = 0;
  void* p = NULL;
  if ((large_only || use_large_os_page(size, try_alignment))
      && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
-    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
    if (!large_only && try_ok > 0) {
      // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive.
      // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times.
-      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
    }
    else {
      // large OS pages must always reserve and commit.
@ -253,7 +297,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
      if (large_only) return p;
      // fall back to non-large page allocation on error (`p == NULL`).
      if (p == NULL) {
-        mi_atomic_write(&large_page_try_ok,10);  // on error, don't try again for the next N allocations
+        mi_atomic_store_release(&large_page_try_ok,10UL);  // on error, don't try again for the next N allocations
      }
    }
  }
@ -262,7 +306,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
    p = mi_win_virtual_allocx(addr, size, try_alignment, flags);
  }
  if (p == NULL) {
-    _mi_warning_message("unable to allocate memory: error code: %i, addr: %p, size: 0x%x, large only: %d, allow_large: %d\n", GetLastError(), addr, size, large_only, allow_large);
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, GetLastError(), addr, large_only, allow_large);
  }
  return p;
 }
@ -314,7 +358,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
  int fd = -1;
  #if defined(MAP_ALIGNED)  // BSD
  if (try_alignment > 0) {
-    size_t n = _mi_bsr(try_alignment);
+    size_t n = mi_bsr(try_alignment);
    if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) {  // alignment is a power of 2 and 4096 <= alignment <= 1GiB
      flags |= MAP_ALIGNED(n);
    }
@ -330,14 +374,14 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
  fd = VM_MAKE_TAG(os_tag);
  #endif
  if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) {
-    static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
-    uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
+    static _Atomic(uintptr_t) large_page_try_ok; // = 0;
+    uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok);
    if (!large_only && try_ok > 0) {
      // If the OS is not configured for large OS pages, or the user does not have
      // enough permission, the `mmap` will always fail (but it might also fail for other reasons).
      // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times
      // to avoid too many failing calls to mmap.
-      mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok);
+      mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1);
    }
    else {
      int lflags = flags & ~MAP_NORESERVE;  // using NORESERVE on huge pages seems to fail on Linux
@ -377,7 +421,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
        #endif
        if (large_only) return p;
        if (p == NULL) {
-          mi_atomic_write(&large_page_try_ok, 10);  // on error, don't try again for the next N allocations
+          mi_atomic_store_release(&large_page_try_ok, (uintptr_t)10);  // on error, don't try again for the next N allocations
        }
      }
    }
@ -391,13 +435,26 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
    // though since properly aligned allocations will already use large pages if available
    // in that case -- in particular for our large regions (in `memory.c`).
    // However, some systems only allow THP if called with explicit `madvise`, so
-    // when large OS pages are enabled for mimalloc, we call `madvice` anyways.
+    // when large OS pages are enabled for mimalloc, we call `madvise` anyways.
    if (allow_large && use_large_os_page(size, try_alignment)) {
      if (madvise(p, size, MADV_HUGEPAGE) == 0) {
        *is_large = true; // possibly
      };
    }
    #endif
+    #if defined(__sun)
+    if (allow_large && use_large_os_page(size, try_alignment)) {
+      struct memcntl_mha cmd = {0};
+      cmd.mha_pagesize = large_os_page_size;
+      cmd.mha_cmd = MHA_MAPSIZE_VA;
+      if (memcntl(p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) {
+        *is_large = true;
+      }
+    }
+    #endif
+  }
+  if (p == NULL) {
+    _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large);
  }
  return p;
 }
@ -406,21 +463,22 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 // On 64-bit systems, we can do efficient aligned allocation by using
 // the 4TiB to 30TiB area to allocate them.
 #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
-static volatile mi_decl_cache_align _Atomic(uintptr_t) aligned_base;
+static mi_decl_cache_align _Atomic(uintptr_t) aligned_base;

 // Return a 4MiB aligned address that is probably available
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
  if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
  if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
-  uintptr_t hint = mi_atomic_add(&aligned_base, size);
+  uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size);
  if (hint == 0 || hint > ((intptr_t)30<<40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
    uintptr_t init = ((uintptr_t)4 << 40); // start at 4TiB area
    #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
    init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF));  // (randomly 20 bits)*4MiB == 0 to 4TiB
    #endif
-    mi_atomic_cas_strong(&aligned_base, init, hint + size);
-    hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
+    uintptr_t expected = hint + size;
+    mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init);
+    hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
  }
  if (hint%try_alignment != 0) return NULL;
  return (void*)hint;
@ -544,14 +602,18 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit,
  OS API: alloc, free, alloc_aligned
 ----------------------------------------------------------- */

-void* _mi_os_alloc(size_t size, mi_stats_t* stats) {
+void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) {
+  UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
  if (size == 0) return NULL;
  size = _mi_os_good_alloc_size(size);
  bool is_large = false;
  return mi_os_mem_alloc(size, 0, true, false, &is_large, stats);
 }

-void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* stats) {
+void  _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) {
+  UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
  if (size == 0 || p == NULL) return;
  size = _mi_os_good_alloc_size(size);
  mi_os_mem_free(p, size, was_committed, stats);
@ -561,8 +623,9 @@ void  _mi_os_free(void* p, size_t size, mi_stats_t* stats) {
  _mi_os_free_ex(p, size, true, stats);
 }

-void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_os_tld_t* tld)
+void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats)
 {
+  UNUSED(tld_stats);
  if (size == 0) return NULL;
  size = _mi_os_good_alloc_size(size);
  alignment = _mi_align_up(alignment, _mi_os_page_size());
@ -571,7 +634,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* lar
    allow_large = *large;
    *large = false;
  }
-  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), tld->stats);
+  return mi_os_mem_alloc_aligned(size, alignment, commit, allow_large, (large!=NULL?large:&allow_large), &_mi_stats_main /*tld->stats*/ );
 }


@ -628,11 +691,11 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
  if (csize == 0) return true;  // || _mi_os_is_huge_reserved(addr))
  int err = 0;
  if (commit) {
-    _mi_stat_increase(&stats->committed, csize);
+    _mi_stat_increase(&stats->committed, size);  // use size for precise commit vs. decommit
    _mi_stat_counter_increase(&stats->commit_calls, 1);
  }
  else {
-    _mi_stat_decrease(&stats->committed, csize);
+    _mi_stat_decrease(&stats->committed, size);
  }

  #if defined(_WIN32)
@ -658,6 +721,9 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
    // for commit, just change the protection
    err = mprotect(start, csize, (PROT_READ | PROT_WRITE));
    if (err != 0) { err = errno; }
+    #if defined(MADV_FREE_REUSE)
+      while ((err = madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; }
+    #endif
  }
  #else
  err = mprotect(start, csize, (commit ? (PROT_READ | PROT_WRITE) : PROT_NONE));
@ -671,16 +737,20 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
  return (err == 0);
 }

-bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
  return mi_os_commitx(addr, size, true, false /* liberal */, is_zero, stats);
 }

-bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats) {
+bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) {
+  UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
  bool is_zero;
  return mi_os_commitx(addr, size, false, true /* conservative */, &is_zero, stats);
 }

-bool _mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+static bool mi_os_commit_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {  
  return mi_os_commitx(addr, size, true, true /* conservative */, is_zero, stats);
 }

@ -715,12 +785,19 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
  if (p != start) return false;
 #else
 #if defined(MADV_FREE)
-  static int advice = MADV_FREE;
-  int err = madvise(start, csize, advice);
-  if (err != 0 && errno == EINVAL && advice == MADV_FREE) {
-    // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on
-    advice = MADV_DONTNEED;
-    err = madvise(start, csize, advice);
+  #if defined(MADV_FREE_REUSABLE)
+    #define KK_MADV_FREE_INITIAL  MADV_FREE_REUSABLE
+  #else
+    #define KK_MADV_FREE_INITIAL  MADV_FREE
+  #endif
+  static _Atomic(uintptr_t) advice = ATOMIC_VAR_INIT(KK_MADV_FREE_INITIAL);
+  int oadvice = (int)mi_atomic_load_relaxed(&advice);
+  int err;
+  while ((err = madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0;  };
+  if (err != 0 && errno == EINVAL && oadvice == KK_MADV_FREE_INITIAL) {  
+    // if MADV_FREE/MADV_FREE_REUSABLE is not supported, fall back to MADV_DONTNEED from now on
+    mi_atomic_store_release(&advice, (uintptr_t)MADV_DONTNEED);
+    err = madvise(start, csize, MADV_DONTNEED);
  }
 #elif defined(__wasi__)
  int err = 0;
@ -740,7 +817,9 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
 // but may be used later again. This will release physical memory
 // pages and reduce swapping while keeping the memory committed.
 // We page align to a conservative area inside the range to reset.
-bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
+bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats) {
+  UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
  if (mi_option_is_enabled(mi_option_reset_decommits)) {
    return _mi_os_decommit(addr, size, stats);
  }
@ -749,9 +828,11 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) {
  }
 }

-bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* stats) {
+bool _mi_os_unreset(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) {
+  UNUSED(tld_stats);
+  mi_stats_t* stats = &_mi_stats_main;
  if (mi_option_is_enabled(mi_option_reset_decommits)) {
-    return _mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
+    return mi_os_commit_unreset(addr, size, is_zero, stats);  // re-commit it (conservatively!)
  }
  else {
    *is_zero = false;
@ -876,7 +957,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
  return VirtualAlloc(addr, size, flags, PAGE_READWRITE);
 }

-#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
+#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__)
 #include <sys/syscall.h>
 #ifndef MPOL_PREFERRED
 #define MPOL_PREFERRED 1
@ -926,9 +1007,9 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {

  uintptr_t start = 0;
  uintptr_t end = 0;
-  uintptr_t expected;
+  uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start);
  do {
-    start = expected = mi_atomic_read_relaxed(&mi_huge_start);
+    start = huge_start;
    if (start == 0) {
      // Initialize the start address after the 32TiB area
      start = ((uintptr_t)32 << 40);  // 32TiB virtual start address
@ -939,7 +1020,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) {
    }
    end = start + size;
    mi_assert_internal(end % MI_SEGMENT_SIZE == 0);
-  } while (!mi_atomic_cas_strong(&mi_huge_start, end, expected));
+  } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end));

  if (total_size != NULL) *total_size = size;
  return (uint8_t*)start;
@ -1019,24 +1100,50 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
 /* ----------------------------------------------------------------------------
 Support NUMA aware allocation
 -----------------------------------------------------------------------------*/
-#ifdef _WIN32
-  #if (_WIN32_WINNT < 0x601)  // before Win7
-  typedef struct _PROCESSOR_NUMBER { WORD Group; BYTE Number; BYTE Reserved; } PROCESSOR_NUMBER, *PPROCESSOR_NUMBER;
-  WINBASEAPI VOID WINAPI GetCurrentProcessorNumberEx(_Out_ PPROCESSOR_NUMBER ProcNumber);
-  WINBASEAPI BOOL WINAPI GetNumaProcessorNodeEx(_In_  PPROCESSOR_NUMBER Processor, _Out_ PUSHORT NodeNumber);
-  #endif
+#ifdef _WIN32  
 static size_t mi_os_numa_nodex() {
-  PROCESSOR_NUMBER pnum;
  USHORT numa_node = 0;
-  GetCurrentProcessorNumberEx(&pnum);
-  GetNumaProcessorNodeEx(&pnum,&numa_node);
+  if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) {
+    // Extended API is supported
+    PROCESSOR_NUMBER pnum;
+    (*pGetCurrentProcessorNumberEx)(&pnum);
+    USHORT nnode = 0;
+    BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode);
+    if (ok) numa_node = nnode;
+  }
+  else {
+    // Vista or earlier, use older API that is limited to 64 processors. Issue #277
+    DWORD pnum = GetCurrentProcessorNumber();
+    UCHAR nnode = 0;
+    BOOL ok = GetNumaProcessorNode((UCHAR)pnum, &nnode);
+    if (ok) numa_node = nnode;    
+  }
  return numa_node;
 }

 static size_t mi_os_numa_node_countx(void) {
  ULONG numa_max = 0;
  GetNumaHighestNodeNumber(&numa_max);
-  return (numa_max + 1);
+  // find the highest node number that has actual processors assigned to it. Issue #282
+  while(numa_max > 0) {
+    if (pGetNumaNodeProcessorMaskEx != NULL) {
+      // Extended API is supported
+      GROUP_AFFINITY affinity;
+      if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) {
+        if (affinity.Mask != 0) break;  // found the maximum non-empty node
+      }
+    }
+    else {
+      // Vista or earlier, use older API that is limited to 64 processors.
+      ULONGLONG mask;
+      if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) {
+        if (mask != 0) break; // found the maximum non-empty node
+      };
+    }
+    // max node was invalid or had no processor assigned, try again
+    numa_max--;
+  }
+  return ((size_t)numa_max + 1);
 }
 #elif defined(__linux__)
 #include <sys/syscall.h>  // getcpu
--- a/src/page-queue.c
+++ b/src/page-queue.c
@ -49,50 +49,6 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) {
  Bins
 ----------------------------------------------------------- */

-// Bit scan reverse: return the index of the highest bit.
-static inline uint8_t mi_bsr32(uint32_t x);
-
-#if defined(_MSC_VER)
-#include <intrin.h>
-static inline uint8_t mi_bsr32(uint32_t x) {
-  uint32_t idx;
-  _BitScanReverse((DWORD*)&idx, x);
-  return (uint8_t)idx;
-}
-#elif defined(__GNUC__) || defined(__clang__)
-static inline uint8_t mi_bsr32(uint32_t x) {
-  return (31 - __builtin_clz(x));
-}
-#else
-static inline uint8_t mi_bsr32(uint32_t x) {
-  // de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
-  static const uint8_t debruijn[32] = {
-     31,  0, 22,  1, 28, 23, 18,  2, 29, 26, 24, 10, 19,  7,  3, 12,
-     30, 21, 27, 17, 25,  9,  6, 11, 20, 16,  8,  5, 15,  4, 14, 13,
-  };
-  x |= x >> 1;
-  x |= x >> 2;
-  x |= x >> 4;
-  x |= x >> 8;
-  x |= x >> 16;
-  x++;
-  return debruijn[(x*0x076be629) >> 27];
-}
-#endif
-
-// Bit scan reverse: return the index of the highest bit.
-uint8_t _mi_bsr(uintptr_t x) {
-  if (x == 0) return 0;
-#if MI_INTPTR_SIZE==8
-  uint32_t hi = (x >> 32);
-  return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi));
-#elif MI_INTPTR_SIZE==4
-  return mi_bsr32(x);
-#else
-# error "define bsr for non-32 or 64-bit platforms"
-#endif
-}
-
 // Return the bin for a given field size.
 // Returns MI_BIN_HUGE if the size is too large.
 // We use `wsize` for the size in "machine word sizes",
@ -125,7 +81,7 @@ extern inline uint8_t _mi_bin(size_t size) {
    #endif
    wsize--;
    // find the highest bit
-    uint8_t b = mi_bsr32((uint32_t)wsize);
+    uint8_t b = (uint8_t)mi_bsr(wsize);  // note: wsize != 0
    // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
    // - adjust with 3 because we use do not round the first 8 sizes
    //   which each get an exact bin
@ -260,7 +216,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) {
  heap->page_count--;
  page->next = NULL;
  page->prev = NULL;
-  // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL);
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL);
  mi_page_set_in_full(page,false);
 }

@ -274,7 +230,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_
                        (mi_page_is_in_full(page) && mi_page_queue_is_full(queue)));

  mi_page_set_in_full(page, mi_page_queue_is_full(queue));
-  // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap);
+  // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap);
  page->next = queue->first;
  page->prev = NULL;
  if (queue->first != NULL) {
@ -341,7 +297,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue
  for (mi_page_t* page = append->first; page != NULL; page = page->next) {
    // inline `mi_page_set_heap` to avoid wrong assertion during absorption;
    // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive.
-    mi_atomic_write(&page->xheap, (uintptr_t)heap); 
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap); 
    // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a
    // side effect that it spins until any DELAYED_FREEING is finished. This ensures
    // that after appending only the new heap will be used for delayed free operations.
--- a/src/page.c
+++ b/src/page.c
@ -122,11 +122,11 @@ bool _mi_page_is_valid(mi_page_t* page) {
 #endif

 void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) {
-  mi_thread_free_t tfree;
  mi_thread_free_t tfreex;
  mi_delayed_t     old_delay;
+  mi_thread_free_t tfree;  
  do {
-    tfree = mi_atomic_read(&page->xthread_free);  // note: must acquire as we can break this loop and not do a CAS
+    tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS;
    tfreex = mi_tf_set_delayed(tfree, delay);
    old_delay = mi_tf_delayed(tfree);
    if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
@ -140,7 +140,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
      break; // leave never-delayed flag set
    }
  } while ((old_delay == MI_DELAYED_FREEING) ||
-           !mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+           !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex));
 }

 /* -----------------------------------------------------------
@ -154,13 +154,12 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
 static void _mi_page_thread_free_collect(mi_page_t* page)
 {
  mi_block_t* head;
-  mi_thread_free_t tfree;
  mi_thread_free_t tfreex;
+  mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free);
  do {
-    tfree = mi_atomic_read_relaxed(&page->xthread_free);
    head = mi_tf_block(tfree);
    tfreex = mi_tf_set_block(tfree,NULL);
-  } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree));
+  } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex));

  // return if the list is empty
  if (head == NULL) return;
@ -273,11 +272,9 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   (put there by other threads if they deallocated in a full page)
 ----------------------------------------------------------- */
 void _mi_heap_delayed_free(mi_heap_t* heap) {
-  // take over the list (note: no atomic exchange is it is often NULL)
-  mi_block_t* block;
-  do {
-    block = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
-  } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, NULL, block));
+  // take over the list (note: no atomic exchange since it is often NULL)
+  mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
+  while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ };

  // and free them all
  while(block != NULL) {
@ -286,11 +283,10 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
    if (!_mi_free_delayed_block(block)) {
      // we might already start delayed freeing while another thread has not yet
      // reset the delayed_freeing flag; in that case delay it further by reinserting.
-      mi_block_t* dfree;
+      mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free);
      do {
-        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
        mi_block_set_nextx(heap, block, dfree, heap->keys);
-      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
+      } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block));
    }
    block = next;
  }
@ -709,14 +705,17 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
  mi_page_queue_t* pq = mi_page_queue(heap,size);
  mi_page_t* page = pq->first;
  if (page != NULL) {
-    if ((MI_SECURE >= 3) && page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
-      // in secure mode, we extend half the time to increase randomness
+   #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness      
+    if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
      mi_page_extend_free(heap, page, heap->tld);
      mi_assert_internal(mi_page_immediate_available(page));
    }
-    else {
+    else 
+   #endif
+    {
      _mi_page_free_collect(page,false);
    }
+    
    if (mi_page_immediate_available(page)) {
      page->retire_expire = 0;
      return page; // fast path
@ -734,20 +733,20 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) {
 ----------------------------------------------------------- */

 static mi_deferred_free_fun* volatile deferred_free = NULL;
-static volatile _Atomic(void*) deferred_arg; // = NULL
+static _Atomic(void*) deferred_arg; // = NULL

 void _mi_deferred_free(mi_heap_t* heap, bool force) {
  heap->tld->heartbeat++;
  if (deferred_free != NULL && !heap->tld->recurse) {
    heap->tld->recurse = true;
-    deferred_free(force, heap->tld->heartbeat, mi_atomic_read_ptr_relaxed(void,&deferred_arg));
+    deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg));
    heap->tld->recurse = false;
  }
 }

 void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
  deferred_free = fn;
-  mi_atomic_write_ptr(void,&deferred_arg, arg);
+  mi_atomic_store_ptr_release(void,&deferred_arg, arg);
 }


@ -792,7 +791,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size) mi_attr_noexcept {
  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
  if (mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) )) {
    if (mi_unlikely(req_size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
-      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu b requested)\n", req_size);
+      _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size);
      return NULL;
    }
    else {
@ -816,6 +815,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
  if (mi_unlikely(!mi_heap_is_initialized(heap))) {
    mi_thread_init(); // calls `_mi_heap_init` in turn
    heap = mi_get_default_heap();
+    if (mi_unlikely(!mi_heap_is_initialized(heap))) { return NULL; }
  }
  mi_assert_internal(mi_heap_is_initialized(heap));

@ -833,7 +833,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
  }

  if (mi_unlikely(page == NULL)) { // out of memory
-    _mi_error_message(ENOMEM, "cannot allocate memory (%zu bytes requested)\n", size);
+    const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
+    _mi_error_message(ENOMEM, "unable to allocate memory (%zu bytes)\n", req_size);
    return NULL;
  }

--- a/src/random.c
+++ b/src/random.c
@ -155,30 +155,40 @@ uintptr_t _mi_random_next(mi_random_ctx_t* ctx) {

 /* ----------------------------------------------------------------------------
 To initialize a fresh random context we rely on the OS:
- Windows     : BCryptGenRandom
+- Windows     : BCryptGenRandom (or RtlGenRandom)
 - osX,bsd,wasi: arc4random_buf
 - Linux       : getrandom,/dev/urandom
 If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR.
 -----------------------------------------------------------------------------*/

 #if defined(_WIN32)
+
+#if !defined(MI_USE_RTLGENRANDOM)
+// We prefer BCryptGenRandom over RtlGenRandom
 #pragma comment (lib,"bcrypt.lib")
 #include <bcrypt.h>
 static bool os_random_buf(void* buf, size_t buf_len) {
  return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0);
 }
-/*
-#define SystemFunction036 NTAPI SystemFunction036
-#include <NTSecAPI.h>
-#undef SystemFunction036
-static bool os_random_buf(void* buf, size_t buf_len) {
-  RtlGenRandom(buf, (ULONG)buf_len);
-  return true;
+#else
+// Use (unofficial) RtlGenRandom
+#pragma comment (lib,"advapi32.lib")
+#define RtlGenRandom  SystemFunction036
+#ifdef __cplusplus
+extern "C" {
+#endif
+BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength);
+#ifdef __cplusplus
 }
-*/
+#endif
+static bool os_random_buf(void* buf, size_t buf_len) {
+  return (RtlGenRandom(buf, (ULONG)buf_len) != 0);
+}
+#endif
+
 #elif defined(ANDROID) || defined(XP_DARWIN) || defined(__APPLE__) || defined(__DragonFly__) || \
      defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \
-      defined(__wasi__)
+      defined(__sun) || defined(__wasi__)
 #include <stdlib.h>
 static bool os_random_buf(void* buf, size_t buf_len) {
  arc4random_buf(buf, buf_len);
@ -200,12 +210,12 @@ static bool os_random_buf(void* buf, size_t buf_len) {
  #ifndef GRND_NONBLOCK
  #define GRND_NONBLOCK (1)
  #endif
-  static volatile _Atomic(uintptr_t) no_getrandom; // = 0
-  if (mi_atomic_read(&no_getrandom)==0) {
+  static _Atomic(uintptr_t) no_getrandom; // = 0
+  if (mi_atomic_load_acquire(&no_getrandom)==0) {
    ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK);
    if (ret >= 0) return (buf_len == (size_t)ret);
    if (ret != ENOSYS) return false;
-    mi_atomic_write(&no_getrandom,1); // don't call again, and fall back to /dev/urandom
+    mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom
  }
 #endif
  int flags = O_RDONLY;
@ -234,7 +244,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {
 #endif

 #if defined(_WIN32)
-#include <windows.h>
+#include <Windows.h>
 #elif defined(__APPLE__)
 #include <mach/mach_time.h>
 #else
@ -243,6 +253,7 @@ static bool os_random_buf(void* buf, size_t buf_len) {

 uintptr_t _os_random_weak(uintptr_t extra_seed) {
  uintptr_t x = (uintptr_t)&_os_random_weak ^ extra_seed; // ASLR makes the address random
+  
  #if defined(_WIN32)
    LARGE_INTEGER pcount;
    QueryPerformanceCounter(&pcount);
--- a/src/region.c
+++ b/src/region.c
@ -37,7 +37,7 @@ Possible issues:

 #include <string.h>  // memset

-#include "bitmap.inc.c"
+#include "bitmap.h"

 // Internal raw OS interface
 size_t  _mi_os_large_page_size();
@ -50,8 +50,8 @@ bool    _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);

 // arena.c
 void    _mi_arena_free(void* p, size_t size, size_t memid, bool all_committed, mi_stats_t* stats);
-void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
-void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc(size_t size, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
+void*   _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld);



@ -77,7 +77,8 @@ typedef union mi_region_info_u {
  uintptr_t value;      
  struct {
    bool  valid;        // initialized?
-    bool  is_large;     // allocated in fixed large/huge OS pages
+    bool  is_large:1;   // allocated in fixed large/huge OS pages
+    bool  is_pinned:1;  // pinned memory cannot be decommitted
    short numa_node;    // the associated NUMA node (where -1 means no associated node)
  } x;
 } mi_region_info_t;
@ -86,21 +87,21 @@ typedef union mi_region_info_u {
 // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with
 // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block.
 typedef struct mem_region_s {
-  volatile _Atomic(uintptr_t)        info;        // mi_region_info_t.value
-  volatile _Atomic(void*)            start;       // start of the memory area 
-  mi_bitmap_field_t                  in_use;      // bit per in-use block
-  mi_bitmap_field_t                  dirty;       // track if non-zero per block
-  mi_bitmap_field_t                  commit;      // track if committed per block
-  mi_bitmap_field_t                  reset;       // track if reset per block
-  volatile _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena
-  uintptr_t                          padding;     // round to 8 fields
+  _Atomic(uintptr_t)        info;        // mi_region_info_t.value
+  _Atomic(void*)            start;       // start of the memory area 
+  mi_bitmap_field_t         in_use;      // bit per in-use block
+  mi_bitmap_field_t         dirty;       // track if non-zero per block
+  mi_bitmap_field_t         commit;      // track if committed per block
+  mi_bitmap_field_t         reset;       // track if reset per block
+  _Atomic(uintptr_t)        arena_memid; // if allocated from a (huge page) arena
+  uintptr_t                 padding;     // round to 8 fields
 } mem_region_t;

 // The region map
 static mem_region_t regions[MI_REGION_MAX];

 // Allocated regions
-static volatile _Atomic(uintptr_t) regions_count; // = 0;        
+static _Atomic(uintptr_t) regions_count; // = 0;        


 /* ----------------------------------------------------------------------------
@ -123,9 +124,9 @@ static size_t mi_good_commit_size(size_t size) {
 // Return if a pointer points into a region reserved by us.
 bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  if (p==NULL) return false;
-  size_t count = mi_atomic_read_relaxed(&regions_count);
+  size_t count = mi_atomic_load_relaxed(&regions_count);
  for (size_t i = 0; i < count; i++) {
-    uint8_t* start = mi_atomic_read_ptr_relaxed(uint8_t,&regions[i].start);
+    uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, &regions[i].start);
    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
  }
  return false;
@ -133,7 +134,7 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {


 static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  uint8_t* start = mi_atomic_read_ptr(uint8_t,&region->start);
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start);
  mi_assert_internal(start != NULL);
  return (start + (bit_idx * MI_SEGMENT_SIZE));  
 }
@ -171,22 +172,23 @@ static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_
 static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
  // not out of regions yet?
-  if (mi_atomic_read_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;
+  if (mi_atomic_load_relaxed(&regions_count) >= MI_REGION_MAX - 1) return false;

  // try to allocate a fresh region from the OS
  bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit));
  bool region_large = (commit && allow_large);
  bool is_zero = false;
+  bool is_pinned = false;
  size_t arena_memid = 0;
-  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_zero, &arena_memid, tld);
+  void* const start = _mi_arena_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, &region_commit, &region_large, &is_pinned, &is_zero, &arena_memid, tld);
  if (start == NULL) return false;
  mi_assert_internal(!(region_large && !allow_large));
  mi_assert_internal(!region_large || region_commit);

  // claim a fresh slot
-  const uintptr_t idx = mi_atomic_increment(&regions_count);
+  const uintptr_t idx = mi_atomic_increment_acq_rel(&regions_count);
  if (idx >= MI_REGION_MAX) {
-    mi_atomic_decrement(&regions_count);
+    mi_atomic_decrement_acq_rel(&regions_count);
    _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats);
    _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, GiB));
    return false;
@ -195,21 +197,22 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
  // allocated, initialize and claim the initial blocks
  mem_region_t* r = &regions[idx];
  r->arena_memid  = arena_memid;
-  mi_atomic_write(&r->in_use, 0);
-  mi_atomic_write(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
-  mi_atomic_write(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
-  mi_atomic_write(&r->reset, 0);
+  mi_atomic_store_release(&r->in_use, (uintptr_t)0);
+  mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL));
+  mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0));
+  mi_atomic_store_release(&r->reset, (uintptr_t)0);
  *bit_idx = 0;
-  mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
-  mi_atomic_write_ptr(uint8_t*,&r->start, start);
+  _mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
+  mi_atomic_store_ptr_release(void,&r->start, start);

  // and share it 
  mi_region_info_t info;
  info.value = 0;                        // initialize the full union to zero
  info.x.valid = true;
  info.x.is_large = region_large;
+  info.x.is_pinned = is_pinned;
  info.x.numa_node = (short)_mi_os_numa_node(tld);
-  mi_atomic_write(&r->info, info.value); // now make it available to others
+  mi_atomic_store_release(&r->info, info.value); // now make it available to others
  *region = r;
  return true;
 }
@ -221,7 +224,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
 static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) {
  // initialized at all?
  mi_region_info_t info;
-  info.value = mi_atomic_read_relaxed(&region->info);
+  info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info);
  if (info.value==0) return false;

  // numa correct
@ -240,7 +243,7 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo
 static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
  // try all regions for a free slot  
-  const size_t count = mi_atomic_read(&regions_count);
+  const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
  for (size_t visited = 0; visited < count; visited++, idx++) {
    if (idx >= count) idx = 0;  // wrap around
@ -248,7 +251,7 @@ static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large,
    // if this region suits our demand (numa node matches, large OS page matches)
    if (mi_region_is_suitable(r, numa_node, allow_large)) {
      // then try to atomically claim a segment(s) in this region
-      if (mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
+      if (_mi_bitmap_try_find_claim_field(&r->in_use, 0, blocks, bit_idx)) {
        tld->region_idx = idx;    // remember the last found position
        *region = r;
        return true;
@ -259,16 +262,16 @@ static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large,
 }


-static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
  mi_assert_internal(blocks <= MI_BITMAP_FIELD_BITS);
  mem_region_t* region;
  mi_bitmap_index_t bit_idx;
  const int numa_node = (_mi_os_numa_node_count() <= 1 ? -1 : _mi_os_numa_node(tld));
  // try to claim in existing regions
-  if (!mi_region_try_claim(numa_node, blocks, *is_large, &region, &bit_idx, tld)) {
+  if (!mi_region_try_claim(numa_node, blocks, *large, &region, &bit_idx, tld)) {
    // otherwise try to allocate a fresh region and claim in there
-    if (!mi_region_try_alloc_os(blocks, *commit, *is_large, &region, &bit_idx, tld)) {
+    if (!mi_region_try_alloc_os(blocks, *commit, *large, &region, &bit_idx, tld)) {
      // out of regions or memory
      return NULL;
    }
@ -277,41 +280,46 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
  // ------------------------------------------------
  // found a region and claimed `blocks` at `bit_idx`, initialize them now
  mi_assert_internal(region != NULL);
-  mi_assert_internal(mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));
+  mi_assert_internal(_mi_bitmap_is_claimed(&region->in_use, 1, blocks, bit_idx));

  mi_region_info_t info;
-  info.value = mi_atomic_read(&region->info);
-  uint8_t* start = mi_atomic_read_ptr(uint8_t,&region->start);
-  mi_assert_internal(!(info.x.is_large && !*is_large));
+  info.value = mi_atomic_load_acquire(&region->info);
+  uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&region->start);
+  mi_assert_internal(!(info.x.is_large && !*large));
  mi_assert_internal(start != NULL);

-  *is_zero = mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
-  *is_large = info.x.is_large;
-  *memid = mi_memid_create(region, bit_idx);
+  *is_zero   = _mi_bitmap_claim(&region->dirty, 1, blocks, bit_idx, NULL);  
+  *large     = info.x.is_large;
+  *is_pinned = info.x.is_pinned;
+  *memid     = mi_memid_create(region, bit_idx);
  void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);

  // commit
  if (*commit) {
    // ensure commit
    bool any_uncommitted;
-    mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
+    _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, &any_uncommitted);
    if (any_uncommitted) {
-      mi_assert_internal(!info.x.is_large);
-      bool commit_zero;
-      _mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld);
-      if (commit_zero) *is_zero = true;
+      mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
+      bool commit_zero = false;
+      if (!_mi_mem_commit(p, blocks * MI_SEGMENT_SIZE, &commit_zero, tld)) {
+        // failed to commit! unclaim and return
+        mi_bitmap_unclaim(&region->in_use, 1, blocks, bit_idx);
+        return NULL;
+      }
+      if (commit_zero) *is_zero = true;      
    }
  }
  else {
    // no need to commit, but check if already fully committed
-    *commit = mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
+    *commit = _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx);
  }  
-  mi_assert_internal(!*commit || mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));
+  mi_assert_internal(!*commit || _mi_bitmap_is_claimed(&region->commit, 1, blocks, bit_idx));

  // unreset reset blocks
-  if (mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
+  if (_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx)) {
    // some blocks are still reset
-    mi_assert_internal(!info.x.is_large);
+    mi_assert_internal(!info.x.is_large && !info.x.is_pinned);
    mi_assert_internal(!mi_option_is_enabled(mi_option_eager_commit) || *commit || mi_option_get(mi_option_eager_commit_delay) > 0); 
    mi_bitmap_unclaim(&region->reset, 1, blocks, bit_idx);
    if (*commit || !mi_option_is_enabled(mi_option_reset_decommits)) { // only if needed
@ -320,7 +328,7 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo
      if (reset_zero) *is_zero = true;
    }
  }
-  mi_assert_internal(!mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
+  mi_assert_internal(!_mi_bitmap_is_any_claimed(&region->reset, 1, blocks, bit_idx));
  
  #if (MI_DEBUG>=2)
  if (*commit) { ((uint8_t*)p)[0] = 0; }
@ -338,12 +346,13 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo

 // Allocate `size` memory aligned at `alignment`. Return non NULL on success, with a given memory `id`.
 // (`id` is abstract, but `id = idx*MI_REGION_MAP_BITS + bitidx`)
-void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
+void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_pinned, bool* is_zero, size_t* memid, mi_os_tld_t* tld)
 {
  mi_assert_internal(memid != NULL && tld != NULL);
  mi_assert_internal(size > 0);
  *memid = 0;
  *is_zero = false;
+  *is_pinned = false;
  bool default_large = false;
  if (large==NULL) large = &default_large;  // ensure `large != NULL`  
  if (size == 0) return NULL;
@ -354,14 +363,14 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
  size_t arena_memid;
  const size_t blocks = mi_region_block_count(size);
  if (blocks <= MI_REGION_MAX_OBJ_BLOCKS && alignment <= MI_SEGMENT_ALIGN) {
-    p = mi_region_try_alloc(blocks, commit, large, is_zero, memid, tld);    
+    p = mi_region_try_alloc(blocks, commit, large, is_pinned, is_zero, memid, tld);    
    if (p == NULL) {
      _mi_warning_message("unable to allocate from region: size %zu\n", size);
    }
  }
  if (p == NULL) {
    // and otherwise fall back to the OS
-    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_zero, &arena_memid, tld);
+    p = _mi_arena_alloc_aligned(size, alignment, commit, large, is_pinned, is_zero, &arena_memid, tld);
    *memid = mi_memid_create_from_arena(arena_memid);
  }

@ -400,7 +409,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
    const size_t blocks = mi_region_block_count(size);
    mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS);
    mi_region_info_t info;
-    info.value = mi_atomic_read(&region->info);
+    info.value = mi_atomic_load_acquire(&region->info);
    mi_assert_internal(info.value != 0);
    void* blocks_start = mi_region_blocks_start(region, bit_idx);
    mi_assert_internal(blocks_start == p); // not a pointer in our area?
@ -409,21 +418,21 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re

    // committed?
    if (full_commit && (size % MI_SEGMENT_SIZE) == 0) {
-      mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
+      _mi_bitmap_claim(&region->commit, 1, blocks, bit_idx, NULL);
    }

    if (any_reset) {
      // set the is_reset bits if any pages were reset
-      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
+      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, NULL);
    }

    // reset the blocks to reduce the working set.
-    if (!info.x.is_large && mi_option_is_enabled(mi_option_segment_reset) 
+    if (!info.x.is_large && !info.x.is_pinned && mi_option_is_enabled(mi_option_segment_reset) 
       && (mi_option_is_enabled(mi_option_eager_commit) ||
           mi_option_is_enabled(mi_option_reset_decommits))) // cannot reset halfway committed segments, use only `option_page_reset` instead            
    {
      bool any_unreset;
-      mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
+      _mi_bitmap_claim(&region->reset, 1, blocks, bit_idx, &any_unreset);
      if (any_unreset) {
        _mi_abandoned_await_readers(); // ensure no more pending write (in case reset = decommit)
        _mi_mem_reset(p, blocks * MI_SEGMENT_SIZE, tld);
@ -442,23 +451,21 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re
 -----------------------------------------------------------------------------*/
 void _mi_mem_collect(mi_os_tld_t* tld) {
  // free every region that has no segments in use.
-  uintptr_t rcount = mi_atomic_read_relaxed(&regions_count);
+  uintptr_t rcount = mi_atomic_load_relaxed(&regions_count);
  for (size_t i = 0; i < rcount; i++) {
    mem_region_t* region = &regions[i];
-    if (mi_atomic_read_relaxed(&region->info) != 0) {
+    if (mi_atomic_load_relaxed(&region->info) != 0) {
      // if no segments used, try to claim the whole region
-      uintptr_t m;
-      do {
-        m = mi_atomic_read_relaxed(&region->in_use);
-      } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
+      uintptr_t m = mi_atomic_load_relaxed(&region->in_use);
+      while (m == 0 && !mi_atomic_cas_weak_release(&region->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ };
      if (m == 0) {
        // on success, free the whole region
-        uint8_t* start = mi_atomic_read_ptr(uint8_t,&regions[i].start);
-        size_t arena_memid = mi_atomic_read_relaxed(&regions[i].arena_memid);
-        uintptr_t commit = mi_atomic_read_relaxed(&regions[i].commit);
+        uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,&regions[i].start);
+        size_t arena_memid = mi_atomic_load_relaxed(&regions[i].arena_memid);
+        uintptr_t commit = mi_atomic_load_relaxed(&regions[i].commit);
        memset(&regions[i], 0, sizeof(mem_region_t));
        // and release the whole region
-        mi_atomic_write(&region->info, 0);
+        mi_atomic_store_release(&region->info, (uintptr_t)0);
        if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) {         
          _mi_abandoned_await_readers(); // ensure no pending reads
          _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats);
--- a/src/segment.c
+++ b/src/segment.c
@ -198,26 +198,32 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
  // add/remove guard pages
  if (MI_SECURE != 0) {
    // in secure mode, we set up a protected page in between the segment info and the page data
-    const size_t os_page_size = _mi_os_page_size();
-    mi_assert_internal((segment->segment_info_size - os_page_size) >= (sizeof(mi_segment_t) + ((segment->capacity - 1) * sizeof(mi_page_t))));
-    mi_assert_internal(((uintptr_t)segment + segment->segment_info_size) % os_page_size == 0);
-    mi_segment_protect_range((uint8_t*)segment + segment->segment_info_size - os_page_size, os_page_size, protect);
+    const size_t os_psize = _mi_os_page_size();
+    mi_assert_internal((segment->segment_info_size - os_psize) >= (sizeof(mi_segment_t) + ((segment->capacity - 1) * sizeof(mi_page_t))));
+    mi_assert_internal(((uintptr_t)segment + segment->segment_info_size) % os_psize == 0);
+    mi_segment_protect_range((uint8_t*)segment + segment->segment_info_size - os_psize, os_psize, protect);
    if (MI_SECURE <= 1 || segment->capacity == 1) {
      // and protect the last (or only) page too
      mi_assert_internal(MI_SECURE <= 1 || segment->page_kind >= MI_PAGE_LARGE);
-      uint8_t* start = (uint8_t*)segment + segment->segment_size - os_page_size;
+      uint8_t* start = (uint8_t*)segment + segment->segment_size - os_psize;
      if (protect && !segment->mem_is_committed) {
-        // ensure secure page is committed
-        _mi_mem_commit(start, os_page_size, NULL, tld);
+        if (protect) {
+          // ensure secure page is committed
+          if (_mi_mem_commit(start, os_psize, NULL, tld)) {  // if this fails that is ok (as it is an unaccessible page)
+            mi_segment_protect_range(start, os_psize, protect);
+          }
+        }
+      }
+      else {
+        mi_segment_protect_range(start, os_psize, protect);
      }
-      mi_segment_protect_range(start, os_page_size, protect);
    }
    else {
      // or protect every page
      const size_t page_size = mi_segment_page_size(segment);
      for (size_t i = 0; i < segment->capacity; i++) {
        if (segment->pages[i].is_committed) {
-          mi_segment_protect_range((uint8_t*)segment + (i+1)*page_size - os_page_size, os_page_size, protect);
+          mi_segment_protect_range((uint8_t*)segment + (i+1)*page_size - os_psize, os_psize, protect);
        }
      }
    }
@ -231,7 +237,7 @@ static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t*
 static void mi_page_reset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld) {
  mi_assert_internal(page->is_committed);
  if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;
+  if (segment->mem_is_pinned || page->segment_in_use || !page->is_committed || page->is_reset) return;
  size_t psize;
  void* start = mi_segment_raw_page_start(segment, page, &psize);
  page->is_reset = true;
@ -240,19 +246,23 @@ static void mi_page_reset(mi_segment_t* segment, mi_page_t* page, size_t size, m
  if (reset_size > 0) _mi_mem_reset(start, reset_size, tld->os);
 }

-static void mi_page_unreset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld)
+static bool mi_page_unreset(mi_segment_t* segment, mi_page_t* page, size_t size, mi_segments_tld_t* tld)
 {
  mi_assert_internal(page->is_reset);
  mi_assert_internal(page->is_committed);
-  mi_assert_internal(!segment->mem_is_fixed);
-  if (segment->mem_is_fixed || !page->is_committed || !page->is_reset) return;
+  mi_assert_internal(!segment->mem_is_pinned);
+  if (segment->mem_is_pinned || !page->is_committed || !page->is_reset) return true;
  page->is_reset = false;
  size_t psize;
  uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
  size_t unreset_size = (size == 0 || size > psize ? psize : size);
  bool is_zero = false;
-  if (unreset_size > 0) _mi_mem_unreset(start, unreset_size, &is_zero, tld->os);
+  bool ok = true;
+  if (unreset_size > 0) {
+    ok = _mi_mem_unreset(start, unreset_size, &is_zero, tld->os);
+  }
  if (is_zero) page->is_zero_init = true;
+  return ok;
 }


@ -280,7 +290,7 @@ static void mi_pages_reset_add(mi_segment_t* segment, mi_page_t* page, mi_segmen
  mi_assert_expensive(!mi_pages_reset_contains(page, tld));
  mi_assert_internal(_mi_page_segment(page)==segment);
  if (!mi_option_is_enabled(mi_option_page_reset)) return;
-  if (segment->mem_is_fixed || page->segment_in_use || !page->is_committed || page->is_reset) return;
+  if (segment->mem_is_pinned || page->segment_in_use || !page->is_committed || page->is_reset) return;

  if (mi_option_get(mi_option_reset_delay) == 0) {
    // reset immediately?
@ -320,7 +330,7 @@ static void mi_pages_reset_remove(mi_page_t* page, mi_segments_tld_t* tld) {
 }

 static void mi_pages_reset_remove_all_in_segment(mi_segment_t* segment, bool force_reset, mi_segments_tld_t* tld) {
-  if (segment->mem_is_fixed) return; // never reset in huge OS pages
+  if (segment->mem_is_pinned) return; // never reset in huge OS pages
  for (size_t i = 0; i < segment->capacity; i++) {
    mi_page_t* page = &segment->pages[i];
    if (!page->segment_in_use && page->is_committed && !page->is_reset) {
@ -375,11 +385,13 @@ static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_
    psize -= segment->segment_info_size;
  }

-  if (MI_SECURE > 1 || (MI_SECURE == 1 && page->segment_idx == segment->capacity - 1)) {
-    // secure == 1: the last page has an os guard page at the end
-    // secure >  1: every page has an os guard page
+#if (MI_SECURE > 1)  // every page has an os guard page
+  psize -= _mi_os_page_size();
+#elif (MI_SECURE==1) // the last page has an os guard page at the end
+  if (page->segment_idx == segment->capacity - 1) {
    psize -= _mi_os_page_size();
  }
+#endif

  if (page_size != NULL) *page_size = psize;
  mi_assert_internal(page->xblock_size == 0 || _mi_ptr_page(p) == page);
@ -428,7 +440,7 @@ static size_t mi_segment_size(size_t capacity, size_t required, size_t* pre_size
    guardsize = page_size;
    required = _mi_align_up(required, page_size);
  }
-;
+
  if (info_size != NULL) *info_size = isize;
  if (pre_size != NULL)  *pre_size  = isize + guardsize;
  return (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + 2*guardsize, MI_PAGE_HUGE_ALIGN) );
@ -454,7 +466,7 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
  segment->thread_id = 0;
  mi_segments_track_size(-((long)segment_size),tld);
  if (MI_SECURE != 0) {
-    mi_assert_internal(!segment->mem_is_fixed);
+    mi_assert_internal(!segment->mem_is_pinned);
    mi_segment_protect(segment, false, tld->os); // ensure no more guard pages are set
  }

@ -468,7 +480,6 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
  if (any_reset && mi_option_is_enabled(mi_option_reset_decommits)) {
    fully_committed = false;
  }
-  
  _mi_mem_free(segment, segment_size, segment->memid, fully_committed, any_reset, tld->os);
 }

@ -584,7 +595,7 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
    else
    {
      if (MI_SECURE!=0) {
-        mi_assert_internal(!segment->mem_is_fixed);
+        mi_assert_internal(!segment->mem_is_pinned);
        mi_segment_protect(segment, false, tld->os); // reset protection if the page kind differs
      }
      // different page kinds; unreset any reset pages, and unprotect
@ -603,8 +614,11 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
      // ensure the initial info is committed
      if (segment->capacity < capacity) {
        bool commit_zero = false;
-        _mi_mem_commit(segment, pre_size, &commit_zero, tld->os);
+        bool ok = _mi_mem_commit(segment, pre_size, &commit_zero, tld->os);
        if (commit_zero) is_zero = true;
+        if (!ok) {
+          return NULL;
+        }
      }
    }
  }
@ -612,10 +626,12 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
    // Allocate the segment from the OS
    size_t memid;
    bool   mem_large = (!eager_delayed && (MI_SECURE==0)); // only allow large OS pages once we are no longer lazy
-    segment = (mi_segment_t*)_mi_mem_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_zero, &memid, os_tld);
+    bool   is_pinned = false;
+    segment = (mi_segment_t*)_mi_mem_alloc_aligned(segment_size, MI_SEGMENT_SIZE, &commit, &mem_large, &is_pinned, &is_zero, &memid, os_tld);
    if (segment == NULL) return NULL;  // failed to allocate
    if (!commit) {
      // ensure the initial info is committed
+      mi_assert_internal(!mem_large && !is_pinned);
      bool commit_zero = false;
      bool ok = _mi_mem_commit(segment, pre_size, &commit_zero, tld->os);
      if (commit_zero) is_zero = true;
@ -626,12 +642,13 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_
      }
    }
    segment->memid = memid;
-    segment->mem_is_fixed = mem_large;
-    segment->mem_is_committed = commit;
+    segment->mem_is_pinned = (mem_large || is_pinned);
+    segment->mem_is_committed = commit;    
    mi_segments_track_size((long)segment_size, tld);
  }
  mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0);
-
+  mi_assert_internal(segment->mem_is_pinned ? segment->mem_is_committed : true);  
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);  // tsan
  if (!pages_still_good) {
    // zero the segment info (but not the `mem` fields)
    ptrdiff_t ofs = offsetof(mi_segment_t, next);
@ -715,7 +732,7 @@ static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
  mi_pages_reset_remove(page, tld);
  // check commit
  if (!page->is_committed) {
-    mi_assert_internal(!segment->mem_is_fixed);
+    mi_assert_internal(!segment->mem_is_pinned);
    mi_assert_internal(!page->is_reset);    
    size_t psize;
    uint8_t* start = mi_segment_raw_page_start(segment, page, &psize);
@ -732,7 +749,13 @@ static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_seg
  segment->used++;
  // check reset
  if (page->is_reset) {
-    mi_page_unreset(segment, page, 0, tld); // todo: only unreset the part that was reset?
+    mi_assert_internal(!segment->mem_is_pinned);
+    bool ok = mi_page_unreset(segment, page, 0, tld); 
+    if (!ok) {
+      page->segment_in_use = false;
+      segment->used--;
+      return false;
+    }
  }
  mi_assert_internal(page->segment_in_use);
  mi_assert_internal(segment->used <= segment->capacity);
@ -791,7 +814,7 @@ static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, bool a
    mi_pages_reset_add(segment, page, tld);
  }

-  page->capacity = 0;  // after reset there can be zero'd now
+  page->capacity = 0;  // after reset these can be zero'd now
  page->reserved = 0;
 }

@ -867,84 +890,97 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se
 // This is a list of visited abandoned pages that were full at the time.
 // this list migrates to `abandoned` when that becomes NULL. The use of
 // this list reduces contention and the rate at which segments are visited.
-static mi_decl_cache_align volatile _Atomic(mi_segment_t*)       abandoned_visited; // = NULL
+static mi_decl_cache_align _Atomic(mi_segment_t*)       abandoned_visited; // = NULL

 // The abandoned page list (tagged as it supports pop)
-static mi_decl_cache_align volatile _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
+static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned;         // = NULL
+
+// Maintain these for debug purposes (these counts may be a bit off)
+static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_count; 
+static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_visited_count;

 // We also maintain a count of current readers of the abandoned list
 // in order to prevent resetting/decommitting segment memory if it might
 // still be read.
-static mi_decl_cache_align volatile _Atomic(uintptr_t)           abandoned_readers; // = 0
+static mi_decl_cache_align _Atomic(uintptr_t)           abandoned_readers; // = 0

 // Push on the visited list
 static void mi_abandoned_visited_push(mi_segment_t* segment) {
  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL);
  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
  mi_assert_internal(segment->used > 0);
-  mi_segment_t* anext;
+  mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited);
  do {
-    anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited);
-    segment->abandoned_next = anext;
-  } while (!mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned_visited, segment, anext));
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext);
+  } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment));
+  mi_atomic_increment_relaxed(&abandoned_visited_count);
 }

 // Move the visited list to the abandoned list.
 static bool mi_abandoned_visited_revisit(void)
 {
  // quick check if the visited list is empty
-  if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false;
+  if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false;

  // grab the whole visited list
-  mi_segment_t* first = mi_atomic_exchange_ptr(mi_segment_t, &abandoned_visited, NULL);
+  mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL);
  if (first == NULL) return false;

  // first try to swap directly if the abandoned list happens to be NULL
-  const mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
  mi_tagged_segment_t afirst;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
  if (mi_tagged_segment_ptr(ts)==NULL) {
+    uintptr_t count = mi_atomic_load_relaxed(&abandoned_visited_count);
    afirst = mi_tagged_segment(first, ts);
-    if (mi_atomic_cas_strong(&abandoned, afirst, ts)) return true;
+    if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) {
+      mi_atomic_add_relaxed(&abandoned_count, count);
+      mi_atomic_sub_relaxed(&abandoned_visited_count, count);
+      return true;
+    }
  }

  // find the last element of the visited list: O(n)
  mi_segment_t* last = first;
-  while (last->abandoned_next != NULL) {
-    last = last->abandoned_next;
+  mi_segment_t* next;
+  while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) {
+    last = next;
  }

  // and atomically prepend to the abandoned list
  // (no need to increase the readers as we don't access the abandoned segments)
-  mi_tagged_segment_t anext;
+  mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned);
+  uintptr_t count;
  do {
-    anext = mi_atomic_read_relaxed(&abandoned);
-    last->abandoned_next = mi_tagged_segment_ptr(anext);
+    count = mi_atomic_load_relaxed(&abandoned_visited_count);
+    mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext));
    afirst = mi_tagged_segment(first, anext);
-  } while (!mi_atomic_cas_weak(&abandoned, afirst, anext));
+  } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst));
+  mi_atomic_add_relaxed(&abandoned_count, count);
+  mi_atomic_sub_relaxed(&abandoned_visited_count, count);
  return true;
 }

 // Push on the abandoned list.
 static void mi_abandoned_push(mi_segment_t* segment) {
  mi_assert_internal(segment->thread_id == 0);
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
  mi_assert_internal(segment->next == NULL && segment->prev == NULL);
  mi_assert_internal(segment->used > 0);
-  mi_tagged_segment_t ts;
  mi_tagged_segment_t next;
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
  do {
-    ts = mi_atomic_read_relaxed(&abandoned);
-    segment->abandoned_next = mi_tagged_segment_ptr(ts);
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts));
    next = mi_tagged_segment(segment, ts);
-  } while (!mi_atomic_cas_weak(&abandoned, next, ts));
+  } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next));
+  mi_atomic_increment_relaxed(&abandoned_count);
 }

 // Wait until there are no more pending reads on segments that used to be in the abandoned list
 void _mi_abandoned_await_readers(void) {
  uintptr_t n;
  do {
-    n = mi_atomic_read(&abandoned_readers);
+    n = mi_atomic_load_acquire(&abandoned_readers);
    if (n != 0) mi_atomic_yield();
  } while (n != 0);
 }
@ -953,7 +989,7 @@ void _mi_abandoned_await_readers(void) {
 static mi_segment_t* mi_abandoned_pop(void) {
  mi_segment_t* segment;
  // Check efficiently if it is empty (or if the visited list needs to be moved)
-  mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned);
+  mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned);
  segment = mi_tagged_segment_ptr(ts);
  if (mi_likely(segment == NULL)) {
    if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL
@ -964,19 +1000,21 @@ static mi_segment_t* mi_abandoned_pop(void) {
  // Do a pop. We use a reader count to prevent
  // a segment to be decommitted while a read is still pending,
  // and a tagged pointer to prevent A-B-A link corruption.
-  // (this is called from `memory.c:_mi_mem_free` for example)
-  mi_atomic_increment(&abandoned_readers);  // ensure no segment gets decommitted
+  // (this is called from `region.c:_mi_mem_free` for example)
+  mi_atomic_increment_relaxed(&abandoned_readers);  // ensure no segment gets decommitted
  mi_tagged_segment_t next = 0;
+  ts = mi_atomic_load_acquire(&abandoned);
  do {
-    ts = mi_atomic_read_relaxed(&abandoned);
    segment = mi_tagged_segment_ptr(ts);
    if (segment != NULL) {
-      next = mi_tagged_segment(segment->abandoned_next, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
+      mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next);
+      next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted
    }
-  } while (segment != NULL && !mi_atomic_cas_weak(&abandoned, next, ts));
-  mi_atomic_decrement(&abandoned_readers);  // release reader lock
+  } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next));
+  mi_atomic_decrement_relaxed(&abandoned_readers);  // release reader lock
  if (segment != NULL) {
-    segment->abandoned_next = NULL;
+    mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
+    mi_atomic_decrement_relaxed(&abandoned_count);
  }
  return segment;
 }
@ -988,7 +1026,7 @@ static mi_segment_t* mi_abandoned_pop(void) {
 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
  mi_assert_internal(segment->used == segment->abandoned);
  mi_assert_internal(segment->used > 0);
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
  mi_assert_expensive(mi_segment_is_valid(segment, tld));

  // remove the segment from the free page queue if needed
@ -1001,8 +1039,8 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
  mi_segments_track_size(-((long)segment->segment_size), tld);
  segment->thread_id = 0;
-  segment->abandoned_next = NULL;
  segment->abandoned_visits = 0;
+  mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL);
  mi_abandoned_push(segment);
 }

@ -1066,7 +1104,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool
 // Reclaim a segment; returns NULL if the segment was freed
 // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full.
 static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) {
-  mi_assert_internal(segment->abandoned_next == NULL);
+  mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL);
  if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; }

  segment->thread_id = _mi_thread_id();
@ -1283,28 +1321,27 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block
  // huge page segments are always abandoned and can be freed immediately by any thread
  mi_assert_internal(segment->page_kind==MI_PAGE_HUGE);
  mi_assert_internal(segment == _mi_page_segment(page));
-  mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0);
+  mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0);

  // claim it and free
  mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized.
  // paranoia: if this it the last reference, the cas should always succeed
-  if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) {
+  uintptr_t expected_tid = 0;
+  if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) {
    mi_block_set_next(page, block, page->free);
    page->free = block;
    page->used--;
    page->is_zero = false;
    mi_assert(page->used == 0);
    mi_tld_t* tld = heap->tld;
-    const size_t bsize = mi_page_usable_block_size(page);
-    if (bsize > MI_HUGE_OBJ_SIZE_MAX) {
-      _mi_stat_decrease(&tld->stats.giant, bsize); 
-    }
-    else {
-      _mi_stat_decrease(&tld->stats.huge, bsize);
-    }
    mi_segments_track_size((long)segment->segment_size, &tld->segments);
    _mi_segment_page_free(page, true, &tld->segments);
  }
+#if (MI_DEBUG!=0)
+  else {
+    mi_assert_internal(false);
+  }
+#endif
 }

 /* -----------------------------------------------------------
--- a/src/static.c
+++ b/src/static.c
@ -4,7 +4,14 @@ This is free software; you can redistribute it and/or modify it under the
 terms of the MIT license. A copy of the license can be found in the file
 "LICENSE" at the root of this distribution.
 -----------------------------------------------------------------------------*/
+#ifndef _DEFAULT_SOURCE
 #define _DEFAULT_SOURCE
+#endif
+#if defined(__sun)
+// same remarks as os.c for the static's context.
+#undef _XOPEN_SOURCE
+#undef _POSIX_C_SOURCE
+#endif

 #include "mimalloc.h"
 #include "mimalloc-internal.h"
@ -16,6 +23,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #include "stats.c"
 #include "random.c"
 #include "os.c"
+#include "bitmap.c"
 #include "arena.c"
 #include "region.c"
 #include "segment.c"
--- a/src/stats.c
+++ b/src/stats.c
@ -11,6 +11,9 @@ terms of the MIT license. A copy of the license can be found in the file
 #include <stdio.h>  // fputs, stderr
 #include <string.h> // memset

+#if defined(_MSC_VER) && (_MSC_VER < 1920)
+#pragma warning(disable:4204)  // non-constant aggregate initializer
+#endif

 /* -----------------------------------------------------------
  Statistics operations
@ -26,13 +29,13 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
  if (mi_is_in_main(stat))
  {
    // add atomically (for abandoned pages)
-    mi_atomic_addi64(&stat->current,amount);
-    if (stat->current > stat->peak) stat->peak = stat->current;  // racing.. it's ok
+    int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount);
+    mi_atomic_maxi64_relaxed(&stat->peak, current + amount);
    if (amount > 0) {
-      mi_atomic_addi64(&stat->allocated,amount);
+      mi_atomic_addi64_relaxed(&stat->allocated,amount);
    }
    else {
-      mi_atomic_addi64(&stat->freed, -amount);
+      mi_atomic_addi64_relaxed(&stat->freed, -amount);
    }
  }
  else {
@ -50,8 +53,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {

 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
  if (mi_is_in_main(stat)) {
-    mi_atomic_addi64( &stat->count, 1 );
-    mi_atomic_addi64( &stat->total, (int64_t)amount );
+    mi_atomic_addi64_relaxed( &stat->count, 1 );
+    mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount );
  }
  else {
    stat->count++;
@ -70,17 +73,18 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
  if (stat==src) return;
-  mi_atomic_addi64( &stat->allocated, src->allocated * unit);
-  mi_atomic_addi64( &stat->current, src->current * unit);
-  mi_atomic_addi64( &stat->freed, src->freed * unit);
-  // peak scores do not work across threads..
-  mi_atomic_addi64( &stat->peak, src->peak * unit);
+  if (src->allocated==0 && src->freed==0) return;
+  mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit);
+  mi_atomic_addi64_relaxed( &stat->current, src->current * unit);
+  mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit);
+  // peak scores do not work across threads.. 
+  mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit);
 }

 static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
  if (stat==src) return;
-  mi_atomic_addi64( &stat->total, src->total * unit);
-  mi_atomic_addi64( &stat->count, src->count * unit);
+  mi_atomic_addi64_relaxed( &stat->total, src->total * unit);
+  mi_atomic_addi64_relaxed( &stat->count, src->count * unit);
 }

 // must be thread safe as it is called from stats_merge
@ -99,6 +103,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {

  mi_stat_add(&stats->malloc, &src->malloc, 1);
  mi_stat_add(&stats->segments_cache, &src->segments_cache, 1);
+  mi_stat_add(&stats->normal, &src->normal, 1);
  mi_stat_add(&stats->huge, &src->huge, 1);
  mi_stat_add(&stats->giant, &src->giant, 1);

@ -108,12 +113,13 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {

  mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1);
  mi_stat_counter_add(&stats->searches, &src->searches, 1);
+  mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1);
  mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1);
  mi_stat_counter_add(&stats->giant_count, &src->giant_count, 1);
 #if MI_STAT>1
  for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    if (src->normal[i].allocated > 0 || src->normal[i].freed > 0) {
-      mi_stat_add(&stats->normal[i], &src->normal[i], 1);
+    if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) {
+      mi_stat_add(&stats->normal_bins[i], &src->normal_bins[i], 1);
    }
  }
 #endif
@ -145,7 +151,7 @@ static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void*
    const int64_t tens = (n / (divider/10));
    const long whole = (long)(tens/10);
    const long frac1 = (long)(tens%10);
-    snprintf(buf, len, "%ld.%ld %s%s", whole, frac1, magnitude, suffix);
+    snprintf(buf, len, "%ld.%ld %s%s", whole, (frac1 < 0 ? -frac1 : frac1), magnitude, suffix);
  }
  _mi_fprintf(out, arg, (fmt==NULL ? "%11s" : fmt), buf);
 }
@ -166,6 +172,7 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
    mi_print_amount(stat->peak, unit, out, arg);
    mi_print_amount(stat->allocated, unit, out, arg);
    mi_print_amount(stat->freed, unit, out, arg);
+    mi_print_amount(stat->current, unit, out, arg);
    mi_print_amount(unit, 1, out, arg);
    mi_print_count(stat->allocated, unit, out, arg);
    if (stat->allocated > stat->freed)
@ -177,6 +184,7 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
    mi_print_amount(stat->peak, -1, out, arg);
    mi_print_amount(stat->allocated, -1, out, arg);
    mi_print_amount(stat->freed, -1, out, arg);
+    mi_print_amount(stat->current, -1, out, arg);
    if (unit==-1) {
      _mi_fprintf(out, arg, "%22s", "");
    }
@ -192,6 +200,8 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t
  else {
    mi_print_amount(stat->peak, 1, out, arg);
    mi_print_amount(stat->allocated, 1, out, arg);
+    _mi_fprintf(out, arg, "%11s", " ");  // no freed 
+    mi_print_amount(stat->current, 1, out, arg);
    _mi_fprintf(out, arg, "\n");
  }
 }
@ -211,24 +221,21 @@ static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char*


 static void mi_print_header(mi_output_fun* out, void* arg ) {
-  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "unit  ", "count  ");
+  _mi_fprintf(out, arg, "%10s: %10s %10s %10s %10s %10s %10s\n", "heap stats", "peak  ", "total  ", "freed  ", "current  ", "unit  ", "count  ");
 }

 #if MI_STAT>1
-static void mi_stats_print_bins(mi_stat_count_t* all, const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
+static void mi_stats_print_bins(const mi_stat_count_t* bins, size_t max, const char* fmt, mi_output_fun* out, void* arg) {
  bool found = false;
  char buf[64];
  for (size_t i = 0; i <= max; i++) {
    if (bins[i].allocated > 0) {
      found = true;
      int64_t unit = _mi_bin_size((uint8_t)i);
-      snprintf(buf, 64, "%s %3zu", fmt, i);
-      mi_stat_add(all, &bins[i], unit);
+      snprintf(buf, 64, "%s %3lu", fmt, (long)i);
      mi_stat_print(&bins[i], buf, unit, out, arg);
    }
  }
-  //snprintf(buf, 64, "%s all", fmt);
-  //mi_stat_print(all, buf, 1);
  if (found) {
    _mi_fprintf(out, arg, "\n");
    mi_print_header(out, arg);
@ -272,31 +279,34 @@ static void mi_buffered_out(const char* msg, void* arg) {
 // Print statistics
 //------------------------------------------------------------

-static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit);
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);

-static void _mi_stats_print(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
+static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept {
  // wrap the output function to be line buffered
  char buf[256];
-  buffered_t buffer = { out0, arg0, buf, 0, 255 };
+  buffered_t buffer = { out0, arg0, NULL, 0, 255 };
+  buffer.buf = buf;
  mi_output_fun* out = &mi_buffered_out;
  void* arg = &buffer;

  // and print using that
  mi_print_header(out,arg);
  #if MI_STAT>1
-  mi_stat_count_t normal = { 0,0,0,0 };
-  mi_stats_print_bins(&normal, stats->normal, MI_BIN_HUGE, "normal",out,arg);
-  mi_stat_print(&normal, "normal", 1, out, arg);
+  mi_stats_print_bins(stats->normal_bins, MI_BIN_HUGE, "normal",out,arg);
+  #endif
+  #if MI_STAT
+  mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg);
  mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg);
  mi_stat_print(&stats->giant, "giant", (stats->giant_count.count == 0 ? 1 : -(stats->giant.allocated / stats->giant_count.count)), out, arg);
  mi_stat_count_t total = { 0,0,0,0 };
-  mi_stat_add(&total, &normal, 1);
+  mi_stat_add(&total, &stats->normal, 1);
  mi_stat_add(&total, &stats->huge, 1);
  mi_stat_add(&total, &stats->giant, 1);
  mi_stat_print(&total, "total", 1, out, arg);
-  _mi_fprintf(out, arg, "malloc requested:     ");
-  mi_print_amount(stats->malloc.allocated, 1, out, arg);
-  _mi_fprintf(out, arg, "\n\n");
+  #endif
+  #if MI_STAT>1
+  mi_stat_print(&stats->malloc, "malloc req", 1, out, arg);
+  _mi_fprintf(out, arg, "\n");
  #endif
  mi_stat_print(&stats->reserved, "reserved", 1, out, arg);
  mi_stat_print(&stats->committed, "committed", 1, out, arg);
@ -314,25 +324,28 @@ static void _mi_stats_print(mi_stats_t* stats, mi_msecs_t elapsed, mi_output_fun
  mi_stat_print(&stats->threads, "threads", -1, out, arg);
  mi_stat_counter_print_avg(&stats->searches, "searches", out, arg);
  _mi_fprintf(out, arg, "%10s: %7i\n", "numa nodes", _mi_os_numa_node_count());
-  if (elapsed > 0) _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
-
+  
+  mi_msecs_t elapsed;
  mi_msecs_t user_time;
  mi_msecs_t sys_time;
+  size_t current_rss;
  size_t peak_rss;
-  size_t page_faults;
-  size_t page_reclaim;
+  size_t current_commit;
  size_t peak_commit;
-  mi_process_info(&user_time, &sys_time, &peak_rss, &page_faults, &page_reclaim, &peak_commit);
-  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, reclaims: %lu, rss: ", "process", user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults, (unsigned long)page_reclaim );
+  size_t page_faults;
+  mi_stat_process_info(&elapsed, &user_time, &sys_time, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000);
+  _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process",
+              user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults );
  mi_printf_amount((int64_t)peak_rss, 1, out, arg, "%s");
  if (peak_commit > 0) {
-    _mi_fprintf(out, arg, ", commit charge: ");
+    _mi_fprintf(out, arg, ", commit: ");
    mi_printf_amount((int64_t)peak_commit, 1, out, arg, "%s");
  }
  _mi_fprintf(out, arg, "\n");  
 }

-static mi_msecs_t mi_time_start; // = 0
+static mi_msecs_t mi_process_start; // = 0

 static mi_stats_t* mi_stats_get_default(void) {
  mi_heap_t* heap = mi_heap_get_default();
@ -350,7 +363,7 @@ void mi_stats_reset(void) mi_attr_noexcept {
  mi_stats_t* stats = mi_stats_get_default();
  if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); }
  memset(&_mi_stats_main, 0, sizeof(mi_stats_t));
-  mi_time_start = _mi_clock_start();
+  if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); };
 }

 void mi_stats_merge(void) mi_attr_noexcept {
@ -362,9 +375,8 @@ void _mi_stats_done(mi_stats_t* stats) {  // called from `mi_thread_done`
 }

 void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
  mi_stats_merge_from(mi_stats_get_default());
-  _mi_stats_print(&_mi_stats_main, elapsed, out, arg);
+  _mi_stats_print(&_mi_stats_main, out, arg);
 }

 void mi_stats_print(void* out) mi_attr_noexcept {
@ -373,8 +385,7 @@ void mi_stats_print(void* out) mi_attr_noexcept {
 }

 void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
-  mi_msecs_t elapsed = _mi_clock_end(mi_time_start);
-  _mi_stats_print(mi_stats_get_default(), elapsed, out, arg);
+  _mi_stats_print(mi_stats_get_default(), out, arg);
 }


@ -382,7 +393,7 @@ void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept {
 // Basic timer for convenience; use milli-seconds to avoid doubles
 // ----------------------------------------------------------------
 #ifdef _WIN32
-#include <windows.h>
+#include <Windows.h>
 static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) {
  static LARGE_INTEGER mfreq; // = 0
  if (mfreq.QuadPart == 0LL) {
@ -437,7 +448,7 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) {
 // --------------------------------------------------------

 #if defined(_WIN32)
-#include <windows.h>
+#include <Windows.h>
 #include <psapi.h>
 #pragma comment(lib,"psapi.lib")

@ -448,7 +459,10 @@ static mi_msecs_t filetime_msecs(const FILETIME* ftime) {
  mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds
  return msecs;
 }
-static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) 
+{
+  *elapsed = _mi_clock_end(mi_process_start);
  FILETIME ct;
  FILETIME ut;
  FILETIME st;
@ -456,16 +470,16 @@ static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_r
  GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut);
  *utime = filetime_msecs(&ut);
  *stime = filetime_msecs(&st);
-
  PROCESS_MEMORY_COUNTERS info;
  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
-  *peak_rss = (size_t)info.PeakWorkingSetSize;
-  *page_faults = (size_t)info.PageFaultCount;
-  *peak_commit = (size_t)info.PeakPagefileUsage;
-  *page_reclaim = 0;
+  *current_rss    = (size_t)info.WorkingSetSize;
+  *peak_rss       = (size_t)info.PeakWorkingSetSize;
+  *current_commit = (size_t)info.PagefileUsage;
+  *peak_commit    = (size_t)info.PeakPagefileUsage;
+  *page_faults    = (size_t)info.PageFaultCount;  
 }

-#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__))
+#elif defined(__unix__) || defined(__unix) || defined(unix) || (defined(__APPLE__) && defined(__MACH__)) || defined(__HAIKU__)
 #include <stdio.h>
 #include <unistd.h>
 #include <sys/resource.h>
@ -474,23 +488,48 @@ static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_r
 #include <mach/mach.h>
 #endif

+#if defined(__HAIKU__)
+#include <kernel/OS.h>
+#endif
+
 static mi_msecs_t timeval_secs(const struct timeval* tv) {
  return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L);
 }

-static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
  struct rusage rusage;
  getrusage(RUSAGE_SELF, &rusage);
-#if defined(__APPLE__) && defined(__MACH__)
-  *peak_rss = rusage.ru_maxrss;
-#else
-  *peak_rss = rusage.ru_maxrss * 1024;
-#endif
-  *page_faults = rusage.ru_majflt;
-  *page_reclaim = rusage.ru_minflt;
-  *peak_commit = 0;
  *utime = timeval_secs(&rusage.ru_utime);
  *stime = timeval_secs(&rusage.ru_stime);
+#if !defined(__HAIKU__)
+  *page_faults = rusage.ru_majflt;
+#endif
+  // estimate commit using our stats
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *current_rss    = *current_commit;  // estimate 
+#if defined(__HAIKU__)
+  // Haiku does not have (yet?) a way to
+  // get these stats per process
+  thread_info tid;
+  area_info mem;
+  ssize_t c;
+  get_thread_info(find_thread(0), &tid);
+  while (get_next_area_info(tid.team, &c, &mem) == B_OK) {
+    *peak_rss += mem.ram_size;
+  }
+#elif defined(__APPLE__) && defined(__MACH__)
+  *peak_rss = rusage.ru_maxrss;         // BSD reports in bytes
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) {
+    *current_rss = (size_t)info.resident_size;
+  }
+#else
+  *peak_rss = rusage.ru_maxrss * 1024;  // Linux reports in KiB
+#endif  
 }

 #else
@ -499,12 +538,38 @@ static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_r
 #pragma message("define a way to get process info")
 #endif

-static void mi_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* peak_rss, size_t* page_faults, size_t* page_reclaim, size_t* peak_commit) {
-  *peak_rss = 0;
+static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults)
+{
+  *elapsed = _mi_clock_end(mi_process_start);
+  *peak_commit    = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak));
+  *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current));
+  *peak_rss    = *peak_commit;
+  *current_rss = *current_commit;
  *page_faults = 0;
-  *page_reclaim = 0;
-  *peak_commit = 0;
  *utime = 0;
  *stime = 0;
 }
 #endif
+
+
+mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept
+{
+  mi_msecs_t elapsed = 0;
+  mi_msecs_t utime = 0;
+  mi_msecs_t stime = 0;
+  size_t current_rss0 = 0;
+  size_t peak_rss0 = 0;
+  size_t current_commit0 = 0;
+  size_t peak_commit0 = 0;
+  size_t page_faults0 = 0;  
+  mi_stat_process_info(&elapsed,&utime, &stime, &current_rss0, &peak_rss0, &current_commit0, &peak_commit0, &page_faults0);
+  if (elapsed_msecs!=NULL)  *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX));
+  if (user_msecs!=NULL)     *user_msecs     = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX));
+  if (system_msecs!=NULL)   *system_msecs   = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX));
+  if (current_rss!=NULL)    *current_rss    = current_rss0;
+  if (peak_rss!=NULL)       *peak_rss       = peak_rss0;
+  if (current_commit!=NULL) *current_commit = current_commit0;
+  if (peak_commit!=NULL)    *peak_commit    = peak_commit0;
+  if (page_faults!=NULL)    *page_faults    = page_faults0;
+}
+
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@ -11,38 +11,54 @@ static void double_free1();
 static void double_free2();
 static void corrupt_free();
 static void block_overflow1();
+static void invalid_free();
+static void test_aslr(void);
+static void test_process_info(void);
+static void test_reserved(void);
+static void negative_stat(void);

 int main() {
  mi_version();
-
+  mi_stats_reset();
  // detect double frees and heap corruption
  // double_free1();
  // double_free2();
  // corrupt_free();
-  block_overflow1();
-
+  // block_overflow1();
+  // test_aslr();
+  // invalid_free();
+  // test_reserved();
+  // negative_stat();
+  
  void* p1 = malloc(78);
  void* p2 = malloc(24);
  free(p1);
  p1 = mi_malloc(8);
-  //char* s = strdup("hello\n");
+  char* s = strdup("hello\n");
  free(p2);
+  
  p2 = malloc(16);
  p1 = realloc(p1, 32);
  free(p1);
  free(p2);
-  //free(s);
-  //mi_collect(true);
-
+  free(s);
+  
  /* now test if override worked by allocating/freeing across the api's*/
  //p1 = mi_malloc(32);
  //free(p1);
  //p2 = malloc(32);
  //mi_free(p2);
+  mi_collect(true);
  mi_stats_print(NULL);
+  // test_process_info();
  return 0;
 }

+static void invalid_free() {
+  free((void*)0xBADBEEF);
+  realloc((void*)0xBADBEEF,10);
+}
+
 static void block_overflow1() {
  uint8_t* p = (uint8_t*)mi_malloc(17);
  p[18] = 0;
@ -114,3 +130,53 @@ static void corrupt_free() {
    malloc(SZ);
  }
 }
+
+static void test_aslr(void) {
+  void* p[256];
+  p[0] = malloc(378200);
+  p[1] = malloc(1134626);
+  printf("p1: %p, p2: %p\n", p[0], p[1]);
+}
+
+static void test_process_info(void) {
+  size_t elapsed = 0;
+  size_t user_msecs = 0;
+  size_t system_msecs = 0;
+  size_t current_rss = 0;
+  size_t peak_rss = 0;
+  size_t current_commit = 0;
+  size_t peak_commit = 0;
+  size_t page_faults = 0;  
+  for (int i = 0; i < 100000; i++) {
+    void* p = calloc(100,10);
+    free(p);
+  }
+  mi_process_info(&elapsed, &user_msecs, &system_msecs, &current_rss, &peak_rss, &current_commit, &peak_commit, &page_faults);
+  printf("\n\n*** process info: elapsed %3zd.%03zd s, user: %3zd.%03zd s, rss: %zd b, commit: %zd b\n\n", elapsed/1000, elapsed%1000, user_msecs/1000, user_msecs%1000, peak_rss, peak_commit);
+}
+
+static void test_reserved(void) {
+#define KiB 1024ULL
+#define MiB (KiB*KiB)
+#define GiB (MiB*KiB)
+  mi_reserve_os_memory(4*GiB, false, true);
+  void* p1 = malloc(100);
+  void* p2 = malloc(100000);
+  void* p3 = malloc(2*GiB);
+  void* p4 = malloc(1*GiB + 100000);
+  free(p1);
+  free(p2);
+  free(p3);
+  p3 = malloc(1*GiB);
+  free(p4);
+}
+
+
+
+static void negative_stat(void) {
+  int* p = mi_malloc(60000);
+  mi_stats_print_out(NULL, NULL);
+  *p = 100;
+  mi_free(p);
+  mi_stats_print_out(NULL, NULL);  
+}
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@ -14,10 +14,12 @@
 #include <mimalloc.h>
 #include <assert.h>

+#ifdef _WIN32
 #include <mimalloc-new-delete.h>
+#endif

 #ifdef _WIN32
-#include <windows.h>
+#include <Windows.h>
 static void msleep(unsigned long msecs) { Sleep(msecs); }
 #else
 #include <unistd.h>
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -20,7 +20,6 @@ terms of the MIT license.
 #include <stdint.h>
 #include <stdbool.h>
 #include <string.h>
-#include <mimalloc.h>

 // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
 //
@ -43,6 +42,7 @@ static size_t use_one_size = 0;              // use single object size of `N * s
 #define custom_realloc(p,s)   realloc(p,s)
 #define custom_free(p)        free(p)
 #else
+#include <mimalloc.h>
 #define custom_calloc(n,s)    mi_calloc(n,s)
 #define custom_realloc(p,s)   mi_realloc(p,s)
 #define custom_free(p)        mi_free(p)
@ -123,7 +123,7 @@ static void free_items(void* p) {

 static void stress(intptr_t tid) {
  //bench_start_thread();
-  uintptr_t r = (tid * 43); // rand();
+  uintptr_t r = ((tid + 1) * 43); // rand();
  const size_t max_item_shift = 5; // 128
  const size_t max_item_retained_shift = max_item_shift + 2;
  size_t allocs = 100 * ((size_t)SCALE) * (tid % 8 + 1); // some threads do more
@ -189,7 +189,7 @@ static void test_stress(void) {
      }
    }
    // mi_collect(false);
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(MI_TSAN)
    if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); }
 #endif
  }
@ -217,7 +217,7 @@ static void test_leak(void) {
 }
 #endif

-int main(int argc, char** argv) {
+int main(int argc, char** argv) {  
  // > mimalloc-test-stress [THREADS] [SCALE] [ITER]
  if (argc >= 2) {
    char* end;
@ -235,6 +235,7 @@ int main(int argc, char** argv) {
    if (n > 0) ITER = n;
  }
  printf("Using %d threads with a %d%% load-per-thread and %d iterations\n", THREADS, SCALE, ITER);
+  //mi_reserve_os_memory(1024*1024*1024ULL, false, true);
  //int res = mi_reserve_huge_os_pages(4,1);
  //printf("(reserve huge: %i\n)", res);

@ -250,7 +251,9 @@ int main(int argc, char** argv) {
 #endif

  // mi_collect(true);
+#ifndef USE_STD_MALLOC
  mi_stats_print(NULL);
+#endif
  //bench_end_program();
  return 0;
 }
@ -260,7 +263,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress;

 #ifdef _WIN32

-#include <windows.h>
+#include <Windows.h>

 static DWORD WINAPI thread_entry(LPVOID param) {
  thread_entry_fun((intptr_t)param);
@ -272,7 +275,7 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
  DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD));
  HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE));
  for (uintptr_t i = 0; i < nthreads; i++) {
-    thandles[i] = CreateThread(0, 4096, &thread_entry, (void*)(i), 0, &tids[i]);
+    thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]);
  }
  for (size_t i = 0; i < nthreads; i++) {
    WaitForSingleObject(thandles[i], INFINITE);
@ -305,7 +308,7 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) {
  pthread_t* threads = (pthread_t*)custom_calloc(nthreads,sizeof(pthread_t));
  memset(threads, 0, sizeof(pthread_t) * nthreads);
  //pthread_setconcurrency(nthreads);
-  for (uintptr_t i = 0; i < nthreads; i++) {
+  for (size_t i = 0; i < nthreads; i++) {
    pthread_create(&threads[i], NULL, &thread_entry, (void*)i);
  }
  for (size_t i = 0; i < nthreads; i++) {