From 32ee13f41eafb7de541438279100c28ed9df437f Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 2 Nov 2021 21:54:44 -0700 Subject: [PATCH] improve macOS M1 performance; use interpose in combination with zone's; add -fno-builtin-malloc flag in building with MI_OVERRIDE --- CMakeLists.txt | 14 ++- src/alloc-override-osx.c | 234 ++++++++++++++++++++++++++++++++------- src/alloc-override.c | 25 +++-- 3 files changed, 223 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c46ffdac..f07d892f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -12,7 +12,7 @@ option(MI_XMALLOC "Enable abort() call on memory allocation failure by option(MI_SHOW_ERRORS "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF) option(MI_USE_CXX "Use the C++ compiler to compile the library (instead of the C compiler)" OFF) option(MI_SEE_ASM "Generate assembly files" OFF) -option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" OFF) +option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" ON) option(MI_OSX_ZONE "Use malloc zone to override standard malloc on macOS" ON) option(MI_LOCAL_DYNAMIC_TLS "Use slightly slower, dlopen-compatible TLS mechanism (Unix)" OFF) option(MI_BUILD_SHARED "Build shared library" ON) @@ -78,12 +78,17 @@ if(MI_OVERRIDE) message(STATUS " Use malloc zone to override malloc (MI_OSX_ZONE=ON)") list(APPEND mi_sources src/alloc-override-osx.c) list(APPEND mi_defines MI_OSX_ZONE=1) + if (NOT MI_OSX_INTERPOSE) + message(STATUS " WARNING: zone overriding usually also needs interpose (use -DMI_OSX_INTERPOSE=ON)") + endif() endif() if(MI_OSX_INTERPOSE) # use interpose on macOS message(STATUS " Use interpose to override malloc (MI_OSX_INTERPOSE=ON)") - message(STATUS " WARNING: interpose does not seem to work reliably on the M1; use -DMI_OSX_ZONE=ON instead") list(APPEND mi_defines MI_OSX_INTERPOSE) + if (NOT MI_OSX_ZONE) + message(STATUS " WARNING: interpose usually also needs zone overriding (use -DMI_OSX_INTERPOSE=ON)") + endif() endif() endif() endif() @@ -188,6 +193,9 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM else() list(APPEND mi_cflags -ftls-model=initial-exec) endif() + if(MI_OVERRIDE) + list(APPEND -fno-builtin-malloc) + endif() endif() if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) @@ -364,7 +372,7 @@ if (MI_BUILD_TESTS) target_compile_definitions(mimalloc-test-api PRIVATE ${mi_defines}) target_compile_options(mimalloc-test-api PRIVATE ${mi_cflags}) target_include_directories(mimalloc-test-api PRIVATE include) - target_link_libraries(mimalloc-test-api PRIVATE mimalloc-static ${mi_libraries}) + target_link_libraries(mimalloc-test-api PRIVATE mimalloc ${mi_libraries}) add_executable(mimalloc-test-stress test/test-stress.c) target_compile_definitions(mimalloc-test-stress PRIVATE ${mi_defines}) diff --git a/src/alloc-override-osx.c b/src/alloc-override-osx.c index 723ef226..0759f785 100644 --- a/src/alloc-override-osx.c +++ b/src/alloc-override-osx.c @@ -17,17 +17,20 @@ terms of the MIT license. A copy of the license can be found in the file /* ------------------------------------------------------ Override system malloc on macOS This is done through the malloc zone interface. - It seems we also need to interpose (see `alloc-override.c`) - or otherwise we get zone errors as there are usually - already allocations done by the time we take over the - zone. Unfortunately, that means we need to replace - the `free` with a checked free (`cfree`) impacting - performance. + It seems to be most robust in combination with interposing + though or otherwise we may get zone errors as there are could + be allocations done by the time we take over the + zone. ------------------------------------------------------ */ #include #include #include // memset +#include + +#ifdef __cplusplus +extern "C" { +#endif #if defined(MAC_OS_X_VERSION_10_6) && \ MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6 @@ -41,9 +44,7 @@ extern malloc_zone_t* malloc_default_purgeable_zone(void) __attribute__((weak_im static size_t zone_size(malloc_zone_t* zone, const void* p) { UNUSED(zone); - if (!mi_is_in_heap_region(p)) - return 0; // not our pointer, bail out - + //if (!mi_is_in_heap_region(p)){ return 0; } // not our pointer, bail out return mi_usable_size(p); } @@ -109,6 +110,11 @@ static void zone_free_definite_size(malloc_zone_t* zone, void* p, size_t size) { zone_free(zone,p); } +static boolean_t zone_claimed_address(malloc_zone_t* zone, void* p) { + UNUSED(zone); + return mi_is_in_heap_region(p); +} + /* ------------------------------------------------------ Introspection members @@ -174,21 +180,6 @@ static boolean_t intro_zone_locked(malloc_zone_t* zone) { At process start, override the default allocator ------------------------------------------------------ */ -static malloc_zone_t* mi_get_default_zone() -{ - // The first returned zone is the real default - malloc_zone_t** zones = NULL; - unsigned count = 0; - kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count); - if (ret == KERN_SUCCESS && count > 0) { - return zones[0]; - } - else { - // fallback - return malloc_default_zone(); - } -} - #if defined(__GNUC__) && !defined(__clang__) #pragma GCC diagnostic ignored "-Wmissing-field-initializers" #endif @@ -220,33 +211,197 @@ static malloc_zone_t mi_malloc_zone = { .batch_malloc = &zone_batch_malloc, .batch_free = &zone_batch_free, .introspect = &mi_introspect, -#if defined(MAC_OS_X_VERSION_10_6) && \ - MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6 - // switch to version 9 on OSX 10.6 to support memalign. - .version = 9, +#if defined(MAC_OS_X_VERSION_10_6) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6 + // switch to version 9+ on OSX 10.6 to support memalign. .memalign = &zone_memalign, .free_definite_size = &zone_free_definite_size, .pressure_relief = &zone_pressure_relief, + #if defined(MAC_OS_X_VERSION_10_7) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_7 + .claimed_address = &zone_claimed_address, + .version = 10 + #else + .version = 9 + #endif #else - .version = 4, + .version = 4 #endif }; +#ifdef __cplusplus +} +#endif -#if defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE) -static malloc_zone_t *mi_malloc_default_zone(void) { +#if defined(MI_OSX_INTERPOSE) + +// ------------------------------------------------------ +// Override malloc_xxx and zone_xxx api's to use only +// our mimalloc zone. Since even the loader uses malloc +// on macOS, this ensures that all allocations go through +// mimalloc (as all calls are interposed). +// ------------------------------------------------------ + +static inline malloc_zone_t* mi_get_default_zone(void) +{ + static bool init; + if (mi_unlikely(!init)) { + init = true; + malloc_zone_register(&mi_malloc_zone); // by calling register we avoid a zone error on free (see ) + } return &mi_malloc_zone; } -// TODO: should use the macros in alloc-override but they aren't available here. -__attribute__((used)) static struct { - const void *replacement; - const void *target; -} replace_malloc_default_zone[] __attribute__((section("__DATA, __interpose"))) = { - { (const void*)mi_malloc_default_zone, (const void*)malloc_default_zone }, -}; -#endif +mi_decl_externc int malloc_jumpstart(uintptr_t cookie); +mi_decl_externc void _malloc_fork_prepare(void); +mi_decl_externc void _malloc_fork_parent(void); +mi_decl_externc void _malloc_fork_child(void); + + +static malloc_zone_t* mi_malloc_create_zone(vm_size_t size, unsigned flags) { + UNUSED(size); UNUSED(flags); + return mi_get_default_zone(); +} + +static malloc_zone_t* mi_malloc_default_zone (void) { + return mi_get_default_zone(); +} + +static malloc_zone_t* mi_malloc_default_purgeable_zone(void) { + return mi_get_default_zone(); +} + +static void mi_malloc_destroy_zone(malloc_zone_t* zone) { + UNUSED(zone); + // nothing. +} + +static kern_return_t mi_malloc_get_all_zones (task_t task, memory_reader_t mr, vm_address_t** addresses, unsigned* count) { + UNUSED(task); UNUSED(mr); + if (addresses != NULL) *addresses = NULL; + if (count != NULL) *count = 0; + return KERN_SUCCESS; +} + +static const char* mi_malloc_get_zone_name(malloc_zone_t* zone) { + return (zone == NULL ? mi_malloc_zone.zone_name : zone->zone_name); +} + +static void mi_malloc_set_zone_name(malloc_zone_t* zone, const char* name) { + UNUSED(zone); UNUSED(name); +} + +static int mi_malloc_jumpstart(uintptr_t cookie) { + UNUSED(cookie); + return 1; // or 0 for no error? +} + +static void mi__malloc_fork_prepare(void) { + // nothing +} +static void mi__malloc_fork_parent(void) { + // nothing +} +static void mi__malloc_fork_child(void) { + // nothing +} + +static void mi_malloc_printf(const char* fmt, ...) { + UNUSED(fmt); +} + +static bool zone_check(malloc_zone_t* zone) { + UNUSED(zone); + return true; +} + +static malloc_zone_t* zone_from_ptr(const void* p) { + UNUSED(p); + return mi_get_default_zone(); +} + +static void zone_log(malloc_zone_t* zone, void* p) { + UNUSED(zone); UNUSED(p); +} + +static void zone_print(malloc_zone_t* zone, bool b) { + UNUSED(zone); UNUSED(b); +} + +static void zone_print_ptr_info(void* p) { + UNUSED(p); +} + +static void zone_register(malloc_zone_t* zone) { + UNUSED(zone); +} + +static void zone_unregister(malloc_zone_t* zone) { + UNUSED(zone); +} + +// use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1` +// See: +struct mi_interpose_s { + const void* replacement; + const void* target; +}; +#define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun } +#define MI_INTERPOSE_MI(fun) MI_INTERPOSE_FUN(fun,mi_##fun) +#define MI_INTERPOSE_ZONE(fun) MI_INTERPOSE_FUN(malloc_##fun,fun) +__attribute__((used)) static const struct mi_interpose_s _mi_zone_interposes[] __attribute__((section("__DATA, __interpose"))) = +{ + + MI_INTERPOSE_MI(malloc_create_zone), + MI_INTERPOSE_MI(malloc_default_purgeable_zone), + MI_INTERPOSE_MI(malloc_default_zone), + MI_INTERPOSE_MI(malloc_destroy_zone), + MI_INTERPOSE_MI(malloc_get_all_zones), + MI_INTERPOSE_MI(malloc_get_zone_name), + MI_INTERPOSE_MI(malloc_jumpstart), + MI_INTERPOSE_MI(malloc_printf), + MI_INTERPOSE_MI(malloc_set_zone_name), + MI_INTERPOSE_MI(_malloc_fork_child), + MI_INTERPOSE_MI(_malloc_fork_parent), + MI_INTERPOSE_MI(_malloc_fork_prepare), + + MI_INTERPOSE_ZONE(zone_batch_free), + MI_INTERPOSE_ZONE(zone_batch_malloc), + MI_INTERPOSE_ZONE(zone_calloc), + MI_INTERPOSE_ZONE(zone_check), + MI_INTERPOSE_ZONE(zone_free), + MI_INTERPOSE_ZONE(zone_from_ptr), + MI_INTERPOSE_ZONE(zone_log), + MI_INTERPOSE_ZONE(zone_malloc), + MI_INTERPOSE_ZONE(zone_memalign), + MI_INTERPOSE_ZONE(zone_print), + MI_INTERPOSE_ZONE(zone_print_ptr_info), + MI_INTERPOSE_ZONE(zone_realloc), + MI_INTERPOSE_ZONE(zone_register), + MI_INTERPOSE_ZONE(zone_unregister), + MI_INTERPOSE_ZONE(zone_valloc) +}; + + +#else + +// ------------------------------------------------------ +// hook into the zone api's without interposing +// ------------------------------------------------------ + +static inline malloc_zone_t* mi_get_default_zone(void) +{ + // The first returned zone is the real default + malloc_zone_t** zones = NULL; + unsigned count = 0; + kern_return_t ret = malloc_get_all_zones(0, NULL, (vm_address_t**)&zones, &count); + if (ret == KERN_SUCCESS && count > 0) { + return zones[0]; + } + else { + // fallback + return malloc_default_zone(); + } +} #if defined(__clang__) __attribute__((constructor(0))) @@ -287,5 +442,6 @@ static void _mi_macos_override_malloc() { } } +#endif // MI_OSX_INTERPOSE #endif // MI_MALLOC_OVERRIDE diff --git a/src/alloc-override.c b/src/alloc-override.c index 6c42ae00..123f5e07 100644 --- a/src/alloc-override.c +++ b/src/alloc-override.c @@ -13,7 +13,7 @@ terms of the MIT license. A copy of the license can be found in the file #error "It is only possible to override "malloc" on Windows when building as a DLL (and linking the C runtime as a DLL)" #endif -#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // || (defined(__APPLE__) && !defined(MI_OSX_INTERPOSE))) +#if defined(MI_MALLOC_OVERRIDE) && !(defined(_WIN32)) // ------------------------------------------------------ // Override system malloc @@ -41,7 +41,12 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_FORWARD02(fun,x,y) { fun(x,y); } #endif -#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE) +#if defined(__APPLE__) && defined(MI_SHARED_LIB_EXPORT) && defined(MI_OSX_INTERPOSE) + #include + mi_decl_externc void vfree(void* p); + mi_decl_externc size_t malloc_size(const void* p); + mi_decl_externc size_t malloc_good_size(size_t size); + // use interposing so `DYLD_INSERT_LIBRARIES` works without `DYLD_FORCE_FLAT_NAMESPACE=1` // See: struct mi_interpose_s { @@ -50,6 +55,7 @@ terms of the MIT license. A copy of the license can be found in the file }; #define MI_INTERPOSE_FUN(oldfun,newfun) { (const void*)&newfun, (const void*)&oldfun } #define MI_INTERPOSE_MI(fun) MI_INTERPOSE_FUN(fun,mi_##fun) + __attribute__((used)) static struct mi_interpose_s _mi_interposes[] __attribute__((section("__DATA, __interpose"))) = { MI_INTERPOSE_MI(malloc), @@ -61,21 +67,24 @@ terms of the MIT license. A copy of the license can be found in the file MI_INTERPOSE_MI(posix_memalign), MI_INTERPOSE_MI(reallocf), MI_INTERPOSE_MI(valloc), + MI_INTERPOSE_MI(malloc_size), + MI_INTERPOSE_MI(malloc_good_size), + MI_INTERPOSE_MI(aligned_alloc), #ifndef MI_OSX_ZONE - // some code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity ) + // sometimes code allocates from default zone but deallocates using plain free :-( (like NxHashResizeToCapacity ) MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us + MI_INTERPOSE_FUN(vfree,mi_cfree), #else - // We interpose malloc_default_zone in alloc-override-osx.c + // we interpose malloc_default_zone in alloc-override-osx.c so we can use mi_free safely MI_INTERPOSE_MI(free), + MI_INTERPOSE_FUN(vfree,mi_free), #endif - // some code allocates from a zone but deallocates using plain free :-( (like NxHashResizeToCapacity ) - MI_INTERPOSE_FUN(free,mi_cfree), // use safe free that checks if pointers are from us }; #elif defined(_MSC_VER) // cannot override malloc unless using a dll. // we just override new/delete which does work in a static library. #else - // On all other systems forward to our API + // On all other systems forward to our API void* malloc(size_t size) MI_FORWARD1(mi_malloc, size) void* calloc(size_t size, size_t n) MI_FORWARD2(mi_calloc, size, n) void* realloc(void* p, size_t newsize) MI_FORWARD2(mi_realloc, p, newsize) @@ -123,7 +132,7 @@ terms of the MIT license. A copy of the license can be found in the file void* operator new[](std::size_t n, std::align_val_t al, const std::nothrow_t&) noexcept { return mi_new_aligned_nothrow(n, static_cast(al)); } #endif -#elif (defined(__GNUC__) || defined(__clang__)) +#elif (defined(__GNUC__) || defined(__clang__)) && !defined(MI_OSX_ZONE) // ------------------------------------------------------ // Override by defining the mangled C++ names of the operators (as // used by GCC and CLang).