diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index 574e5bde..06d31dbd 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -180,21 +180,6 @@ bool _mi_page_is_valid(mi_page_t* page); #endif -// ----------------------------------------------------------------------------------- -// On windows x86/x64 with msvc/clang-cl, use `rep movsb` for `memcpy` (issue #201) -// ----------------------------------------------------------------------------------- - -#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) -#include -static inline void _mi_memcpy_rep_movsb(void* d, const void* s, size_t n) { - __movsb((unsigned char*)d, (const unsigned char*)s, n); -} -#define _mi_memcpy(d,s,n) _mi_memcpy_rep_movsb(d,s,n) -#else -#define _mi_memcpy(d,s,n) memcpy(d,s,n) -#endif - - /* ----------------------------------------------------------- Inlined definitions ----------------------------------------------------------- */ @@ -997,4 +982,55 @@ static inline size_t mi_bsr(uintptr_t x) { } +// --------------------------------------------------------------------------------- +// Provide our own `_mi_memcpy` for potential performance optimizations. +// +// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if +// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support +// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253. +// --------------------------------------------------------------------------------- + +#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) +#include +#include +extern bool _mi_cpu_has_fsrm; +static inline void _mi_memcpy(void* dst, const void* src, size_t n) { + if (_mi_cpu_has_fsrm) { + __movsb((unsigned char*)dst, (const unsigned char*)src, n); + } + else { + memcpy(dst, src, n); // todo: use noinline? + } +} +#else +#include +static inline void _mi_memcpy(void* dst, const void* src, size_t n) { + memcpy(dst, src, n); +} +#endif + + +// ------------------------------------------------------------------------------- +// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned +// This is used for example in `mi_realloc`. +// ------------------------------------------------------------------------------- + +#if (__GNUC__ >= 4) || defined(__clang__) +// On GCC/CLang we provide a hint that the pointers are word aligned. +#include +static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { + mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0)); + void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE); + const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE); + memcpy(adst, asrc, n); +} +#else +// Default fallback on `_mi_memcpy` +static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { + mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0)); + _mi_memcpy(dst, src, n); +} +#endif + + #endif diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index 10f40355..4be651d4 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc-internal.h" -#include // memset, memcpy +#include // memset // ------------------------------------------------------ // Aligned Allocation @@ -137,7 +137,7 @@ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t ne memset((uint8_t*)newp + start, 0, newsize - start); } } - _mi_memcpy(newp, p, (newsize > size ? size : newsize)); + _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize)); mi_free(p); // only free if successful } return newp; diff --git a/src/alloc-posix.c b/src/alloc-posix.c index 85cef389..eef70ab5 100644 --- a/src/alloc-posix.c +++ b/src/alloc-posix.c @@ -17,7 +17,7 @@ terms of the MIT license. A copy of the license can be found in the file // ------------------------------------------------------ #include -#include // memcpy +#include // memset #include // getenv #ifdef _MSC_VER diff --git a/src/alloc.c b/src/alloc.c index 370bf271..1d973ca4 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc-internal.h" #include "mimalloc-atomic.h" -#include // memset, memcpy, strlen +#include // memset, strlen #include // malloc, exit #define MI_IN_ALLOC_C @@ -628,7 +628,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0); memset((uint8_t*)newp + start, 0, newsize - start); } - _mi_memcpy(newp, p, (newsize > size ? size : newsize)); + _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize)); mi_free(p); // only free if successful } return newp; diff --git a/src/heap.c b/src/heap.c index c7130a90..c5195491 100644 --- a/src/heap.c +++ b/src/heap.c @@ -193,7 +193,7 @@ mi_heap_t* mi_heap_new(void) { mi_heap_t* bheap = mi_heap_get_backing(); mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? if (heap==NULL) return NULL; - _mi_memcpy(heap, &_mi_heap_empty, sizeof(mi_heap_t)); + _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); heap->tld = bheap->tld; heap->thread_id = _mi_thread_id(); _mi_random_split(&bheap->random, &heap->random); @@ -220,7 +220,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) { #ifdef MI_MEDIUM_DIRECT memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium)); #endif - _mi_memcpy(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages)); + _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages)); heap->thread_delayed_free = NULL; heap->page_count = 0; } diff --git a/src/init.c b/src/init.c index eec0f1ba..2161fdd8 100644 --- a/src/init.c +++ b/src/init.c @@ -102,7 +102,7 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -const mi_heap_t _mi_heap_empty = { +mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, MI_SMALL_PAGES_EMPTY, MI_PAGE_QUEUES_EMPTY, @@ -120,7 +120,7 @@ const mi_heap_t _mi_heap_empty = { #define tld_empty_stats ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats))) #define tld_empty_os ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os))) -static const mi_tld_t tld_empty = { +mi_decl_cache_align static const mi_tld_t tld_empty = { 0, false, NULL, NULL, @@ -213,8 +213,8 @@ static bool _mi_heap_init(void) { // OS allocated so already zero initialized mi_tld_t* tld = &td->tld; mi_heap_t* heap = &td->heap; - _mi_memcpy(tld, &tld_empty, sizeof(*tld)); - _mi_memcpy(heap, &_mi_heap_empty, sizeof(*heap)); + _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld)); + _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap)); heap->thread_id = _mi_thread_id(); _mi_random_init(&heap->random); heap->cookie = _mi_heap_random_next(heap) | 1; @@ -483,6 +483,22 @@ static void mi_process_load(void) { } } +#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) +#include +mi_decl_cache_align bool _mi_cpu_has_fsrm = false; + +static void mi_detect_cpu_features(void) { + // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017)) + int32_t cpu_info[4]; + __cpuid(cpu_info, 7); + _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see +} +#else +static void mi_detect_cpu_features(void) { + // nothing +} +#endif + // Initialize the process; called by thread_init or the process loader void mi_process_init(void) mi_attr_noexcept { // ensure we are called once @@ -491,6 +507,7 @@ void mi_process_init(void) mi_attr_noexcept { mi_process_setup_auto_thread_done(); _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id()); + mi_detect_cpu_features(); _mi_os_init(); mi_heap_main_init(); #if (MI_DEBUG)