merge from dev

This commit is contained in:
Daan Leijen 2021-01-30 16:37:38 -08:00
commit 36b7a3cb03
6 changed files with 79 additions and 26 deletions

View File

@ -180,21 +180,6 @@ bool _mi_page_is_valid(mi_page_t* page);
#endif #endif
// -----------------------------------------------------------------------------------
// On windows x86/x64 with msvc/clang-cl, use `rep movsb` for `memcpy` (issue #201)
// -----------------------------------------------------------------------------------
#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
#include <intrin.h>
static inline void _mi_memcpy_rep_movsb(void* d, const void* s, size_t n) {
__movsb((unsigned char*)d, (const unsigned char*)s, n);
}
#define _mi_memcpy(d,s,n) _mi_memcpy_rep_movsb(d,s,n)
#else
#define _mi_memcpy(d,s,n) memcpy(d,s,n)
#endif
/* ----------------------------------------------------------- /* -----------------------------------------------------------
Inlined definitions Inlined definitions
----------------------------------------------------------- */ ----------------------------------------------------------- */
@ -997,4 +982,55 @@ static inline size_t mi_bsr(uintptr_t x) {
} }
// ---------------------------------------------------------------------------------
// Provide our own `_mi_memcpy` for potential performance optimizations.
//
// For now, only on Windows with msvc/clang-cl we optimize to `rep movsb` if
// we happen to run on x86/x64 cpu's that have "fast short rep movsb" (FSRM) support
// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
// ---------------------------------------------------------------------------------
#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
#include <intrin.h>
#include <string.h>
extern bool _mi_cpu_has_fsrm;
static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
if (_mi_cpu_has_fsrm) {
__movsb((unsigned char*)dst, (const unsigned char*)src, n);
}
else {
memcpy(dst, src, n); // todo: use noinline?
}
}
#else
#include <string.h>
static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
memcpy(dst, src, n);
}
#endif
// -------------------------------------------------------------------------------
// The `_mi_memcpy_aligned` can be used if the pointers are machine-word aligned
// This is used for example in `mi_realloc`.
// -------------------------------------------------------------------------------
#if (__GNUC__ >= 4) || defined(__clang__)
// On GCC/CLang we provide a hint that the pointers are word aligned.
#include <string.h>
static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE);
const void* asrc = __builtin_assume_aligned(src, MI_INTPTR_SIZE);
memcpy(adst, asrc, n);
}
#else
// Default fallback on `_mi_memcpy`
static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) {
mi_assert_internal(((uintptr_t)dst % MI_INTPTR_SIZE == 0) && ((uintptr_t)src % MI_INTPTR_SIZE == 0));
_mi_memcpy(dst, src, n);
}
#endif
#endif #endif

View File

@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
#include "mimalloc.h" #include "mimalloc.h"
#include "mimalloc-internal.h" #include "mimalloc-internal.h"
#include <string.h> // memset, memcpy #include <string.h> // memset
// ------------------------------------------------------ // ------------------------------------------------------
// Aligned Allocation // Aligned Allocation
@ -137,7 +137,7 @@ static void* mi_heap_realloc_zero_aligned_at(mi_heap_t* heap, void* p, size_t ne
memset((uint8_t*)newp + start, 0, newsize - start); memset((uint8_t*)newp + start, 0, newsize - start);
} }
} }
_mi_memcpy(newp, p, (newsize > size ? size : newsize)); _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
mi_free(p); // only free if successful mi_free(p); // only free if successful
} }
return newp; return newp;

View File

@ -17,7 +17,7 @@ terms of the MIT license. A copy of the license can be found in the file
// ------------------------------------------------------ // ------------------------------------------------------
#include <errno.h> #include <errno.h>
#include <string.h> // memcpy #include <string.h> // memset
#include <stdlib.h> // getenv #include <stdlib.h> // getenv
#ifdef _MSC_VER #ifdef _MSC_VER

View File

@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
#include "mimalloc-internal.h" #include "mimalloc-internal.h"
#include "mimalloc-atomic.h" #include "mimalloc-atomic.h"
#include <string.h> // memset, memcpy, strlen #include <string.h> // memset, strlen
#include <stdlib.h> // malloc, exit #include <stdlib.h> // malloc, exit
#define MI_IN_ALLOC_C #define MI_IN_ALLOC_C
@ -628,7 +628,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero)
size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0); size_t start = (size >= sizeof(intptr_t) ? size - sizeof(intptr_t) : 0);
memset((uint8_t*)newp + start, 0, newsize - start); memset((uint8_t*)newp + start, 0, newsize - start);
} }
_mi_memcpy(newp, p, (newsize > size ? size : newsize)); _mi_memcpy_aligned(newp, p, (newsize > size ? size : newsize));
mi_free(p); // only free if successful mi_free(p); // only free if successful
} }
return newp; return newp;

View File

@ -193,7 +193,7 @@ mi_heap_t* mi_heap_new(void) {
mi_heap_t* bheap = mi_heap_get_backing(); mi_heap_t* bheap = mi_heap_get_backing();
mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode?
if (heap==NULL) return NULL; if (heap==NULL) return NULL;
_mi_memcpy(heap, &_mi_heap_empty, sizeof(mi_heap_t)); _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
heap->tld = bheap->tld; heap->tld = bheap->tld;
heap->thread_id = _mi_thread_id(); heap->thread_id = _mi_thread_id();
_mi_random_split(&bheap->random, &heap->random); _mi_random_split(&bheap->random, &heap->random);
@ -220,7 +220,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) {
#ifdef MI_MEDIUM_DIRECT #ifdef MI_MEDIUM_DIRECT
memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium)); memset(&heap->pages_free_medium, 0, sizeof(heap->pages_free_medium));
#endif #endif
_mi_memcpy(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages)); _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages));
heap->thread_delayed_free = NULL; heap->thread_delayed_free = NULL;
heap->page_count = 0; heap->page_count = 0;
} }

View File

@ -102,7 +102,7 @@ const mi_page_t _mi_page_empty = {
// may lead to allocation itself on some platforms) // may lead to allocation itself on some platforms)
// -------------------------------------------------------- // --------------------------------------------------------
const mi_heap_t _mi_heap_empty = { mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
NULL, NULL,
MI_SMALL_PAGES_EMPTY, MI_SMALL_PAGES_EMPTY,
MI_PAGE_QUEUES_EMPTY, MI_PAGE_QUEUES_EMPTY,
@ -120,7 +120,7 @@ const mi_heap_t _mi_heap_empty = {
#define tld_empty_stats ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats))) #define tld_empty_stats ((mi_stats_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,stats)))
#define tld_empty_os ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os))) #define tld_empty_os ((mi_os_tld_t*)((uint8_t*)&tld_empty + offsetof(mi_tld_t,os)))
static const mi_tld_t tld_empty = { mi_decl_cache_align static const mi_tld_t tld_empty = {
0, 0,
false, false,
NULL, NULL, NULL, NULL,
@ -213,8 +213,8 @@ static bool _mi_heap_init(void) {
// OS allocated so already zero initialized // OS allocated so already zero initialized
mi_tld_t* tld = &td->tld; mi_tld_t* tld = &td->tld;
mi_heap_t* heap = &td->heap; mi_heap_t* heap = &td->heap;
_mi_memcpy(tld, &tld_empty, sizeof(*tld)); _mi_memcpy_aligned(tld, &tld_empty, sizeof(*tld));
_mi_memcpy(heap, &_mi_heap_empty, sizeof(*heap)); _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
heap->thread_id = _mi_thread_id(); heap->thread_id = _mi_thread_id();
_mi_random_init(&heap->random); _mi_random_init(&heap->random);
heap->cookie = _mi_heap_random_next(heap) | 1; heap->cookie = _mi_heap_random_next(heap) | 1;
@ -483,6 +483,22 @@ static void mi_process_load(void) {
} }
} }
#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
#include <intrin.h>
mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
static void mi_detect_cpu_features(void) {
// FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
int32_t cpu_info[4];
__cpuid(cpu_info, 7);
_mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
}
#else
static void mi_detect_cpu_features(void) {
// nothing
}
#endif
// Initialize the process; called by thread_init or the process loader // Initialize the process; called by thread_init or the process loader
void mi_process_init(void) mi_attr_noexcept { void mi_process_init(void) mi_attr_noexcept {
// ensure we are called once // ensure we are called once
@ -491,6 +507,7 @@ void mi_process_init(void) mi_attr_noexcept {
mi_process_setup_auto_thread_done(); mi_process_setup_auto_thread_done();
_mi_verbose_message("process init: 0x%zx\n", _mi_thread_id()); _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
mi_detect_cpu_features();
_mi_os_init(); _mi_os_init();
mi_heap_main_init(); mi_heap_main_init();
#if (MI_DEBUG) #if (MI_DEBUG)