avoid allocation at numa node detection on linux

This commit is contained in:
daan 2019-11-12 10:16:59 -08:00
parent 165ee45845
commit ef179a6377
2 changed files with 56 additions and 46 deletions

View File

@ -17,18 +17,18 @@ terms of the MIT license. A copy of the license can be found in the file
#if (MI_DEBUG>0) #if (MI_DEBUG>0)
#define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) #define mi_trace_message(...) _mi_trace_message(__VA_ARGS__)
#else #else
#define mi_trace_message(...) #define mi_trace_message(...)
#endif #endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#define mi_decl_noinline __declspec(noinline) #define mi_decl_noinline __declspec(noinline)
#define mi_attr_noreturn #define mi_attr_noreturn
#elif defined(__GNUC__) || defined(__clang__) #elif defined(__GNUC__) || defined(__clang__)
#define mi_decl_noinline __attribute__((noinline)) #define mi_decl_noinline __attribute__((noinline))
#define mi_attr_noreturn __attribute__((noreturn)) #define mi_attr_noreturn __attribute__((noreturn))
#else #else
#define mi_decl_noinline #define mi_decl_noinline
#define mi_attr_noreturn #define mi_attr_noreturn
#endif #endif
@ -56,8 +56,6 @@ void _mi_os_init(void); // called fro
void* _mi_os_alloc(size_t size, mi_stats_t* stats); // to allocate thread local data void* _mi_os_alloc(size_t size, mi_stats_t* stats); // to allocate thread local data
void _mi_os_free(void* p, size_t size, mi_stats_t* stats); // to free thread local data void _mi_os_free(void* p, size_t size, mi_stats_t* stats); // to free thread local data
size_t _mi_os_good_alloc_size(size_t size); size_t _mi_os_good_alloc_size(size_t size);
int _mi_os_numa_node(mi_os_tld_t* tld);
int _mi_os_numa_node_count(void);
// memory.c // memory.c
void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld); void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* id, mi_os_tld_t* tld);
@ -146,8 +144,8 @@ bool _mi_page_is_valid(mi_page_t* page);
Inlined definitions Inlined definitions
----------------------------------------------------------- */ ----------------------------------------------------------- */
#define UNUSED(x) (void)(x) #define UNUSED(x) (void)(x)
#if (MI_DEBUG>0) #if (MI_DEBUG>0)
#define UNUSED_RELEASE(x) #define UNUSED_RELEASE(x)
#else #else
#define UNUSED_RELEASE(x) UNUSED(x) #define UNUSED_RELEASE(x) UNUSED(x)
#endif #endif
@ -398,7 +396,7 @@ static inline mi_block_t* mi_block_nextx( uintptr_t cookie, const mi_block_t* bl
#endif #endif
} }
static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, const mi_block_t* next) { static inline void mi_block_set_nextx(uintptr_t cookie, mi_block_t* block, const mi_block_t* next) {
#ifdef MI_ENCODE_FREELIST #ifdef MI_ENCODE_FREELIST
block->next = (mi_encoded_t)next ^ cookie; block->next = (mi_encoded_t)next ^ cookie;
#else #else
@ -411,12 +409,12 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t*
#ifdef MI_ENCODE_FREELIST #ifdef MI_ENCODE_FREELIST
mi_block_t* next = mi_block_nextx(page->cookie,block); mi_block_t* next = mi_block_nextx(page->cookie,block);
// check for free list corruption: is `next` at least in our segment range? // check for free list corruption: is `next` at least in our segment range?
// TODO: it is better to check if it is actually inside our page but that is more expensive // TODO: it is better to check if it is actually inside our page but that is more expensive
// to calculate. Perhaps with a relative free list this becomes feasible? // to calculate. Perhaps with a relative free list this becomes feasible?
if (next!=NULL && !mi_is_in_same_segment(block, next)) { if (next!=NULL && !mi_is_in_same_segment(block, next)) {
_mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next); _mi_fatal_error("corrupted free list entry of size %zub at %p: value 0x%zx\n", page->block_size, block, (uintptr_t)next);
next = NULL; next = NULL;
} }
return next; return next;
#else #else
UNUSED(page); UNUSED(page);
@ -433,6 +431,25 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c
#endif #endif
} }
// -------------------------------------------------------------------
// Optimize numa node access for the common case (= one node)
// -------------------------------------------------------------------
int _mi_os_numa_node_get(mi_os_tld_t* tld);
int _mi_os_numa_node_count_get(void);
extern int _mi_numa_node_count;
static inline int _mi_os_numa_node(mi_os_tld_t* tld) {
if (mi_likely(_mi_numa_node_count == 1)) return 0;
else return _mi_os_numa_node_get(tld);
}
static inline int _mi_os_numa_node_count(void) {
if (mi_likely(_mi_numa_node_count>0)) return _mi_numa_node_count;
else return _mi_os_numa_node_count_get();
}
// ------------------------------------------------------------------- // -------------------------------------------------------------------
// Getting the thread id should be performant // Getting the thread id should be performant
// as it is called in the fast path of `_mi_free`, // as it is called in the fast path of `_mi_free`,

View File

@ -786,9 +786,9 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE; const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE;
mi_win_enable_large_os_pages(); mi_win_enable_large_os_pages();
#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS) #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} }; MEM_EXTENDED_PARAMETER params[3] = { {0,0},{0,0},{0,0} };
// on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
static bool mi_huge_pages_available = true; static bool mi_huge_pages_available = true;
if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) { if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) {
@ -818,7 +818,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
// on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation
if (pVirtualAlloc2 != NULL && numa_node >= 0) { if (pVirtualAlloc2 != NULL && numa_node >= 0) {
params[0].Type = MemExtendedParameterNumaNode; params[0].Type = MemExtendedParameterNumaNode;
params[0].ULong = (unsigned)numa_node; params[0].ULong = (unsigned)numa_node;
return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1); return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1);
} }
#endif #endif
@ -838,7 +838,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
#ifdef MI_HAS_NUMA #ifdef MI_HAS_NUMA
if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes
uintptr_t numa_mask = (1UL << numa_node); uintptr_t numa_mask = (1UL << numa_node);
// TODO: does `mbind` work correctly for huge OS pages? should we // TODO: does `mbind` work correctly for huge OS pages? should we
// use `set_mempolicy` before calling mmap instead? // use `set_mempolicy` before calling mmap instead?
// see: <https://lkml.org/lkml/2017/2/9/875> // see: <https://lkml.org/lkml/2017/2/9/875>
long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0); long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
@ -857,7 +857,7 @@ static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node)
} }
#endif #endif
#if (MI_INTPTR_SIZE >= 8) #if (MI_INTPTR_SIZE >= 8)
// To ensure proper alignment, use our own area for huge OS pages // To ensure proper alignment, use our own area for huge OS pages
static _Atomic(uintptr_t) mi_huge_start; // = 0 static _Atomic(uintptr_t) mi_huge_start; // = 0
@ -900,7 +900,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
size_t size = 0; size_t size = 0;
uint8_t* start = mi_os_claim_huge_pages(pages, &size); uint8_t* start = mi_os_claim_huge_pages(pages, &size);
if (start == NULL) return NULL; // or 32-bit systems if (start == NULL) return NULL; // or 32-bit systems
// Allocate one page at the time but try to place them contiguously // Allocate one page at the time but try to place them contiguously
// We allocate one page at the time to be able to abort if it takes too long // We allocate one page at the time to be able to abort if it takes too long
// or to at least allocate as many as available on the system. // or to at least allocate as many as available on the system.
@ -920,11 +920,11 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
} }
break; break;
} }
// success, record it // success, record it
_mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE); _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
_mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE); _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
// check for timeout // check for timeout
if (max_msecs > 0) { if (max_msecs > 0) {
mi_msecs_t elapsed = _mi_clock_end(start_t); mi_msecs_t elapsed = _mi_clock_end(start_t);
@ -958,7 +958,7 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) {
} }
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
Support NUMA aware allocation Support NUMA aware allocation
-----------------------------------------------------------------------------*/ -----------------------------------------------------------------------------*/
#ifdef WIN32 #ifdef WIN32
static int mi_os_numa_nodex() { static int mi_os_numa_nodex() {
@ -975,9 +975,8 @@ static int mi_os_numa_node_countx(void) {
return (int)(numa_max + 1); return (int)(numa_max + 1);
} }
#elif defined(__linux__) #elif defined(__linux__)
#include <dirent.h> #include <sys/syscall.h> // getcpu
#include <stdlib.h> #include <stdio.h> // access
#include <sys/syscall.h>
static int mi_os_numa_nodex(void) { static int mi_os_numa_nodex(void) {
#ifdef SYS_getcpu #ifdef SYS_getcpu
@ -990,22 +989,15 @@ static int mi_os_numa_nodex(void) {
return 0; return 0;
#endif #endif
} }
static int mi_os_numa_node_countx(void) { static int mi_os_numa_node_countx(void) {
DIR* d = opendir("/sys/devices/system/node"); char buf[128];
if (d==NULL) return 1; int max_node = mi_option_get(mi_option_max_numa_node);
int node = 0;
struct dirent* de; for(node = 0; node < max_node; node++) {
int max_node_num = 0; snprintf(buf, 127, "/sys/devices/system/node/node%i", node + 1);
while ((de = readdir(d)) != NULL) { if (access(buf,R_OK) != 0) break;
int node_num;
if (strncmp(de->d_name, "node", 4) == 0) {
node_num = (int)strtol(de->d_name+4, NULL, 0);
if (max_node_num < node_num) max_node_num = node_num;
}
} }
closedir(d); return (node+1);
return (max_node_num + 1);
} }
#else #else
static int mi_os_numa_nodex(void) { static int mi_os_numa_nodex(void) {
@ -1016,29 +1008,30 @@ static int mi_os_numa_node_countx(void) {
} }
#endif #endif
int _mi_os_numa_node_count(void) { int _mi_numa_node_count = 0; // cache the node count
static int numa_node_count = 0; // cache the node count
if (mi_unlikely(numa_node_count <= 0)) { int _mi_os_numa_node_count_get(void) {
int ncount = mi_os_numa_node_countx(); if (mi_unlikely(_mi_numa_node_count <= 0)) {
int ncount = mi_os_numa_node_countx();
int ncount0 = ncount; int ncount0 = ncount;
// never more than max numa node and at least 1 // never more than max numa node and at least 1
int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node); int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node);
if (ncount > nmax) ncount = nmax; if (ncount > nmax) ncount = nmax;
if (ncount <= 0) ncount = 1; if (ncount <= 0) ncount = 1;
numa_node_count = ncount; _mi_numa_node_count = ncount;
_mi_verbose_message("using %i numa regions (%i nodes detected)\n", numa_node_count, ncount0); _mi_verbose_message("using %i numa regions (%i nodes detected)\n", _mi_numa_node_count, ncount0);
} }
mi_assert_internal(numa_node_count >= 1); mi_assert_internal(_mi_numa_node_count >= 1);
return numa_node_count; return _mi_numa_node_count;
} }
int _mi_os_numa_node(mi_os_tld_t* tld) { int _mi_os_numa_node_get(mi_os_tld_t* tld) {
UNUSED(tld); UNUSED(tld);
int numa_count = _mi_os_numa_node_count(); int numa_count = _mi_os_numa_node_count();
if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0 if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
// never more than the node count and >= 0 // never more than the node count and >= 0
int numa_node = mi_os_numa_nodex(); int numa_node = mi_os_numa_nodex();
if (numa_node >= numa_count) { numa_node = numa_node % numa_count; } if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
if (numa_node < 0) numa_node = 0; if (numa_node < 0) numa_node = 0;
return numa_node; return numa_node;
} }