merge from master
This commit is contained in:
commit
8b2194c160
76
src/memory.c
76
src/memory.c
@ -11,24 +11,24 @@ and the segment and huge object allocation by mimalloc. There may be multiple
|
|||||||
implementations of this (one could be the identity going directly to the OS,
|
implementations of this (one could be the identity going directly to the OS,
|
||||||
another could be a simple cache etc), but the current one uses large "regions".
|
another could be a simple cache etc), but the current one uses large "regions".
|
||||||
In contrast to the rest of mimalloc, the "regions" are shared between threads and
|
In contrast to the rest of mimalloc, the "regions" are shared between threads and
|
||||||
need to be accessed using atomic operations.
|
need to be accessed using atomic operations.
|
||||||
We need this memory layer between the raw OS calls because of:
|
We need this memory layer between the raw OS calls because of:
|
||||||
1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
|
1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
|
||||||
to reuse memory effectively.
|
to reuse memory effectively.
|
||||||
2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
|
2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
|
||||||
an OS allocation/free is still (much) too expensive relative to the accesses in that
|
an OS allocation/free is still (much) too expensive relative to the accesses in that
|
||||||
object :-( (`mallloc-large` tests this). This means we need a cheaper way to
|
object :-( (`mallloc-large` tests this). This means we need a cheaper way to
|
||||||
reuse memory.
|
reuse memory.
|
||||||
3. This layer can help with a NUMA aware allocation in the future.
|
3. This layer can help with a NUMA aware allocation in the future.
|
||||||
|
|
||||||
Possible issues:
|
Possible issues:
|
||||||
- (2) can potentially be addressed too with a small cache per thread which is much
|
- (2) can potentially be addressed too with a small cache per thread which is much
|
||||||
simpler. Generally though that requires shrinking of huge pages, and may overuse
|
simpler. Generally though that requires shrinking of huge pages, and may overuse
|
||||||
memory per thread. (and is not compatible with `sbrk`).
|
memory per thread. (and is not compatible with `sbrk`).
|
||||||
- Since the current regions are per-process, we need atomic operations to
|
- Since the current regions are per-process, we need atomic operations to
|
||||||
claim blocks which may be contended
|
claim blocks which may be contended
|
||||||
- In the worst case, we need to search the whole region map (16KiB for 256GiB)
|
- In the worst case, we need to search the whole region map (16KiB for 256GiB)
|
||||||
linearly. At what point will direct OS calls be faster? Is there a way to
|
linearly. At what point will direct OS calls be faster? Is there a way to
|
||||||
do this better without adding too much complexity?
|
do this better without adding too much complexity?
|
||||||
-----------------------------------------------------------------------------*/
|
-----------------------------------------------------------------------------*/
|
||||||
#include "mimalloc.h"
|
#include "mimalloc.h"
|
||||||
@ -100,7 +100,7 @@ static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) {
|
|||||||
// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
|
// Return a rounded commit/reset size such that we don't fragment large OS pages into small ones.
|
||||||
static size_t mi_good_commit_size(size_t size) {
|
static size_t mi_good_commit_size(size_t size) {
|
||||||
if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
|
if (size > (SIZE_MAX - _mi_os_large_page_size())) return size;
|
||||||
return _mi_align_up(size, _mi_os_large_page_size());
|
return _mi_align_up(size, _mi_os_large_page_size());
|
||||||
}
|
}
|
||||||
|
|
||||||
// Return if a pointer points into a region reserved by us.
|
// Return if a pointer points into a region reserved by us.
|
||||||
@ -121,11 +121,11 @@ Commit from a region
|
|||||||
|
|
||||||
#define ALLOCATING ((void*)1)
|
#define ALLOCATING ((void*)1)
|
||||||
|
|
||||||
// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
|
// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
|
||||||
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
|
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
|
||||||
// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
|
// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
|
||||||
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
|
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
|
||||||
static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
|
static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
|
||||||
{
|
{
|
||||||
size_t mask = mi_region_block_mask(blocks,bitidx);
|
size_t mask = mi_region_block_mask(blocks,bitidx);
|
||||||
mi_assert_internal(mask != 0);
|
mi_assert_internal(mask != 0);
|
||||||
@ -142,21 +142,21 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
|
|||||||
// another thead is already allocating.. wait it out
|
// another thead is already allocating.. wait it out
|
||||||
// note: the wait here is not great (but should not happen often). Another
|
// note: the wait here is not great (but should not happen often). Another
|
||||||
// strategy might be to just allocate another region in parallel. This tends
|
// strategy might be to just allocate another region in parallel. This tends
|
||||||
// to be bad for benchmarks though as these often start many threads at the
|
// to be bad for benchmarks though as these often start many threads at the
|
||||||
// same time leading to the allocation of too many regions. (Still, this might
|
// same time leading to the allocation of too many regions. (Still, this might
|
||||||
// be the most performant and it's ok on 64-bit virtual memory with over-commit.)
|
// be the most performant and it's ok on 64-bit virtual memory with over-commit.)
|
||||||
mi_atomic_yield();
|
mi_atomic_yield();
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} while( start == ALLOCATING && !mi_atomic_compare_exchange_ptr(®ion->start, ALLOCATING, NULL) );
|
} while( start == ALLOCATING && !mi_atomic_compare_exchange_ptr(®ion->start, ALLOCATING, NULL) );
|
||||||
mi_assert_internal(start != NULL);
|
mi_assert_internal(start != NULL);
|
||||||
|
|
||||||
// allocate the region if needed
|
// allocate the region if needed
|
||||||
if (start == ALLOCATING) {
|
if (start == ALLOCATING) {
|
||||||
start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld);
|
start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld);
|
||||||
// set the new allocation (or NULL on failure) -- this releases any waiting threads.
|
// set the new allocation (or NULL on failure) -- this releases any waiting threads.
|
||||||
mi_atomic_write_ptr(®ion->start, start);
|
mi_atomic_write_ptr(®ion->start, start);
|
||||||
|
|
||||||
if (start == NULL) {
|
if (start == NULL) {
|
||||||
// failure to allocate from the OS! unclaim the blocks and fail
|
// failure to allocate from the OS! unclaim the blocks and fail
|
||||||
size_t map;
|
size_t map;
|
||||||
@ -167,7 +167,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
|
|||||||
}
|
}
|
||||||
|
|
||||||
// update the region count if this is a new max idx.
|
// update the region count if this is a new max idx.
|
||||||
mi_atomic_compare_exchange(®ions_count, idx+1, idx);
|
mi_atomic_compare_exchange(®ions_count, idx+1, idx);
|
||||||
}
|
}
|
||||||
mi_assert_internal(start != NULL && start != ALLOCATING);
|
mi_assert_internal(start != NULL && start != ALLOCATING);
|
||||||
mi_assert_internal(start == mi_atomic_read_ptr(®ion->start));
|
mi_assert_internal(start == mi_atomic_read_ptr(®ion->start));
|
||||||
@ -218,15 +218,15 @@ static inline size_t mi_bsr(uintptr_t x) {
|
|||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Allocate `blocks` in a `region` at `idx` of a given `size`.
|
// Allocate `blocks` in a `region` at `idx` of a given `size`.
|
||||||
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
|
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
|
||||||
// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
|
// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
|
||||||
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
|
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
|
||||||
static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
|
static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
|
||||||
{
|
{
|
||||||
mi_assert_internal(p != NULL && id != NULL);
|
mi_assert_internal(p != NULL && id != NULL);
|
||||||
mi_assert_internal(blocks < MI_REGION_MAP_BITS);
|
mi_assert_internal(blocks < MI_REGION_MAP_BITS);
|
||||||
|
|
||||||
const uintptr_t mask = mi_region_block_mask(blocks, 0);
|
const uintptr_t mask = mi_region_block_mask(blocks, 0);
|
||||||
const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
|
const size_t bitidx_max = MI_REGION_MAP_BITS - blocks;
|
||||||
uintptr_t map = mi_atomic_read(®ion->map);
|
uintptr_t map = mi_atomic_read(®ion->map);
|
||||||
@ -237,16 +237,16 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
|
|||||||
size_t bitidx = 0; // otherwise start at 0
|
size_t bitidx = 0; // otherwise start at 0
|
||||||
#endif
|
#endif
|
||||||
uintptr_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx
|
uintptr_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx
|
||||||
|
|
||||||
// scan linearly for a free range of zero bits
|
// scan linearly for a free range of zero bits
|
||||||
while(bitidx <= bitidx_max) {
|
while(bitidx <= bitidx_max) {
|
||||||
if ((map & m) == 0) { // are the mask bits free at bitidx?
|
if ((map & m) == 0) { // are the mask bits free at bitidx?
|
||||||
mi_assert_internal((m >> bitidx) == mask); // no overflow?
|
mi_assert_internal((m >> bitidx) == mask); // no overflow?
|
||||||
uintptr_t newmap = map | m;
|
uintptr_t newmap = map | m;
|
||||||
mi_assert_internal((newmap^map) >> bitidx == mask);
|
mi_assert_internal((newmap^map) >> bitidx == mask);
|
||||||
if (!mi_atomic_compare_exchange(®ion->map, newmap, map)) {
|
if (!mi_atomic_compare_exchange(®ion->map, newmap, map)) {
|
||||||
// no success, another thread claimed concurrently.. keep going
|
// no success, another thread claimed concurrently.. keep going
|
||||||
map = mi_atomic_read(®ion->map);
|
map = mi_atomic_read(®ion->map);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
@ -261,19 +261,19 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
|
|||||||
size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
|
size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1);
|
||||||
mi_assert_internal(shift > 0 && shift <= blocks);
|
mi_assert_internal(shift > 0 && shift <= blocks);
|
||||||
#else
|
#else
|
||||||
size_t shift = 1;
|
size_t shift = 1;
|
||||||
#endif
|
#endif
|
||||||
bitidx += shift;
|
bitidx += shift;
|
||||||
m <<= shift;
|
m <<= shift;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// no error, but also no bits found
|
// no error, but also no bits found
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
|
// Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim.
|
||||||
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
|
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
|
||||||
// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
|
// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call.
|
||||||
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
|
// (not being able to claim is not considered an error so check for `p != NULL` afterwards).
|
||||||
static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
|
static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld)
|
||||||
{
|
{
|
||||||
@ -321,7 +321,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t*
|
|||||||
for (size_t visited = 0; visited < count; visited++, idx++) {
|
for (size_t visited = 0; visited < count; visited++, idx++) {
|
||||||
if (idx >= count) idx = 0; // wrap around
|
if (idx >= count) idx = 0; // wrap around
|
||||||
if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error
|
if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error
|
||||||
if (p != NULL) break;
|
if (p != NULL) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (p == NULL) {
|
if (p == NULL) {
|
||||||
@ -361,10 +361,10 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
|
|||||||
if (size==0) return;
|
if (size==0) return;
|
||||||
if (id == SIZE_MAX) {
|
if (id == SIZE_MAX) {
|
||||||
// was a direct OS allocation, pass through
|
// was a direct OS allocation, pass through
|
||||||
_mi_os_free(p, size, stats);
|
_mi_os_free(p, size, stats);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// allocated in a region
|
// allocated in a region
|
||||||
mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
|
mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return;
|
||||||
// we can align the size up to page size (as we allocate that way too)
|
// we can align the size up to page size (as we allocate that way too)
|
||||||
// this ensures we fully commit/decommit/reset
|
// this ensures we fully commit/decommit/reset
|
||||||
@ -377,29 +377,29 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
|
|||||||
mem_region_t* region = ®ions[idx];
|
mem_region_t* region = ®ions[idx];
|
||||||
mi_assert_internal((mi_atomic_read(®ion->map) & mask) == mask ); // claimed?
|
mi_assert_internal((mi_atomic_read(®ion->map) & mask) == mask ); // claimed?
|
||||||
void* start = mi_atomic_read_ptr(®ion->start);
|
void* start = mi_atomic_read_ptr(®ion->start);
|
||||||
mi_assert_internal(start != NULL);
|
mi_assert_internal(start != NULL);
|
||||||
void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
|
void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
|
||||||
mi_assert_internal(blocks_start == p); // not a pointer in our area?
|
mi_assert_internal(blocks_start == p); // not a pointer in our area?
|
||||||
mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
|
mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS);
|
||||||
if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?
|
if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`?
|
||||||
|
|
||||||
// decommit (or reset) the blocks to reduce the working set.
|
// decommit (or reset) the blocks to reduce the working set.
|
||||||
// TODO: implement delayed decommit/reset as these calls are too expensive
|
// TODO: implement delayed decommit/reset as these calls are too expensive
|
||||||
// if the memory is reused soon.
|
// if the memory is reused soon.
|
||||||
// reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
|
// reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
|
||||||
if (!mi_option_is_enabled(mi_option_large_os_pages)) {
|
if (!mi_option_is_enabled(mi_option_large_os_pages)) {
|
||||||
if (mi_option_is_enabled(mi_option_eager_region_commit)) {
|
if (mi_option_is_enabled(mi_option_eager_region_commit)) {
|
||||||
//_mi_os_reset(p, size, stats);
|
//_mi_os_reset(p, size, stats);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
//_mi_os_decommit(p, size, stats);
|
//_mi_os_decommit(p, size, stats);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: should we free empty regions? currently only done _mi_mem_collect.
|
// TODO: should we free empty regions? currently only done _mi_mem_collect.
|
||||||
// this frees up virtual address space which
|
// this frees up virtual address space which
|
||||||
// might be useful on 32-bit systems?
|
// might be useful on 32-bit systems?
|
||||||
|
|
||||||
// and unclaim
|
// and unclaim
|
||||||
uintptr_t map;
|
uintptr_t map;
|
||||||
uintptr_t newmap;
|
uintptr_t newmap;
|
||||||
@ -418,7 +418,7 @@ void _mi_mem_collect(mi_stats_t* stats) {
|
|||||||
// free every region that has no segments in use.
|
// free every region that has no segments in use.
|
||||||
for (size_t i = 0; i < regions_count; i++) {
|
for (size_t i = 0; i < regions_count; i++) {
|
||||||
mem_region_t* region = ®ions[i];
|
mem_region_t* region = ®ions[i];
|
||||||
if (mi_atomic_read(®ion->map) == 0 && region->start != NULL) {
|
if (mi_atomic_read(®ion->map) == 0 && region->start != NULL) {
|
||||||
// if no segments used, try to claim the whole region
|
// if no segments used, try to claim the whole region
|
||||||
uintptr_t m;
|
uintptr_t m;
|
||||||
do {
|
do {
|
||||||
@ -427,7 +427,7 @@ void _mi_mem_collect(mi_stats_t* stats) {
|
|||||||
if (m == 0) {
|
if (m == 0) {
|
||||||
// on success, free the whole region
|
// on success, free the whole region
|
||||||
if (region->start != NULL) _mi_os_free((void*)region->start, MI_REGION_SIZE, stats);
|
if (region->start != NULL) _mi_os_free((void*)region->start, MI_REGION_SIZE, stats);
|
||||||
// and release
|
// and release
|
||||||
region->start = 0;
|
region->start = 0;
|
||||||
mi_atomic_write(®ion->map,0);
|
mi_atomic_write(®ion->map,0);
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user