rpmalloc: Import and set as default.

Hoard, the LGPL-licensed locking thread-caching allocator that we have
used by default since libroot's introduction, is showing its age.
It is a "pseudo-sbrk-based" allocator (it predates our actual sbrk,
so instead it uses a single Be area), which has serious limitations:
as we cannot ever move the area, we can only resize it "in place",
and so once we hit the end of the ~1.5GB reserved part of the address
space for the heap, we are usually out of luck if we need more memory.

On 32-bit userspace has only 2GB of address space anyway, but on
64-bit where address space is not a resource worth worrying about,
this can be a serious problem for applications that want to use a
lot of RAM. As more and more large applications get ported to Haiku,
the time for a mmap-based allocator has come.

For posterity's sake, here are all the possible options there were,
and why this one was selected rather than one of them, beginning
with the immediate rejects:

 * nedmalloc. Unmaintained since 2014 and all benchmarks show it
   underperforming vs. virtually all other allocators.
 * bmalloc. Significantly worse-performing vs. other options on
   this list with no apparent advantages.
 * hoard3. Now GPL only, which is obviously not acceptable for
   use as a system library.
 * ptmalloc2. glibc's default allocator; underperforms vs.
   even most of the above-listeds.

And now on to the honorable mentions:

 * tcmalloc. This is Google's allocator; it's designed for server
   and other high-performance workloads. As a result, it almost
   never unmaps memory unless ordered to do so in a very explicit
   way, which obviously is unacceptable behavior for a general-purpose
   allocator.

 * jemalloc. This is FreeBSD and NetBSD's default allocator as well
   as finding use in Firefox and Rust. It is again designed for
   performance, with significantly higher memory overhead than
   other allocators, especially for small heaps; which is of course
   a problem for us, as we want to retain our light footprint.

Finally this brings us to rpmalloc. It's not as well-travelled as
tcmalloc or jemalloc, but by benchmarks done by itself [0] and
by developers of other allocators [1], it seems to typically hit
the "sweet spot" of very good performance with lower (if not the lowest)
memory use out of all the other allocators it's tested against;
even beating jemalloc in certain benchmarks for speed, too.

You can see a description of the allocator's design in its README [2].

[0]: https://github.com/rampantpixels/rpmalloc/blob/master/BENCHMARKS.md
[1]: https://github.com/ezrosent/allocators-rs/blob/master/info/elfmalloc-performance.md
[2]: https://github.com/rampantpixels/rpmalloc#rpmalloc---rampant-pixels-memory-allocator

In general testing thus far on Haiku, it appears to be a consistent
5-10% performance boost (1m28s real -> 1m23s real) when doing
the "HaikuDepot compile" benchmark. Memory usage by most apps
after a cold boot changed negligibly (launch_daemon: 444K -> 476K,
app_server: 15.86MB -> 15.49MB, Tracker: 6.19MB -> 4.49MB.)

The only adverse affect I have observed so far is that a certain few
WebKit double-frees cause crashes/asserts faster than they did before
(e.g. Google Maps crashes after less than a minute instead of a few
minutes.)

That being said, any new or strange behaviors, please report
immediately. Backing out this change should be as easy as
reverting the changes to the libroot/posix Jamfile. If nothing
else comes up in a few weeks, then I'll remove Hoard from
the repository.

Fixes #13554.

Change-Id: Id2871601b1e99dcf022fbef2c53008ee6c3f233b
This commit is contained in:
Augustin Cavalier 2018-12-22 00:45:13 -05:00
parent c5f96e5b4a
commit 7132b79eaf
5 changed files with 2450 additions and 1 deletions

View File

@ -49,7 +49,7 @@ for arch in $(TARGET_ARCHS) {
SubInclude HAIKU_TOP src system libroot posix crypt ;
SubInclude HAIKU_TOP src system libroot posix locale ;
SubInclude HAIKU_TOP src system libroot posix malloc ;
SubInclude HAIKU_TOP src system libroot posix rpmalloc ;
SubInclude HAIKU_TOP src system libroot posix malloc_debug ;
SubInclude HAIKU_TOP src system libroot posix pthread ;
SubInclude HAIKU_TOP src system libroot posix signal ;

View File

@ -0,0 +1,17 @@
SubDir HAIKU_TOP src system libroot posix rpmalloc ;
UsePrivateHeaders libroot shared ;
local architectureObject ;
for architectureObject in [ MultiArchSubDirSetup ] {
on $(architectureObject) {
local architecture = $(TARGET_PACKAGING_ARCH) ;
UsePrivateSystemHeaders ;
MergeObject <$(architecture)>posix_malloc.o :
rpmalloc.cpp
wrapper.cpp
;
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,193 @@
/* rpmalloc.h - Memory allocator - Public Domain - 2016 Mattias Jansson / Rampant Pixels
*
* This library provides a cross-platform lock free thread caching malloc implementation in C11.
* The latest source code is always available at
*
* https://github.com/rampantpixels/rpmalloc
*
* This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
*
*/
#pragma once
#include <stddef.h>
#ifdef __HAIKU__
namespace BPrivate {
namespace rpmalloc {
#endif
#if defined(__clang__) || defined(__GNUC__)
# define RPMALLOC_ATTRIBUTE __attribute__((__malloc__))
# define RPMALLOC_RESTRICT
# define RPMALLOC_CDECL
#elif defined(_MSC_VER)
# define RPMALLOC_ATTRIBUTE
# define RPMALLOC_RESTRICT __declspec(restrict)
# define RPMALLOC_CDECL __cdecl
#else
# define RPMALLOC_ATTRIBUTE
# define RPMALLOC_RESTRICT
# define RPMALLOC_CDECL
#endif
//! Flag to rpaligned_realloc to not preserve content in reallocation
#define RPMALLOC_NO_PRESERVE 1
typedef struct rpmalloc_global_statistics_t {
//! Current amount of virtual memory mapped (only if ENABLE_STATISTICS=1)
size_t mapped;
//! Current amount of memory in global caches for small and medium sizes (<64KiB)
size_t cached;
//! Total amount of memory mapped (only if ENABLE_STATISTICS=1)
size_t mapped_total;
//! Total amount of memory unmapped (only if ENABLE_STATISTICS=1)
size_t unmapped_total;
} rpmalloc_global_statistics_t;
typedef struct rpmalloc_thread_statistics_t {
//! Current number of bytes available for allocation from active spans
size_t active;
//! Current number of bytes available in thread size class caches
size_t sizecache;
//! Current number of bytes available in thread span caches
size_t spancache;
//! Current number of bytes in pending deferred deallocations
size_t deferred;
//! Total number of bytes transitioned from thread cache to global cache
size_t thread_to_global;
//! Total number of bytes transitioned from global cache to thread cache
size_t global_to_thread;
} rpmalloc_thread_statistics_t;
typedef struct rpmalloc_config_t {
//! Map memory pages for the given number of bytes. The returned address MUST be
// aligned to the rpmalloc span size, which will always be a power of two.
// Optionally the function can store an alignment offset in the offset variable
// in case it performs alignment and the returned pointer is offset from the
// actual start of the memory region due to this alignment. The alignment offset
// will be passed to the memory unmap function. The alignment offset MUST NOT be
// larger than 65535 (storable in an uint16_t), if it is you must use natural
// alignment to shift it into 16 bits. If you set a memory_map function, you
// must also set a memory_unmap function or else the default implementation will
// be used for both.
void* (*memory_map)(size_t size, size_t* offset);
//! Unmap the memory pages starting at address and spanning the given number of bytes.
// If release is set to non-zero, the unmap is for an entire span range as returned by
// a previous call to memory_map and that the entire range should be released. The
// release argument holds the size of the entire span range. If release is set to 0,
// the unmap is a partial decommit of a subset of the mapped memory range.
// If you set a memory_unmap function, you must also set a memory_map function or
// else the default implementation will be used for both.
void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release);
//! Size of memory pages. The page size MUST be a power of two. All memory mapping
// requests to memory_map will be made with size set to a multiple of the page size.
size_t page_size;
//! Size of a span of memory blocks. MUST be a power of two, and in [4096,262144]
// range (unless 0 - set to 0 to use the default span size).
size_t span_size;
//! Number of spans to map at each request to map new virtual memory blocks. This can
// be used to minimize the system call overhead at the cost of virtual memory address
// space. The extra mapped pages will not be written until actually used, so physical
// committed memory should not be affected in the default implementation. Will be
// aligned to a multiple of spans that match memory page size in case of huge pages.
size_t span_map_count;
//! Enable use of large/huge pages. If this flag is set to non-zero and page size is
// zero, the allocator will try to enable huge pages and auto detect the configuration.
// If this is set to non-zero and page_size is also non-zero, the allocator will
// assume huge pages have been configured and enabled prior to initializing the
// allocator.
// For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
// For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
int enable_huge_pages;
} rpmalloc_config_t;
//! Initialize allocator with default configuration
extern int
rpmalloc_initialize(void);
//! Initialize allocator with given configuration
extern int
rpmalloc_initialize_config(const rpmalloc_config_t* config);
//! Get allocator configuration
extern const rpmalloc_config_t*
rpmalloc_config(void);
//! Finalize allocator
extern void
rpmalloc_finalize(void);
//! Initialize allocator for calling thread
extern void
rpmalloc_thread_initialize(void);
//! Finalize allocator for calling thread
extern void
rpmalloc_thread_finalize(void);
//! Perform deferred deallocations pending for the calling thread heap
extern void
rpmalloc_thread_collect(void);
//! Query if allocator is initialized for calling thread
extern int
rpmalloc_is_thread_initialized(void);
//! Get per-thread statistics
extern void
rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats);
//! Get global statistics
extern void
rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats);
//! Allocate a memory block of at least the given size
extern RPMALLOC_RESTRICT void*
rpmalloc(size_t size) RPMALLOC_ATTRIBUTE;
//! Free the given memory block
extern void
rpfree(void* ptr);
//! Allocate a memory block of at least the given size and zero initialize it
extern RPMALLOC_RESTRICT void*
rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIBUTE;
//! Reallocate the given block to at least the given size
extern void*
rprealloc(void* ptr, size_t size);
//! Reallocate the given block to at least the given size and alignment,
// with optional control flags (see RPMALLOC_NO_PRESERVE).
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size
extern void*
rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags);
//! Allocate a memory block of at least the given size and alignment.
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size
extern RPMALLOC_RESTRICT void*
rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE;
//! Allocate a memory block of at least the given size and alignment.
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size
extern RPMALLOC_RESTRICT void*
rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE;
//! Allocate a memory block of at least the given size and alignment.
// Alignment must be a power of two and a multiple of sizeof(void*),
// and should ideally be less than memory page size
extern int
rpposix_memalign(void **memptr, size_t alignment, size_t size);
//! Query the usable size of the given memory block (from given pointer to the end of block)
extern size_t
rpmalloc_usable_size(void* ptr);
#ifdef __HAIKU__
} // namespace rpmalloc
} // namespace BPrivate
#endif

View File

@ -0,0 +1,210 @@
/*
* Copyright 2018, Haiku Inc. All rights reserved.
* Distributed under the terms of the MIT License.
*
* Authors:
* Augustin Cavalier <waddlesplash>
*/
#include <errno.h>
#include <string.h>
#include <errno_private.h>
#include <user_thread.h>
#include "rpmalloc.h"
using namespace BPrivate::rpmalloc;
#if USER_MALLOC_TRACING
# define KTRACE(format...) ktrace_printf(format)
#else
# define KTRACE(format...) do {} while (false)
#endif
// #pragma mark - internal functions
extern "C" void
__init_heap()
{
rpmalloc_initialize();
}
extern "C" void
__heap_terminate_after()
{
// nothing to do
}
extern "C" void
__heap_before_fork()
{
// rpmalloc is lock-free; nothing to do
}
extern "C" void
__heap_after_fork_child()
{
// rpmalloc is lock-free; nothing to do
}
extern "C" void
__heap_after_fork_parent()
{
// rpmalloc is lock-free; nothing to do
}
extern "C" void
__heap_thread_init()
{
rpmalloc_thread_initialize();
}
extern "C" void
__heap_thread_exit()
{
rpmalloc_thread_finalize();
}
// #pragma mark - public functions
extern "C" void *
malloc(size_t size)
{
void* addr = rpmalloc(size);
if (addr == NULL) {
__set_errno(B_NO_MEMORY);
KTRACE("malloc(%lu) -> NULL", size);
return NULL;
}
KTRACE("malloc(%lu) -> %p", size, addr);
return addr;
}
extern "C" void *
calloc(size_t nelem, size_t elsize)
{
void* addr = rpcalloc(nelem, elsize);
if (addr == NULL) {
__set_errno(B_NO_MEMORY);
KTRACE("calloc(%lu, %lu) -> NULL", nelem, elsize);
return NULL;
}
KTRACE("calloc(%lu, %lu) -> %p", nelem, elsize, addr);
return addr;
}
extern "C" void
free(void *ptr)
{
KTRACE("free(%p)", ptr);
rpfree(ptr);
}
extern "C" void*
memalign(size_t alignment, size_t size)
{
void* addr = rpmemalign(alignment, size);
if (addr == NULL) {
__set_errno(B_NO_MEMORY);
KTRACE("memalign(%lu, %lu) -> NULL", alignment, size);
return NULL;
}
KTRACE("memalign(%lu, %lu) -> %p", alignment, size, addr);
return addr;
}
extern "C" int
posix_memalign(void** _pointer, size_t alignment, size_t size)
{
if (_pointer == NULL)
return B_BAD_VALUE;
status_t status = rpposix_memalign(_pointer, alignment, size);
KTRACE("posix_memalign(%p, %lu, %lu) -> %s, %p", _pointer, alignment,
size, strerror(status), *_pointer);
return status;
}
extern "C" void*
valloc(size_t size)
{
return memalign(B_PAGE_SIZE, size);
}
extern "C" void*
realloc(void* ptr, size_t size)
{
if (ptr == NULL)
return malloc(size);
if (size == 0) {
free(ptr);
return NULL;
}
void* addr = rprealloc(ptr, size);
if (addr == NULL) {
__set_errno(B_NO_MEMORY);
KTRACE("realloc(%p, %lu) -> NULL", ptr, size);
return NULL;
}
KTRACE("rprealloc(%p, %lu) -> %p", ptr, size, addr);
return addr;
}
extern "C" size_t
malloc_usable_size(void *ptr)
{
if (ptr == NULL)
return 0;
return rpmalloc_usable_size(ptr);
}
// #pragma mark - BeOS specific extensions
struct mstats {
size_t bytes_total;
size_t chunks_used;
size_t bytes_used;
size_t chunks_free;
size_t bytes_free;
};
extern "C" struct mstats mstats(void);
extern "C" struct mstats
mstats(void)
{
// TODO
static struct mstats stats = {0};
return stats;
}