rpmalloc: Import and set as default.

Hoard, the LGPL-licensed locking thread-caching allocator that we have used by default since libroot's introduction, is showing its age. It is a "pseudo-sbrk-based" allocator (it predates our actual sbrk, so instead it uses a single Be area), which has serious limitations: as we cannot ever move the area, we can only resize it "in place", and so once we hit the end of the ~1.5GB reserved part of the address space for the heap, we are usually out of luck if we need more memory. On 32-bit userspace has only 2GB of address space anyway, but on 64-bit where address space is not a resource worth worrying about, this can be a serious problem for applications that want to use a lot of RAM. As more and more large applications get ported to Haiku, the time for a mmap-based allocator has come. For posterity's sake, here are all the possible options there were, and why this one was selected rather than one of them, beginning with the immediate rejects: * nedmalloc. Unmaintained since 2014 and all benchmarks show it underperforming vs. virtually all other allocators. * bmalloc. Significantly worse-performing vs. other options on this list with no apparent advantages. * hoard3. Now GPL only, which is obviously not acceptable for use as a system library. * ptmalloc2. glibc's default allocator; underperforms vs. even most of the above-listeds. And now on to the honorable mentions: * tcmalloc. This is Google's allocator; it's designed for server and other high-performance workloads. As a result, it almost never unmaps memory unless ordered to do so in a very explicit way, which obviously is unacceptable behavior for a general-purpose allocator. * jemalloc. This is FreeBSD and NetBSD's default allocator as well as finding use in Firefox and Rust. It is again designed for performance, with significantly higher memory overhead than other allocators, especially for small heaps; which is of course a problem for us, as we want to retain our light footprint. Finally this brings us to rpmalloc. It's not as well-travelled as tcmalloc or jemalloc, but by benchmarks done by itself [0] and by developers of other allocators [1], it seems to typically hit the "sweet spot" of very good performance with lower (if not the lowest) memory use out of all the other allocators it's tested against; even beating jemalloc in certain benchmarks for speed, too. You can see a description of the allocator's design in its README [2]. [0]: https://github.com/rampantpixels/rpmalloc/blob/master/BENCHMARKS.md [1]: https://github.com/ezrosent/allocators-rs/blob/master/info/elfmalloc-performance.md [2]: https://github.com/rampantpixels/rpmalloc#rpmalloc---rampant-pixels-memory-allocator In general testing thus far on Haiku, it appears to be a consistent 5-10% performance boost (1m28s real -> 1m23s real) when doing the "HaikuDepot compile" benchmark. Memory usage by most apps after a cold boot changed negligibly (launch_daemon: 444K -> 476K, app_server: 15.86MB -> 15.49MB, Tracker: 6.19MB -> 4.49MB.) The only adverse affect I have observed so far is that a certain few WebKit double-frees cause crashes/asserts faster than they did before (e.g. Google Maps crashes after less than a minute instead of a few minutes.) That being said, any new or strange behaviors, please report immediately. Backing out this change should be as easy as reverting the changes to the libroot/posix Jamfile. If nothing else comes up in a few weeks, then I'll remove Hoard from the repository. Fixes #13554. Change-Id: Id2871601b1e99dcf022fbef2c53008ee6c3f233b
2018-12-22 00:45:13 -05:00 · 2018-12-22 00:45:13 -05:00 · 7132b79eaf
commit 7132b79eaf
parent c5f96e5b4a
5 changed files with 2450 additions and 1 deletions
--- a/src/system/libroot/posix/Jamfile
+++ b/src/system/libroot/posix/Jamfile
@ -49,7 +49,7 @@ for arch in $(TARGET_ARCHS) {

 SubInclude HAIKU_TOP src system libroot posix crypt ;
 SubInclude HAIKU_TOP src system libroot posix locale ;
-SubInclude HAIKU_TOP src system libroot posix malloc ;
+SubInclude HAIKU_TOP src system libroot posix rpmalloc ;
 SubInclude HAIKU_TOP src system libroot posix malloc_debug ;
 SubInclude HAIKU_TOP src system libroot posix pthread ;
 SubInclude HAIKU_TOP src system libroot posix signal ;
--- a/src/system/libroot/posix/rpmalloc/Jamfile
+++ b/src/system/libroot/posix/rpmalloc/Jamfile
@ -0,0 +1,17 @@
+SubDir HAIKU_TOP src system libroot posix rpmalloc ;
+
+UsePrivateHeaders libroot shared ;
+
+local architectureObject ;
+for architectureObject in [ MultiArchSubDirSetup ] {
+	on $(architectureObject) {
+		local architecture = $(TARGET_PACKAGING_ARCH) ;
+
+		UsePrivateSystemHeaders ;
+
+		MergeObject <$(architecture)>posix_malloc.o :
+			rpmalloc.cpp
+			wrapper.cpp
+			;
+	}
+}
--- a/src/system/libroot/posix/rpmalloc/rpmalloc.cpp
+++ b/src/system/libroot/posix/rpmalloc/rpmalloc.cpp
--- a/src/system/libroot/posix/rpmalloc/rpmalloc.h
+++ b/src/system/libroot/posix/rpmalloc/rpmalloc.h
@ -0,0 +1,193 @@
+/* rpmalloc.h  -  Memory allocator  -  Public Domain  -  2016 Mattias Jansson / Rampant Pixels
+ *
+ * This library provides a cross-platform lock free thread caching malloc implementation in C11.
+ * The latest source code is always available at
+ *
+ * https://github.com/rampantpixels/rpmalloc
+ *
+ * This library is put in the public domain; you can redistribute it and/or modify it without any restrictions.
+ *
+ */
+#pragma once
+
+#include <stddef.h>
+
+#ifdef __HAIKU__
+namespace BPrivate {
+namespace rpmalloc {
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+# define RPMALLOC_ATTRIBUTE __attribute__((__malloc__))
+# define RPMALLOC_RESTRICT
+# define RPMALLOC_CDECL
+#elif defined(_MSC_VER)
+# define RPMALLOC_ATTRIBUTE
+# define RPMALLOC_RESTRICT __declspec(restrict)
+# define RPMALLOC_CDECL __cdecl
+#else
+# define RPMALLOC_ATTRIBUTE
+# define RPMALLOC_RESTRICT
+# define RPMALLOC_CDECL
+#endif
+
+//! Flag to rpaligned_realloc to not preserve content in reallocation
+#define RPMALLOC_NO_PRESERVE    1
+
+typedef struct rpmalloc_global_statistics_t {
+	//! Current amount of virtual memory mapped (only if ENABLE_STATISTICS=1)
+	size_t mapped;
+	//! Current amount of memory in global caches for small and medium sizes (<64KiB)
+	size_t cached;
+	//! Total amount of memory mapped (only if ENABLE_STATISTICS=1)
+	size_t mapped_total;
+	//! Total amount of memory unmapped (only if ENABLE_STATISTICS=1)
+	size_t unmapped_total;
+} rpmalloc_global_statistics_t;
+
+typedef struct rpmalloc_thread_statistics_t {
+	//! Current number of bytes available for allocation from active spans
+	size_t active;
+	//! Current number of bytes available in thread size class caches
+	size_t sizecache;
+	//! Current number of bytes available in thread span caches
+	size_t spancache;
+	//! Current number of bytes in pending deferred deallocations
+	size_t deferred;
+	//! Total number of bytes transitioned from thread cache to global cache
+	size_t thread_to_global;
+	//! Total number of bytes transitioned from global cache to thread cache
+	size_t global_to_thread;
+} rpmalloc_thread_statistics_t;
+
+typedef struct rpmalloc_config_t {
+	//! Map memory pages for the given number of bytes. The returned address MUST be
+	//  aligned to the rpmalloc span size, which will always be a power of two.
+	//  Optionally the function can store an alignment offset in the offset variable
+	//  in case it performs alignment and the returned pointer is offset from the
+	//  actual start of the memory region due to this alignment. The alignment offset
+	//  will be passed to the memory unmap function. The alignment offset MUST NOT be
+	//  larger than 65535 (storable in an uint16_t), if it is you must use natural
+	//  alignment to shift it into 16 bits. If you set a memory_map function, you
+	//  must also set a memory_unmap function or else the default implementation will
+	//  be used for both.
+	void* (*memory_map)(size_t size, size_t* offset);
+	//! Unmap the memory pages starting at address and spanning the given number of bytes.
+	//  If release is set to non-zero, the unmap is for an entire span range as returned by
+	//  a previous call to memory_map and that the entire range should be released. The
+	//  release argument holds the size of the entire span range. If release is set to 0,
+	//  the unmap is a partial decommit of a subset of the mapped memory range.
+	//  If you set a memory_unmap function, you must also set a memory_map function or
+	//  else the default implementation will be used for both.
+	void (*memory_unmap)(void* address, size_t size, size_t offset, size_t release);
+	//! Size of memory pages. The page size MUST be a power of two. All memory mapping
+	//  requests to memory_map will be made with size set to a multiple of the page size.
+	size_t page_size;
+	//! Size of a span of memory blocks. MUST be a power of two, and in [4096,262144]
+	//  range (unless 0 - set to 0 to use the default span size).
+	size_t span_size;
+	//! Number of spans to map at each request to map new virtual memory blocks. This can
+	//  be used to minimize the system call overhead at the cost of virtual memory address
+	//  space. The extra mapped pages will not be written until actually used, so physical
+	//  committed memory should not be affected in the default implementation. Will be
+	//  aligned to a multiple of spans that match memory page size in case of huge pages.
+	size_t span_map_count;
+	//! Enable use of large/huge pages. If this flag is set to non-zero and page size is
+	//  zero, the allocator will try to enable huge pages and auto detect the configuration.
+	//  If this is set to non-zero and page_size is also non-zero, the allocator will
+	//  assume huge pages have been configured and enabled prior to initializing the
+	//  allocator.
+	//  For Windows, see https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
+	//  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
+	int enable_huge_pages;
+} rpmalloc_config_t;
+
+//! Initialize allocator with default configuration
+extern int
+rpmalloc_initialize(void);
+
+//! Initialize allocator with given configuration
+extern int
+rpmalloc_initialize_config(const rpmalloc_config_t* config);
+
+//! Get allocator configuration
+extern const rpmalloc_config_t*
+rpmalloc_config(void);
+
+//! Finalize allocator
+extern void
+rpmalloc_finalize(void);
+
+//! Initialize allocator for calling thread
+extern void
+rpmalloc_thread_initialize(void);
+
+//! Finalize allocator for calling thread
+extern void
+rpmalloc_thread_finalize(void);
+
+//! Perform deferred deallocations pending for the calling thread heap
+extern void
+rpmalloc_thread_collect(void);
+
+//! Query if allocator is initialized for calling thread
+extern int
+rpmalloc_is_thread_initialized(void);
+
+//! Get per-thread statistics
+extern void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t* stats);
+
+//! Get global statistics
+extern void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t* stats);
+
+//! Allocate a memory block of at least the given size
+extern RPMALLOC_RESTRICT void*
+rpmalloc(size_t size) RPMALLOC_ATTRIBUTE;
+
+//! Free the given memory block
+extern void
+rpfree(void* ptr);
+
+//! Allocate a memory block of at least the given size and zero initialize it
+extern RPMALLOC_RESTRICT void*
+rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIBUTE;
+
+//! Reallocate the given block to at least the given size
+extern void*
+rprealloc(void* ptr, size_t size);
+
+//! Reallocate the given block to at least the given size and alignment,
+//  with optional control flags (see RPMALLOC_NO_PRESERVE).
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size
+extern void*
+rpaligned_realloc(void* ptr, size_t alignment, size_t size, size_t oldsize, unsigned int flags);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size
+extern RPMALLOC_RESTRICT void*
+rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE;
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size
+extern RPMALLOC_RESTRICT void*
+rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIBUTE;
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size
+extern int
+rpposix_memalign(void **memptr, size_t alignment, size_t size);
+
+//! Query the usable size of the given memory block (from given pointer to the end of block)
+extern size_t
+rpmalloc_usable_size(void* ptr);
+
+#ifdef __HAIKU__
+} // namespace rpmalloc
+} // namespace BPrivate
+#endif
--- a/src/system/libroot/posix/rpmalloc/wrapper.cpp
+++ b/src/system/libroot/posix/rpmalloc/wrapper.cpp
@ -0,0 +1,210 @@
+/*
+ * Copyright 2018, Haiku Inc. All rights reserved.
+ * Distributed under the terms of the MIT License.
+ *
+ * Authors:
+ *		Augustin Cavalier <waddlesplash>
+ */
+
+#include <errno.h>
+#include <string.h>
+
+#include <errno_private.h>
+#include <user_thread.h>
+
+#include "rpmalloc.h"
+
+using namespace BPrivate::rpmalloc;
+
+#if USER_MALLOC_TRACING
+#	define KTRACE(format...)	ktrace_printf(format)
+#else
+#	define KTRACE(format...)	do {} while (false)
+#endif
+
+
+//	#pragma mark - internal functions
+
+
+extern "C" void
+__init_heap()
+{
+	rpmalloc_initialize();
+}
+
+
+extern "C" void
+__heap_terminate_after()
+{
+	// nothing to do
+}
+
+
+extern "C" void
+__heap_before_fork()
+{
+	// rpmalloc is lock-free; nothing to do
+}
+
+
+extern "C" void
+__heap_after_fork_child()
+{
+	// rpmalloc is lock-free; nothing to do
+}
+
+
+extern "C" void
+__heap_after_fork_parent()
+{
+	// rpmalloc is lock-free; nothing to do
+}
+
+
+extern "C" void
+__heap_thread_init()
+{
+	rpmalloc_thread_initialize();
+}
+
+
+extern "C" void
+__heap_thread_exit()
+{
+	rpmalloc_thread_finalize();
+}
+
+
+//	#pragma mark - public functions
+
+
+extern "C" void *
+malloc(size_t size)
+{
+	void* addr = rpmalloc(size);
+	if (addr == NULL) {
+		__set_errno(B_NO_MEMORY);
+		KTRACE("malloc(%lu) -> NULL", size);
+		return NULL;
+	}
+
+	KTRACE("malloc(%lu) -> %p", size, addr);
+
+	return addr;
+}
+
+
+extern "C" void *
+calloc(size_t nelem, size_t elsize)
+{
+	void* addr = rpcalloc(nelem, elsize);
+	if (addr == NULL) {
+		__set_errno(B_NO_MEMORY);
+		KTRACE("calloc(%lu, %lu) -> NULL", nelem, elsize);
+		return NULL;
+	}
+
+	KTRACE("calloc(%lu, %lu) -> %p", nelem, elsize, addr);
+
+	return addr;
+}
+
+
+extern "C" void
+free(void *ptr)
+{
+	KTRACE("free(%p)", ptr);
+	rpfree(ptr);
+}
+
+
+extern "C" void*
+memalign(size_t alignment, size_t size)
+{
+	void* addr = rpmemalign(alignment, size);
+	if (addr == NULL) {
+		__set_errno(B_NO_MEMORY);
+		KTRACE("memalign(%lu, %lu) -> NULL", alignment, size);
+		return NULL;
+	}
+
+	KTRACE("memalign(%lu, %lu) -> %p", alignment, size, addr);
+
+	return addr;
+}
+
+
+extern "C" int
+posix_memalign(void** _pointer, size_t alignment, size_t size)
+{
+	if (_pointer == NULL)
+		return B_BAD_VALUE;
+
+	status_t status = rpposix_memalign(_pointer, alignment, size);
+	KTRACE("posix_memalign(%p, %lu, %lu) -> %s, %p", _pointer, alignment,
+		size, strerror(status), *_pointer);
+	return status;
+}
+
+
+extern "C" void*
+valloc(size_t size)
+{
+	return memalign(B_PAGE_SIZE, size);
+}
+
+
+extern "C" void*
+realloc(void* ptr, size_t size)
+{
+	if (ptr == NULL)
+		return malloc(size);
+	if (size == 0) {
+		free(ptr);
+		return NULL;
+	}
+
+	void* addr = rprealloc(ptr, size);
+	if (addr == NULL) {
+		__set_errno(B_NO_MEMORY);
+		KTRACE("realloc(%p, %lu) -> NULL", ptr, size);
+		return NULL;
+	}
+
+	KTRACE("rprealloc(%p, %lu) -> %p", ptr, size, addr);
+
+	return addr;
+}
+
+
+extern "C" size_t
+malloc_usable_size(void *ptr)
+{
+	if (ptr == NULL)
+		return 0;
+
+	return rpmalloc_usable_size(ptr);
+}
+
+
+//	#pragma mark - BeOS specific extensions
+
+
+struct mstats {
+	size_t bytes_total;
+	size_t chunks_used;
+	size_t bytes_used;
+	size_t chunks_free;
+	size_t bytes_free;
+};
+
+
+extern "C" struct mstats mstats(void);
+
+extern "C" struct mstats
+mstats(void)
+{
+	// TODO
+	static struct mstats stats = {0};
+	return stats;
+}