util/oslib-posix: Support MADV_POPULATE_WRITE for os_mem_prealloc()

Let's sense support and use it for preallocation. MADV_POPULATE_WRITE
does not require a SIGBUS handler, doesn't actually touch page content,
and avoids context switches; it is, therefore, faster and easier to handle
than our current approach.

While MADV_POPULATE_WRITE is, in general, faster than manual
prefaulting, and especially faster with 4k pages, there is still value in
prefaulting using multiple threads to speed up preallocation.

More details on MADV_POPULATE_WRITE can be found in the Linux commits
4ca9b3859dac ("mm/madvise: introduce MADV_POPULATE_(READ|WRITE) to prefault
page tables") and eb2faa513c24 ("mm/madvise: report SIGBUS as -EFAULT for
MADV_POPULATE_(READ|WRITE)"), and in the man page proposal [1].

This resolves the TODO in do_touch_pages().

In the future, we might want to look into using fallocate(), eventually
combined with MADV_POPULATE_READ, when dealing with shared file/fd
mappings and not caring about memory bindings.

[1] https://lkml.kernel.org/r/20210816081922.5155-1-david@redhat.com

Reviewed-by: Pankaj Gupta <pankaj.gupta@ionos.com>
Reviewed-by: Daniel P. Berrangé <berrange@redhat.com>
Reviewed-by: Michal Privoznik <mprivozn@redhat.com>
Signed-off-by: David Hildenbrand <david@redhat.com>
Message-Id: <20211217134611.31172-3-david@redhat.com>
Reviewed-by: Michael S. Tsirkin <mst@redhat.com>
Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
This commit is contained in:
David Hildenbrand 2021-12-17 14:46:05 +01:00 committed by Michael S. Tsirkin
parent 6c427ab926
commit a384bfa32e
2 changed files with 68 additions and 20 deletions

View File

@ -471,6 +471,11 @@ static inline void qemu_cleanup_generic_vfree(void *p)
#else #else
#define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED #define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED
#endif #endif
#ifdef MADV_POPULATE_WRITE
#define QEMU_MADV_POPULATE_WRITE MADV_POPULATE_WRITE
#else
#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
#endif
#elif defined(CONFIG_POSIX_MADVISE) #elif defined(CONFIG_POSIX_MADVISE)
@ -484,6 +489,7 @@ static inline void qemu_cleanup_generic_vfree(void *p)
#define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED #define QEMU_MADV_REMOVE QEMU_MADV_DONTNEED
#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
#else /* no-op */ #else /* no-op */
@ -497,6 +503,7 @@ static inline void qemu_cleanup_generic_vfree(void *p)
#define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID #define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
#define QEMU_MADV_REMOVE QEMU_MADV_INVALID #define QEMU_MADV_REMOVE QEMU_MADV_INVALID
#define QEMU_MADV_POPULATE_WRITE QEMU_MADV_INVALID
#endif #endif

View File

@ -484,10 +484,6 @@ static void *do_touch_pages(void *arg)
* *
* 'volatile' to stop compiler optimizing this away * 'volatile' to stop compiler optimizing this away
* to a no-op * to a no-op
*
* TODO: get a better solution from kernel so we
* don't need to write at all so we don't cause
* wear on the storage backing the region...
*/ */
*(volatile char *)addr = *addr; *(volatile char *)addr = *addr;
addr += hpagesize; addr += hpagesize;
@ -497,6 +493,26 @@ static void *do_touch_pages(void *arg)
return (void *)(uintptr_t)ret; return (void *)(uintptr_t)ret;
} }
static void *do_madv_populate_write_pages(void *arg)
{
MemsetThread *memset_args = (MemsetThread *)arg;
const size_t size = memset_args->numpages * memset_args->hpagesize;
char * const addr = memset_args->addr;
int ret = 0;
/* See do_touch_pages(). */
qemu_mutex_lock(&page_mutex);
while (!threads_created_flag) {
qemu_cond_wait(&page_cond, &page_mutex);
}
qemu_mutex_unlock(&page_mutex);
if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) {
ret = -errno;
}
return (void *)(uintptr_t)ret;
}
static inline int get_memset_num_threads(int smp_cpus) static inline int get_memset_num_threads(int smp_cpus)
{ {
long host_procs = sysconf(_SC_NPROCESSORS_ONLN); long host_procs = sysconf(_SC_NPROCESSORS_ONLN);
@ -510,10 +526,11 @@ static inline int get_memset_num_threads(int smp_cpus)
} }
static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
int smp_cpus) int smp_cpus, bool use_madv_populate_write)
{ {
static gsize initialized = 0; static gsize initialized = 0;
size_t numpages_per_thread, leftover; size_t numpages_per_thread, leftover;
void *(*touch_fn)(void *);
int ret = 0, i = 0; int ret = 0, i = 0;
char *addr = area; char *addr = area;
@ -523,6 +540,12 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
g_once_init_leave(&initialized, 1); g_once_init_leave(&initialized, 1);
} }
if (use_madv_populate_write) {
touch_fn = do_madv_populate_write_pages;
} else {
touch_fn = do_touch_pages;
}
threads_created_flag = false; threads_created_flag = false;
memset_num_threads = get_memset_num_threads(smp_cpus); memset_num_threads = get_memset_num_threads(smp_cpus);
memset_thread = g_new0(MemsetThread, memset_num_threads); memset_thread = g_new0(MemsetThread, memset_num_threads);
@ -533,7 +556,7 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
memset_thread[i].numpages = numpages_per_thread + (i < leftover); memset_thread[i].numpages = numpages_per_thread + (i < leftover);
memset_thread[i].hpagesize = hpagesize; memset_thread[i].hpagesize = hpagesize;
qemu_thread_create(&memset_thread[i].pgthread, "touch_pages", qemu_thread_create(&memset_thread[i].pgthread, "touch_pages",
do_touch_pages, &memset_thread[i], touch_fn, &memset_thread[i],
QEMU_THREAD_JOINABLE); QEMU_THREAD_JOINABLE);
addr += memset_thread[i].numpages * hpagesize; addr += memset_thread[i].numpages * hpagesize;
} }
@ -556,6 +579,12 @@ static int touch_all_pages(char *area, size_t hpagesize, size_t numpages,
return ret; return ret;
} }
static bool madv_populate_write_possible(char *area, size_t pagesize)
{
return !qemu_madvise(area, pagesize, QEMU_MADV_POPULATE_WRITE) ||
errno != EINVAL;
}
void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
Error **errp) Error **errp)
{ {
@ -563,30 +592,42 @@ void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus,
struct sigaction act, oldact; struct sigaction act, oldact;
size_t hpagesize = qemu_fd_getpagesize(fd); size_t hpagesize = qemu_fd_getpagesize(fd);
size_t numpages = DIV_ROUND_UP(memory, hpagesize); size_t numpages = DIV_ROUND_UP(memory, hpagesize);
bool use_madv_populate_write;
memset(&act, 0, sizeof(act)); /*
act.sa_handler = &sigbus_handler; * Sense on every invocation, as MADV_POPULATE_WRITE cannot be used for
act.sa_flags = 0; * some special mappings, such as mapping /dev/mem.
*/
use_madv_populate_write = madv_populate_write_possible(area, hpagesize);
ret = sigaction(SIGBUS, &act, &oldact); if (!use_madv_populate_write) {
if (ret) { memset(&act, 0, sizeof(act));
error_setg_errno(errp, errno, act.sa_handler = &sigbus_handler;
"os_mem_prealloc: failed to install signal handler"); act.sa_flags = 0;
return;
ret = sigaction(SIGBUS, &act, &oldact);
if (ret) {
error_setg_errno(errp, errno,
"os_mem_prealloc: failed to install signal handler");
return;
}
} }
/* touch pages simultaneously */ /* touch pages simultaneously */
ret = touch_all_pages(area, hpagesize, numpages, smp_cpus); ret = touch_all_pages(area, hpagesize, numpages, smp_cpus,
use_madv_populate_write);
if (ret) { if (ret) {
error_setg_errno(errp, -ret, error_setg_errno(errp, -ret,
"os_mem_prealloc: preallocating memory failed"); "os_mem_prealloc: preallocating memory failed");
} }
ret = sigaction(SIGBUS, &oldact, NULL); if (!use_madv_populate_write) {
if (ret) { ret = sigaction(SIGBUS, &oldact, NULL);
/* Terminate QEMU since it can't recover from error */ if (ret) {
perror("os_mem_prealloc: failed to reinstall signal handler"); /* Terminate QEMU since it can't recover from error */
exit(1); perror("os_mem_prealloc: failed to reinstall signal handler");
exit(1);
}
} }
} }