* Michal Privoznik (mpriv...@redhat.com) wrote: > When allocating large amounts of memory the task is offloaded > onto threads. These threads then use various techniques to > allocate the memory fully (madvise(), writing into the memory). > However, these threads are free to run on any CPU, which becomes > problematic on NUMA machines because it may happen that a thread > is running on a distant node. > > Ideally, this is something that a management application would > resolve, but we are not anywhere close to that, Firstly, memory > allocation happens before monitor socket is even available. But > okay, that's what -preconfig is for. But then the problem is that > 'object-add' would not return until all memory is preallocated. > > Long story short, management application has no way of learning > TIDs of allocator threads so it can't make them run NUMA aware. > > But what we can do is to propagate the 'host-nodes' attribute of > MemoryBackend object down to where preallocation threads are > created and set their affinity according to the attribute.
Joe (cc'd) sent me some numbers for this which emphasise how useful it is: | On systems with 4 physical numa nodes and 2-6 Tb of memory, this numa-aware |preallocation provided about a 25% speedup in touching the pages. |The speedup gets larger as the numa node count and memory sizes grow. .... | In a simple parallel 1Gb page-zeroing test on a very large system (32-numa | nodes and 47Tb of memory), the numa-aware preallocation was 2.3X faster | than letting the threads float wherever. | We're working with someone whose large guest normally takes 4.5 hours to | boot. With Michal P's initial patch to parallelize the preallocation, that | time dropped to about 1 hour. Including this numa-aware preallocation | would reduce the guest boot time to less than 1/2 hour. so chopping *half an hour* off the startup time seems a worthy optimisation (even if most of us aren't fortunate enough to have 47T of ram). Dave > Resolves: https://bugzilla.redhat.com/show_bug.cgi?id=2074000 > Signed-off-by: Michal Privoznik <mpriv...@redhat.com> > --- > backends/hostmem.c | 6 ++-- > hw/virtio/virtio-mem.c | 2 +- > include/qemu/osdep.h | 2 ++ > util/meson.build | 2 +- > util/oslib-posix.c | 74 ++++++++++++++++++++++++++++++++++++++++-- > util/oslib-win32.c | 2 ++ > 6 files changed, 82 insertions(+), 6 deletions(-) > > diff --git a/backends/hostmem.c b/backends/hostmem.c > index a7bae3d713..7373472c7e 100644 > --- a/backends/hostmem.c > +++ b/backends/hostmem.c > @@ -232,7 +232,8 @@ static void host_memory_backend_set_prealloc(Object *obj, > bool value, > void *ptr = memory_region_get_ram_ptr(&backend->mr); > uint64_t sz = memory_region_size(&backend->mr); > > - os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, &local_err); > + os_mem_prealloc(fd, ptr, sz, backend->prealloc_threads, > + backend->host_nodes, MAX_NODES, &local_err); > if (local_err) { > error_propagate(errp, local_err); > return; > @@ -394,7 +395,8 @@ host_memory_backend_memory_complete(UserCreatable *uc, > Error **errp) > */ > if (backend->prealloc) { > os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz, > - backend->prealloc_threads, &local_err); > + backend->prealloc_threads, backend->host_nodes, > + MAX_NODES, &local_err); > if (local_err) { > goto out; > } > diff --git a/hw/virtio/virtio-mem.c b/hw/virtio/virtio-mem.c > index 5aca408726..48b104cdf6 100644 > --- a/hw/virtio/virtio-mem.c > +++ b/hw/virtio/virtio-mem.c > @@ -467,7 +467,7 @@ static int virtio_mem_set_block_state(VirtIOMEM *vmem, > uint64_t start_gpa, > int fd = memory_region_get_fd(&vmem->memdev->mr); > Error *local_err = NULL; > > - os_mem_prealloc(fd, area, size, 1, &local_err); > + os_mem_prealloc(fd, area, size, 1, NULL, 0, &local_err); > if (local_err) { > static bool warned; > > diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h > index 1c1e7eca98..474cbf3b86 100644 > --- a/include/qemu/osdep.h > +++ b/include/qemu/osdep.h > @@ -577,6 +577,8 @@ unsigned long qemu_getauxval(unsigned long type); > void qemu_set_tty_echo(int fd, bool echo); > > void os_mem_prealloc(int fd, char *area, size_t sz, int smp_cpus, > + const unsigned long *host_nodes, > + unsigned long max_node, > Error **errp); > > /** > diff --git a/util/meson.build b/util/meson.build > index 8f16018cd4..393ff74570 100644 > --- a/util/meson.build > +++ b/util/meson.build > @@ -15,7 +15,7 @@ freebsd_dep = [] > if targetos == 'freebsd' > freebsd_dep = util > endif > -util_ss.add(when: 'CONFIG_POSIX', if_true: [files('oslib-posix.c'), > freebsd_dep]) > +util_ss.add(when: 'CONFIG_POSIX', if_true: [files('oslib-posix.c'), > freebsd_dep, numa]) > util_ss.add(when: 'CONFIG_POSIX', if_true: files('qemu-thread-posix.c')) > util_ss.add(when: 'CONFIG_POSIX', if_true: files('memfd.c')) > util_ss.add(when: 'CONFIG_WIN32', if_true: files('aio-win32.c')) > diff --git a/util/oslib-posix.c b/util/oslib-posix.c > index 477990f39b..1572b9b178 100644 > --- a/util/oslib-posix.c > +++ b/util/oslib-posix.c > @@ -73,6 +73,10 @@ > #include "qemu/error-report.h" > #endif > > +#ifdef CONFIG_NUMA > +#include <numa.h> > +#endif > + > #define MAX_MEM_PREALLOC_THREAD_COUNT 16 > > struct MemsetThread; > @@ -82,6 +86,9 @@ typedef struct MemsetContext { > bool any_thread_failed; > struct MemsetThread *threads; > int num_threads; > +#ifdef CONFIG_NUMA > + struct bitmask *nodemask; > +#endif > } MemsetContext; > > struct MemsetThread { > @@ -420,6 +427,12 @@ static void *do_touch_pages(void *arg) > } > qemu_mutex_unlock(&page_mutex); > > +#ifdef CONFIG_NUMA > + if (memset_args->context->nodemask) { > + numa_run_on_node_mask(memset_args->context->nodemask); > + } > +#endif > + > /* unblock SIGBUS */ > sigemptyset(&set); > sigaddset(&set, SIGBUS); > @@ -463,6 +476,12 @@ static void *do_madv_populate_write_pages(void *arg) > } > qemu_mutex_unlock(&page_mutex); > > +#ifdef CONFIG_NUMA > + if (memset_args->context->nodemask) { > + numa_run_on_node_mask(memset_args->context->nodemask); > + } > +#endif > + > if (size && qemu_madvise(addr, size, QEMU_MADV_POPULATE_WRITE)) { > ret = -errno; > } > @@ -489,7 +508,9 @@ static inline int get_memset_num_threads(size_t > hpagesize, size_t numpages, > } > > static int touch_all_pages(char *area, size_t hpagesize, size_t numpages, > - int smp_cpus, bool use_madv_populate_write) > + int smp_cpus, const unsigned long *host_nodes, > + unsigned long max_node, > + bool use_madv_populate_write) > { > static gsize initialized = 0; > MemsetContext context = { > @@ -499,6 +520,7 @@ static int touch_all_pages(char *area, size_t hpagesize, > size_t numpages, > void *(*touch_fn)(void *); > int ret = 0, i = 0; > char *addr = area; > + unsigned long value = max_node; > > if (g_once_init_enter(&initialized)) { > qemu_mutex_init(&page_mutex); > @@ -520,6 +542,48 @@ static int touch_all_pages(char *area, size_t hpagesize, > size_t numpages, > touch_fn = do_touch_pages; > } > > + if (host_nodes) { > + value = find_first_bit(host_nodes, max_node); > + } > + if (value != max_node) { > +#ifdef CONFIG_NUMA > + struct bitmask *cpus = numa_allocate_cpumask(); > + g_autofree unsigned long *zerocpumask; > + size_t zerocpumasklen; > + g_autofree unsigned long *zeronodemask; > + size_t zeronodemasklen; > + > + context.nodemask = numa_bitmask_alloc(max_node); > + > + zerocpumasklen = cpus->size / sizeof(unsigned long); > + zerocpumask = g_new0(unsigned long, zerocpumasklen); > + > + for (; value != max_node; > + value = find_next_bit(host_nodes, max_node, value + 1)) { > + if (numa_node_to_cpus(value, cpus) || > + memcmp(cpus->maskp, zerocpumask, zerocpumasklen) == 0) > + continue; > + > + /* If given NUMA node has CPUs run threads on them. */ > + numa_bitmask_setbit(context.nodemask, value); > + } > + > + numa_bitmask_free(cpus); > + > + zeronodemasklen = max_node / sizeof(unsigned long); > + zeronodemask = g_new0(unsigned long, zeronodemasklen); > + > + if (memcmp(context.nodemask->maskp, > + zeronodemask, zeronodemasklen) == 0) { > + /* If no NUMA has a CPU available, then don't pin threads. */ > + g_clear_pointer(&context.nodemask, numa_bitmask_free); > + } > +#else > + errno = -EINVAL; > + return -1; > +#endif > + } > + > context.threads = g_new0(MemsetThread, context.num_threads); > numpages_per_thread = numpages / context.num_threads; > leftover = numpages % context.num_threads; > @@ -554,6 +618,10 @@ static int touch_all_pages(char *area, size_t hpagesize, > size_t numpages, > if (!use_madv_populate_write) { > sigbus_memset_context = NULL; > } > + > +#ifdef CONFIG_NUMA > + g_clear_pointer(&context.nodemask, numa_bitmask_free); > +#endif > g_free(context.threads); > > return ret; > @@ -566,6 +634,8 @@ static bool madv_populate_write_possible(char *area, > size_t pagesize) > } > > void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, > + const unsigned long *host_nodes, > + unsigned long max_node, > Error **errp) > { > static gsize initialized; > @@ -608,7 +678,7 @@ void os_mem_prealloc(int fd, char *area, size_t memory, > int smp_cpus, > > /* touch pages simultaneously */ > ret = touch_all_pages(area, hpagesize, numpages, smp_cpus, > - use_madv_populate_write); > + host_nodes, max_node, use_madv_populate_write); > if (ret) { > error_setg_errno(errp, -ret, > "os_mem_prealloc: preallocating memory failed"); > diff --git a/util/oslib-win32.c b/util/oslib-win32.c > index dafef4f157..6efd912355 100644 > --- a/util/oslib-win32.c > +++ b/util/oslib-win32.c > @@ -314,6 +314,8 @@ int getpagesize(void) > } > > void os_mem_prealloc(int fd, char *area, size_t memory, int smp_cpus, > + const unsigned long *host_nodes, > + unsigned long max_node, > Error **errp) > { > int i; > -- > 2.35.1 > > -- Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK