On Mon, May 19, 2014 at 08:34:54PM -0300, Eduardo Habkost wrote: > On Tue, May 06, 2014 at 05:27:46PM +0800, Hu Tao wrote: > [...] > > @@ -203,6 +296,20 @@ host_memory_backend_memory_init(UserCreatable *uc, > > Error **errp) > > if (backend->prealloc) { > > os_mem_prealloc(memory_region_get_fd(&backend->mr), ptr, sz); > > } > > + > > +#ifdef CONFIG_NUMA > > + unsigned long maxnode = find_last_bit(backend->host_nodes, MAX_NODES); > > + > > + /* This is a workaround for a long standing bug in Linux' > > + * mbind implementation, which cuts off the last specified > > + * node. > > + */ > > What if the bug is fixed? mbind() documentation says "nodemask points to
No it won't, otherwise softwares depend on mbind() will break. > a bit mask of nodes containing up to maxnode bits", so we must ensure > backend->host_nodes has the one extra bit. Yes. > > Also, if no bit is set, we can pass nodemask=NULL or maxnode=0 as > argument. > > We could address both issues, and do this: > > struct HostMemoryBackend { [...] > DECLARE_BITMAP(host_nodes, MAX_NODES + 1); > [...] > lastbit = find_last_bit(backend->host_nodes, MAX_NODES); > /* lastbit == MAX_NODES means maxnode=0 */ > maxnode = (lastbit + 1) % (MAX_NODES + 1); > /* We can have up to MAX_NODES nodes, but we need to pass maxnode+1 > * as argument to mbind() due to an old Linux bug (feature?) which > * cuts off the last specified node. This means backend->host_nodes > * must have MAX_NODES+1 bits available. > */ > assert(sizeof(backend->host_nodes) >= BITS_TO_LONGS(MAX_NODES + 1) * > sizeof(unsigned long)); > assert(maxnode <= MAX_NODES); I think we can just omit these two asserts since they are guaranteed to be true. > mbind(ptr, sz, policy, maxnode ? backend->host_nodes : NULL, maxnode + 1, > flags); > > > (I am starting to wonder if it was worth dropping the libnuma > requirement and implementing our own mbind()-calling code.) > > > + if (mbind(ptr, sz, backend->policy, backend->host_nodes, maxnode + 2, > > 0)) { > > + error_setg_errno(errp, errno, > > + "cannot bind memory to host NUMA nodes"); > > Don't we want to set flags to MPOL_MF_STRICT here? I believe we > shouldn't have any pages preallocated at this point, but in case we do, > I would expect them to be moved instead of ignoring the policy set by > the user. MPOL_MF_STRICT | MPOL_MF_MOVE to move. Actually in this version the preallocation happens before mbind, which is fixed in v3.2. > > > + return; > > + } > > +#endif > > } > > > > MemoryRegion * > > diff --git a/include/sysemu/hostmem.h b/include/sysemu/hostmem.h > > index 819b72d..4e96298 100644 > > --- a/include/sysemu/hostmem.h > > +++ b/include/sysemu/hostmem.h > > @@ -12,8 +12,10 @@ > > #ifndef QEMU_HOSTMEM_H > > #define QEMU_HOSTMEM_H > > > > +#include "sysemu/sysemu.h" /* for MAX_NODES */ > > #include "qom/object.h" > > #include "exec/memory.h" > > +#include "qemu/bitmap.h" > > > > #define TYPE_MEMORY_BACKEND "memory" > > #define MEMORY_BACKEND(obj) \ > > @@ -52,6 +54,8 @@ struct HostMemoryBackend { > > uint64_t size; > > bool merge, dump; > > bool prealloc, force_prealloc; > > + DECLARE_BITMAP(host_nodes, MAX_NODES); > > + HostMemPolicy policy; > > > > MemoryRegion mr; > > }; > > diff --git a/qapi-schema.json b/qapi-schema.json > > index 5dd30eb..bea3476 100644 > > --- a/qapi-schema.json > > +++ b/qapi-schema.json > > @@ -4732,3 +4732,23 @@ > > '*cpus': ['uint16'], > > '*mem': 'size', > > '*memdev': 'str' }} > > + > > +## > > +# @HostMemPolicy > > +# > > +# Host memory policy types > > +# > > +# @default: restore default policy, remove any nondefault policy > > +# > > +# @preferred: set the preferred host nodes for allocation > > +# > > +# @bind: a strict policy that restricts memory allocation to the > > +# host nodes specified > > +# > > +# @interleave: memory allocations are interleaved across the set > > +# of host nodes specified > > +# > > +# Since 2.1 > > +## > > +{ 'enum': 'HostMemPolicy', > > + 'data': [ 'default', 'preferred', 'bind', 'interleave' ] } > > -- > > 1.8.5.2.229.g4448466 > > > > > > -- > Eduardo