On Tue, 24 Jun 2014 10:40:38 -0700 Nishanth Aravamudan <n...@linux.vnet.ibm.com> wrote:
> Sparse node numbering occurs on powerpc in practice under PowerVM. In > order to emulate the same NUMA topology under qemu, the assumption that > NUMA nodes are linearly ordered has to be removed. qemu actually already > supports (inasmuch as it doesn't throw an error) sparse node numbering > by the end-user, but the information is effectively ignored and the > nodes are populated linearly starting at 0. This means a user's node ID > requests are not honored in the Linux kernel, and that the topology may > not be as requested (in terms of the CPU/memory placement). > > Add a present field to NodeInfo which indicates if a given nodeid was > present on the command-line or not. Current code relies on a memory > value being passed for a node to indicate presence, which is > insufficient in the presence of memoryless nodes or sparse numbering. > Adjust the iteration of various NUMA loops to use the maximum known NUMA > ID rather than the number of NUMA nodes. > > numa.c::set_numa_nodes() has become a bit more convoluted for > round-robin'ing the CPUs over known nodes when not specified by the > user. > > Note that architecture-specific code still possibly needs changes > (forthcoming) for both sparse node numbering and memoryless nodes. This > only puts in the generic infrastructure to support that. > > Examples: > > (1) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=3 -numa > node,nodeid=2 -smp 16 > > Before: > > node 0 cpus: 0 2 4 6 8 10 12 14 > node 0 size: 2048 MB > node 1 cpus: 1 3 5 7 9 11 13 15 > node 1 size: 2048 MB > > After: > > node 2 cpus: 0 2 4 6 8 10 12 14 > | > node 2 size: 2048 MB > | > node 3 cpus: 1 3 5 7 9 11 13 15 > | > node 3 size: 2048 MB > > (2) qemu-system-x86_64 -enable-kvm -m 4096 -numa node,nodeid=0 -numa > node,nodeid=1 -numa node,nodeid=2 -numa node,nodeid=3 -smp 16 > > node 0 cpus: 0 4 8 12 > | > node 0 size: 1024 MB > | > node 1 cpus: 1 5 9 13 > | > node 1 size: 1024 MB > | > node 2 cpus: 2 6 10 14 > | > node 2 size: 1024 MB > | > node 3 cpus: 3 7 11 15 > | > node 3 size: 1024 MB > > (qemu) info numa > 4 nodes > node 0 cpus: 0 4 8 12 > node 0 size: 1024 MB > node 1 cpus: 1 5 9 13 > node 1 size: 1024 MB > node 2 cpus: 2 6 10 14 > node 2 size: 1024 MB > node 3 cpus: 3 7 11 15 > node 3 size: 1024 MB > > Signed-off-by: Nishanth Aravamudan <n...@linux.vnet.ibm.com> > Tested-by: Hu Tao <hu...@cn.fujitsu.com> > > --- > Based off mst's for_upstream tag, which has the NodeInfo changes. > > v1 -> v2: > Modify set_numa_nodes loop for round-robin'ing CPUs. > > v2 -> v3: > Updated changelog to indicate problem being solved. > Updated memory_region_allocate_system_memory based upon feedback from > Hu. > Updated set_numa_nodes loop again to be simpler based upon feedback > from Hu. > Fixed bug with a mix of nodes with nodeid specified and without, where > the same nodeid would be used by the explicit specification and the > auto-numbering code. > > diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h > index 277230d..b90bf66 100644 > --- a/include/sysemu/sysemu.h > +++ b/include/sysemu/sysemu.h > @@ -145,11 +145,13 @@ extern int mem_prealloc; > */ > #define MAX_CPUMASK_BITS 255 > > -extern int nb_numa_nodes; > +extern int nb_numa_nodes; /* Number of NUMA nodes */ > +extern int max_numa_node; /* Highest specified NUMA node ID */ > typedef struct node_info { > uint64_t node_mem; > DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); > struct HostMemoryBackend *node_memdev; > + bool present; How about dropping 'present' and replacing array with a list of only present nodes? That way it will be one more step closer to converting numa infrastructure to a set of QOM objects. > } NodeInfo; > extern NodeInfo numa_info[MAX_NODES]; > void set_numa_nodes(void); > diff --git a/monitor.c b/monitor.c > index c7f8797..4721996 100644 > --- a/monitor.c > +++ b/monitor.c > @@ -2003,7 +2003,10 @@ static void do_info_numa(Monitor *mon, const QDict > *qdict) > CPUState *cpu; > > monitor_printf(mon, "%d nodes\n", nb_numa_nodes); > - for (i = 0; i < nb_numa_nodes; i++) { > + for (i = 0; i < max_numa_node; i++) { > + if (!numa_info[i].present) { > + continue; > + } > monitor_printf(mon, "node %d cpus:", i); > CPU_FOREACH(cpu) { > if (cpu->numa_node == i) { > diff --git a/numa.c b/numa.c > index e471afe..d6b86ab 100644 > --- a/numa.c > +++ b/numa.c > @@ -53,7 +53,10 @@ static void numa_node_parse(NumaNodeOptions *node, > QemuOpts *opts, Error **errp) > if (node->has_nodeid) { > nodenr = node->nodeid; > } else { > - nodenr = nb_numa_nodes; > + nodenr = 0; > + while (numa_info[nodenr].present) { > + nodenr++; > + } > } > > if (nodenr >= MAX_NODES) { > @@ -106,6 +109,10 @@ static void numa_node_parse(NumaNodeOptions *node, > QemuOpts *opts, Error **errp) > numa_info[nodenr].node_mem = object_property_get_int(o, "size", > NULL); > numa_info[nodenr].node_memdev = MEMORY_BACKEND(o); > } > + numa_info[nodenr].present = true; > + if (nodenr >= max_numa_node) { > + max_numa_node = nodenr + 1; > + } > } > > int numa_init_func(QemuOpts *opts, void *opaque) > @@ -155,7 +162,7 @@ void set_numa_nodes(void) > { > if (nb_numa_nodes > 0) { > uint64_t numa_total; > - int i; > + int i, j = -1; > > if (nb_numa_nodes > MAX_NODES) { > nb_numa_nodes = MAX_NODES; > @@ -164,27 +171,29 @@ void set_numa_nodes(void) > /* If no memory size if given for any node, assume the default case > * and distribute the available memory equally across all nodes > */ > - for (i = 0; i < nb_numa_nodes; i++) { > - if (numa_info[i].node_mem != 0) { > + for (i = 0; i < max_numa_node; i++) { > + if (numa_info[i].present && numa_info[i].node_mem != 0) { > break; > } > } > - if (i == nb_numa_nodes) { > + if (i == max_numa_node) { > uint64_t usedmem = 0; > > /* On Linux, the each node's border has to be 8MB aligned, > * the final node gets the rest. > */ > - for (i = 0; i < nb_numa_nodes - 1; i++) { > - numa_info[i].node_mem = (ram_size / nb_numa_nodes) & > - ~((1 << 23UL) - 1); > - usedmem += numa_info[i].node_mem; > + for (i = 0; i < max_numa_node - 1; i++) { > + if (numa_info[i].present) { > + numa_info[i].node_mem = (ram_size / nb_numa_nodes) & > + ~((1 << 23UL) - 1); > + usedmem += numa_info[i].node_mem; > + } > } > numa_info[i].node_mem = ram_size - usedmem; > } > > numa_total = 0; > - for (i = 0; i < nb_numa_nodes; i++) { > + for (i = 0; i < max_numa_node; i++) { > numa_total += numa_info[i].node_mem; > } > if (numa_total != ram_size) { > @@ -194,8 +203,9 @@ void set_numa_nodes(void) > exit(1); > } > > - for (i = 0; i < nb_numa_nodes; i++) { > - if (!bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) { > + for (i = 0; i < max_numa_node; i++) { > + if (numa_info[i].present && > + !bitmap_empty(numa_info[i].node_cpu, MAX_CPUMASK_BITS)) { > break; > } > } > @@ -203,9 +213,12 @@ void set_numa_nodes(void) > * must cope with this anyway, because there are BIOSes out there in > * real machines which also use this scheme. > */ > - if (i == nb_numa_nodes) { > + if (i == max_numa_node) { > for (i = 0; i < max_cpus; i++) { > - set_bit(i, numa_info[i % nb_numa_nodes].node_cpu); > + do { > + j = (j + 1) % max_numa_node; > + } while (!numa_info[j].present); > + set_bit(i, numa_info[j].node_cpu); > } > } > } > @@ -217,8 +230,9 @@ void set_numa_modes(void) > int i; > > CPU_FOREACH(cpu) { > - for (i = 0; i < nb_numa_nodes; i++) { > - if (test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { > + for (i = 0; i < max_numa_node; i++) { > + if (numa_info[i].present && > + test_bit(cpu->cpu_index, numa_info[i].node_cpu)) { > cpu->numa_node = i; > } > } > @@ -266,10 +280,13 @@ void memory_region_allocate_system_memory(MemoryRegion > *mr, Object *owner, > } > > memory_region_init(mr, owner, name, ram_size); > - for (i = 0; i < MAX_NODES; i++) { > + for (i = 0; i < max_numa_node; i++) { > Error *local_err = NULL; > uint64_t size = numa_info[i].node_mem; > HostMemoryBackend *backend = numa_info[i].node_memdev; > + if (!numa_info[i].present) { > + continue; > + } > if (!backend) { > continue; > } > diff --git a/vl.c b/vl.c > index 54b4627..e1a6ab8 100644 > --- a/vl.c > +++ b/vl.c > @@ -195,6 +195,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = > QTAILQ_HEAD_INITIALIZER(fw_boot_order); > > int nb_numa_nodes; > +int max_numa_node; > NodeInfo numa_info[MAX_NODES]; > > uint8_t qemu_uuid[16]; > @@ -2967,10 +2968,12 @@ int main(int argc, char **argv, char **envp) > > for (i = 0; i < MAX_NODES; i++) { > numa_info[i].node_mem = 0; > + numa_info[i].present = false; > bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); > } > > nb_numa_nodes = 0; > + max_numa_node = -1; > nb_nics = 0; > > bdrv_init_with_whitelist(); > >