Re: [Qemu-devel] [PATCH V17 00/11] Add support for binding guest numa nodes to host numa nodes
On 12/06/2013 05:06 PM, Paolo Bonzini wrote: > I think patches 1-4 and 7 are fine. For the rest, I'd rather wait for > Igor's patches and try to integrate with Igor's memory hotplug patches. So, how about apply them first and then I can help Igor to rebase my remaining patches for him? Thanks, Wanlong Gao > > Paolo >
[Qemu-devel] [PATCH V17 11/11] NUMA: convert hmp command info_numa to use qmp command query_numa
Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- hmp.c | 57 + hmp.h | 1 + monitor.c | 21 + 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/hmp.c b/hmp.c index 32ee285..d6dedd2 100644 --- a/hmp.c +++ b/hmp.c @@ -24,6 +24,10 @@ #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1564,3 +1568,56 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } + +void hmp_info_numa(Monitor *mon, const QDict *qdict) +{ +NUMANodeList *node_list, *node; +uint16List *head; +int nodeid; +char *policy_str = NULL; + +node_list = qmp_query_numa(NULL); + +monitor_printf(mon, "%d nodes\n", nb_numa_nodes); +for (node = node_list; node; node = node->next) { +nodeid = node->value->nodeid; +monitor_printf(mon, "node %d cpus:", nodeid); +head = node->value->cpus; +for (head = node->value->cpus; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +monitor_printf(mon, "node %d size: %" PRId64 " MB\n", + nodeid, node->value->memory >> 20); +switch (node->value->policy) { +case NUMA_NODE_POLICY_DEFAULT: +policy_str = g_strdup("default"); +break; +case NUMA_NODE_POLICY_PREFERRED: +policy_str = g_strdup("preferred"); +break; +case NUMA_NODE_POLICY_MEMBIND: +policy_str = g_strdup("membind"); +break; +case NUMA_NODE_POLICY_INTERLEAVE: +policy_str = g_strdup("interleave"); +break; +default: +break; +} +monitor_printf(mon, "node %d policy: %s\n", + nodeid, policy_str ? : " "); +if (policy_str) { +free(policy_str); +} +monitor_printf(mon, "node %d relative: %s\n", nodeid, + node->value->relative ? "true" : "false"); +monitor_printf(mon, "node %d host-nodes:", nodeid); +for (head = node->value->host_nodes; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +} + +qapi_free_NUMANodeList(node_list); +} diff --git a/hmp.h b/hmp.h index 54cf71f..4f8d39b 100644 --- a/hmp.h +++ b/hmp.h @@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); +void hmp_info_numa(Monitor *mon, const QDict *qdict); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); diff --git a/monitor.c b/monitor.c index b97b7d3..f747a48 100644 --- a/monitor.c +++ b/monitor.c @@ -1989,25 +1989,6 @@ static void do_info_mtree(Monitor *mon, const QDict *qdict) mtree_info((fprintf_function)monitor_printf, mon); } -static void do_info_numa(Monitor *mon, const QDict *qdict) -{ -int i; -CPUState *cpu; - -monitor_printf(mon, "%d nodes\n", nb_numa_nodes); -for (i = 0; i < nb_numa_nodes; i++) { -monitor_printf(mon, "node %d cpus:", i); -CPU_FOREACH(cpu) { -if (cpu->numa_node == i) { -monitor_printf(mon, " %d", cpu->cpu_index); -} -} -monitor_printf(mon, "\n"); -monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -numa_info[i].node_mem >> 20); -} -} - #ifdef CONFIG_PROFILER int64_t qemu_time; @@ -2775,7 +2756,7 @@ static mon_cmd_t info_cmds[] = { .args_type = "", .params = "", .help = "show NUMA information", -.mhandler.cmd = do_info_numa, +.mhandler.cmd = hmp_info_numa, }, { .name = "usb", -- 1.8.5
[Qemu-devel] [PATCH V17 05/11] NUMA: introduce NumaMemOptions
Signed-off-by: Wanlong Gao --- qapi-schema.json | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index db539b6..1043e57 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4223,7 +4223,8 @@ ## { 'union': 'NumaOptions', 'data': { -'node': 'NumaNodeOptions' }} +'node': 'NumaNodeOptions', +'mem' : 'NumaMemOptions' }} ## # @NumaNodeOptions @@ -4243,3 +4244,19 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem':'str' }} + +## +# @NumaMemOptions +# +# Set memory information of guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @size: #optional memory size of this node +# +# Since 1.7 +## +{ 'type': 'NumaMemOptions', + 'data': { + '*nodeid': 'uint16', + '*size': 'size' }} -- 1.8.5
[Qemu-devel] [PATCH V17 08/11] NUMA: parse guest numa nodes memory policy
The memory policy setting format is like: policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N And we are adding this setting as a suboption of "-numa mem,", the memory policy then can be set like following: -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +++ numa.c | 18 ++ qapi-schema.json| 33 +++-- vl.c| 3 +++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 807619e..82f1447 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -139,6 +139,9 @@ extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +DECLARE_BITMAP(host_mem, MAX_NODES); +NumaNodePolicy policy; +bool relative; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; void set_numa_nodes(void); diff --git a/numa.c b/numa.c index c676c5e..da4dbbd 100644 --- a/numa.c +++ b/numa.c @@ -78,6 +78,7 @@ static int numa_mem_parse(NumaMemOptions *opts) { uint16_t nodenr; uint64_t mem_size; +uint16List *nodes; if (opts->has_nodeid) { nodenr = opts->nodeid; @@ -96,6 +97,23 @@ static int numa_mem_parse(NumaMemOptions *opts) numa_info[nodenr].node_mem = mem_size; } +if (opts->has_policy) { +numa_info[nodenr].policy = opts->policy; +} + +if (opts->has_relative) { +numa_info[nodenr].relative = opts->relative; +} + +for (nodes = opts->host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +fprintf(stderr, "qemu: node number %" PRIu16 " is bigger than %d\n", +nodes->value, MAX_NODES); +continue; +} +bitmap_set(numa_info[nodenr].host_mem, nodes->value, 1); +} + return 0; } diff --git a/qapi-schema.json b/qapi-schema.json index 1043e57..c0dad81 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4246,6 +4246,26 @@ '*mem':'str' }} ## +# @NumaNodePolicy +# +# NUMA node policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred node for allocation +# +# @membind: a strict policy that restricts memory allocation to the +# nodes specified +# +# @interleave: the page allocations is interleaved across the set +# of nodes specified +# +# Since 1.7 +## +{ 'enum': 'NumaNodePolicy', + 'data': [ 'default', 'preferred', 'membind', 'interleave' ] } + +## # @NumaMemOptions # # Set memory information of guest NUMA node. (for OptsVisitor) @@ -4254,9 +4274,18 @@ # # @size: #optional memory size of this node # +# @policy: #optional memory policy of this node +# +# @relative: #optional if the nodes specified are relative +# +# @host-nodes: #optional host nodes for its memory policy +# # Since 1.7 ## { 'type': 'NumaMemOptions', 'data': { - '*nodeid': 'uint16', - '*size': 'size' }} + '*nodeid': 'uint16', + '*size': 'size', + '*policy': 'NumaNodePolicy', + '*relative': 'bool', + '*host-nodes': ['uint16'] }} diff --git a/vl.c b/vl.c index 064b821..95d03f5 100644 --- a/vl.c +++ b/vl.c @@ -2815,6 +2815,9 @@ int main(int argc, char **argv, char **envp) for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_mem = 0; bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); +bitmap_zero(numa_info[i].host_mem, MAX_NODES); +numa_info[i].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[i].relative = false; } nb_numa_nodes = 0; -- 1.8.5
[Qemu-devel] [PATCH V17 03/11] NUMA: Add numa_info structure to contain numa nodes info
Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c| 12 include/sysemu/sysemu.h | 8 ++-- monitor.c | 2 +- numa.c | 23 --- vl.c| 7 +++ 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 12c436e..74c1f16 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -670,14 +670,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { -numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * @@ -1072,8 +1072,12 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, guest_info->apic_id_limit = pc_apic_id_limit(max_cpus); guest_info->apic_xrupt_override = kvm_allows_irq0_override(); guest_info->numa_nodes = nb_numa_nodes; -guest_info->node_mem = g_memdup(node_mem, guest_info->numa_nodes * +guest_info->node_mem = g_malloc0(guest_info->numa_nodes * sizeof *guest_info->node_mem); +for (i = 0; i < nb_numa_nodes; i++) { +guest_info->node_mem[i] = numa_info[i].node_mem; +} + guest_info->node_cpu = g_malloc0(guest_info->apic_id_limit * sizeof *guest_info->node_cpu); @@ -1081,7 +1085,7 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < guest_info->apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { guest_info->node_cpu[apic_id] = j; break; } diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 2509649..d873b42 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -134,8 +135,11 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { +uint64_t node_mem; +DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); diff --git a/monitor.c b/monitor.c index 845f608..b97b7d3 100644 --- a/monitor.c +++ b/monitor.c @@ -2004,7 +2004,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -node_mem[i] >> 20); +numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index beda80e..1bc0fad 100644 --- a/numa.c +++ b/numa.c @@ -61,7 +61,7 @@ static void numa_node_parse_cpus(int nodenr, const char *cpus) goto error; } -bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); return; error: @@ -101,7 +101,7 @@ void numa_add(const char *optarg) } if (get_param_value(option, 128, "mem", optarg) == 0) { -node_mem[nodenr] = 0; +numa_info[nodenr].node_mem = 0; } else { int64_t sval; sval = strtosz(option, &endptr); @@ -109,7 +109,7 @@ void numa_add(const char *optarg) fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); exit(1); } -node_mem[nodenr] = sval; +numa_info[nodenr].node_mem = sval; } if (get_param_value(option, 128, "cpus", optarg) != 0) { numa_node_parse_cpus(nodenr, option); @@ -134,7 +134,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */
[Qemu-devel] [PATCH V17 04/11] NUMA: convert -numa option to use OptsVisitor
Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +- numa.c | 148 +++- qapi-schema.json| 30 ++ vl.c| 11 +++- 4 files changed, 114 insertions(+), 78 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index d873b42..20b05a3 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -140,9 +140,10 @@ typedef struct node_info { DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index 1bc0fad..c4fa665 100644 --- a/numa.c +++ b/numa.c @@ -24,101 +24,97 @@ */ #include "sysemu/sysemu.h" - -static void numa_node_parse_cpus(int nodenr, const char *cpus) +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +QemuOptsList qemu_numa_opts = { +.name = "numa", +.implied_opt_name = "type", +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), +.desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static int numa_node_parse(NumaNodeOptions *opts) { -char *endptr; -unsigned long long value, endvalue; - -/* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ -if (!*cpus) { -return; -} +uint16_t nodenr; +uint16List *cpus = NULL; -if (parse_uint(cpus, &value, &endptr, 10) < 0) { -goto error; -} -if (*endptr == '-') { -if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { -goto error; -} -} else if (*endptr == '\0') { -endvalue = value; +if (opts->has_nodeid) { +nodenr = opts->nodeid; } else { -goto error; +nodenr = nb_numa_nodes; } -if (endvalue >= MAX_CPUMASK_BITS) { -endvalue = MAX_CPUMASK_BITS - 1; -fprintf(stderr, -"qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; } -if (endvalue < value) { -goto error; +for (cpus = opts->cpus; cpus; cpus = cpus->next) { +if (cpus->value > MAX_CPUMASK_BITS) { +fprintf(stderr, "qemu: cpu number %" PRIu16 " is bigger than %d", +cpus->value, MAX_CPUMASK_BITS); +continue; +} +bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } -bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); -return; +if (opts->has_mem) { +int64_t mem_size; +char *endptr; +mem_size = strtosz(opts->mem, &endptr); +if (mem_size < 0 || *endptr) { +fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); +return -1; +} +numa_info[nodenr].node_mem = mem_size; +} -error: -fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); -exit(1); +return 0; } -void numa_add(const char *optarg) +int numa_init_func(QemuOpts *opts, void *opaque) { -char option[128]; -char *endptr; -unsigned long long nodenr; - -optarg = get_opt_name(option, 128, optarg, ','); -if (*optarg == ',') { -optarg++; +NumaOptions *object = NULL; +Error *err = NULL; +int ret = 0; + +{ +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err); +opts_visitor_cleanup(ov); } -if (!strcmp(option, "node")) { - -if (nb_numa_nodes >= MAX_NODES) { -fprintf(stderr, "qemu: too many NUMA nodes\n"); -exit(1); -} -if (get_param_value(option, 128, "nodeid", optarg) == 0) { -nodenr = nb_numa_nodes; -} else { -if (parse_uint_full(option, &nodenr, 10) < 0) { -fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); -exit(1); -} -} - -if (nodenr >= MAX_NODES) { -fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); -exit(1); -} +if (error_is_set(&err)) { +fprintf(stderr, "qemu: %s\n", error_get_pretty(err)); +error_free(err); +ret = -1; +goto er
[Qemu-devel] [PATCH V17 07/11] NUMA: expand MAX_NODES from 64 to 128
libnuma choosed 128 for MAX_NODES, so we follow libnuma here. Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 291aa6a..807619e 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -132,7 +132,7 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; -- 1.8.5
[Qemu-devel] [PATCH V17 06/11] NUMA: add "-numa mem," options
Add "-numa mem," option like following as Paolo suggested: -numa mem,nodeid=0,size=1G This new option will make later coming memory hotplug better. We will use the new options to specify nodes memory info, and just remain "-numa node,mem=xx" as legacy. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 1 + numa.c | 36 qemu-options.hx | 6 -- vl.c| 2 ++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 20b05a3..291aa6a 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -135,6 +135,7 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; +extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); diff --git a/numa.c b/numa.c index c4fa665..c676c5e 100644 --- a/numa.c +++ b/numa.c @@ -74,6 +74,31 @@ static int numa_node_parse(NumaNodeOptions *opts) return 0; } +static int numa_mem_parse(NumaMemOptions *opts) +{ +uint16_t nodenr; +uint64_t mem_size; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +} else { +nodenr = nb_numa_mem_nodes; +} + +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} + +if (opts->has_size) { +mem_size = opts->size; +numa_info[nodenr].node_mem = mem_size; +} + +return 0; +} + int numa_init_func(QemuOpts *opts, void *opaque) { NumaOptions *object = NULL; @@ -101,6 +126,13 @@ int numa_init_func(QemuOpts *opts, void *opaque) } nb_numa_nodes++; break; +case NUMA_OPTIONS_KIND_MEM: +ret = numa_mem_parse(object->mem); +if (ret) { +goto error; +} +nb_numa_mem_nodes++; +break; default: fprintf(stderr, "qemu: Invalid NUMA options type.\n"); ret = -1; @@ -119,6 +151,10 @@ error: void set_numa_nodes(void) { +if (nb_numa_mem_nodes > nb_numa_nodes) { +nb_numa_nodes = nb_numa_mem_nodes; +} + if (nb_numa_nodes > 0) { int i; diff --git a/qemu-options.hx b/qemu-options.hx index 8b94264..e6afb6f 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,11 +95,13 @@ specifies the maximum number of hotpluggable CPUs. ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, -"-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) +"-numa node[,nodeid=node][,cpus=cpu[-cpu]]\n" +"-numa mem[,nodeid=node][,size=size]\n" +, QEMU_ARCH_ALL) STEXI @item -numa @var{opts} @findex -numa -Simulate a multi node NUMA system. If mem and cpus are omitted, resources +Simulate a multi node NUMA system. If @var{size} and @var{cpus} are omitted, resources are split equally. ETEXI diff --git a/vl.c b/vl.c index e67f34a..064b821 100644 --- a/vl.c +++ b/vl.c @@ -250,6 +250,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); int nb_numa_nodes; +int nb_numa_mem_nodes; NodeInfo numa_info[MAX_NODES]; uint8_t qemu_uuid[16]; @@ -2817,6 +2818,7 @@ int main(int argc, char **argv, char **envp) } nb_numa_nodes = 0; +nb_numa_mem_nodes = 0; nb_nics = 0; bdrv_init_with_whitelist(); -- 1.8.5
[Qemu-devel] [PATCH V17 09/11] NUMA: set guest numa nodes memory policy
Set the guest numa nodes memory policies using the mbind(2) system call node by node. After this patch, we are able to set guest nodes memory policies through the QEMU options, this arms to solve the guest cross nodes memory access performance issue. And as you all know, if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policies before the pages are really mapped. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c | 9 + include/exec/memory.h | 15 numa.c| 99 +++ 3 files changed, 123 insertions(+) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 74c1f16..07553f2 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1178,6 +1178,10 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, memory_region_init_alias(ram_below_4g, NULL, "ram-below-4g", ram, 0, below_4g_mem_size); memory_region_add_subregion(system_memory, 0, ram_below_4g); +if (memory_region_set_mem_policy(ram_below_4g, 0, below_4g_mem_size, 0)) { +fprintf(stderr, "qemu: set below 4g memory policy failed\n"); +exit(1); +} e820_add_entry(0, below_4g_mem_size, E820_RAM); if (above_4g_mem_size > 0) { ram_above_4g = g_malloc(sizeof(*ram_above_4g)); @@ -1185,6 +1189,11 @@ FWCfgState *pc_memory_init(MemoryRegion *system_memory, below_4g_mem_size, above_4g_mem_size); memory_region_add_subregion(system_memory, 0x1ULL, ram_above_4g); +if (memory_region_set_mem_policy(ram_above_4g, 0, above_4g_mem_size, + below_4g_mem_size)) { +fprintf(stderr, "qemu: set above 4g memory policy failed\n"); +exit(1); +} e820_add_entry(0x1ULL, above_4g_mem_size, E820_RAM); } diff --git a/include/exec/memory.h b/include/exec/memory.h index 480dfbf..33de50a 100644 --- a/include/exec/memory.h +++ b/include/exec/memory.h @@ -905,6 +905,21 @@ void memory_region_transaction_begin(void); void memory_region_transaction_commit(void); /** + * memory_region_set_mem_policy: Set memory policy + * + * Set the memory policy for the specified area. + * + * @mr: a MemoryRegion we are setting memory policy for + * @start: the start offset of the specific region in this MemoryRegion + * @length: the specific memory area length + * @offset: the start offset of the specific area in NUMA setting + */ +int memory_region_set_mem_policy(MemoryRegion *mr, + ram_addr_t start, + ram_addr_t length, + ram_addr_t offset); + +/** * memory_listener_register: register callbacks to be called when memory * sections are mapped or unmapped into an address * space diff --git a/numa.c b/numa.c index da4dbbd..43bba42 100644 --- a/numa.c +++ b/numa.c @@ -27,6 +27,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif + QemuOptsList qemu_numa_opts = { .name = "numa", .implied_opt_name = "type", @@ -228,6 +238,95 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ +int bind_mode; + +switch (numa_info[nodeid].policy) { +case NUMA_NODE_POLICY_DEFAULT: +case NUMA_NODE_POLICY_PREFERRED: +case NUMA_NODE_POLICY_MEMBIND: +case NUMA_NODE_POLICY_INTERLEAVE: +bind_mode = numa_info[nodeid].policy; +break; +default: +bind_mode = NUMA_NODE_POLICY_DEFAULT; +return bind_mode; +} + +bind_mode |= numa_info[nodeid].relative ? +MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + +return bind_mode; +} + +static int node_set_mem_policy(void *ram_ptr, ram_addr_t length, int nodeid) +{ +int bind_mode = node_parse_bind_mode(nodeid); +unsigned long *nodes = numa_info[nodeid].host_mem; + +/* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To s
[Qemu-devel] [PATCH V17 00/11] Add support for binding guest numa nodes to host numa nodes
1->V12: rebase to current master split patch 02/11 of V11 (Eduardo) add some max value check (Eduardo) split MAX_NODES change patch (Eduardo) V12->V13: rebase to current master thanks for Luiz's review (Luiz) doc hmp command set-mem-policy (Luiz) rename: NUMAInfo -> NUMANode (Luiz) V13->V14: remove "set-mem-policy" qmp and hmp commands (Marcelo, Paolo) V14->V15: rebase to the current master V15->V16: rebase to current master add more test log V16->V17: use MemoryRegion to set policy instead of using "pc.ram" (Paolo) Wanlong Gao (11): NUMA: move numa related code to new file numa.c NUMA: check if the total numa memory size is equal to ram_size NUMA: Add numa_info structure to contain numa nodes info NUMA: convert -numa option to use OptsVisitor NUMA: introduce NumaMemOptions NUMA: add "-numa mem," options NUMA: expand MAX_NODES from 64 to 128 NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp.c | 57 +++ hmp.h | 1 + hw/i386/pc.c| 21 ++- include/exec/memory.h | 15 ++ include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 ++- monitor.c | 21 +-- numa.c | 408 qapi-schema.json| 112 + qemu-options.hx | 6 +- qmp-commands.hx | 49 ++ vl.c| 160 +++ 14 files changed, 698 insertions(+), 187 deletions(-) create mode 100644 numa.c -- 1.8.5
[Qemu-devel] [PATCH V17 02/11] NUMA: check if the total numa memory size is equal to ram_size
If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI talb, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao --- numa.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/numa.c b/numa.c index ce7736a..beda80e 100644 --- a/numa.c +++ b/numa.c @@ -150,6 +150,16 @@ void set_numa_nodes(void) node_mem[i] = ram_size - usedmem; } +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; -- 1.8.5
[Qemu-devel] [PATCH V17 10/11] NUMA: add qmp command query-numa
Add qmp command query-numa to show guest NUMA information. Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- numa.c | 66 qapi-schema.json | 36 +++ qmp-commands.hx | 49 + 3 files changed, 151 insertions(+) diff --git a/numa.c b/numa.c index 43bba42..2954709 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "exec/memory.h" +#include "qmp-commands.h" #ifdef __linux__ #include @@ -340,3 +341,68 @@ void set_numa_modes(void) } } } + +NUMANodeList *qmp_query_numa(Error **errp) +{ +NUMANodeList *head = NULL, *cur_item = NULL; +CPUState *cpu; +int i; + +for (i = 0; i < nb_numa_nodes; i++) { +NUMANodeList *info; +uint16List *cur_cpu_item = NULL; +info = g_malloc0(sizeof(*info)); +info->value = g_malloc0(sizeof(*info->value)); +info->value->nodeid = i; +CPU_FOREACH(cpu) { +if (cpu->numa_node == i) { +uint16List *node_cpu = g_malloc0(sizeof(*node_cpu)); +node_cpu->value = cpu->cpu_index; + +if (!cur_cpu_item) { +info->value->cpus = cur_cpu_item = node_cpu; +} else { +cur_cpu_item->next = node_cpu; +cur_cpu_item = node_cpu; +} +} +} +info->value->memory = numa_info[i].node_mem; + +#ifdef __linux__ +info->value->policy = numa_info[i].policy; +info->value->relative = numa_info[i].relative; + +unsigned long first, next; +next = first = find_first_bit(numa_info[i].host_mem, MAX_NODES); +if (first == MAX_NODES) { +goto end; +} +uint16List *cur_node_item = g_malloc0(sizeof(*cur_node_item)); +cur_node_item->value = first; +info->value->host_nodes = cur_node_item; +do { +next = find_next_bit(numa_info[i].host_mem, MAX_NODES, + next + 1); +if (next == MAX_NODES) { +break; +} + +uint16List *host_node = g_malloc0(sizeof(*host_node)); +host_node->value = next; +cur_node_item->next = host_node; +cur_node_item = host_node; +} while (true); +end: +#endif + +if (!cur_item) { +head = cur_item = info; +} else { +cur_item->next = info; +cur_item = info; +} +} + +return head; +} diff --git a/qapi-schema.json b/qapi-schema.json index c0dad81..af947e2 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4289,3 +4289,39 @@ '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] }} + +## +# @NUMANode: +# +# Information of guest NUMA node +# +# @nodeid: NUMA node ID +# +# @cpus: VCPUs contained in this node +# +# @memory: memory size of this node +# +# @policy: memory policy of this node +# +# @relative: if host nodes are relative for memory policy +# +# @host-nodes: host nodes for its memory policy +# +# Since: 1.7 +# +## +{ 'type': 'NUMANode', + 'data': {'nodeid': 'uint16', 'cpus': ['uint16'], 'memory': 'uint64', + 'policy': 'NumaNodePolicy', 'relative': 'bool', + 'host-nodes': ['uint16'] }} + +## +# @query-numa: +# +# Returns a list of information about each guest node. +# +# Returns: a list of @NUMANode for each guest node +# +# Since: 1.7 +## +{ 'command': 'query-numa', 'returns': ['NUMANode'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..c2bc508 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3295,3 +3295,52 @@ Example (2): <- { "return": {} } EQMP + +{ +.name = "query-numa", +.args_type = "", +.mhandler.cmd_new = qmp_marshal_input_query_numa, +}, + +SQMP +query-numa +- + +Show NUMA information. + +Return a json-array. Each NUMA node is represented by a json-object, +which contains: + +- "nodeid": NUMA node ID (json-int) +- "cpus": a json-arry of contained VCPUs +- "memory": amount of memory in each node in Byte (json-int) +- "policy": memory policy of this node (json-string) +- "relative": if host nodes is relative for its memory policy (json-bool) +- "host-nodes": a json-array of host nodes for its memory policy + +Arguments: + +Example: + +-> { "excute": "query
[Qemu-devel] [PATCH V17 01/11] NUMA: move numa related code to new file numa.c
Signed-off-by: Wanlong Gao --- Makefile.target | 2 +- cpus.c | 14 include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 3 + numa.c | 182 vl.c| 139 +--- 6 files changed, 187 insertions(+), 154 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index af6ac7e..0197c17 100644 --- a/Makefile.target +++ b/Makefile.target @@ -109,7 +109,7 @@ endif #CONFIG_BSD_USER # # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/cpus.c b/cpus.c index 01d128d..53360b0 100644 --- a/cpus.c +++ b/cpus.c @@ -1297,20 +1297,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ -CPUState *cpu; -int i; - -CPU_FOREACH(cpu) { -for (i = 0; i < nb_numa_nodes; i++) { -if (test_bit(cpu->cpu_index, node_cpumask[i])) { -cpu->numa_node = i; -} -} -} -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488..4f79081 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 495dae8..2509649 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -136,6 +136,9 @@ extern QEMUClockType rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +void numa_add(const char *optarg); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 000..ce7736a --- /dev/null +++ b/numa.c @@ -0,0 +1,182 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2013 Fujitsu Ltd. + * Author: Wanlong Gao + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" + +static void numa_node_parse_cpus(int nodenr, const char *cpus) +{ +char *endptr; +unsigned long long value, endvalue; + +/* Empty CPU range strings will be considered valid, they will simply + * not set any bit in the CPU bitmap. + */ +if (!*cpus) { +return; +} + +if (parse_uint(cpus, &value, &endptr, 10) < 0) { +goto error; +} +if (*endptr == '-') { +if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { +goto error; +} +} else if (*endptr == '\0') { +endvalue = value; +} else { +goto error; +} + +if (endvalue >= MAX_CPUMASK_BITS) { +endvalue = MAX_CPUMASK_BITS - 1; +fprintf(stderr, +"qemu: NUMA: A max of %d VCPUs are supported\n", + MAX_CPUMASK_BITS); +} + +if (endvalue < value) { +goto error; +} + +bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +return; + +error: +fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); +exit(1); +} + +void numa_add(const char *optarg) +{ +char option[128]; +char *endptr; +unsigned long long nodenr; + +optarg = get_opt_name(option, 128, optarg, ','); +if (*optarg == ',') { +
Re: [Qemu-devel] [PATCH V16 09/11] NUMA: set guest numa nodes memory policy
On 11/27/2013 10:35 PM, Paolo Bonzini wrote: > Il 22/11/2013 03:08, Wanlong Gao ha scritto: >> +static int set_node_mem_policy(int nodeid) >> +{ >> +#ifdef __linux__ >> +void *ram_ptr; >> +RAMBlock *block; >> +ram_addr_t len, ram_offset = 0; >> +int bind_mode; >> +int i; >> + >> +QTAILQ_FOREACH(block, &ram_list.blocks, next) { >> +if (!strcmp(block->mr->name, "pc.ram")) { > > This is not acceptable, "pc.ram" is a board-specific name. > > I think instead set_node_mem_policy could be something like > > int memory_region_set_mem_policy(MemoryRegion *mr, > uint64_t start, uint64_t length, > uint64_t offset); > > that applies the NUMA policies specified for [offset, offset+length) to > the host physical address [ptr+start, ptr+start+length), where ptr is > memory_region_get_ram_ptr(mr). > > Each board then can call the function after it adds RAM with > memory_region_add_subregion. Got it, than you. Thanks, Wanlong Gao > > Paolo > >> +break; >> +} >> +} >> + >> +if (block->host == NULL) { >> +return -1; >> +} >> + >> +ram_ptr = block->host; >> +for (i = 0; i < nodeid; i++) { >> +len = numa_info[i].node_mem; >> +ram_offset += len; >> +} >> + >> +len = numa_info[nodeid].node_mem; >> +bind_mode = node_parse_bind_mode(nodeid); >> +unsigned long *nodes = numa_info[nodeid].host_mem; >> + >> +/* This is a workaround for a long standing bug in Linux' >> + * mbind implementation, which cuts off the last specified >> + * node. To stay compatible should this bug be fixed, we >> + * specify one more node and zero this one out. >> + */ >> +unsigned long maxnode = find_last_bit(nodes, MAX_NODES); >> +if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, >> +nodes, maxnode + 2, 0)) { >> +perror("mbind"); >> +return -1; >> +} > > Also, it's still not clear to me why we're not using libnuma. > >> +#endif >> + >> +return 0; >> +} >> + >> void set_numa_modes(void) >> { >> CPUState *cpu; >> @@ -240,4 +319,11 @@ void set_numa_modes(void) >> } >> } >> } >> + >> +for (i = 0; i < nb_numa_nodes; i++) { >> +if (set_node_mem_policy(i) == -1) { >> +fprintf(stderr, >> +"qemu: can not set host memory policy for node%d\n", i); >> +} >> +} >> } >> > >
[Qemu-devel] [PATCH V16 07/11] NUMA: expand MAX_NODES from 64 to 128
libnuma choosed 128 for MAX_NODES, so we follow libnuma here. Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 291aa6a..807619e 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -132,7 +132,7 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; -- 1.8.5.rc3
[Qemu-devel] [PATCH V16 09/11] NUMA: set guest numa nodes memory policy
Set the guest numa nodes memory policies using the mbind(2) system call node by node. After this patch, we are able to set guest nodes memory policies through the QEMU options, this arms to solve the guest cross nodes memory access performance issue. And as you all know, if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policies before the pages are really mapped. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- numa.c | 86 ++ 1 file changed, 86 insertions(+) diff --git a/numa.c b/numa.c index da4dbbd..915a67a 100644 --- a/numa.c +++ b/numa.c @@ -27,6 +27,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif + QemuOptsList qemu_numa_opts = { .name = "numa", .implied_opt_name = "type", @@ -228,6 +238,75 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ +int bind_mode; + +switch (numa_info[nodeid].policy) { +case NUMA_NODE_POLICY_DEFAULT: +case NUMA_NODE_POLICY_PREFERRED: +case NUMA_NODE_POLICY_MEMBIND: +case NUMA_NODE_POLICY_INTERLEAVE: +bind_mode = numa_info[nodeid].policy; +break; +default: +bind_mode = NUMA_NODE_POLICY_DEFAULT; +return bind_mode; +} + +bind_mode |= numa_info[nodeid].relative ? +MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + +return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef __linux__ +void *ram_ptr; +RAMBlock *block; +ram_addr_t len, ram_offset = 0; +int bind_mode; +int i; + +QTAILQ_FOREACH(block, &ram_list.blocks, next) { +if (!strcmp(block->mr->name, "pc.ram")) { +break; +} +} + +if (block->host == NULL) { +return -1; +} + +ram_ptr = block->host; +for (i = 0; i < nodeid; i++) { +len = numa_info[i].node_mem; +ram_offset += len; +} + +len = numa_info[nodeid].node_mem; +bind_mode = node_parse_bind_mode(nodeid); +unsigned long *nodes = numa_info[nodeid].host_mem; + +/* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ +unsigned long maxnode = find_last_bit(nodes, MAX_NODES); +if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, +nodes, maxnode + 2, 0)) { +perror("mbind"); +return -1; +} +#endif + +return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -240,4 +319,11 @@ void set_numa_modes(void) } } } + +for (i = 0; i < nb_numa_nodes; i++) { +if (set_node_mem_policy(i) == -1) { +fprintf(stderr, +"qemu: can not set host memory policy for node%d\n", i); +} +} } -- 1.8.5.rc3
[Qemu-devel] [PATCH] fixup
Signed-off-by: Wanlong Gao --- hw/i386/pc.c | 8 ++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 50ed4cc..74c1f16 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -1072,8 +1072,12 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, guest_info->apic_id_limit = pc_apic_id_limit(max_cpus); guest_info->apic_xrupt_override = kvm_allows_irq0_override(); guest_info->numa_nodes = nb_numa_nodes; -guest_info->node_mem = g_memdup(node_mem, guest_info->numa_nodes * +guest_info->node_mem = g_malloc0(guest_info->numa_nodes * sizeof *guest_info->node_mem); +for (i = 0; i < nb_numa_nodes; i++) { +guest_info->node_mem[i] = numa_info[i].node_mem; +} + guest_info->node_cpu = g_malloc0(guest_info->apic_id_limit * sizeof *guest_info->node_cpu); @@ -1081,7 +1085,7 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < guest_info->apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { guest_info->node_cpu[apic_id] = j; break; } -- 1.8.5.rc0.44.gf26f72d
[Qemu-devel] [PATCH V16 01/11] NUMA: move numa related code to new file numa.c
Signed-off-by: Wanlong Gao --- Makefile.target | 2 +- cpus.c | 14 include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 3 + numa.c | 182 vl.c| 139 +--- 6 files changed, 187 insertions(+), 154 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index af6ac7e..0197c17 100644 --- a/Makefile.target +++ b/Makefile.target @@ -109,7 +109,7 @@ endif #CONFIG_BSD_USER # # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/cpus.c b/cpus.c index 01d128d..53360b0 100644 --- a/cpus.c +++ b/cpus.c @@ -1297,20 +1297,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ -CPUState *cpu; -int i; - -CPU_FOREACH(cpu) { -for (i = 0; i < nb_numa_nodes; i++) { -if (test_bit(cpu->cpu_index, node_cpumask[i])) { -cpu->numa_node = i; -} -} -} -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488..4f79081 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 495dae8..2509649 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -136,6 +136,9 @@ extern QEMUClockType rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +void numa_add(const char *optarg); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 000..ce7736a --- /dev/null +++ b/numa.c @@ -0,0 +1,182 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2013 Fujitsu Ltd. + * Author: Wanlong Gao + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" + +static void numa_node_parse_cpus(int nodenr, const char *cpus) +{ +char *endptr; +unsigned long long value, endvalue; + +/* Empty CPU range strings will be considered valid, they will simply + * not set any bit in the CPU bitmap. + */ +if (!*cpus) { +return; +} + +if (parse_uint(cpus, &value, &endptr, 10) < 0) { +goto error; +} +if (*endptr == '-') { +if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { +goto error; +} +} else if (*endptr == '\0') { +endvalue = value; +} else { +goto error; +} + +if (endvalue >= MAX_CPUMASK_BITS) { +endvalue = MAX_CPUMASK_BITS - 1; +fprintf(stderr, +"qemu: NUMA: A max of %d VCPUs are supported\n", + MAX_CPUMASK_BITS); +} + +if (endvalue < value) { +goto error; +} + +bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +return; + +error: +fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); +exit(1); +} + +void numa_add(const char *optarg) +{ +char option[128]; +char *endptr; +unsigned long long nodenr; + +optarg = get_opt_name(option, 128, optarg, ','); +if (*optarg == ',') { +
[Qemu-devel] [PATCH V16 11/11] NUMA: convert hmp command info_numa to use qmp command query_numa
Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- hmp.c | 57 + hmp.h | 1 + monitor.c | 21 + 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/hmp.c b/hmp.c index 32ee285..d6dedd2 100644 --- a/hmp.c +++ b/hmp.c @@ -24,6 +24,10 @@ #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1564,3 +1568,56 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } + +void hmp_info_numa(Monitor *mon, const QDict *qdict) +{ +NUMANodeList *node_list, *node; +uint16List *head; +int nodeid; +char *policy_str = NULL; + +node_list = qmp_query_numa(NULL); + +monitor_printf(mon, "%d nodes\n", nb_numa_nodes); +for (node = node_list; node; node = node->next) { +nodeid = node->value->nodeid; +monitor_printf(mon, "node %d cpus:", nodeid); +head = node->value->cpus; +for (head = node->value->cpus; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +monitor_printf(mon, "node %d size: %" PRId64 " MB\n", + nodeid, node->value->memory >> 20); +switch (node->value->policy) { +case NUMA_NODE_POLICY_DEFAULT: +policy_str = g_strdup("default"); +break; +case NUMA_NODE_POLICY_PREFERRED: +policy_str = g_strdup("preferred"); +break; +case NUMA_NODE_POLICY_MEMBIND: +policy_str = g_strdup("membind"); +break; +case NUMA_NODE_POLICY_INTERLEAVE: +policy_str = g_strdup("interleave"); +break; +default: +break; +} +monitor_printf(mon, "node %d policy: %s\n", + nodeid, policy_str ? : " "); +if (policy_str) { +free(policy_str); +} +monitor_printf(mon, "node %d relative: %s\n", nodeid, + node->value->relative ? "true" : "false"); +monitor_printf(mon, "node %d host-nodes:", nodeid); +for (head = node->value->host_nodes; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +} + +qapi_free_NUMANodeList(node_list); +} diff --git a/hmp.h b/hmp.h index 54cf71f..4f8d39b 100644 --- a/hmp.h +++ b/hmp.h @@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); +void hmp_info_numa(Monitor *mon, const QDict *qdict); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); diff --git a/monitor.c b/monitor.c index b97b7d3..f747a48 100644 --- a/monitor.c +++ b/monitor.c @@ -1989,25 +1989,6 @@ static void do_info_mtree(Monitor *mon, const QDict *qdict) mtree_info((fprintf_function)monitor_printf, mon); } -static void do_info_numa(Monitor *mon, const QDict *qdict) -{ -int i; -CPUState *cpu; - -monitor_printf(mon, "%d nodes\n", nb_numa_nodes); -for (i = 0; i < nb_numa_nodes; i++) { -monitor_printf(mon, "node %d cpus:", i); -CPU_FOREACH(cpu) { -if (cpu->numa_node == i) { -monitor_printf(mon, " %d", cpu->cpu_index); -} -} -monitor_printf(mon, "\n"); -monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -numa_info[i].node_mem >> 20); -} -} - #ifdef CONFIG_PROFILER int64_t qemu_time; @@ -2775,7 +2756,7 @@ static mon_cmd_t info_cmds[] = { .args_type = "", .params = "", .help = "show NUMA information", -.mhandler.cmd = do_info_numa, +.mhandler.cmd = hmp_info_numa, }, { .name = "usb", -- 1.8.5.rc3
Re: [Qemu-devel] [PATCH] fixup
Sorry, please ignore this patch. Thanks, Wanlong Gao > Signed-off-by: Wanlong Gao > --- > hw/i386/pc.c | 8 ++-- > 1 file changed, 6 insertions(+), 2 deletions(-) > > diff --git a/hw/i386/pc.c b/hw/i386/pc.c > index 50ed4cc..74c1f16 100644 > --- a/hw/i386/pc.c > +++ b/hw/i386/pc.c > @@ -1072,8 +1072,12 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t > below_4g_mem_size, > guest_info->apic_id_limit = pc_apic_id_limit(max_cpus); > guest_info->apic_xrupt_override = kvm_allows_irq0_override(); > guest_info->numa_nodes = nb_numa_nodes; > -guest_info->node_mem = g_memdup(node_mem, guest_info->numa_nodes * > +guest_info->node_mem = g_malloc0(guest_info->numa_nodes * > sizeof *guest_info->node_mem); > +for (i = 0; i < nb_numa_nodes; i++) { > +guest_info->node_mem[i] = numa_info[i].node_mem; > +} > + > guest_info->node_cpu = g_malloc0(guest_info->apic_id_limit * > sizeof *guest_info->node_cpu); > > @@ -1081,7 +1085,7 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t > below_4g_mem_size, > unsigned int apic_id = x86_cpu_apic_id_from_index(i); > assert(apic_id < guest_info->apic_id_limit); > for (j = 0; j < nb_numa_nodes; j++) { > -if (test_bit(i, node_cpumask[j])) { > +if (test_bit(i, numa_info[j].node_cpu)) { > guest_info->node_cpu[apic_id] = j; > break; > } >
[Qemu-devel] [PATCH V16 02/11] NUMA: check if the total numa memory size is equal to ram_size
If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI talb, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao --- numa.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/numa.c b/numa.c index ce7736a..beda80e 100644 --- a/numa.c +++ b/numa.c @@ -150,6 +150,16 @@ void set_numa_nodes(void) node_mem[i] = ram_size - usedmem; } +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; -- 1.8.5.rc3
[Qemu-devel] [PATCH V16 10/11] NUMA: add qmp command query-numa
Add qmp command query-numa to show guest NUMA information. Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- numa.c | 66 qapi-schema.json | 36 +++ qmp-commands.hx | 49 + 3 files changed, 151 insertions(+) diff --git a/numa.c b/numa.c index 915a67a..b392190 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "exec/memory.h" +#include "qmp-commands.h" #ifdef __linux__ #include @@ -327,3 +328,68 @@ void set_numa_modes(void) } } } + +NUMANodeList *qmp_query_numa(Error **errp) +{ +NUMANodeList *head = NULL, *cur_item = NULL; +CPUState *cpu; +int i; + +for (i = 0; i < nb_numa_nodes; i++) { +NUMANodeList *info; +uint16List *cur_cpu_item = NULL; +info = g_malloc0(sizeof(*info)); +info->value = g_malloc0(sizeof(*info->value)); +info->value->nodeid = i; +CPU_FOREACH(cpu) { +if (cpu->numa_node == i) { +uint16List *node_cpu = g_malloc0(sizeof(*node_cpu)); +node_cpu->value = cpu->cpu_index; + +if (!cur_cpu_item) { +info->value->cpus = cur_cpu_item = node_cpu; +} else { +cur_cpu_item->next = node_cpu; +cur_cpu_item = node_cpu; +} +} +} +info->value->memory = numa_info[i].node_mem; + +#ifdef __linux__ +info->value->policy = numa_info[i].policy; +info->value->relative = numa_info[i].relative; + +unsigned long first, next; +next = first = find_first_bit(numa_info[i].host_mem, MAX_NODES); +if (first == MAX_NODES) { +goto end; +} +uint16List *cur_node_item = g_malloc0(sizeof(*cur_node_item)); +cur_node_item->value = first; +info->value->host_nodes = cur_node_item; +do { +next = find_next_bit(numa_info[i].host_mem, MAX_NODES, + next + 1); +if (next == MAX_NODES) { +break; +} + +uint16List *host_node = g_malloc0(sizeof(*host_node)); +host_node->value = next; +cur_node_item->next = host_node; +cur_node_item = host_node; +} while (true); +end: +#endif + +if (!cur_item) { +head = cur_item = info; +} else { +cur_item->next = info; +cur_item = info; +} +} + +return head; +} diff --git a/qapi-schema.json b/qapi-schema.json index c0dad81..af947e2 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4289,3 +4289,39 @@ '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] }} + +## +# @NUMANode: +# +# Information of guest NUMA node +# +# @nodeid: NUMA node ID +# +# @cpus: VCPUs contained in this node +# +# @memory: memory size of this node +# +# @policy: memory policy of this node +# +# @relative: if host nodes are relative for memory policy +# +# @host-nodes: host nodes for its memory policy +# +# Since: 1.7 +# +## +{ 'type': 'NUMANode', + 'data': {'nodeid': 'uint16', 'cpus': ['uint16'], 'memory': 'uint64', + 'policy': 'NumaNodePolicy', 'relative': 'bool', + 'host-nodes': ['uint16'] }} + +## +# @query-numa: +# +# Returns a list of information about each guest node. +# +# Returns: a list of @NUMANode for each guest node +# +# Since: 1.7 +## +{ 'command': 'query-numa', 'returns': ['NUMANode'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..c2bc508 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3295,3 +3295,52 @@ Example (2): <- { "return": {} } EQMP + +{ +.name = "query-numa", +.args_type = "", +.mhandler.cmd_new = qmp_marshal_input_query_numa, +}, + +SQMP +query-numa +- + +Show NUMA information. + +Return a json-array. Each NUMA node is represented by a json-object, +which contains: + +- "nodeid": NUMA node ID (json-int) +- "cpus": a json-arry of contained VCPUs +- "memory": amount of memory in each node in Byte (json-int) +- "policy": memory policy of this node (json-string) +- "relative": if host nodes is relative for its memory policy (json-bool) +- "host-nodes": a json-array of host nodes for its memory policy + +Arguments: + +Example: + +-> { "excute": "query
[Qemu-devel] [PATCH V16 04/11] NUMA: convert -numa option to use OptsVisitor
Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +- numa.c | 148 +++- qapi-schema.json| 30 ++ vl.c| 11 +++- 4 files changed, 114 insertions(+), 78 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index d873b42..20b05a3 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -140,9 +140,10 @@ typedef struct node_info { DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index 1bc0fad..c4fa665 100644 --- a/numa.c +++ b/numa.c @@ -24,101 +24,97 @@ */ #include "sysemu/sysemu.h" - -static void numa_node_parse_cpus(int nodenr, const char *cpus) +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +QemuOptsList qemu_numa_opts = { +.name = "numa", +.implied_opt_name = "type", +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), +.desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static int numa_node_parse(NumaNodeOptions *opts) { -char *endptr; -unsigned long long value, endvalue; - -/* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ -if (!*cpus) { -return; -} +uint16_t nodenr; +uint16List *cpus = NULL; -if (parse_uint(cpus, &value, &endptr, 10) < 0) { -goto error; -} -if (*endptr == '-') { -if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { -goto error; -} -} else if (*endptr == '\0') { -endvalue = value; +if (opts->has_nodeid) { +nodenr = opts->nodeid; } else { -goto error; +nodenr = nb_numa_nodes; } -if (endvalue >= MAX_CPUMASK_BITS) { -endvalue = MAX_CPUMASK_BITS - 1; -fprintf(stderr, -"qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; } -if (endvalue < value) { -goto error; +for (cpus = opts->cpus; cpus; cpus = cpus->next) { +if (cpus->value > MAX_CPUMASK_BITS) { +fprintf(stderr, "qemu: cpu number %" PRIu16 " is bigger than %d", +cpus->value, MAX_CPUMASK_BITS); +continue; +} +bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } -bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); -return; +if (opts->has_mem) { +int64_t mem_size; +char *endptr; +mem_size = strtosz(opts->mem, &endptr); +if (mem_size < 0 || *endptr) { +fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); +return -1; +} +numa_info[nodenr].node_mem = mem_size; +} -error: -fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); -exit(1); +return 0; } -void numa_add(const char *optarg) +int numa_init_func(QemuOpts *opts, void *opaque) { -char option[128]; -char *endptr; -unsigned long long nodenr; - -optarg = get_opt_name(option, 128, optarg, ','); -if (*optarg == ',') { -optarg++; +NumaOptions *object = NULL; +Error *err = NULL; +int ret = 0; + +{ +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err); +opts_visitor_cleanup(ov); } -if (!strcmp(option, "node")) { - -if (nb_numa_nodes >= MAX_NODES) { -fprintf(stderr, "qemu: too many NUMA nodes\n"); -exit(1); -} -if (get_param_value(option, 128, "nodeid", optarg) == 0) { -nodenr = nb_numa_nodes; -} else { -if (parse_uint_full(option, &nodenr, 10) < 0) { -fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); -exit(1); -} -} - -if (nodenr >= MAX_NODES) { -fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); -exit(1); -} +if (error_is_set(&err)) { +fprintf(stderr, "qemu: %s\n", error_get_pretty(err)); +error_free(err); +ret = -1; +goto er
[Qemu-devel] [PATCH V16 03/11] NUMA: Add numa_info structure to contain numa nodes info
Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c| 12 include/sysemu/sysemu.h | 8 ++-- monitor.c | 2 +- numa.c | 23 --- vl.c| 7 +++ 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 12c436e..74c1f16 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -670,14 +670,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { -numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * @@ -1072,8 +1072,12 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, guest_info->apic_id_limit = pc_apic_id_limit(max_cpus); guest_info->apic_xrupt_override = kvm_allows_irq0_override(); guest_info->numa_nodes = nb_numa_nodes; -guest_info->node_mem = g_memdup(node_mem, guest_info->numa_nodes * +guest_info->node_mem = g_malloc0(guest_info->numa_nodes * sizeof *guest_info->node_mem); +for (i = 0; i < nb_numa_nodes; i++) { +guest_info->node_mem[i] = numa_info[i].node_mem; +} + guest_info->node_cpu = g_malloc0(guest_info->apic_id_limit * sizeof *guest_info->node_cpu); @@ -1081,7 +1085,7 @@ PcGuestInfo *pc_guest_info_init(ram_addr_t below_4g_mem_size, unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < guest_info->apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { guest_info->node_cpu[apic_id] = j; break; } diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 2509649..d873b42 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -134,8 +135,11 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { +uint64_t node_mem; +DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); diff --git a/monitor.c b/monitor.c index 845f608..b97b7d3 100644 --- a/monitor.c +++ b/monitor.c @@ -2004,7 +2004,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -node_mem[i] >> 20); +numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index beda80e..1bc0fad 100644 --- a/numa.c +++ b/numa.c @@ -61,7 +61,7 @@ static void numa_node_parse_cpus(int nodenr, const char *cpus) goto error; } -bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); return; error: @@ -101,7 +101,7 @@ void numa_add(const char *optarg) } if (get_param_value(option, 128, "mem", optarg) == 0) { -node_mem[nodenr] = 0; +numa_info[nodenr].node_mem = 0; } else { int64_t sval; sval = strtosz(option, &endptr); @@ -109,7 +109,7 @@ void numa_add(const char *optarg) fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); exit(1); } -node_mem[nodenr] = sval; +numa_info[nodenr].node_mem = sval; } if (get_param_value(option, 128, "cpus", optarg) != 0) { numa_node_parse_cpus(nodenr, option); @@ -134,7 +134,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */
[Qemu-devel] [PATCH V16 06/11] NUMA: add "-numa mem," options
Add "-numa mem," option like following as Paolo suggested: -numa mem,nodeid=0,size=1G This new option will make later coming memory hotplug better. We will use the new options to specify nodes memory info, and just remain "-numa node,mem=xx" as legacy. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 1 + numa.c | 36 qemu-options.hx | 6 -- vl.c| 2 ++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 20b05a3..291aa6a 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -135,6 +135,7 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; +extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); diff --git a/numa.c b/numa.c index c4fa665..c676c5e 100644 --- a/numa.c +++ b/numa.c @@ -74,6 +74,31 @@ static int numa_node_parse(NumaNodeOptions *opts) return 0; } +static int numa_mem_parse(NumaMemOptions *opts) +{ +uint16_t nodenr; +uint64_t mem_size; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +} else { +nodenr = nb_numa_mem_nodes; +} + +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} + +if (opts->has_size) { +mem_size = opts->size; +numa_info[nodenr].node_mem = mem_size; +} + +return 0; +} + int numa_init_func(QemuOpts *opts, void *opaque) { NumaOptions *object = NULL; @@ -101,6 +126,13 @@ int numa_init_func(QemuOpts *opts, void *opaque) } nb_numa_nodes++; break; +case NUMA_OPTIONS_KIND_MEM: +ret = numa_mem_parse(object->mem); +if (ret) { +goto error; +} +nb_numa_mem_nodes++; +break; default: fprintf(stderr, "qemu: Invalid NUMA options type.\n"); ret = -1; @@ -119,6 +151,10 @@ error: void set_numa_nodes(void) { +if (nb_numa_mem_nodes > nb_numa_nodes) { +nb_numa_nodes = nb_numa_mem_nodes; +} + if (nb_numa_nodes > 0) { int i; diff --git a/qemu-options.hx b/qemu-options.hx index 8b94264..e6afb6f 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,11 +95,13 @@ specifies the maximum number of hotpluggable CPUs. ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, -"-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) +"-numa node[,nodeid=node][,cpus=cpu[-cpu]]\n" +"-numa mem[,nodeid=node][,size=size]\n" +, QEMU_ARCH_ALL) STEXI @item -numa @var{opts} @findex -numa -Simulate a multi node NUMA system. If mem and cpus are omitted, resources +Simulate a multi node NUMA system. If @var{size} and @var{cpus} are omitted, resources are split equally. ETEXI diff --git a/vl.c b/vl.c index e67f34a..064b821 100644 --- a/vl.c +++ b/vl.c @@ -250,6 +250,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); int nb_numa_nodes; +int nb_numa_mem_nodes; NodeInfo numa_info[MAX_NODES]; uint8_t qemu_uuid[16]; @@ -2817,6 +2818,7 @@ int main(int argc, char **argv, char **envp) } nb_numa_nodes = 0; +nb_numa_mem_nodes = 0; nb_nics = 0; bdrv_init_with_whitelist(); -- 1.8.5.rc3
[Qemu-devel] [PATCH V16 00/11] Add support for binding guest numa nodes to host numa nodes
1->V12: rebase to current master split patch 02/11 of V11 (Eduardo) add some max value check (Eduardo) split MAX_NODES change patch (Eduardo) V12->V13: rebase to current master thanks for Luiz's review (Luiz) doc hmp command set-mem-policy (Luiz) rename: NUMAInfo -> NUMANode (Luiz) V13->V14: remove "set-mem-policy" qmp and hmp commands (Marcelo, Paolo) V14->V15: rebase to the current master V15->V16: rebase to current master add more test log Wanlong Gao (11): NUMA: move numa related code to new file numa.c NUMA: check if the total numa memory size is equal to ram_size NUMA: Add numa_info structure to contain numa nodes info NUMA: convert -numa option to use OptsVisitor NUMA: introduce NumaMemOptions NUMA: add "-numa mem," options NUMA: expand MAX_NODES from 64 to 128 NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp.c | 57 +++ hmp.h | 1 + hw/i386/pc.c| 12 +- include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 ++- monitor.c | 21 +-- numa.c | 395 qapi-schema.json| 112 ++ qemu-options.hx | 6 +- qmp-commands.hx | 49 ++ vl.c| 160 +++- 13 files changed, 661 insertions(+), 187 deletions(-) create mode 100644 numa.c -- 1.8.5.rc3
[Qemu-devel] [PATCH V16 05/11] NUMA: introduce NumaMemOptions
Signed-off-by: Wanlong Gao --- qapi-schema.json | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index db539b6..1043e57 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4223,7 +4223,8 @@ ## { 'union': 'NumaOptions', 'data': { -'node': 'NumaNodeOptions' }} +'node': 'NumaNodeOptions', +'mem' : 'NumaMemOptions' }} ## # @NumaNodeOptions @@ -4243,3 +4244,19 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem':'str' }} + +## +# @NumaMemOptions +# +# Set memory information of guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @size: #optional memory size of this node +# +# Since 1.7 +## +{ 'type': 'NumaMemOptions', + 'data': { + '*nodeid': 'uint16', + '*size': 'size' }} -- 1.8.5.rc3
[Qemu-devel] [PATCH V16 08/11] NUMA: parse guest numa nodes memory policy
The memory policy setting format is like: policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N And we are adding this setting as a suboption of "-numa mem,", the memory policy then can be set like following: -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +++ numa.c | 18 ++ qapi-schema.json| 33 +++-- vl.c| 3 +++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 807619e..82f1447 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -139,6 +139,9 @@ extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +DECLARE_BITMAP(host_mem, MAX_NODES); +NumaNodePolicy policy; +bool relative; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; void set_numa_nodes(void); diff --git a/numa.c b/numa.c index c676c5e..da4dbbd 100644 --- a/numa.c +++ b/numa.c @@ -78,6 +78,7 @@ static int numa_mem_parse(NumaMemOptions *opts) { uint16_t nodenr; uint64_t mem_size; +uint16List *nodes; if (opts->has_nodeid) { nodenr = opts->nodeid; @@ -96,6 +97,23 @@ static int numa_mem_parse(NumaMemOptions *opts) numa_info[nodenr].node_mem = mem_size; } +if (opts->has_policy) { +numa_info[nodenr].policy = opts->policy; +} + +if (opts->has_relative) { +numa_info[nodenr].relative = opts->relative; +} + +for (nodes = opts->host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +fprintf(stderr, "qemu: node number %" PRIu16 " is bigger than %d\n", +nodes->value, MAX_NODES); +continue; +} +bitmap_set(numa_info[nodenr].host_mem, nodes->value, 1); +} + return 0; } diff --git a/qapi-schema.json b/qapi-schema.json index 1043e57..c0dad81 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4246,6 +4246,26 @@ '*mem':'str' }} ## +# @NumaNodePolicy +# +# NUMA node policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred node for allocation +# +# @membind: a strict policy that restricts memory allocation to the +# nodes specified +# +# @interleave: the page allocations is interleaved across the set +# of nodes specified +# +# Since 1.7 +## +{ 'enum': 'NumaNodePolicy', + 'data': [ 'default', 'preferred', 'membind', 'interleave' ] } + +## # @NumaMemOptions # # Set memory information of guest NUMA node. (for OptsVisitor) @@ -4254,9 +4274,18 @@ # # @size: #optional memory size of this node # +# @policy: #optional memory policy of this node +# +# @relative: #optional if the nodes specified are relative +# +# @host-nodes: #optional host nodes for its memory policy +# # Since 1.7 ## { 'type': 'NumaMemOptions', 'data': { - '*nodeid': 'uint16', - '*size': 'size' }} + '*nodeid': 'uint16', + '*size': 'size', + '*policy': 'NumaNodePolicy', + '*relative': 'bool', + '*host-nodes': ['uint16'] }} diff --git a/vl.c b/vl.c index 064b821..95d03f5 100644 --- a/vl.c +++ b/vl.c @@ -2815,6 +2815,9 @@ int main(int argc, char **argv, char **envp) for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_mem = 0; bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); +bitmap_zero(numa_info[i].host_mem, MAX_NODES); +numa_info[i].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[i].relative = false; } nb_numa_nodes = 0; -- 1.8.5.rc3
[Qemu-devel] [PATCH V15 05/11] NUMA: introduce NumaMemOptions
Signed-off-by: Wanlong Gao --- qapi-schema.json | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index b7f0b15..a19e453 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4198,7 +4198,8 @@ ## { 'union': 'NumaOptions', 'data': { -'node': 'NumaNodeOptions' }} +'node': 'NumaNodeOptions', +'mem' : 'NumaMemOptions' }} ## # @NumaNodeOptions @@ -4218,3 +4219,19 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem':'str' }} + +## +# @NumaMemOptions +# +# Set memory information of guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @size: #optional memory size of this node +# +# Since 1.7 +## +{ 'type': 'NumaMemOptions', + 'data': { + '*nodeid': 'uint16', + '*size': 'size' }} -- 1.8.4.1.600.g3d092bf
[Qemu-devel] [PATCH V15 11/11] NUMA: convert hmp command info_numa to use qmp command query_numa
Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- hmp.c | 57 + hmp.h | 1 + monitor.c | 21 + 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/hmp.c b/hmp.c index 32ee285..d6dedd2 100644 --- a/hmp.c +++ b/hmp.c @@ -24,6 +24,10 @@ #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1564,3 +1568,56 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } + +void hmp_info_numa(Monitor *mon, const QDict *qdict) +{ +NUMANodeList *node_list, *node; +uint16List *head; +int nodeid; +char *policy_str = NULL; + +node_list = qmp_query_numa(NULL); + +monitor_printf(mon, "%d nodes\n", nb_numa_nodes); +for (node = node_list; node; node = node->next) { +nodeid = node->value->nodeid; +monitor_printf(mon, "node %d cpus:", nodeid); +head = node->value->cpus; +for (head = node->value->cpus; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +monitor_printf(mon, "node %d size: %" PRId64 " MB\n", + nodeid, node->value->memory >> 20); +switch (node->value->policy) { +case NUMA_NODE_POLICY_DEFAULT: +policy_str = g_strdup("default"); +break; +case NUMA_NODE_POLICY_PREFERRED: +policy_str = g_strdup("preferred"); +break; +case NUMA_NODE_POLICY_MEMBIND: +policy_str = g_strdup("membind"); +break; +case NUMA_NODE_POLICY_INTERLEAVE: +policy_str = g_strdup("interleave"); +break; +default: +break; +} +monitor_printf(mon, "node %d policy: %s\n", + nodeid, policy_str ? : " "); +if (policy_str) { +free(policy_str); +} +monitor_printf(mon, "node %d relative: %s\n", nodeid, + node->value->relative ? "true" : "false"); +monitor_printf(mon, "node %d host-nodes:", nodeid); +for (head = node->value->host_nodes; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +} + +qapi_free_NUMANodeList(node_list); +} diff --git a/hmp.h b/hmp.h index 54cf71f..4f8d39b 100644 --- a/hmp.h +++ b/hmp.h @@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); +void hmp_info_numa(Monitor *mon, const QDict *qdict); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); diff --git a/monitor.c b/monitor.c index be34488..ce9dfe7 100644 --- a/monitor.c +++ b/monitor.c @@ -1995,25 +1995,6 @@ static void do_info_mtree(Monitor *mon, const QDict *qdict) mtree_info((fprintf_function)monitor_printf, mon); } -static void do_info_numa(Monitor *mon, const QDict *qdict) -{ -int i; -CPUState *cpu; - -monitor_printf(mon, "%d nodes\n", nb_numa_nodes); -for (i = 0; i < nb_numa_nodes; i++) { -monitor_printf(mon, "node %d cpus:", i); -CPU_FOREACH(cpu) { -if (cpu->numa_node == i) { -monitor_printf(mon, " %d", cpu->cpu_index); -} -} -monitor_printf(mon, "\n"); -monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -numa_info[i].node_mem >> 20); -} -} - #ifdef CONFIG_PROFILER int64_t qemu_time; @@ -2781,7 +2762,7 @@ static mon_cmd_t info_cmds[] = { .args_type = "", .params = "", .help = "show NUMA information", -.mhandler.cmd = do_info_numa, +.mhandler.cmd = hmp_info_numa, }, { .name = "usb", -- 1.8.4.1.600.g3d092bf
[Qemu-devel] [PATCH V15 07/11] NUMA: expand MAX_NODES from 64 to 128
libnuma choosed 128 for MAX_NODES, so we follow libnuma here. Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 861cd77..995cf3b 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -132,7 +132,7 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; -- 1.8.4.1.600.g3d092bf
[Qemu-devel] [PATCH V15 09/11] NUMA: set guest numa nodes memory policy
Set the guest numa nodes memory policies using the mbind(2) system call node by node. After this patch, we are able to set guest nodes memory policies through the QEMU options, this arms to solve the guest cross nodes memory access performance issue. And as you all know, if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policies before the pages are really mapped. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- numa.c | 86 ++ 1 file changed, 86 insertions(+) diff --git a/numa.c b/numa.c index da4dbbd..915a67a 100644 --- a/numa.c +++ b/numa.c @@ -27,6 +27,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif + QemuOptsList qemu_numa_opts = { .name = "numa", .implied_opt_name = "type", @@ -228,6 +238,75 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ +int bind_mode; + +switch (numa_info[nodeid].policy) { +case NUMA_NODE_POLICY_DEFAULT: +case NUMA_NODE_POLICY_PREFERRED: +case NUMA_NODE_POLICY_MEMBIND: +case NUMA_NODE_POLICY_INTERLEAVE: +bind_mode = numa_info[nodeid].policy; +break; +default: +bind_mode = NUMA_NODE_POLICY_DEFAULT; +return bind_mode; +} + +bind_mode |= numa_info[nodeid].relative ? +MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + +return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef __linux__ +void *ram_ptr; +RAMBlock *block; +ram_addr_t len, ram_offset = 0; +int bind_mode; +int i; + +QTAILQ_FOREACH(block, &ram_list.blocks, next) { +if (!strcmp(block->mr->name, "pc.ram")) { +break; +} +} + +if (block->host == NULL) { +return -1; +} + +ram_ptr = block->host; +for (i = 0; i < nodeid; i++) { +len = numa_info[i].node_mem; +ram_offset += len; +} + +len = numa_info[nodeid].node_mem; +bind_mode = node_parse_bind_mode(nodeid); +unsigned long *nodes = numa_info[nodeid].host_mem; + +/* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ +unsigned long maxnode = find_last_bit(nodes, MAX_NODES); +if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, +nodes, maxnode + 2, 0)) { +perror("mbind"); +return -1; +} +#endif + +return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -240,4 +319,11 @@ void set_numa_modes(void) } } } + +for (i = 0; i < nb_numa_nodes; i++) { +if (set_node_mem_policy(i) == -1) { +fprintf(stderr, +"qemu: can not set host memory policy for node%d\n", i); +} +} } -- 1.8.4.1.600.g3d092bf
[Qemu-devel] [PATCH V15 10/11] NUMA: add qmp command query-numa
Add qmp command query-numa to show guest NUMA information. Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- numa.c | 66 qapi-schema.json | 36 +++ qmp-commands.hx | 49 + 3 files changed, 151 insertions(+) diff --git a/numa.c b/numa.c index 915a67a..b392190 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "exec/memory.h" +#include "qmp-commands.h" #ifdef __linux__ #include @@ -327,3 +328,68 @@ void set_numa_modes(void) } } } + +NUMANodeList *qmp_query_numa(Error **errp) +{ +NUMANodeList *head = NULL, *cur_item = NULL; +CPUState *cpu; +int i; + +for (i = 0; i < nb_numa_nodes; i++) { +NUMANodeList *info; +uint16List *cur_cpu_item = NULL; +info = g_malloc0(sizeof(*info)); +info->value = g_malloc0(sizeof(*info->value)); +info->value->nodeid = i; +CPU_FOREACH(cpu) { +if (cpu->numa_node == i) { +uint16List *node_cpu = g_malloc0(sizeof(*node_cpu)); +node_cpu->value = cpu->cpu_index; + +if (!cur_cpu_item) { +info->value->cpus = cur_cpu_item = node_cpu; +} else { +cur_cpu_item->next = node_cpu; +cur_cpu_item = node_cpu; +} +} +} +info->value->memory = numa_info[i].node_mem; + +#ifdef __linux__ +info->value->policy = numa_info[i].policy; +info->value->relative = numa_info[i].relative; + +unsigned long first, next; +next = first = find_first_bit(numa_info[i].host_mem, MAX_NODES); +if (first == MAX_NODES) { +goto end; +} +uint16List *cur_node_item = g_malloc0(sizeof(*cur_node_item)); +cur_node_item->value = first; +info->value->host_nodes = cur_node_item; +do { +next = find_next_bit(numa_info[i].host_mem, MAX_NODES, + next + 1); +if (next == MAX_NODES) { +break; +} + +uint16List *host_node = g_malloc0(sizeof(*host_node)); +host_node->value = next; +cur_node_item->next = host_node; +cur_node_item = host_node; +} while (true); +end: +#endif + +if (!cur_item) { +head = cur_item = info; +} else { +cur_item->next = info; +cur_item = info; +} +} + +return head; +} diff --git a/qapi-schema.json b/qapi-schema.json index 804d44a..86e001e 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4264,3 +4264,39 @@ '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] }} + +## +# @NUMANode: +# +# Information of guest NUMA node +# +# @nodeid: NUMA node ID +# +# @cpus: VCPUs contained in this node +# +# @memory: memory size of this node +# +# @policy: memory policy of this node +# +# @relative: if host nodes are relative for memory policy +# +# @host-nodes: host nodes for its memory policy +# +# Since: 1.7 +# +## +{ 'type': 'NUMANode', + 'data': {'nodeid': 'uint16', 'cpus': ['uint16'], 'memory': 'uint64', + 'policy': 'NumaNodePolicy', 'relative': 'bool', + 'host-nodes': ['uint16'] }} + +## +# @query-numa: +# +# Returns a list of information about each guest node. +# +# Returns: a list of @NUMANode for each guest node +# +# Since: 1.7 +## +{ 'command': 'query-numa', 'returns': ['NUMANode'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index fba15cd..c2bc508 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3295,3 +3295,52 @@ Example (2): <- { "return": {} } EQMP + +{ +.name = "query-numa", +.args_type = "", +.mhandler.cmd_new = qmp_marshal_input_query_numa, +}, + +SQMP +query-numa +- + +Show NUMA information. + +Return a json-array. Each NUMA node is represented by a json-object, +which contains: + +- "nodeid": NUMA node ID (json-int) +- "cpus": a json-arry of contained VCPUs +- "memory": amount of memory in each node in Byte (json-int) +- "policy": memory policy of this node (json-string) +- "relative": if host nodes is relative for its memory policy (json-bool) +- "host-nodes": a json-array of host nodes for its memory policy + +Arguments: + +Example: + +-> { "excute": "query
[Qemu-devel] [PATCH V15 06/11] NUMA: add "-numa mem," options
Add "-numa mem," option like following as Paolo suggested: -numa mem,nodeid=0,size=1G This new option will make later coming memory hotplug better. We will use the new options to specify nodes memory info, and just remain "-numa node,mem=xx" as legacy. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 1 + numa.c | 36 qemu-options.hx | 6 -- vl.c| 2 ++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index c9fb2c7..861cd77 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -135,6 +135,7 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; +extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); diff --git a/numa.c b/numa.c index c4fa665..c676c5e 100644 --- a/numa.c +++ b/numa.c @@ -74,6 +74,31 @@ static int numa_node_parse(NumaNodeOptions *opts) return 0; } +static int numa_mem_parse(NumaMemOptions *opts) +{ +uint16_t nodenr; +uint64_t mem_size; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +} else { +nodenr = nb_numa_mem_nodes; +} + +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} + +if (opts->has_size) { +mem_size = opts->size; +numa_info[nodenr].node_mem = mem_size; +} + +return 0; +} + int numa_init_func(QemuOpts *opts, void *opaque) { NumaOptions *object = NULL; @@ -101,6 +126,13 @@ int numa_init_func(QemuOpts *opts, void *opaque) } nb_numa_nodes++; break; +case NUMA_OPTIONS_KIND_MEM: +ret = numa_mem_parse(object->mem); +if (ret) { +goto error; +} +nb_numa_mem_nodes++; +break; default: fprintf(stderr, "qemu: Invalid NUMA options type.\n"); ret = -1; @@ -119,6 +151,10 @@ error: void set_numa_nodes(void) { +if (nb_numa_mem_nodes > nb_numa_nodes) { +nb_numa_nodes = nb_numa_mem_nodes; +} + if (nb_numa_nodes > 0) { int i; diff --git a/qemu-options.hx b/qemu-options.hx index 5dc8b75..98fa25d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,11 +95,13 @@ specifies the maximum number of hotpluggable CPUs. ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, -"-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) +"-numa node[,nodeid=node][,cpus=cpu[-cpu]]\n" +"-numa mem[,nodeid=node][,size=size]\n" +, QEMU_ARCH_ALL) STEXI @item -numa @var{opts} @findex -numa -Simulate a multi node NUMA system. If mem and cpus are omitted, resources +Simulate a multi node NUMA system. If @var{size} and @var{cpus} are omitted, resources are split equally. ETEXI diff --git a/vl.c b/vl.c index d55d717..190d5d7 100644 --- a/vl.c +++ b/vl.c @@ -250,6 +250,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); int nb_numa_nodes; +int nb_numa_mem_nodes; NodeInfo numa_info[MAX_NODES]; uint8_t qemu_uuid[16]; @@ -2816,6 +2817,7 @@ int main(int argc, char **argv, char **envp) } nb_numa_nodes = 0; +nb_numa_mem_nodes = 0; nb_nics = 0; bdrv_init_with_whitelist(); -- 1.8.4.1.600.g3d092bf
[Qemu-devel] [PATCH V15 04/11] NUMA: convert -numa option to use OptsVisitor
Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +- numa.c | 148 +++- qapi-schema.json| 30 ++ vl.c| 11 +++- 4 files changed, 114 insertions(+), 78 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 3f3764d..c9fb2c7 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -140,9 +140,10 @@ typedef struct node_info { DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index 1bc0fad..c4fa665 100644 --- a/numa.c +++ b/numa.c @@ -24,101 +24,97 @@ */ #include "sysemu/sysemu.h" - -static void numa_node_parse_cpus(int nodenr, const char *cpus) +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +QemuOptsList qemu_numa_opts = { +.name = "numa", +.implied_opt_name = "type", +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), +.desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static int numa_node_parse(NumaNodeOptions *opts) { -char *endptr; -unsigned long long value, endvalue; - -/* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ -if (!*cpus) { -return; -} +uint16_t nodenr; +uint16List *cpus = NULL; -if (parse_uint(cpus, &value, &endptr, 10) < 0) { -goto error; -} -if (*endptr == '-') { -if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { -goto error; -} -} else if (*endptr == '\0') { -endvalue = value; +if (opts->has_nodeid) { +nodenr = opts->nodeid; } else { -goto error; +nodenr = nb_numa_nodes; } -if (endvalue >= MAX_CPUMASK_BITS) { -endvalue = MAX_CPUMASK_BITS - 1; -fprintf(stderr, -"qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; } -if (endvalue < value) { -goto error; +for (cpus = opts->cpus; cpus; cpus = cpus->next) { +if (cpus->value > MAX_CPUMASK_BITS) { +fprintf(stderr, "qemu: cpu number %" PRIu16 " is bigger than %d", +cpus->value, MAX_CPUMASK_BITS); +continue; +} +bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } -bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); -return; +if (opts->has_mem) { +int64_t mem_size; +char *endptr; +mem_size = strtosz(opts->mem, &endptr); +if (mem_size < 0 || *endptr) { +fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); +return -1; +} +numa_info[nodenr].node_mem = mem_size; +} -error: -fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); -exit(1); +return 0; } -void numa_add(const char *optarg) +int numa_init_func(QemuOpts *opts, void *opaque) { -char option[128]; -char *endptr; -unsigned long long nodenr; - -optarg = get_opt_name(option, 128, optarg, ','); -if (*optarg == ',') { -optarg++; +NumaOptions *object = NULL; +Error *err = NULL; +int ret = 0; + +{ +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err); +opts_visitor_cleanup(ov); } -if (!strcmp(option, "node")) { - -if (nb_numa_nodes >= MAX_NODES) { -fprintf(stderr, "qemu: too many NUMA nodes\n"); -exit(1); -} -if (get_param_value(option, 128, "nodeid", optarg) == 0) { -nodenr = nb_numa_nodes; -} else { -if (parse_uint_full(option, &nodenr, 10) < 0) { -fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); -exit(1); -} -} - -if (nodenr >= MAX_NODES) { -fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); -exit(1); -} +if (error_is_set(&err)) { +fprintf(stderr, "qemu: %s\n", error_get_pretty(err)); +error_free(err); +ret = -1; +goto er
[Qemu-devel] [PATCH V15 03/11] NUMA: Add numa_info structure to contain numa nodes info
Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c| 4 ++-- include/sysemu/sysemu.h | 8 ++-- monitor.c | 2 +- numa.c | 23 --- vl.c| 7 +++ 5 files changed, 24 insertions(+), 20 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0c313fe..b0fddd0 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -652,14 +652,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { -numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index e58ef3f..3f3764d 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -134,8 +135,11 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { +uint64_t node_mem; +DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); diff --git a/monitor.c b/monitor.c index 74f3f1b..be34488 100644 --- a/monitor.c +++ b/monitor.c @@ -2010,7 +2010,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -node_mem[i] >> 20); +numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index beda80e..1bc0fad 100644 --- a/numa.c +++ b/numa.c @@ -61,7 +61,7 @@ static void numa_node_parse_cpus(int nodenr, const char *cpus) goto error; } -bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); return; error: @@ -101,7 +101,7 @@ void numa_add(const char *optarg) } if (get_param_value(option, 128, "mem", optarg) == 0) { -node_mem[nodenr] = 0; +numa_info[nodenr].node_mem = 0; } else { int64_t sval; sval = strtosz(option, &endptr); @@ -109,7 +109,7 @@ void numa_add(const char *optarg) fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); exit(1); } -node_mem[nodenr] = sval; +numa_info[nodenr].node_mem = sval; } if (get_param_value(option, 128, "cpus", optarg) != 0) { numa_node_parse_cpus(nodenr, option); @@ -134,7 +134,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */ for (i = 0; i < nb_numa_nodes; i++) { -if (node_mem[i] != 0) +if (numa_info[i].node_mem != 0) break; } if (i == nb_numa_nodes) { @@ -144,15 +144,16 @@ void set_numa_nodes(void) * the final node gets the rest. */ for (i = 0; i < nb_numa_nodes - 1; i++) { -node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); -usedmem += node_mem[i]; +numa_info[i].node_mem = (ram_size / nb_numa_nodes) & +~((1 << 23UL) - 1); +usedmem += numa_info[i].node_mem; } -node_mem[i] = ram_size - usedmem; +numa_info[i].node_mem = ram_size - usedmem; } uint64_t numa_total = 0; for (i = 0; i < nb_numa_nodes; i++) { -numa_total += node_mem[i]; +numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { fprintf(stderr, "qemu: numa nodes total memory size " @@ -161,7 +162,7 @@ void set_numa_nodes(void) } for (i = 0; i
[Qemu-devel] [PATCH V15 00/11] Add support for binding guest numa nodes to host numa nodes
As you know, QEMU can't direct it's memory allocation now, this may cause guest cross node access performance regression. And, the worse thing is that if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policy before the pages are really mapped. According to this patch set, we are able to set guest nodes memory policy like following: -numa node,nodeid=0,cpus=0, \ -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ -numa node,nodeid=1,cpus=1 \ -numa mem,size=1024M,policy=interleave,host-nodes=1 This supports "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" like format. And add a QMP command "query-numa" to show numa info through this API. And convert the "info numa" monitor command to use this QMP command "query-numa". This version removes "set-mem-policy" qmp and hmp commands temporarily as Marcelo and Paolo suggested. V1->V2: change to use QemuOpts in numa options (Paolo) handle Error in mpol parser (Paolo) change qmp command format to mem-policy=membind,mem-hostnode=0-1 like (Paolo) V2->V3: also handle Error in cpus parser (5/10) split out common parser from cpus and hostnode parser (Bandan 6/10) V3-V4: rebase to request for comments V4->V5: use OptVisitor and split -numa option (Paolo) - s/set-mpol/set-mem-policy (Andreas) - s/mem-policy/policy - s/mem-hostnode/host-nodes fix hmp command process after error (Luiz) add qmp command query-numa and convert info numa to it (Luiz) V5->V6: remove tabs in json file (Laszlo, Paolo) add back "-numa node,mem=xxx" as legacy (Paolo) change cpus and host-nodes to array (Laszlo, Eric) change "nodeid" to "uint16" add NumaMemPolicy enum type (Eric) rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for repeating options" patch set, thanks for Laszlo's help V6-V7: change UInt16 to uint16 (Laszlo) fix a typo in adding qmp command set-mem-policy V7-V8: rebase to current master with Laszlo's V2 of OptsVisitor patch set fix an adding white space line error V8->V9: rebase to current master check if total numa memory size is equal to ram_size (Paolo) add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) replace the use of numa_num_configured_nodes() (Andrew) avoid abusing the fact i==nodeid (Andrew) V9->V10: rebase to current master remove libnuma (Andrew) MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) remove a useless clear_bit() operation (Andrew) V10->V11: rebase to current master fix "maxnode" argument of mbind(2) V11->V12: rebase to current master split patch 02/11 of V11 (Eduardo) add some max value check (Eduardo) split MAX_NODES change patch (Eduardo) V12->V13: rebase to current master thanks for Luiz's review (Luiz) doc hmp command set-mem-policy (Luiz) rename: NUMAInfo -> NUMANode (Luiz) V13->V14: remove "set-mem-policy" qmp and hmp commands (Marcelo, Paolo) V14->V15: rebase to the current master *I hope this can catch up the train of 1.7.* Wanlong Gao (11): NUMA: move numa related code to new file numa.c NUMA: check if the total numa memory size is equal to ram_size NUMA: Add numa_info structure to contain numa nodes info NUMA: convert -numa option to use OptsVisitor NUMA: introduce NumaMemOptions NUMA: add "-numa mem," options NUMA: expand MAX_NODES from 64 to 128 NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp.c | 57 +++ hmp.h | 1 + hw/i386/pc.c| 4 +- include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 ++- monitor.c | 21 +-- numa.c | 395 qapi-schema.json| 112 ++ qemu-options.hx | 6 +- qmp-commands.hx | 49 ++ vl.c| 160 +++- 13 files changed, 655 insertions(+), 185 deletions(-) create mode 100644 numa.c -- 1.8.4.1.600.g3d092bf
[Qemu-devel] [PATCH V15 01/11] NUMA: move numa related code to new file numa.c
Signed-off-by: Wanlong Gao --- Makefile.target | 2 +- cpus.c | 14 include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 3 + numa.c | 182 vl.c| 139 +--- 6 files changed, 187 insertions(+), 154 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index af6ac7e..0197c17 100644 --- a/Makefile.target +++ b/Makefile.target @@ -109,7 +109,7 @@ endif #CONFIG_BSD_USER # # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/cpus.c b/cpus.c index 398229e..473f655 100644 --- a/cpus.c +++ b/cpus.c @@ -1295,20 +1295,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ -CPUState *cpu; -int i; - -CPU_FOREACH(cpu) { -for (i = 0; i < nb_numa_nodes; i++) { -if (test_bit(cpu->cpu_index, node_cpumask[i])) { -cpu->numa_node = i; -} -} -} -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488..4f79081 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index cd5791e..e58ef3f 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -136,6 +136,9 @@ extern QEMUClockType rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +void numa_add(const char *optarg); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 000..ce7736a --- /dev/null +++ b/numa.c @@ -0,0 +1,182 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2013 Fujitsu Ltd. + * Author: Wanlong Gao + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" + +static void numa_node_parse_cpus(int nodenr, const char *cpus) +{ +char *endptr; +unsigned long long value, endvalue; + +/* Empty CPU range strings will be considered valid, they will simply + * not set any bit in the CPU bitmap. + */ +if (!*cpus) { +return; +} + +if (parse_uint(cpus, &value, &endptr, 10) < 0) { +goto error; +} +if (*endptr == '-') { +if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { +goto error; +} +} else if (*endptr == '\0') { +endvalue = value; +} else { +goto error; +} + +if (endvalue >= MAX_CPUMASK_BITS) { +endvalue = MAX_CPUMASK_BITS - 1; +fprintf(stderr, +"qemu: NUMA: A max of %d VCPUs are supported\n", + MAX_CPUMASK_BITS); +} + +if (endvalue < value) { +goto error; +} + +bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +return; + +error: +fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); +exit(1); +} + +void numa_add(const char *optarg) +{ +char option[128]; +char *endptr; +unsigned long long nodenr; + +optarg = get_opt_name(option, 128, optarg, ','); +if (*optarg == ',') { +
[Qemu-devel] [PATCH V15 08/11] NUMA: parse guest numa nodes memory policy
The memory policy setting format is like: policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N And we are adding this setting as a suboption of "-numa mem,", the memory policy then can be set like following: -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +++ numa.c | 18 ++ qapi-schema.json| 33 +++-- vl.c| 3 +++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 995cf3b..9707195 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -139,6 +139,9 @@ extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +DECLARE_BITMAP(host_mem, MAX_NODES); +NumaNodePolicy policy; +bool relative; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; void set_numa_nodes(void); diff --git a/numa.c b/numa.c index c676c5e..da4dbbd 100644 --- a/numa.c +++ b/numa.c @@ -78,6 +78,7 @@ static int numa_mem_parse(NumaMemOptions *opts) { uint16_t nodenr; uint64_t mem_size; +uint16List *nodes; if (opts->has_nodeid) { nodenr = opts->nodeid; @@ -96,6 +97,23 @@ static int numa_mem_parse(NumaMemOptions *opts) numa_info[nodenr].node_mem = mem_size; } +if (opts->has_policy) { +numa_info[nodenr].policy = opts->policy; +} + +if (opts->has_relative) { +numa_info[nodenr].relative = opts->relative; +} + +for (nodes = opts->host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +fprintf(stderr, "qemu: node number %" PRIu16 " is bigger than %d\n", +nodes->value, MAX_NODES); +continue; +} +bitmap_set(numa_info[nodenr].host_mem, nodes->value, 1); +} + return 0; } diff --git a/qapi-schema.json b/qapi-schema.json index a19e453..804d44a 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -4221,6 +4221,26 @@ '*mem':'str' }} ## +# @NumaNodePolicy +# +# NUMA node policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred node for allocation +# +# @membind: a strict policy that restricts memory allocation to the +# nodes specified +# +# @interleave: the page allocations is interleaved across the set +# of nodes specified +# +# Since 1.7 +## +{ 'enum': 'NumaNodePolicy', + 'data': [ 'default', 'preferred', 'membind', 'interleave' ] } + +## # @NumaMemOptions # # Set memory information of guest NUMA node. (for OptsVisitor) @@ -4229,9 +4249,18 @@ # # @size: #optional memory size of this node # +# @policy: #optional memory policy of this node +# +# @relative: #optional if the nodes specified are relative +# +# @host-nodes: #optional host nodes for its memory policy +# # Since 1.7 ## { 'type': 'NumaMemOptions', 'data': { - '*nodeid': 'uint16', - '*size': 'size' }} + '*nodeid': 'uint16', + '*size': 'size', + '*policy': 'NumaNodePolicy', + '*relative': 'bool', + '*host-nodes': ['uint16'] }} diff --git a/vl.c b/vl.c index 190d5d7..0d7a663 100644 --- a/vl.c +++ b/vl.c @@ -2814,6 +2814,9 @@ int main(int argc, char **argv, char **envp) for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_mem = 0; bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); +bitmap_zero(numa_info[i].host_mem, MAX_NODES); +numa_info[i].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[i].relative = false; } nb_numa_nodes = 0; -- 1.8.4.1.600.g3d092bf
[Qemu-devel] [PATCH V15 02/11] NUMA: check if the total numa memory size is equal to ram_size
If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI talb, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao --- numa.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/numa.c b/numa.c index ce7736a..beda80e 100644 --- a/numa.c +++ b/numa.c @@ -150,6 +150,16 @@ void set_numa_nodes(void) node_mem[i] = ram_size - usedmem; } +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; -- 1.8.4.1.600.g3d092bf
Re: [Qemu-devel] About the migration_set_speed in the qemu monitor
On 10/28/2013 01:30 PM, Yaodong Yang wrote: > Hi all, > > When we migrate a vm from one host to another, we set the " migrate_set_speed > 200" inside the qemu monitor. What does the 200 means? Is it the maximum > migration speed is 200MB/s or something else? > This means the bandwidth limit is 200 bytes. Thanks, Wanlong Gao > Thanks! >
Re: [Qemu-devel] [PATCH V14 00/11] Add support for binding guest numa nodes to host numa nodes
Hi folks, Any more comments? Thanks, Wanlong Gao > As you know, QEMU can't direct it's memory allocation now, this may cause > guest cross node access performance regression. > And, the worse thing is that if PCI-passthrough is used, > direct-attached-device uses DMA transfer between device and qemu process. > All pages of the guest will be pinned by get_user_pages(). > > KVM_ASSIGN_PCI_DEVICE ioctl > kvm_vm_ioctl_assign_device() > =>kvm_assign_device() > => kvm_iommu_map_memslots() > => kvm_iommu_map_pages() >=> kvm_pin_pages() > > So, with direct-attached-device, all guest page's page count will be +1 and > any page migration will not work. AutoNUMA won't too. > > So, we should set the guest nodes memory allocation policy before > the pages are really mapped. > > According to this patch set, we are able to set guest nodes memory policy > like following: > > -numa node,nodeid=0,cpus=0, \ > -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ > -numa node,nodeid=1,cpus=1 \ > -numa mem,size=1024M,policy=interleave,host-nodes=1 > > This supports > "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" > like format. > > And add a QMP command "query-numa" to show numa info through > this API. > > And convert the "info numa" monitor command to use this > QMP command "query-numa". > > This version removes "set-mem-policy" qmp and hmp commands temporarily > as Marcelo and Paolo suggested. > > V1->V2: > change to use QemuOpts in numa options (Paolo) > handle Error in mpol parser (Paolo) > change qmp command format to mem-policy=membind,mem-hostnode=0-1 like > (Paolo) > V2->V3: > also handle Error in cpus parser (5/10) > split out common parser from cpus and hostnode parser (Bandan 6/10) > V3-V4: > rebase to request for comments > V4->V5: > use OptVisitor and split -numa option (Paolo) > - s/set-mpol/set-mem-policy (Andreas) > - s/mem-policy/policy > - s/mem-hostnode/host-nodes > fix hmp command process after error (Luiz) > add qmp command query-numa and convert info numa to it (Luiz) > V5->V6: > remove tabs in json file (Laszlo, Paolo) > add back "-numa node,mem=xxx" as legacy (Paolo) > change cpus and host-nodes to array (Laszlo, Eric) > change "nodeid" to "uint16" > add NumaMemPolicy enum type (Eric) > rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for > repeating options" patch set, thanks for Laszlo's help > V6-V7: > change UInt16 to uint16 (Laszlo) > fix a typo in adding qmp command set-mem-policy > V7-V8: > rebase to current master with Laszlo's V2 of OptsVisitor patch set > fix an adding white space line error > V8->V9: > rebase to current master > check if total numa memory size is equal to ram_size (Paolo) > add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) > replace the use of numa_num_configured_nodes() (Andrew) > avoid abusing the fact i==nodeid (Andrew) > V9->V10: > rebase to current master > remove libnuma (Andrew) > MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) > use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) > remove a useless clear_bit() operation (Andrew) > V10->V11: > rebase to current master > fix "maxnode" argument of mbind(2) > V11->V12: > rebase to current master > split patch 02/11 of V11 (Eduardo) > add some max value check (Eduardo) > split MAX_NODES change patch (Eduardo) > V12->V13: > rebase to current master > thanks for Luiz's review (Luiz) > doc hmp command set-mem-policy (Luiz) > rename: NUMAInfo -> NUMANode (Luiz) > V13->V14: > remove "set-mem-policy" qmp and hmp commands (Marcelo, Paolo) > > > *I hope this can catch up the train of 1.7.* > > Thanks, > Wanlong Gao > > Wanlong Gao (11): > NUMA: move numa related code to new file numa.c > NUMA: check if the total numa memory size is equal to ram_size > NUMA: Add numa_info structure to contain numa nodes info > NUMA: convert -numa option to use OptsVisitor > NUMA: introduce NumaMemOptions > NUMA: add "-numa mem," options > NUMA: expand MAX_NODES from 64 to 128 > NUMA: parse guest numa nodes memory policy > NUMA: set guest numa nodes memory policy > NUMA: add qmp command query-numa > NUMA: convert hmp command info_numa to use q
Re: [Qemu-devel] [PATCH V14 00/11] Add support for binding guest numa nodes to host numa nodes
Hi folks, Settled another week, who can pick? Thanks, Wanlong Gao > As you know, QEMU can't direct it's memory allocation now, this may cause > guest cross node access performance regression. > And, the worse thing is that if PCI-passthrough is used, > direct-attached-device uses DMA transfer between device and qemu process. > All pages of the guest will be pinned by get_user_pages(). > > KVM_ASSIGN_PCI_DEVICE ioctl > kvm_vm_ioctl_assign_device() > =>kvm_assign_device() > => kvm_iommu_map_memslots() > => kvm_iommu_map_pages() >=> kvm_pin_pages() > > So, with direct-attached-device, all guest page's page count will be +1 and > any page migration will not work. AutoNUMA won't too. > > So, we should set the guest nodes memory allocation policy before > the pages are really mapped. > > According to this patch set, we are able to set guest nodes memory policy > like following: > > -numa node,nodeid=0,cpus=0, \ > -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ > -numa node,nodeid=1,cpus=1 \ > -numa mem,size=1024M,policy=interleave,host-nodes=1 > > This supports > "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" > like format. > > And add a QMP command "query-numa" to show numa info through > this API. > > And convert the "info numa" monitor command to use this > QMP command "query-numa". > > This version removes "set-mem-policy" qmp and hmp commands temporarily > as Marcelo and Paolo suggested. > > V1->V2: > change to use QemuOpts in numa options (Paolo) > handle Error in mpol parser (Paolo) > change qmp command format to mem-policy=membind,mem-hostnode=0-1 like > (Paolo) > V2->V3: > also handle Error in cpus parser (5/10) > split out common parser from cpus and hostnode parser (Bandan 6/10) > V3-V4: > rebase to request for comments > V4->V5: > use OptVisitor and split -numa option (Paolo) > - s/set-mpol/set-mem-policy (Andreas) > - s/mem-policy/policy > - s/mem-hostnode/host-nodes > fix hmp command process after error (Luiz) > add qmp command query-numa and convert info numa to it (Luiz) > V5->V6: > remove tabs in json file (Laszlo, Paolo) > add back "-numa node,mem=xxx" as legacy (Paolo) > change cpus and host-nodes to array (Laszlo, Eric) > change "nodeid" to "uint16" > add NumaMemPolicy enum type (Eric) > rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for > repeating options" patch set, thanks for Laszlo's help > V6-V7: > change UInt16 to uint16 (Laszlo) > fix a typo in adding qmp command set-mem-policy > V7-V8: > rebase to current master with Laszlo's V2 of OptsVisitor patch set > fix an adding white space line error > V8->V9: > rebase to current master > check if total numa memory size is equal to ram_size (Paolo) > add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) > replace the use of numa_num_configured_nodes() (Andrew) > avoid abusing the fact i==nodeid (Andrew) > V9->V10: > rebase to current master > remove libnuma (Andrew) > MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) > use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) > remove a useless clear_bit() operation (Andrew) > V10->V11: > rebase to current master > fix "maxnode" argument of mbind(2) > V11->V12: > rebase to current master > split patch 02/11 of V11 (Eduardo) > add some max value check (Eduardo) > split MAX_NODES change patch (Eduardo) > V12->V13: > rebase to current master > thanks for Luiz's review (Luiz) > doc hmp command set-mem-policy (Luiz) > rename: NUMAInfo -> NUMANode (Luiz) > V13->V14: > remove "set-mem-policy" qmp and hmp commands (Marcelo, Paolo) > > > *I hope this can catch up the train of 1.7.* > > Thanks, > Wanlong Gao > > Wanlong Gao (11): > NUMA: move numa related code to new file numa.c > NUMA: check if the total numa memory size is equal to ram_size > NUMA: Add numa_info structure to contain numa nodes info > NUMA: convert -numa option to use OptsVisitor > NUMA: introduce NumaMemOptions > NUMA: add "-numa mem," options > NUMA: expand MAX_NODES from 64 to 128 > NUMA: parse guest numa nodes memory policy > NUMA: set guest numa nodes memory policy > NUMA: add qmp command query-numa > NUMA: convert hmp c
[Qemu-devel] [PATCH V14 01/11] NUMA: move numa related code to new file numa.c
Signed-off-by: Wanlong Gao --- Makefile.target | 2 +- cpus.c | 14 include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 3 + numa.c | 182 vl.c| 139 +--- 6 files changed, 187 insertions(+), 154 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index 9a49852..7e1fddf 100644 --- a/Makefile.target +++ b/Makefile.target @@ -113,7 +113,7 @@ endif #CONFIG_BSD_USER # # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/cpus.c b/cpus.c index e566297..2ca0cd9 100644 --- a/cpus.c +++ b/cpus.c @@ -1225,20 +1225,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ -CPUState *cpu; -int i; - -CPU_FOREACH(cpu) { -for (i = 0; i < nb_numa_nodes; i++) { -if (test_bit(cpu->cpu_index, node_cpumask[i])) { -cpu->numa_node = i; -} -} -} -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488..4f79081 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index cd5791e..e58ef3f 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -136,6 +136,9 @@ extern QEMUClockType rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +void numa_add(const char *optarg); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 000..ce7736a --- /dev/null +++ b/numa.c @@ -0,0 +1,182 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2013 Fujitsu Ltd. + * Author: Wanlong Gao + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" + +static void numa_node_parse_cpus(int nodenr, const char *cpus) +{ +char *endptr; +unsigned long long value, endvalue; + +/* Empty CPU range strings will be considered valid, they will simply + * not set any bit in the CPU bitmap. + */ +if (!*cpus) { +return; +} + +if (parse_uint(cpus, &value, &endptr, 10) < 0) { +goto error; +} +if (*endptr == '-') { +if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { +goto error; +} +} else if (*endptr == '\0') { +endvalue = value; +} else { +goto error; +} + +if (endvalue >= MAX_CPUMASK_BITS) { +endvalue = MAX_CPUMASK_BITS - 1; +fprintf(stderr, +"qemu: NUMA: A max of %d VCPUs are supported\n", + MAX_CPUMASK_BITS); +} + +if (endvalue < value) { +goto error; +} + +bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +return; + +error: +fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); +exit(1); +} + +void numa_add(const char *optarg) +{ +char option[128]; +char *endptr; +unsigned long long nodenr; + +optarg = get_opt_name(option, 128, optarg, ','); +if (*optarg == ',') { +
[Qemu-devel] [PATCH V14 10/11] NUMA: add qmp command query-numa
Add qmp command query-numa to show guest NUMA information. Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- numa.c | 66 qapi-schema.json | 36 +++ qmp-commands.hx | 48 + 3 files changed, 150 insertions(+) diff --git a/numa.c b/numa.c index 915a67a..b392190 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "exec/memory.h" +#include "qmp-commands.h" #ifdef __linux__ #include @@ -327,3 +328,68 @@ void set_numa_modes(void) } } } + +NUMANodeList *qmp_query_numa(Error **errp) +{ +NUMANodeList *head = NULL, *cur_item = NULL; +CPUState *cpu; +int i; + +for (i = 0; i < nb_numa_nodes; i++) { +NUMANodeList *info; +uint16List *cur_cpu_item = NULL; +info = g_malloc0(sizeof(*info)); +info->value = g_malloc0(sizeof(*info->value)); +info->value->nodeid = i; +CPU_FOREACH(cpu) { +if (cpu->numa_node == i) { +uint16List *node_cpu = g_malloc0(sizeof(*node_cpu)); +node_cpu->value = cpu->cpu_index; + +if (!cur_cpu_item) { +info->value->cpus = cur_cpu_item = node_cpu; +} else { +cur_cpu_item->next = node_cpu; +cur_cpu_item = node_cpu; +} +} +} +info->value->memory = numa_info[i].node_mem; + +#ifdef __linux__ +info->value->policy = numa_info[i].policy; +info->value->relative = numa_info[i].relative; + +unsigned long first, next; +next = first = find_first_bit(numa_info[i].host_mem, MAX_NODES); +if (first == MAX_NODES) { +goto end; +} +uint16List *cur_node_item = g_malloc0(sizeof(*cur_node_item)); +cur_node_item->value = first; +info->value->host_nodes = cur_node_item; +do { +next = find_next_bit(numa_info[i].host_mem, MAX_NODES, + next + 1); +if (next == MAX_NODES) { +break; +} + +uint16List *host_node = g_malloc0(sizeof(*host_node)); +host_node->value = next; +cur_node_item->next = host_node; +cur_node_item = host_node; +} while (true); +end: +#endif + +if (!cur_item) { +head = cur_item = info; +} else { +cur_item->next = info; +cur_item = info; +} +} + +return head; +} diff --git a/qapi-schema.json b/qapi-schema.json index 3f5a97a..7375241 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3978,3 +3978,39 @@ '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] }} + +## +# @NUMANode: +# +# Information of guest NUMA node +# +# @nodeid: NUMA node ID +# +# @cpus: VCPUs contained in this node +# +# @memory: memory size of this node +# +# @policy: memory policy of this node +# +# @relative: if host nodes are relative for memory policy +# +# @host-nodes: host nodes for its memory policy +# +# Since: 1.7 +# +## +{ 'type': 'NUMANode', + 'data': {'nodeid': 'uint16', 'cpus': ['uint16'], 'memory': 'uint64', + 'policy': 'NumaNodePolicy', 'relative': 'bool', + 'host-nodes': ['uint16'] }} + +## +# @query-numa: +# +# Returns a list of information about each guest node. +# +# Returns: a list of @NUMANode for each guest node +# +# Since: 1.7 +## +{ 'command': 'query-numa', 'returns': ['NUMANode'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index b17c46e..980257b 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3240,3 +3240,51 @@ Example: } EQMP +{ +.name = "query-numa", +.args_type = "", +.mhandler.cmd_new = qmp_marshal_input_query_numa, +}, + +SQMP +query-numa +- + +Show NUMA information. + +Return a json-array. Each NUMA node is represented by a json-object, +which contains: + +- "nodeid": NUMA node ID (json-int) +- "cpus": a json-arry of contained VCPUs +- "memory": amount of memory in each node in Byte (json-int) +- "policy": memory policy of this node (json-string) +- "relative": if host nodes is relative for its memory policy (json-bool) +- "host-nodes": a json-array of host nodes for its memory policy + +Arguments: + +Example: + +-> { "excute": "query-numa" } +<- { "return":[ +{ +"nodeid": 0, +"cpus": [0, 1], +"memory": 536870912, +"policy": "membind", +"relative": false, +"host-nodes": [0, 1] +}, +{ +"nodeid": 1, +"cpus": [2, 3], +"memory": 536870912, +"policy": "interleave", +"relative": false, +"host-nodes": [1] +} + ] + } + +EQMP -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 04/11] NUMA: convert -numa option to use OptsVisitor
Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +- numa.c | 148 +++- qapi-schema.json| 30 ++ vl.c| 11 +++- 4 files changed, 114 insertions(+), 78 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 3f3764d..c9fb2c7 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -140,9 +140,10 @@ typedef struct node_info { DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index 1bc0fad..c4fa665 100644 --- a/numa.c +++ b/numa.c @@ -24,101 +24,97 @@ */ #include "sysemu/sysemu.h" - -static void numa_node_parse_cpus(int nodenr, const char *cpus) +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +QemuOptsList qemu_numa_opts = { +.name = "numa", +.implied_opt_name = "type", +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), +.desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static int numa_node_parse(NumaNodeOptions *opts) { -char *endptr; -unsigned long long value, endvalue; - -/* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ -if (!*cpus) { -return; -} +uint16_t nodenr; +uint16List *cpus = NULL; -if (parse_uint(cpus, &value, &endptr, 10) < 0) { -goto error; -} -if (*endptr == '-') { -if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { -goto error; -} -} else if (*endptr == '\0') { -endvalue = value; +if (opts->has_nodeid) { +nodenr = opts->nodeid; } else { -goto error; +nodenr = nb_numa_nodes; } -if (endvalue >= MAX_CPUMASK_BITS) { -endvalue = MAX_CPUMASK_BITS - 1; -fprintf(stderr, -"qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; } -if (endvalue < value) { -goto error; +for (cpus = opts->cpus; cpus; cpus = cpus->next) { +if (cpus->value > MAX_CPUMASK_BITS) { +fprintf(stderr, "qemu: cpu number %" PRIu16 " is bigger than %d", +cpus->value, MAX_CPUMASK_BITS); +continue; +} +bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } -bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); -return; +if (opts->has_mem) { +int64_t mem_size; +char *endptr; +mem_size = strtosz(opts->mem, &endptr); +if (mem_size < 0 || *endptr) { +fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); +return -1; +} +numa_info[nodenr].node_mem = mem_size; +} -error: -fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); -exit(1); +return 0; } -void numa_add(const char *optarg) +int numa_init_func(QemuOpts *opts, void *opaque) { -char option[128]; -char *endptr; -unsigned long long nodenr; - -optarg = get_opt_name(option, 128, optarg, ','); -if (*optarg == ',') { -optarg++; +NumaOptions *object = NULL; +Error *err = NULL; +int ret = 0; + +{ +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err); +opts_visitor_cleanup(ov); } -if (!strcmp(option, "node")) { - -if (nb_numa_nodes >= MAX_NODES) { -fprintf(stderr, "qemu: too many NUMA nodes\n"); -exit(1); -} -if (get_param_value(option, 128, "nodeid", optarg) == 0) { -nodenr = nb_numa_nodes; -} else { -if (parse_uint_full(option, &nodenr, 10) < 0) { -fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); -exit(1); -} -} - -if (nodenr >= MAX_NODES) { -fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); -exit(1); -} +if (error_is_set(&err)) { +fprintf(stderr, "qemu: %s\n", error_get_pretty(err)); +error_free(err); +ret = -1; +goto er
[Qemu-devel] [PATCH V14 00/11] Add support for binding guest numa nodes to host numa nodes
As you know, QEMU can't direct it's memory allocation now, this may cause guest cross node access performance regression. And, the worse thing is that if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policy before the pages are really mapped. According to this patch set, we are able to set guest nodes memory policy like following: -numa node,nodeid=0,cpus=0, \ -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ -numa node,nodeid=1,cpus=1 \ -numa mem,size=1024M,policy=interleave,host-nodes=1 This supports "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" like format. And add a QMP command "query-numa" to show numa info through this API. And convert the "info numa" monitor command to use this QMP command "query-numa". This version removes "set-mem-policy" qmp and hmp commands temporarily as Marcelo and Paolo suggested. V1->V2: change to use QemuOpts in numa options (Paolo) handle Error in mpol parser (Paolo) change qmp command format to mem-policy=membind,mem-hostnode=0-1 like (Paolo) V2->V3: also handle Error in cpus parser (5/10) split out common parser from cpus and hostnode parser (Bandan 6/10) V3-V4: rebase to request for comments V4->V5: use OptVisitor and split -numa option (Paolo) - s/set-mpol/set-mem-policy (Andreas) - s/mem-policy/policy - s/mem-hostnode/host-nodes fix hmp command process after error (Luiz) add qmp command query-numa and convert info numa to it (Luiz) V5->V6: remove tabs in json file (Laszlo, Paolo) add back "-numa node,mem=xxx" as legacy (Paolo) change cpus and host-nodes to array (Laszlo, Eric) change "nodeid" to "uint16" add NumaMemPolicy enum type (Eric) rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for repeating options" patch set, thanks for Laszlo's help V6-V7: change UInt16 to uint16 (Laszlo) fix a typo in adding qmp command set-mem-policy V7-V8: rebase to current master with Laszlo's V2 of OptsVisitor patch set fix an adding white space line error V8->V9: rebase to current master check if total numa memory size is equal to ram_size (Paolo) add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) replace the use of numa_num_configured_nodes() (Andrew) avoid abusing the fact i==nodeid (Andrew) V9->V10: rebase to current master remove libnuma (Andrew) MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) remove a useless clear_bit() operation (Andrew) V10->V11: rebase to current master fix "maxnode" argument of mbind(2) V11->V12: rebase to current master split patch 02/11 of V11 (Eduardo) add some max value check (Eduardo) split MAX_NODES change patch (Eduardo) V12->V13: rebase to current master thanks for Luiz's review (Luiz) doc hmp command set-mem-policy (Luiz) rename: NUMAInfo -> NUMANode (Luiz) V13->V14: remove "set-mem-policy" qmp and hmp commands (Marcelo, Paolo) *I hope this can catch up the train of 1.7.* Thanks, Wanlong Gao Wanlong Gao (11): NUMA: move numa related code to new file numa.c NUMA: check if the total numa memory size is equal to ram_size NUMA: Add numa_info structure to contain numa nodes info NUMA: convert -numa option to use OptsVisitor NUMA: introduce NumaMemOptions NUMA: add "-numa mem," options NUMA: expand MAX_NODES from 64 to 128 NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp.c | 57 +++ hmp.h | 1 + hw/i386/pc.c| 4 +- include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 ++- monitor.c | 21 +-- numa.c | 395 qapi-schema.json| 112 ++ qemu-options.hx | 6 +- qmp-commands.hx | 48 ++ vl.c| 160 +++- 13 files changed, 654 insertions(+), 185 deletions(-) create mode 100644 numa.c -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 02/11] NUMA: check if the total numa memory size is equal to ram_size
If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI talb, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao --- numa.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/numa.c b/numa.c index ce7736a..beda80e 100644 --- a/numa.c +++ b/numa.c @@ -150,6 +150,16 @@ void set_numa_nodes(void) node_mem[i] = ram_size - usedmem; } +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 08/11] NUMA: parse guest numa nodes memory policy
The memory policy setting format is like: policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N And we are adding this setting as a suboption of "-numa mem,", the memory policy then can be set like following: -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +++ numa.c | 18 ++ qapi-schema.json| 33 +++-- vl.c| 3 +++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 995cf3b..9707195 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -139,6 +139,9 @@ extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +DECLARE_BITMAP(host_mem, MAX_NODES); +NumaNodePolicy policy; +bool relative; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; void set_numa_nodes(void); diff --git a/numa.c b/numa.c index c676c5e..da4dbbd 100644 --- a/numa.c +++ b/numa.c @@ -78,6 +78,7 @@ static int numa_mem_parse(NumaMemOptions *opts) { uint16_t nodenr; uint64_t mem_size; +uint16List *nodes; if (opts->has_nodeid) { nodenr = opts->nodeid; @@ -96,6 +97,23 @@ static int numa_mem_parse(NumaMemOptions *opts) numa_info[nodenr].node_mem = mem_size; } +if (opts->has_policy) { +numa_info[nodenr].policy = opts->policy; +} + +if (opts->has_relative) { +numa_info[nodenr].relative = opts->relative; +} + +for (nodes = opts->host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +fprintf(stderr, "qemu: node number %" PRIu16 " is bigger than %d\n", +nodes->value, MAX_NODES); +continue; +} +bitmap_set(numa_info[nodenr].host_mem, nodes->value, 1); +} + return 0; } diff --git a/qapi-schema.json b/qapi-schema.json index d291f28..3f5a97a 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3935,6 +3935,26 @@ '*mem':'str' }} ## +# @NumaNodePolicy +# +# NUMA node policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred node for allocation +# +# @membind: a strict policy that restricts memory allocation to the +# nodes specified +# +# @interleave: the page allocations is interleaved across the set +# of nodes specified +# +# Since 1.7 +## +{ 'enum': 'NumaNodePolicy', + 'data': [ 'default', 'preferred', 'membind', 'interleave' ] } + +## # @NumaMemOptions # # Set memory information of guest NUMA node. (for OptsVisitor) @@ -3943,9 +3963,18 @@ # # @size: #optional memory size of this node # +# @policy: #optional memory policy of this node +# +# @relative: #optional if the nodes specified are relative +# +# @host-nodes: #optional host nodes for its memory policy +# # Since 1.7 ## { 'type': 'NumaMemOptions', 'data': { - '*nodeid': 'uint16', - '*size': 'size' }} + '*nodeid': 'uint16', + '*size': 'size', + '*policy': 'NumaNodePolicy', + '*relative': 'bool', + '*host-nodes': ['uint16'] }} diff --git a/vl.c b/vl.c index 626bf62..c69f5b8 100644 --- a/vl.c +++ b/vl.c @@ -2814,6 +2814,9 @@ int main(int argc, char **argv, char **envp) for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_mem = 0; bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); +bitmap_zero(numa_info[i].host_mem, MAX_NODES); +numa_info[i].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[i].relative = false; } nb_numa_nodes = 0; -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 11/11] NUMA: convert hmp command info_numa to use qmp command query_numa
Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- hmp.c | 57 + hmp.h | 1 + monitor.c | 21 + 3 files changed, 59 insertions(+), 20 deletions(-) diff --git a/hmp.c b/hmp.c index 5891507..64cc570 100644 --- a/hmp.c +++ b/hmp.c @@ -24,6 +24,10 @@ #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1566,3 +1570,56 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } + +void hmp_info_numa(Monitor *mon, const QDict *qdict) +{ +NUMANodeList *node_list, *node; +uint16List *head; +int nodeid; +char *policy_str = NULL; + +node_list = qmp_query_numa(NULL); + +monitor_printf(mon, "%d nodes\n", nb_numa_nodes); +for (node = node_list; node; node = node->next) { +nodeid = node->value->nodeid; +monitor_printf(mon, "node %d cpus:", nodeid); +head = node->value->cpus; +for (head = node->value->cpus; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +monitor_printf(mon, "node %d size: %" PRId64 " MB\n", + nodeid, node->value->memory >> 20); +switch (node->value->policy) { +case NUMA_NODE_POLICY_DEFAULT: +policy_str = g_strdup("default"); +break; +case NUMA_NODE_POLICY_PREFERRED: +policy_str = g_strdup("preferred"); +break; +case NUMA_NODE_POLICY_MEMBIND: +policy_str = g_strdup("membind"); +break; +case NUMA_NODE_POLICY_INTERLEAVE: +policy_str = g_strdup("interleave"); +break; +default: +break; +} +monitor_printf(mon, "node %d policy: %s\n", + nodeid, policy_str ? : " "); +if (policy_str) { +free(policy_str); +} +monitor_printf(mon, "node %d relative: %s\n", nodeid, + node->value->relative ? "true" : "false"); +monitor_printf(mon, "node %d host-nodes:", nodeid); +for (head = node->value->host_nodes; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +} + +qapi_free_NUMANodeList(node_list); +} diff --git a/hmp.h b/hmp.h index 54cf71f..4f8d39b 100644 --- a/hmp.h +++ b/hmp.h @@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); +void hmp_info_numa(Monitor *mon, const QDict *qdict); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); diff --git a/monitor.c b/monitor.c index be34488..ce9dfe7 100644 --- a/monitor.c +++ b/monitor.c @@ -1995,25 +1995,6 @@ static void do_info_mtree(Monitor *mon, const QDict *qdict) mtree_info((fprintf_function)monitor_printf, mon); } -static void do_info_numa(Monitor *mon, const QDict *qdict) -{ -int i; -CPUState *cpu; - -monitor_printf(mon, "%d nodes\n", nb_numa_nodes); -for (i = 0; i < nb_numa_nodes; i++) { -monitor_printf(mon, "node %d cpus:", i); -CPU_FOREACH(cpu) { -if (cpu->numa_node == i) { -monitor_printf(mon, " %d", cpu->cpu_index); -} -} -monitor_printf(mon, "\n"); -monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -numa_info[i].node_mem >> 20); -} -} - #ifdef CONFIG_PROFILER int64_t qemu_time; @@ -2781,7 +2762,7 @@ static mon_cmd_t info_cmds[] = { .args_type = "", .params = "", .help = "show NUMA information", -.mhandler.cmd = do_info_numa, +.mhandler.cmd = hmp_info_numa, }, { .name = "usb", -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 03/11] NUMA: Add numa_info structure to contain numa nodes info
Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c| 4 ++-- include/sysemu/sysemu.h | 8 ++-- monitor.c | 2 +- numa.c | 23 --- vl.c| 7 +++ 5 files changed, 24 insertions(+), 20 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0c313fe..b0fddd0 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -652,14 +652,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { -numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index e58ef3f..3f3764d 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -134,8 +135,11 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { +uint64_t node_mem; +DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); diff --git a/monitor.c b/monitor.c index 74f3f1b..be34488 100644 --- a/monitor.c +++ b/monitor.c @@ -2010,7 +2010,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -node_mem[i] >> 20); +numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index beda80e..1bc0fad 100644 --- a/numa.c +++ b/numa.c @@ -61,7 +61,7 @@ static void numa_node_parse_cpus(int nodenr, const char *cpus) goto error; } -bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); return; error: @@ -101,7 +101,7 @@ void numa_add(const char *optarg) } if (get_param_value(option, 128, "mem", optarg) == 0) { -node_mem[nodenr] = 0; +numa_info[nodenr].node_mem = 0; } else { int64_t sval; sval = strtosz(option, &endptr); @@ -109,7 +109,7 @@ void numa_add(const char *optarg) fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); exit(1); } -node_mem[nodenr] = sval; +numa_info[nodenr].node_mem = sval; } if (get_param_value(option, 128, "cpus", optarg) != 0) { numa_node_parse_cpus(nodenr, option); @@ -134,7 +134,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */ for (i = 0; i < nb_numa_nodes; i++) { -if (node_mem[i] != 0) +if (numa_info[i].node_mem != 0) break; } if (i == nb_numa_nodes) { @@ -144,15 +144,16 @@ void set_numa_nodes(void) * the final node gets the rest. */ for (i = 0; i < nb_numa_nodes - 1; i++) { -node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); -usedmem += node_mem[i]; +numa_info[i].node_mem = (ram_size / nb_numa_nodes) & +~((1 << 23UL) - 1); +usedmem += numa_info[i].node_mem; } -node_mem[i] = ram_size - usedmem; +numa_info[i].node_mem = ram_size - usedmem; } uint64_t numa_total = 0; for (i = 0; i < nb_numa_nodes; i++) { -numa_total += node_mem[i]; +numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { fprintf(stderr, "qemu: numa nodes total memory size " @@ -161,7 +162,7 @@ void set_numa_nodes(void) } for (i = 0; i
[Qemu-devel] [PATCH V14 09/11] NUMA: set guest numa nodes memory policy
Set the guest numa nodes memory policies using the mbind(2) system call node by node. After this patch, we are able to set guest nodes memory policies through the QEMU options, this arms to solve the guest cross nodes memory access performance issue. And as you all know, if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policies before the pages are really mapped. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- numa.c | 86 ++ 1 file changed, 86 insertions(+) diff --git a/numa.c b/numa.c index da4dbbd..915a67a 100644 --- a/numa.c +++ b/numa.c @@ -27,6 +27,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif + QemuOptsList qemu_numa_opts = { .name = "numa", .implied_opt_name = "type", @@ -228,6 +238,75 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ +int bind_mode; + +switch (numa_info[nodeid].policy) { +case NUMA_NODE_POLICY_DEFAULT: +case NUMA_NODE_POLICY_PREFERRED: +case NUMA_NODE_POLICY_MEMBIND: +case NUMA_NODE_POLICY_INTERLEAVE: +bind_mode = numa_info[nodeid].policy; +break; +default: +bind_mode = NUMA_NODE_POLICY_DEFAULT; +return bind_mode; +} + +bind_mode |= numa_info[nodeid].relative ? +MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + +return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef __linux__ +void *ram_ptr; +RAMBlock *block; +ram_addr_t len, ram_offset = 0; +int bind_mode; +int i; + +QTAILQ_FOREACH(block, &ram_list.blocks, next) { +if (!strcmp(block->mr->name, "pc.ram")) { +break; +} +} + +if (block->host == NULL) { +return -1; +} + +ram_ptr = block->host; +for (i = 0; i < nodeid; i++) { +len = numa_info[i].node_mem; +ram_offset += len; +} + +len = numa_info[nodeid].node_mem; +bind_mode = node_parse_bind_mode(nodeid); +unsigned long *nodes = numa_info[nodeid].host_mem; + +/* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ +unsigned long maxnode = find_last_bit(nodes, MAX_NODES); +if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, +nodes, maxnode + 2, 0)) { +perror("mbind"); +return -1; +} +#endif + +return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -240,4 +319,11 @@ void set_numa_modes(void) } } } + +for (i = 0; i < nb_numa_nodes; i++) { +if (set_node_mem_policy(i) == -1) { +fprintf(stderr, +"qemu: can not set host memory policy for node%d\n", i); +} +} } -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 06/11] NUMA: add "-numa mem," options
Add "-numa mem," option like following as Paolo suggested: -numa mem,nodeid=0,size=1G This new option will make later coming memory hotplug better. We will use the new options to specify nodes memory info, and just remain "-numa node,mem=xx" as legacy. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 1 + numa.c | 36 qemu-options.hx | 6 -- vl.c| 2 ++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index c9fb2c7..861cd77 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -135,6 +135,7 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; +extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); diff --git a/numa.c b/numa.c index c4fa665..c676c5e 100644 --- a/numa.c +++ b/numa.c @@ -74,6 +74,31 @@ static int numa_node_parse(NumaNodeOptions *opts) return 0; } +static int numa_mem_parse(NumaMemOptions *opts) +{ +uint16_t nodenr; +uint64_t mem_size; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +} else { +nodenr = nb_numa_mem_nodes; +} + +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} + +if (opts->has_size) { +mem_size = opts->size; +numa_info[nodenr].node_mem = mem_size; +} + +return 0; +} + int numa_init_func(QemuOpts *opts, void *opaque) { NumaOptions *object = NULL; @@ -101,6 +126,13 @@ int numa_init_func(QemuOpts *opts, void *opaque) } nb_numa_nodes++; break; +case NUMA_OPTIONS_KIND_MEM: +ret = numa_mem_parse(object->mem); +if (ret) { +goto error; +} +nb_numa_mem_nodes++; +break; default: fprintf(stderr, "qemu: Invalid NUMA options type.\n"); ret = -1; @@ -119,6 +151,10 @@ error: void set_numa_nodes(void) { +if (nb_numa_mem_nodes > nb_numa_nodes) { +nb_numa_nodes = nb_numa_mem_nodes; +} + if (nb_numa_nodes > 0) { int i; diff --git a/qemu-options.hx b/qemu-options.hx index 5dc8b75..98fa25d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,11 +95,13 @@ specifies the maximum number of hotpluggable CPUs. ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, -"-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) +"-numa node[,nodeid=node][,cpus=cpu[-cpu]]\n" +"-numa mem[,nodeid=node][,size=size]\n" +, QEMU_ARCH_ALL) STEXI @item -numa @var{opts} @findex -numa -Simulate a multi node NUMA system. If mem and cpus are omitted, resources +Simulate a multi node NUMA system. If @var{size} and @var{cpus} are omitted, resources are split equally. ETEXI diff --git a/vl.c b/vl.c index 0d644db..626bf62 100644 --- a/vl.c +++ b/vl.c @@ -250,6 +250,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); int nb_numa_nodes; +int nb_numa_mem_nodes; NodeInfo numa_info[MAX_NODES]; uint8_t qemu_uuid[16]; @@ -2816,6 +2817,7 @@ int main(int argc, char **argv, char **envp) } nb_numa_nodes = 0; +nb_numa_mem_nodes = 0; nb_nics = 0; bdrv_init_with_whitelist(); -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 05/11] NUMA: introduce NumaMemOptions
Signed-off-by: Wanlong Gao --- qapi-schema.json | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index 8cf0179..d291f28 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3912,7 +3912,8 @@ ## { 'union': 'NumaOptions', 'data': { -'node': 'NumaNodeOptions' }} +'node': 'NumaNodeOptions', +'mem' : 'NumaMemOptions' }} ## # @NumaNodeOptions @@ -3932,3 +3933,19 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem':'str' }} + +## +# @NumaMemOptions +# +# Set memory information of guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @size: #optional memory size of this node +# +# Since 1.7 +## +{ 'type': 'NumaMemOptions', + 'data': { + '*nodeid': 'uint16', + '*size': 'size' }} -- 1.8.4.474.g128a96c
[Qemu-devel] [PATCH V14 07/11] NUMA: expand MAX_NODES from 64 to 128
libnuma choosed 128 for MAX_NODES, so we follow libnuma here. Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 861cd77..995cf3b 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -132,7 +132,7 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; -- 1.8.4.474.g128a96c
Re: [Qemu-devel] [PATCH V13 10/13] NUMA: add qmp command set-mem-policy to set memory policy for NUMA node
On 10/04/2013 04:13 PM, Paolo Bonzini wrote: > Il 04/10/2013 02:04, Marcelo Tosatti ha scritto: >>>>>> This QMP command allows user set guest node's memory policy >>>>>> through the QMP protocol. The qmp-shell command is like: >>>>>> set-mem-policy nodeid=0 policy=membind relative=true host-nodes=0-1 >>>>>> >>>>>> Reviewed-by: Luiz Capitulino >>>>>> Signed-off-by: Wanlong Gao >>>> >>>> Wanlong Gao, >>>> >>>> 1) >>>> >>>> Exposing mbind via QMP/HMP on a live guest is interesting because, >>>> see mbind manpage: >>>> >>>> "By default, mbind() only has an effect for new allocations; >>>> if the pages inside the range have been already touched before >>>> setting the policy, then the policy has no effect. This default >>>> behavior may be overridden by the MPOL_MF_MOVE and >>>> MPOL_MF_MOVE_ALL flags described below." >>>> >>>> This means that executing set-mem-policy on a live guest is >>>> unpredictable: it depends on which pages have been faulted in already. >>>> >>>> Should the command be restricted to offline guests? >> In fact, unless there is a missing point, it should be removed: to solve >> the device assignment case (memory pinning), mbind must be executed before >> the memory regions are registered. >> > > Right. We can add the command back later as memory-add, together with > memory hotplug. OK, will remove the command in this patch set. Thanks, Wanlong Gao > > Paolo >
Re: [Qemu-devel] [PATCH V13 00/13] Add support for binding guest numa nodes to host numa nodes
Hi folks, Any comments on this version? Thanks, Wanlong Gao > As you know, QEMU can't direct it's memory allocation now, this may cause > guest cross node access performance regression. > And, the worse thing is that if PCI-passthrough is used, > direct-attached-device uses DMA transfer between device and qemu process. > All pages of the guest will be pinned by get_user_pages(). > > KVM_ASSIGN_PCI_DEVICE ioctl > kvm_vm_ioctl_assign_device() > =>kvm_assign_device() > => kvm_iommu_map_memslots() > => kvm_iommu_map_pages() >=> kvm_pin_pages() > > So, with direct-attached-device, all guest page's page count will be +1 and > any page migration will not work. AutoNUMA won't too. > > So, we should set the guest nodes memory allocation policy before > the pages are really mapped. > > According to this patch set, we are able to set guest nodes memory policy > like following: > > -numa node,nodeid=0,cpus=0, \ > -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ > -numa node,nodeid=1,cpus=1 \ > -numa mem,size=1024M,policy=interleave,host-nodes=1 > > This supports > "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" > like format. > > > Also add "set-mem-policy" QMP and hmp command to set memory policy. > > And add a QMP command "query-numa" to show numa info through > this API. > > And convert the "info numa" monitor command to use this > QMP command "query-numa". > > > V1->V2: > change to use QemuOpts in numa options (Paolo) > handle Error in mpol parser (Paolo) > change qmp command format to mem-policy=membind,mem-hostnode=0-1 like > (Paolo) > V2->V3: > also handle Error in cpus parser (5/10) > split out common parser from cpus and hostnode parser (Bandan 6/10) > V3-V4: > rebase to request for comments > V4->V5: > use OptVisitor and split -numa option (Paolo) > - s/set-mpol/set-mem-policy (Andreas) > - s/mem-policy/policy > - s/mem-hostnode/host-nodes > fix hmp command process after error (Luiz) > add qmp command query-numa and convert info numa to it (Luiz) > V5->V6: > remove tabs in json file (Laszlo, Paolo) > add back "-numa node,mem=xxx" as legacy (Paolo) > change cpus and host-nodes to array (Laszlo, Eric) > change "nodeid" to "uint16" > add NumaMemPolicy enum type (Eric) > rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for > repeating options" patch set, thanks for Laszlo's help > V6-V7: > change UInt16 to uint16 (Laszlo) > fix a typo in adding qmp command set-mem-policy > V7-V8: > rebase to current master with Laszlo's V2 of OptsVisitor patch set > fix an adding white space line error > V8->V9: > rebase to current master > check if total numa memory size is equal to ram_size (Paolo) > add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) > replace the use of numa_num_configured_nodes() (Andrew) > avoid abusing the fact i==nodeid (Andrew) > V9->V10: > rebase to current master > remove libnuma (Andrew) > MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) > use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) > remove a useless clear_bit() operation (Andrew) > V10->V11: > rebase to current master > fix "maxnode" argument of mbind(2) > V11->V12: > rebase to current master > split patch 02/11 of V11 (Eduardo) > add some max value check (Eduardo) > split MAX_NODES change patch (Eduardo) > V12->V13: > rebase to current master > thanks for Luiz's review (Luiz) > doc hmp command set-mem-policy (Luiz) > rename: NUMAInfo -> NUMANode (Luiz) > > > *I hope this can catch up the train of 1.7.* > > Thanks, > Wanlong Gao > > Wanlong Gao (13): > NUMA: move numa related code to new file numa.c > NUMA: check if the total numa memory size is equal to ram_size > NUMA: Add numa_info structure to contain numa nodes info > NUMA: convert -numa option to use OptsVisitor > NUMA: introduce NumaMemOptions > NUMA: add "-numa mem," options > NUMA: expand MAX_NODES from 64 to 128 > NUMA: parse guest numa nodes memory policy > NUMA: set guest numa nodes memory policy > NUMA: add qmp command set-mem-policy to set memory policy for NUMA > node > NUMA: add hmp command set-mem-policy > NUMA: add qmp command query-numa > NUMA: convert hmp command i
[Qemu-devel] [PATCH V13 01/13] NUMA: move numa related code to new file numa.c
Signed-off-by: Wanlong Gao --- Makefile.target | 2 +- cpus.c | 14 include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 3 + numa.c | 182 vl.c| 139 +--- 6 files changed, 187 insertions(+), 154 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index 9a49852..7e1fddf 100644 --- a/Makefile.target +++ b/Makefile.target @@ -113,7 +113,7 @@ endif #CONFIG_BSD_USER # # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/cpus.c b/cpus.c index e566297..2ca0cd9 100644 --- a/cpus.c +++ b/cpus.c @@ -1225,20 +1225,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ -CPUState *cpu; -int i; - -CPU_FOREACH(cpu) { -for (i = 0; i < nb_numa_nodes; i++) { -if (test_bit(cpu->cpu_index, node_cpumask[i])) { -cpu->numa_node = i; -} -} -} -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488..4f79081 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index b1aa059..31e5a5b 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -131,6 +131,9 @@ extern QEMUClockType rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +void numa_add(const char *optarg); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 000..ce7736a --- /dev/null +++ b/numa.c @@ -0,0 +1,182 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2013 Fujitsu Ltd. + * Author: Wanlong Gao + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" + +static void numa_node_parse_cpus(int nodenr, const char *cpus) +{ +char *endptr; +unsigned long long value, endvalue; + +/* Empty CPU range strings will be considered valid, they will simply + * not set any bit in the CPU bitmap. + */ +if (!*cpus) { +return; +} + +if (parse_uint(cpus, &value, &endptr, 10) < 0) { +goto error; +} +if (*endptr == '-') { +if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { +goto error; +} +} else if (*endptr == '\0') { +endvalue = value; +} else { +goto error; +} + +if (endvalue >= MAX_CPUMASK_BITS) { +endvalue = MAX_CPUMASK_BITS - 1; +fprintf(stderr, +"qemu: NUMA: A max of %d VCPUs are supported\n", + MAX_CPUMASK_BITS); +} + +if (endvalue < value) { +goto error; +} + +bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +return; + +error: +fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); +exit(1); +} + +void numa_add(const char *optarg) +{ +char option[128]; +char *endptr; +unsigned long long nodenr; + +optarg = get_opt_name(option, 128, optarg, ','); +if (*optarg == ',') { +
[Qemu-devel] [PATCH V13 04/13] NUMA: convert -numa option to use OptsVisitor
Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +- numa.c | 148 +++- qapi-schema.json| 30 ++ vl.c| 11 +++- 4 files changed, 114 insertions(+), 78 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 662c6fb..797490e 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -135,9 +135,10 @@ typedef struct node_info { DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index 1bc0fad..c4fa665 100644 --- a/numa.c +++ b/numa.c @@ -24,101 +24,97 @@ */ #include "sysemu/sysemu.h" - -static void numa_node_parse_cpus(int nodenr, const char *cpus) +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +QemuOptsList qemu_numa_opts = { +.name = "numa", +.implied_opt_name = "type", +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), +.desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static int numa_node_parse(NumaNodeOptions *opts) { -char *endptr; -unsigned long long value, endvalue; - -/* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ -if (!*cpus) { -return; -} +uint16_t nodenr; +uint16List *cpus = NULL; -if (parse_uint(cpus, &value, &endptr, 10) < 0) { -goto error; -} -if (*endptr == '-') { -if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { -goto error; -} -} else if (*endptr == '\0') { -endvalue = value; +if (opts->has_nodeid) { +nodenr = opts->nodeid; } else { -goto error; +nodenr = nb_numa_nodes; } -if (endvalue >= MAX_CPUMASK_BITS) { -endvalue = MAX_CPUMASK_BITS - 1; -fprintf(stderr, -"qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; } -if (endvalue < value) { -goto error; +for (cpus = opts->cpus; cpus; cpus = cpus->next) { +if (cpus->value > MAX_CPUMASK_BITS) { +fprintf(stderr, "qemu: cpu number %" PRIu16 " is bigger than %d", +cpus->value, MAX_CPUMASK_BITS); +continue; +} +bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } -bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); -return; +if (opts->has_mem) { +int64_t mem_size; +char *endptr; +mem_size = strtosz(opts->mem, &endptr); +if (mem_size < 0 || *endptr) { +fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); +return -1; +} +numa_info[nodenr].node_mem = mem_size; +} -error: -fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); -exit(1); +return 0; } -void numa_add(const char *optarg) +int numa_init_func(QemuOpts *opts, void *opaque) { -char option[128]; -char *endptr; -unsigned long long nodenr; - -optarg = get_opt_name(option, 128, optarg, ','); -if (*optarg == ',') { -optarg++; +NumaOptions *object = NULL; +Error *err = NULL; +int ret = 0; + +{ +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err); +opts_visitor_cleanup(ov); } -if (!strcmp(option, "node")) { - -if (nb_numa_nodes >= MAX_NODES) { -fprintf(stderr, "qemu: too many NUMA nodes\n"); -exit(1); -} -if (get_param_value(option, 128, "nodeid", optarg) == 0) { -nodenr = nb_numa_nodes; -} else { -if (parse_uint_full(option, &nodenr, 10) < 0) { -fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); -exit(1); -} -} - -if (nodenr >= MAX_NODES) { -fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); -exit(1); -} +if (error_is_set(&err)) { +fprintf(stderr, "qemu: %s\n", error_get_pretty(err)); +error_free(err); +ret = -1; +goto er
[Qemu-devel] [PATCH V13 13/13] NUMA: convert hmp command info_numa to use qmp command query_numa
Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- hmp.c | 54 ++ hmp.h | 1 + monitor.c | 21 + 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/hmp.c b/hmp.c index 84990d7..3efc1ac 100644 --- a/hmp.c +++ b/hmp.c @@ -27,6 +27,7 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1609,3 +1610,56 @@ out: hmp_handle_error(mon, &local_err); } + +void hmp_info_numa(Monitor *mon, const QDict *qdict) +{ +NUMANodeList *node_list, *node; +uint16List *head; +int nodeid; +char *policy_str = NULL; + +node_list = qmp_query_numa(NULL); + +monitor_printf(mon, "%d nodes\n", nb_numa_nodes); +for (node = node_list; node; node = node->next) { +nodeid = node->value->nodeid; +monitor_printf(mon, "node %d cpus:", nodeid); +head = node->value->cpus; +for (head = node->value->cpus; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +monitor_printf(mon, "node %d size: %" PRId64 " MB\n", + nodeid, node->value->memory >> 20); +switch (node->value->policy) { +case NUMA_NODE_POLICY_DEFAULT: +policy_str = g_strdup("default"); +break; +case NUMA_NODE_POLICY_PREFERRED: +policy_str = g_strdup("preferred"); +break; +case NUMA_NODE_POLICY_MEMBIND: +policy_str = g_strdup("membind"); +break; +case NUMA_NODE_POLICY_INTERLEAVE: +policy_str = g_strdup("interleave"); +break; +default: +break; +} +monitor_printf(mon, "node %d policy: %s\n", + nodeid, policy_str ? : " "); +if (policy_str) { +free(policy_str); +} +monitor_printf(mon, "node %d relative: %s\n", nodeid, + node->value->relative ? "true" : "false"); +monitor_printf(mon, "node %d host-nodes:", nodeid); +for (head = node->value->host_nodes; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +} + +qapi_free_NUMANodeList(node_list); +} diff --git a/hmp.h b/hmp.h index ae09525..56a5efd 100644 --- a/hmp.h +++ b/hmp.h @@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); +void hmp_info_numa(Monitor *mon, const QDict *qdict); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); diff --git a/monitor.c b/monitor.c index be34488..ce9dfe7 100644 --- a/monitor.c +++ b/monitor.c @@ -1995,25 +1995,6 @@ static void do_info_mtree(Monitor *mon, const QDict *qdict) mtree_info((fprintf_function)monitor_printf, mon); } -static void do_info_numa(Monitor *mon, const QDict *qdict) -{ -int i; -CPUState *cpu; - -monitor_printf(mon, "%d nodes\n", nb_numa_nodes); -for (i = 0; i < nb_numa_nodes; i++) { -monitor_printf(mon, "node %d cpus:", i); -CPU_FOREACH(cpu) { -if (cpu->numa_node == i) { -monitor_printf(mon, " %d", cpu->cpu_index); -} -} -monitor_printf(mon, "\n"); -monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -numa_info[i].node_mem >> 20); -} -} - #ifdef CONFIG_PROFILER int64_t qemu_time; @@ -2781,7 +2762,7 @@ static mon_cmd_t info_cmds[] = { .args_type = "", .params = "", .help = "show NUMA information", -.mhandler.cmd = do_info_numa, +.mhandler.cmd = hmp_info_numa, }, { .name = "usb", -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 06/13] NUMA: add "-numa mem," options
Add "-numa mem," option like following as Paolo suggested: -numa mem,nodeid=0,size=1G This new option will make later coming memory hotplug better. We will use the new options to specify nodes memory info, and just remain "-numa node,mem=xx" as legacy. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 1 + numa.c | 36 qemu-options.hx | 6 -- vl.c| 2 ++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 797490e..58c728c 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -130,6 +130,7 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; +extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); diff --git a/numa.c b/numa.c index c4fa665..c676c5e 100644 --- a/numa.c +++ b/numa.c @@ -74,6 +74,31 @@ static int numa_node_parse(NumaNodeOptions *opts) return 0; } +static int numa_mem_parse(NumaMemOptions *opts) +{ +uint16_t nodenr; +uint64_t mem_size; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +} else { +nodenr = nb_numa_mem_nodes; +} + +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} + +if (opts->has_size) { +mem_size = opts->size; +numa_info[nodenr].node_mem = mem_size; +} + +return 0; +} + int numa_init_func(QemuOpts *opts, void *opaque) { NumaOptions *object = NULL; @@ -101,6 +126,13 @@ int numa_init_func(QemuOpts *opts, void *opaque) } nb_numa_nodes++; break; +case NUMA_OPTIONS_KIND_MEM: +ret = numa_mem_parse(object->mem); +if (ret) { +goto error; +} +nb_numa_mem_nodes++; +break; default: fprintf(stderr, "qemu: Invalid NUMA options type.\n"); ret = -1; @@ -119,6 +151,10 @@ error: void set_numa_nodes(void) { +if (nb_numa_mem_nodes > nb_numa_nodes) { +nb_numa_nodes = nb_numa_mem_nodes; +} + if (nb_numa_nodes > 0) { int i; diff --git a/qemu-options.hx b/qemu-options.hx index 5dc8b75..98fa25d 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,11 +95,13 @@ specifies the maximum number of hotpluggable CPUs. ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, -"-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) +"-numa node[,nodeid=node][,cpus=cpu[-cpu]]\n" +"-numa mem[,nodeid=node][,size=size]\n" +, QEMU_ARCH_ALL) STEXI @item -numa @var{opts} @findex -numa -Simulate a multi node NUMA system. If mem and cpus are omitted, resources +Simulate a multi node NUMA system. If @var{size} and @var{cpus} are omitted, resources are split equally. ETEXI diff --git a/vl.c b/vl.c index efd0e53..341c096 100644 --- a/vl.c +++ b/vl.c @@ -250,6 +250,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); int nb_numa_nodes; +int nb_numa_mem_nodes; NodeInfo numa_info[MAX_NODES]; uint8_t qemu_uuid[16]; @@ -2816,6 +2817,7 @@ int main(int argc, char **argv, char **envp) } nb_numa_nodes = 0; +nb_numa_mem_nodes = 0; nb_nics = 0; bdrv_init_with_whitelist(); -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 11/13] NUMA: add hmp command set-mem-policy
Add hmp command set-mem-policy to set host memory policy for a guest NUMA node. Then we can also set node's memory policy using the monitor command like: (qemu) set-mem-policy 0 policy=membind,relative=false,host-nodes=0-1 Signed-off-by: Wanlong Gao --- hmp-commands.hx | 28 + hmp.c | 65 + hmp.h | 1 + 3 files changed, 94 insertions(+) diff --git a/hmp-commands.hx b/hmp-commands.hx index 65b7f60..3cf1d5c 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1587,6 +1587,34 @@ Executes a qemu-io command on the given block device. ETEXI { +.name = "set-mem-policy", +.args_type = "nodeid:i,args:s?", +.params = "nodeid [args]", +.help = "set host memory policy for a guest NUMA node", +.mhandler.cmd = hmp_set_mem_policy, +}, + +STEXI +@item set-mem-policy @var{nodeid} @var{args} +@findex set-mem-policy + +Set host memory policy for a guest NUMA node + +@var{args} is optional. If not set, the policy of @var{nodeid} will +be set to @var{default}. Its syntax is: +[policy=@var{policy}][,relative=@var{relative}][,host-nodes=@var{host-nodes}], +Here @var{policy} can be @var{default}, @var{membind}, @var{perferred} or +@var{interleave}, @var{relative} is a bool value, @var{host-nodes} is a +set of host node ids. For example: + +@example +(qemu) set-mem-policy 0 policy=membind,relative=yes,host-nodes=0-1 +@end example + + +ETEXI + +{ .name = "info", .args_type = "item:s?", .params = "[subcommand]", diff --git a/hmp.c b/hmp.c index b4a6422..84990d7 100644 --- a/hmp.c +++ b/hmp.c @@ -24,6 +24,9 @@ #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1544,3 +1547,65 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } + +void hmp_set_mem_policy(Monitor *mon, const QDict *qdict) +{ +Error *local_err = NULL; +bool has_policy = true; +bool has_relative = true; +bool has_host_nodes = true; +QemuOpts *opts; +NumaMemOptions *object = NULL; +NumaNodePolicy policy = NUMA_NODE_POLICY_DEFAULT; +bool relative = false; +uint16List *host_nodes = NULL; + +uint64_t nodeid = qdict_get_int(qdict, "nodeid"); +const char *args = qdict_get_try_str(qdict, "args"); + +if (args == NULL) { +has_policy = false; +has_relative = false; +has_host_nodes = false; +} else { +opts = qemu_opts_parse(qemu_find_opts("numa"), args, 1); +if (opts == NULL) { +monitor_printf(mon, "Parsing memory policy args failed\n"); +return; +} else { +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaMemOptions(opts_get_visitor(ov), &object, NULL, + &local_err); +opts_visitor_cleanup(ov); + +if (error_is_set(&local_err)) { +goto out; +} + +has_policy = object->has_policy; +if (has_policy) { +policy = object->policy; +} +has_relative = object->has_relative; +if (has_relative) { +relative = object->relative; +} +has_host_nodes = object->has_host_nodes; +if (has_host_nodes) { +host_nodes = object->host_nodes; +} +} +} + +qmp_set_mem_policy(nodeid, has_policy, policy, has_relative, relative, + has_host_nodes, host_nodes, &local_err); +out: +if (object) { +QapiDeallocVisitor *dv = qapi_dealloc_visitor_new(); +visit_type_NumaMemOptions(qapi_dealloc_get_visitor(dv), + &object, NULL, NULL); +qapi_dealloc_visitor_cleanup(dv); +} + +hmp_handle_error(mon, &local_err); +} diff --git a/hmp.h b/hmp.h index 6c3bdcd..ae09525 100644 --- a/hmp.h +++ b/hmp.h @@ -87,5 +87,6 @@ void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); void hmp_chardev_remove(Monitor *mon, const QDict *qdict); void hmp_qemu_io(Monitor *mon, const QDict *qdict); +void hmp_set_mem_policy(Monitor *mon, const QDict *qdict); #endif -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 00/13] Add support for binding guest numa nodes to host numa nodes
As you know, QEMU can't direct it's memory allocation now, this may cause guest cross node access performance regression. And, the worse thing is that if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policy before the pages are really mapped. According to this patch set, we are able to set guest nodes memory policy like following: -numa node,nodeid=0,cpus=0, \ -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ -numa node,nodeid=1,cpus=1 \ -numa mem,size=1024M,policy=interleave,host-nodes=1 This supports "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" like format. Also add "set-mem-policy" QMP and hmp command to set memory policy. And add a QMP command "query-numa" to show numa info through this API. And convert the "info numa" monitor command to use this QMP command "query-numa". V1->V2: change to use QemuOpts in numa options (Paolo) handle Error in mpol parser (Paolo) change qmp command format to mem-policy=membind,mem-hostnode=0-1 like (Paolo) V2->V3: also handle Error in cpus parser (5/10) split out common parser from cpus and hostnode parser (Bandan 6/10) V3-V4: rebase to request for comments V4->V5: use OptVisitor and split -numa option (Paolo) - s/set-mpol/set-mem-policy (Andreas) - s/mem-policy/policy - s/mem-hostnode/host-nodes fix hmp command process after error (Luiz) add qmp command query-numa and convert info numa to it (Luiz) V5->V6: remove tabs in json file (Laszlo, Paolo) add back "-numa node,mem=xxx" as legacy (Paolo) change cpus and host-nodes to array (Laszlo, Eric) change "nodeid" to "uint16" add NumaMemPolicy enum type (Eric) rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for repeating options" patch set, thanks for Laszlo's help V6-V7: change UInt16 to uint16 (Laszlo) fix a typo in adding qmp command set-mem-policy V7-V8: rebase to current master with Laszlo's V2 of OptsVisitor patch set fix an adding white space line error V8->V9: rebase to current master check if total numa memory size is equal to ram_size (Paolo) add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) replace the use of numa_num_configured_nodes() (Andrew) avoid abusing the fact i==nodeid (Andrew) V9->V10: rebase to current master remove libnuma (Andrew) MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) remove a useless clear_bit() operation (Andrew) V10->V11: rebase to current master fix "maxnode" argument of mbind(2) V11->V12: rebase to current master split patch 02/11 of V11 (Eduardo) add some max value check (Eduardo) split MAX_NODES change patch (Eduardo) V12->V13: rebase to current master thanks for Luiz's review (Luiz) doc hmp command set-mem-policy (Luiz) rename: NUMAInfo -> NUMANode (Luiz) *I hope this can catch up the train of 1.7.* Thanks, Wanlong Gao Wanlong Gao (13): NUMA: move numa related code to new file numa.c NUMA: check if the total numa memory size is equal to ram_size NUMA: Add numa_info structure to contain numa nodes info NUMA: convert -numa option to use OptsVisitor NUMA: introduce NumaMemOptions NUMA: add "-numa mem," options NUMA: expand MAX_NODES from 64 to 128 NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command set-mem-policy to set memory policy for NUMA node NUMA: add hmp command set-mem-policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp-commands.hx | 28 +++ hmp.c | 119 + hmp.h | 2 + hw/i386/pc.c| 4 +- include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 +- monitor.c | 21 +-- numa.c | 460 qapi-schema.json| 133 ++ qemu-options.hx | 6 +- qmp-commands.hx | 90 ++ vl.c| 160 ++--- 14 files changed, 873 insertions(+), 185 deletions(-) create mode 100644 numa.c -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 07/13] NUMA: expand MAX_NODES from 64 to 128
libnuma choosed 128 for MAX_NODES, so we follow libnuma here. Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 58c728c..12529a1 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -127,7 +127,7 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 08/13] NUMA: parse guest numa nodes memory policy
The memory policy setting format is like: policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N And we are adding this setting as a suboption of "-numa mem,", the memory policy then can be set like following: -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +++ numa.c | 18 ++ qapi-schema.json| 33 +++-- vl.c| 3 +++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 12529a1..7495e01 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -134,6 +134,9 @@ extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +DECLARE_BITMAP(host_mem, MAX_NODES); +NumaNodePolicy policy; +bool relative; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; void set_numa_nodes(void); diff --git a/numa.c b/numa.c index c676c5e..da4dbbd 100644 --- a/numa.c +++ b/numa.c @@ -78,6 +78,7 @@ static int numa_mem_parse(NumaMemOptions *opts) { uint16_t nodenr; uint64_t mem_size; +uint16List *nodes; if (opts->has_nodeid) { nodenr = opts->nodeid; @@ -96,6 +97,23 @@ static int numa_mem_parse(NumaMemOptions *opts) numa_info[nodenr].node_mem = mem_size; } +if (opts->has_policy) { +numa_info[nodenr].policy = opts->policy; +} + +if (opts->has_relative) { +numa_info[nodenr].relative = opts->relative; +} + +for (nodes = opts->host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +fprintf(stderr, "qemu: node number %" PRIu16 " is bigger than %d\n", +nodes->value, MAX_NODES); +continue; +} +bitmap_set(numa_info[nodenr].host_mem, nodes->value, 1); +} + return 0; } diff --git a/qapi-schema.json b/qapi-schema.json index 950d0f5..dbe7088 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3871,6 +3871,26 @@ '*mem':'str' }} ## +# @NumaNodePolicy +# +# NUMA node policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred node for allocation +# +# @membind: a strict policy that restricts memory allocation to the +# nodes specified +# +# @interleave: the page allocations is interleaved across the set +# of nodes specified +# +# Since 1.7 +## +{ 'enum': 'NumaNodePolicy', + 'data': [ 'default', 'preferred', 'membind', 'interleave' ] } + +## # @NumaMemOptions # # Set memory information of guest NUMA node. (for OptsVisitor) @@ -3879,9 +3899,18 @@ # # @size: #optional memory size of this node # +# @policy: #optional memory policy of this node +# +# @relative: #optional if the nodes specified are relative +# +# @host-nodes: #optional host nodes for its memory policy +# # Since 1.7 ## { 'type': 'NumaMemOptions', 'data': { - '*nodeid': 'uint16', - '*size': 'size' }} + '*nodeid': 'uint16', + '*size': 'size', + '*policy': 'NumaNodePolicy', + '*relative': 'bool', + '*host-nodes': ['uint16'] }} diff --git a/vl.c b/vl.c index 341c096..d09b6ef 100644 --- a/vl.c +++ b/vl.c @@ -2814,6 +2814,9 @@ int main(int argc, char **argv, char **envp) for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_mem = 0; bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); +bitmap_zero(numa_info[i].host_mem, MAX_NODES); +numa_info[i].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[i].relative = false; } nb_numa_nodes = 0; -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 09/13] NUMA: set guest numa nodes memory policy
Set the guest numa nodes memory policies using the mbind(2) system call node by node. After this patch, we are able to set guest nodes memory policies through the QEMU options, this arms to solve the guest cross nodes memory access performance issue. And as you all know, if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policies before the pages are really mapped. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- numa.c | 86 ++ 1 file changed, 86 insertions(+) diff --git a/numa.c b/numa.c index da4dbbd..915a67a 100644 --- a/numa.c +++ b/numa.c @@ -27,6 +27,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif + QemuOptsList qemu_numa_opts = { .name = "numa", .implied_opt_name = "type", @@ -228,6 +238,75 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ +int bind_mode; + +switch (numa_info[nodeid].policy) { +case NUMA_NODE_POLICY_DEFAULT: +case NUMA_NODE_POLICY_PREFERRED: +case NUMA_NODE_POLICY_MEMBIND: +case NUMA_NODE_POLICY_INTERLEAVE: +bind_mode = numa_info[nodeid].policy; +break; +default: +bind_mode = NUMA_NODE_POLICY_DEFAULT; +return bind_mode; +} + +bind_mode |= numa_info[nodeid].relative ? +MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + +return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef __linux__ +void *ram_ptr; +RAMBlock *block; +ram_addr_t len, ram_offset = 0; +int bind_mode; +int i; + +QTAILQ_FOREACH(block, &ram_list.blocks, next) { +if (!strcmp(block->mr->name, "pc.ram")) { +break; +} +} + +if (block->host == NULL) { +return -1; +} + +ram_ptr = block->host; +for (i = 0; i < nodeid; i++) { +len = numa_info[i].node_mem; +ram_offset += len; +} + +len = numa_info[nodeid].node_mem; +bind_mode = node_parse_bind_mode(nodeid); +unsigned long *nodes = numa_info[nodeid].host_mem; + +/* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ +unsigned long maxnode = find_last_bit(nodes, MAX_NODES); +if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, +nodes, maxnode + 2, 0)) { +perror("mbind"); +return -1; +} +#endif + +return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -240,4 +319,11 @@ void set_numa_modes(void) } } } + +for (i = 0; i < nb_numa_nodes; i++) { +if (set_node_mem_policy(i) == -1) { +fprintf(stderr, +"qemu: can not set host memory policy for node%d\n", i); +} +} } -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 05/13] NUMA: introduce NumaMemOptions
Signed-off-by: Wanlong Gao --- qapi-schema.json | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index ca31ca6..950d0f5 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3848,7 +3848,8 @@ ## { 'union': 'NumaOptions', 'data': { -'node': 'NumaNodeOptions' }} +'node': 'NumaNodeOptions', +'mem' : 'NumaMemOptions' }} ## # @NumaNodeOptions @@ -3868,3 +3869,19 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem':'str' }} + +## +# @NumaMemOptions +# +# Set memory information of guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @size: #optional memory size of this node +# +# Since 1.7 +## +{ 'type': 'NumaMemOptions', + 'data': { + '*nodeid': 'uint16', + '*size': 'size' }} -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 12/13] NUMA: add qmp command query-numa
Add qmp command query-numa to show guest NUMA information. Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- numa.c | 65 qapi-schema.json | 36 +++ qmp-commands.hx | 49 ++ 3 files changed, 150 insertions(+) diff --git a/numa.c b/numa.c index 19ee7f7..289a350 100644 --- a/numa.c +++ b/numa.c @@ -393,3 +393,68 @@ error: numa_info[nodeid].relative = old_relative; return; } + +NUMANodeList *qmp_query_numa(Error **errp) +{ +NUMANodeList *head = NULL, *cur_item = NULL; +CPUState *cpu; +int i; + +for (i = 0; i < nb_numa_nodes; i++) { +NUMANodeList *info; +uint16List *cur_cpu_item = NULL; +info = g_malloc0(sizeof(*info)); +info->value = g_malloc0(sizeof(*info->value)); +info->value->nodeid = i; +CPU_FOREACH(cpu) { +if (cpu->numa_node == i) { +uint16List *node_cpu = g_malloc0(sizeof(*node_cpu)); +node_cpu->value = cpu->cpu_index; + +if (!cur_cpu_item) { +info->value->cpus = cur_cpu_item = node_cpu; +} else { +cur_cpu_item->next = node_cpu; +cur_cpu_item = node_cpu; +} +} +} +info->value->memory = numa_info[i].node_mem; + +#ifdef __linux__ +info->value->policy = numa_info[i].policy; +info->value->relative = numa_info[i].relative; + +unsigned long first, next; +next = first = find_first_bit(numa_info[i].host_mem, MAX_NODES); +if (first == MAX_NODES) { +goto end; +} +uint16List *cur_node_item = g_malloc0(sizeof(*cur_node_item)); +cur_node_item->value = first; +info->value->host_nodes = cur_node_item; +do { +next = find_next_bit(numa_info[i].host_mem, MAX_NODES, + next + 1); +if (next == MAX_NODES) { +break; +} + +uint16List *host_node = g_malloc0(sizeof(*host_node)); +host_node->value = next; +cur_node_item->next = host_node; +cur_node_item = host_node; +} while (true); +end: +#endif + +if (!cur_item) { +head = cur_item = info; +} else { +cur_item->next = info; +cur_item = info; +} +} + +return head; +} diff --git a/qapi-schema.json b/qapi-schema.json index 914c0c0..0f5ef69 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3935,3 +3935,39 @@ { 'command': 'set-mem-policy', 'data': {'nodeid': 'uint16', '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] } } + +## +# @NUMANode: +# +# Information of guest NUMA node +# +# @nodeid: NUMA node ID +# +# @cpus: VCPUs contained in this node +# +# @memory: memory size of this node +# +# @policy: memory policy of this node +# +# @relative: if host nodes are relative for memory policy +# +# @host-nodes: host nodes for its memory policy +# +# Since: 1.7 +# +## +{ 'type': 'NUMANode', + 'data': {'nodeid': 'uint16', 'cpus': ['uint16'], 'memory': 'uint64', + 'policy': 'NumaNodePolicy', 'relative': 'bool', + 'host-nodes': ['uint16'] }} + +## +# @query-numa: +# +# Returns a list of information about each guest node. +# +# Returns: a list of @NUMANode for each guest node +# +# Since: 1.7 +## +{ 'command': 'query-numa', 'returns': ['NUMANode'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index fc7b804..9844bcb 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3193,3 +3193,52 @@ Notes: to host node 0. EQMP + +{ +.name = "query-numa", +.args_type = "", +.mhandler.cmd_new = qmp_marshal_input_query_numa, +}, + +SQMP +query-numa +- + +Show NUMA information. + +Return a json-array. Each NUMA node is represented by a json-object, +which contains: + +- "nodeid": NUMA node ID (json-int) +- "cpus": a json-arry of contained VCPUs +- "memory": amount of memory in each node in Byte (json-int) +- "policy": memory policy of this node (json-string) +- "relative": if host nodes is relative for its memory policy (json-bool) +- "host-nodes": a json-array of host nodes for its memory policy + +Arguments: + +Example: + +-> { "excute": "query-numa" } +<- { "return":[ +{ +"nodeid": 0, +"cpus": [0, 1], +"memory": 536870912, +"policy": "membind", +"relative": false, +"host-nodes": [0, 1] +}, +{ +"nodeid": 1, +"cpus": [2, 3], +"memory": 536870912, +"policy": "interleave", +"relative": false, +"host-nodes": [1] +} + ] + } + +EQMP -- 1.8.4.99.gd2dbd39
[Qemu-devel] [PATCH V13 10/13] NUMA: add qmp command set-mem-policy to set memory policy for NUMA node
This QMP command allows user set guest node's memory policy through the QMP protocol. The qmp-shell command is like: set-mem-policy nodeid=0 policy=membind relative=true host-nodes=0-1 Reviewed-by: Luiz Capitulino Signed-off-by: Wanlong Gao --- numa.c | 66 qapi-schema.json | 21 ++ qmp-commands.hx | 41 +++ 3 files changed, 128 insertions(+) diff --git a/numa.c b/numa.c index 915a67a..19ee7f7 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "exec/memory.h" +#include "qmp-commands.h" #ifdef __linux__ #include @@ -327,3 +328,68 @@ void set_numa_modes(void) } } } + +void qmp_set_mem_policy(uint16_t nodeid, bool has_policy, NumaNodePolicy policy, +bool has_relative, bool relative, +bool has_host_nodes, uint16List *host_nodes, +Error **errp) +{ +NumaNodePolicy old_policy; +bool old_relative; +DECLARE_BITMAP(host_mem, MAX_NODES); +uint16List *nodes; + +if (nodeid >= nb_numa_nodes) { +error_setg(errp, "Only has '%d' NUMA nodes", nb_numa_nodes); +return; +} + +bitmap_copy(host_mem, numa_info[nodeid].host_mem, MAX_NODES); +old_policy = numa_info[nodeid].policy; +old_relative = numa_info[nodeid].relative; + +numa_info[nodeid].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[nodeid].relative = false; +bitmap_zero(numa_info[nodeid].host_mem, MAX_NODES); + +if (!has_policy) { +if (set_node_mem_policy(nodeid) == -1) { +error_setg(errp, "Failed to set memory policy for node%" PRIu16, + nodeid); +goto error; +} +return; +} + +numa_info[nodeid].policy = policy; + +if (has_relative) { +numa_info[nodeid].relative = relative; +} + +if (!has_host_nodes) { +bitmap_empty(numa_info[nodeid].host_mem, MAX_NODES); +bitmap_set(numa_info[nodeid].host_mem, 0, 1); +} + +for (nodes = host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +continue; +} +bitmap_set(numa_info[nodeid].host_mem, nodes->value, 1); +} + +if (set_node_mem_policy(nodeid) == -1) { +error_setg(errp, "Failed to set memory policy for node%" PRIu16, + nodeid); +goto error; +} + +return; + +error: +bitmap_copy(numa_info[nodeid].host_mem, host_mem, MAX_NODES); +numa_info[nodeid].policy = old_policy; +numa_info[nodeid].relative = old_relative; +return; +} diff --git a/qapi-schema.json b/qapi-schema.json index dbe7088..914c0c0 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3914,3 +3914,24 @@ '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] }} + +## +# @set-mem-policy: +# +# Set the host memory binding policy for guest NUMA node. +# +# @nodeid: The node ID of guest NUMA node to set memory policy to. +# +# @policy: #optional The memory policy to be set (default 'default'). +# +# @relative: #optional If the specified nodes are relative (default 'false') +# +# @host-nodes: #optional The host nodes range for memory policy. +# +# Returns: Nothing on success +# +# Since: 1.7 +## +{ 'command': 'set-mem-policy', + 'data': {'nodeid': 'uint16', '*policy': 'NumaNodePolicy', + '*relative': 'bool', '*host-nodes': ['uint16'] } } diff --git a/qmp-commands.hx b/qmp-commands.hx index 008cad9..fc7b804 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3089,6 +3089,7 @@ Example: <- { "return": {} } EQMP + { .name = "query-rx-filter", .args_type = "name:s?", @@ -3152,3 +3153,43 @@ Example: } EQMP + +{ +.name = "set-mem-policy", +.args_type = "nodeid:i,policy:s?,relative:b?,host-nodes:q?", +.help = "Set the host memory binding policy for guest NUMA node", +.mhandler.cmd_new = qmp_marshal_input_set_mem_policy, +}, + +SQMP +set-mem-policy +-- + +Set the host memory binding policy for guest NUMA node + +Arguments: + +- "nodeid": The nodeid of guest NUMA node to set memory policy to. +(json-int) +- "policy": The memory policy to set. +(json-string, optional) +- "relative": If the specified nodes are relative. + (json-bool, optional) +- "host-nodes": The host nodes contained to this memory policy. +(a
[Qemu-devel] [PATCH V13 03/13] NUMA: Add numa_info structure to contain numa nodes info
Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c| 4 ++-- include/sysemu/sysemu.h | 8 ++-- monitor.c | 2 +- numa.c | 23 --- vl.c| 7 +++ 5 files changed, 24 insertions(+), 20 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0c313fe..b0fddd0 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -652,14 +652,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { -numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 31e5a5b..662c6fb 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -129,8 +130,11 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { +uint64_t node_mem; +DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); diff --git a/monitor.c b/monitor.c index 74f3f1b..be34488 100644 --- a/monitor.c +++ b/monitor.c @@ -2010,7 +2010,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -node_mem[i] >> 20); +numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index beda80e..1bc0fad 100644 --- a/numa.c +++ b/numa.c @@ -61,7 +61,7 @@ static void numa_node_parse_cpus(int nodenr, const char *cpus) goto error; } -bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); return; error: @@ -101,7 +101,7 @@ void numa_add(const char *optarg) } if (get_param_value(option, 128, "mem", optarg) == 0) { -node_mem[nodenr] = 0; +numa_info[nodenr].node_mem = 0; } else { int64_t sval; sval = strtosz(option, &endptr); @@ -109,7 +109,7 @@ void numa_add(const char *optarg) fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); exit(1); } -node_mem[nodenr] = sval; +numa_info[nodenr].node_mem = sval; } if (get_param_value(option, 128, "cpus", optarg) != 0) { numa_node_parse_cpus(nodenr, option); @@ -134,7 +134,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */ for (i = 0; i < nb_numa_nodes; i++) { -if (node_mem[i] != 0) +if (numa_info[i].node_mem != 0) break; } if (i == nb_numa_nodes) { @@ -144,15 +144,16 @@ void set_numa_nodes(void) * the final node gets the rest. */ for (i = 0; i < nb_numa_nodes - 1; i++) { -node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); -usedmem += node_mem[i]; +numa_info[i].node_mem = (ram_size / nb_numa_nodes) & +~((1 << 23UL) - 1); +usedmem += numa_info[i].node_mem; } -node_mem[i] = ram_size - usedmem; +numa_info[i].node_mem = ram_size - usedmem; } uint64_t numa_total = 0; for (i = 0; i < nb_numa_nodes; i++) { -numa_total += node_mem[i]; +numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { fprintf(stderr, "qemu: numa nodes total memory size " @@ -161,7 +162,7 @@ void set_numa_nodes(void) } for (i = 0; i
[Qemu-devel] [PATCH V13 02/13] NUMA: check if the total numa memory size is equal to ram_size
If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI talb, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao --- numa.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/numa.c b/numa.c index ce7736a..beda80e 100644 --- a/numa.c +++ b/numa.c @@ -150,6 +150,16 @@ void set_numa_nodes(void) node_mem[i] = ram_size - usedmem; } +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; -- 1.8.4.99.gd2dbd39
Re: [Qemu-devel] [PATCH V12 00/13] Add support for binding guest numa nodes to host numa nodes
Hi folks, Any comments? ;-P Wanlong Gao > As you know, QEMU can't direct it's memory allocation now, this may cause > guest cross node access performance regression. > And, the worse thing is that if PCI-passthrough is used, > direct-attached-device uses DMA transfer between device and qemu process. > All pages of the guest will be pinned by get_user_pages(). > > KVM_ASSIGN_PCI_DEVICE ioctl > kvm_vm_ioctl_assign_device() > =>kvm_assign_device() > => kvm_iommu_map_memslots() > => kvm_iommu_map_pages() >=> kvm_pin_pages() > > So, with direct-attached-device, all guest page's page count will be +1 and > any page migration will not work. AutoNUMA won't too. > > So, we should set the guest nodes memory allocation policy before > the pages are really mapped. > > According to this patch set, we are able to set guest nodes memory policy > like following: > > -numa node,nodeid=0,cpus=0, \ > -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ > -numa node,nodeid=1,cpus=1 \ > -numa mem,size=1024M,policy=interleave,host-nodes=1 > > This supports > "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" > like format. > > > Also add "set-mem-policy" QMP and hmp command to set memory policy. > > And add a QMP command "query-numa" to show numa info through > this API. > > And convert the "info numa" monitor command to use this > QMP command "query-numa". > > > V1->V2: > change to use QemuOpts in numa options (Paolo) > handle Error in mpol parser (Paolo) > change qmp command format to mem-policy=membind,mem-hostnode=0-1 like > (Paolo) > V2->V3: > also handle Error in cpus parser (5/10) > split out common parser from cpus and hostnode parser (Bandan 6/10) > V3-V4: > rebase to request for comments > V4->V5: > use OptVisitor and split -numa option (Paolo) > - s/set-mpol/set-mem-policy (Andreas) > - s/mem-policy/policy > - s/mem-hostnode/host-nodes > fix hmp command process after error (Luiz) > add qmp command query-numa and convert info numa to it (Luiz) > V5->V6: > remove tabs in json file (Laszlo, Paolo) > add back "-numa node,mem=xxx" as legacy (Paolo) > change cpus and host-nodes to array (Laszlo, Eric) > change "nodeid" to "uint16" > add NumaMemPolicy enum type (Eric) > rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for > repeating options" patch set, thanks for Laszlo's help > V6-V7: > change UInt16 to uint16 (Laszlo) > fix a typo in adding qmp command set-mem-policy > V7-V8: > rebase to current master with Laszlo's V2 of OptsVisitor patch set > fix an adding white space line error > V8->V9: > rebase to current master > check if total numa memory size is equal to ram_size (Paolo) > add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) > replace the use of numa_num_configured_nodes() (Andrew) > avoid abusing the fact i==nodeid (Andrew) > V9->V10: > rebase to current master > remove libnuma (Andrew) > MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) > use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) > remove a useless clear_bit() operation (Andrew) > V10->V11: > rebase to current master > fix "maxnode" argument of mbind(2) > V11->V12: > rebase to current master > split patch 02/11 of V11 (Eduardo) > add some max value check (Eduardo) > split MAX_NODES change patch (Eduardo) > > > *I hope this can catch up the train of 1.7.* > > Thanks, > Wanlong Gao > > Wanlong Gao (13): > NUMA: move numa related code to new file numa.c > NUMA: check if the total numa memory size is equal to ram_size > NUMA: Add numa_info structure to contain numa nodes info > NUMA: convert -numa option to use OptsVisitor > NUMA: introduce NumaMemOptions > NUMA: add "-numa mem," options > NUMA: expand MAX_NODES from 64 to 128 > NUMA: parse guest numa nodes memory policy > NUMA: set guest numa nodes memory policy > NUMA: add qmp command set-mem-policy to set memory policy for NUMA > node > NUMA: add hmp command set-mem-policy > NUMA: add qmp command query-numa > NUMA: convert hmp command info_numa to use qmp command query_numa > > Makefile.target | 2 +- > cpus.c | 14 -- > hmp-commands.hx | 16 ++ > hmp.c | 119 + > hmp.h | 2 + > hw/i386/pc.c| 4 +- > include/sysemu/cpus.h | 1 - > include/sysemu/sysemu.h | 18 +- > monitor.c | 21 +-- > numa.c | 460 > > qapi-schema.json| 133 ++ > qemu-options.hx | 6 +- > qmp-commands.hx | 90 ++ > vl.c| 160 ++--- > 14 files changed, 861 insertions(+), 185 deletions(-) > create mode 100644 numa.c >
[Qemu-devel] [PATCH V12 03/13] NUMA: Add numa_info structure to contain numa nodes info
Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c| 4 ++-- include/sysemu/sysemu.h | 8 ++-- monitor.c | 2 +- numa.c | 23 --- vl.c| 7 +++ 5 files changed, 24 insertions(+), 20 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 0c313fe..b0fddd0 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -652,14 +652,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { -numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 31e5a5b..662c6fb 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -129,8 +130,11 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { +uint64_t node_mem; +DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); diff --git a/monitor.c b/monitor.c index 74f3f1b..be34488 100644 --- a/monitor.c +++ b/monitor.c @@ -2010,7 +2010,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -node_mem[i] >> 20); +numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index beda80e..1bc0fad 100644 --- a/numa.c +++ b/numa.c @@ -61,7 +61,7 @@ static void numa_node_parse_cpus(int nodenr, const char *cpus) goto error; } -bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); return; error: @@ -101,7 +101,7 @@ void numa_add(const char *optarg) } if (get_param_value(option, 128, "mem", optarg) == 0) { -node_mem[nodenr] = 0; +numa_info[nodenr].node_mem = 0; } else { int64_t sval; sval = strtosz(option, &endptr); @@ -109,7 +109,7 @@ void numa_add(const char *optarg) fprintf(stderr, "qemu: invalid numa mem size: %s\n", optarg); exit(1); } -node_mem[nodenr] = sval; +numa_info[nodenr].node_mem = sval; } if (get_param_value(option, 128, "cpus", optarg) != 0) { numa_node_parse_cpus(nodenr, option); @@ -134,7 +134,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */ for (i = 0; i < nb_numa_nodes; i++) { -if (node_mem[i] != 0) +if (numa_info[i].node_mem != 0) break; } if (i == nb_numa_nodes) { @@ -144,15 +144,16 @@ void set_numa_nodes(void) * the final node gets the rest. */ for (i = 0; i < nb_numa_nodes - 1; i++) { -node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); -usedmem += node_mem[i]; +numa_info[i].node_mem = (ram_size / nb_numa_nodes) & +~((1 << 23UL) - 1); +usedmem += numa_info[i].node_mem; } -node_mem[i] = ram_size - usedmem; +numa_info[i].node_mem = ram_size - usedmem; } uint64_t numa_total = 0; for (i = 0; i < nb_numa_nodes; i++) { -numa_total += node_mem[i]; +numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { fprintf(stderr, "qemu: numa nodes total memory size " @@ -161,7 +162,7 @@ void set_numa_nodes(void) } for (i = 0; i
[Qemu-devel] [PATCH V12 08/13] NUMA: parse guest numa nodes memory policy
The memory policy setting format is like: policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N And we are adding this setting as a suboption of "-numa mem,", the memory policy then can be set like following: -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +++ numa.c | 18 ++ qapi-schema.json| 33 +++-- vl.c| 3 +++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 12529a1..7495e01 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -134,6 +134,9 @@ extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +DECLARE_BITMAP(host_mem, MAX_NODES); +NumaNodePolicy policy; +bool relative; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; void set_numa_nodes(void); diff --git a/numa.c b/numa.c index c676c5e..da4dbbd 100644 --- a/numa.c +++ b/numa.c @@ -78,6 +78,7 @@ static int numa_mem_parse(NumaMemOptions *opts) { uint16_t nodenr; uint64_t mem_size; +uint16List *nodes; if (opts->has_nodeid) { nodenr = opts->nodeid; @@ -96,6 +97,23 @@ static int numa_mem_parse(NumaMemOptions *opts) numa_info[nodenr].node_mem = mem_size; } +if (opts->has_policy) { +numa_info[nodenr].policy = opts->policy; +} + +if (opts->has_relative) { +numa_info[nodenr].relative = opts->relative; +} + +for (nodes = opts->host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +fprintf(stderr, "qemu: node number %" PRIu16 " is bigger than %d\n", +nodes->value, MAX_NODES); +continue; +} +bitmap_set(numa_info[nodenr].host_mem, nodes->value, 1); +} + return 0; } diff --git a/qapi-schema.json b/qapi-schema.json index 3ca742e..2fba592 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3806,6 +3806,26 @@ '*mem':'str' }} ## +# @NumaNodePolicy +# +# NUMA node policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred node for allocation +# +# @membind: a strict policy that restricts memory allocation to the +# nodes specified +# +# @interleave: the page allocations is interleaved across the set +# of nodes specified +# +# Since 1.7 +## +{ 'enum': 'NumaNodePolicy', + 'data': [ 'default', 'preferred', 'membind', 'interleave' ] } + +## # @NumaMemOptions # # Set memory information of guest NUMA node. (for OptsVisitor) @@ -3814,9 +3834,18 @@ # # @size: #optional memory size of this node # +# @policy: #optional memory policy of this node +# +# @relative: #optional if the nodes specified are relative +# +# @host-nodes: #optional host nodes for its memory policy +# # Since 1.7 ## { 'type': 'NumaMemOptions', 'data': { - '*nodeid': 'uint16', - '*size': 'size' }} + '*nodeid': 'uint16', + '*size': 'size', + '*policy': 'NumaNodePolicy', + '*relative': 'bool', + '*host-nodes': ['uint16'] }} diff --git a/vl.c b/vl.c index fd2afc7..31b4091 100644 --- a/vl.c +++ b/vl.c @@ -2888,6 +2888,9 @@ int main(int argc, char **argv, char **envp) for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_mem = 0; bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); +bitmap_zero(numa_info[i].host_mem, MAX_NODES); +numa_info[i].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[i].relative = false; } nb_numa_nodes = 0; -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 10/13] NUMA: add qmp command set-mem-policy to set memory policy for NUMA node
This QMP command allows user set guest node's memory policy through the QMP protocol. The qmp-shell command is like: set-mem-policy nodeid=0 policy=membind relative=true host-nodes=0-1 Signed-off-by: Wanlong Gao --- numa.c | 66 qapi-schema.json | 21 ++ qmp-commands.hx | 41 +++ 3 files changed, 128 insertions(+) diff --git a/numa.c b/numa.c index 915a67a..19ee7f7 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "exec/memory.h" +#include "qmp-commands.h" #ifdef __linux__ #include @@ -327,3 +328,68 @@ void set_numa_modes(void) } } } + +void qmp_set_mem_policy(uint16_t nodeid, bool has_policy, NumaNodePolicy policy, +bool has_relative, bool relative, +bool has_host_nodes, uint16List *host_nodes, +Error **errp) +{ +NumaNodePolicy old_policy; +bool old_relative; +DECLARE_BITMAP(host_mem, MAX_NODES); +uint16List *nodes; + +if (nodeid >= nb_numa_nodes) { +error_setg(errp, "Only has '%d' NUMA nodes", nb_numa_nodes); +return; +} + +bitmap_copy(host_mem, numa_info[nodeid].host_mem, MAX_NODES); +old_policy = numa_info[nodeid].policy; +old_relative = numa_info[nodeid].relative; + +numa_info[nodeid].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[nodeid].relative = false; +bitmap_zero(numa_info[nodeid].host_mem, MAX_NODES); + +if (!has_policy) { +if (set_node_mem_policy(nodeid) == -1) { +error_setg(errp, "Failed to set memory policy for node%" PRIu16, + nodeid); +goto error; +} +return; +} + +numa_info[nodeid].policy = policy; + +if (has_relative) { +numa_info[nodeid].relative = relative; +} + +if (!has_host_nodes) { +bitmap_empty(numa_info[nodeid].host_mem, MAX_NODES); +bitmap_set(numa_info[nodeid].host_mem, 0, 1); +} + +for (nodes = host_nodes; nodes; nodes = nodes->next) { +if (nodes->value > MAX_NODES) { +continue; +} +bitmap_set(numa_info[nodeid].host_mem, nodes->value, 1); +} + +if (set_node_mem_policy(nodeid) == -1) { +error_setg(errp, "Failed to set memory policy for node%" PRIu16, + nodeid); +goto error; +} + +return; + +error: +bitmap_copy(numa_info[nodeid].host_mem, host_mem, MAX_NODES); +numa_info[nodeid].policy = old_policy; +numa_info[nodeid].relative = old_relative; +return; +} diff --git a/qapi-schema.json b/qapi-schema.json index 2fba592..7a8cf6a 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3849,3 +3849,24 @@ '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] }} + +## +# @set-mem-policy: +# +# Set the host memory binding policy for guest NUMA node. +# +# @nodeid: The node ID of guest NUMA node to set memory policy to. +# +# @policy: #optional The memory policy to be set (default 'default'). +# +# @relative: #optional If the specified nodes are relative (default 'false') +# +# @host-nodes: #optional The host nodes range for memory policy. +# +# Returns: Nothing on success +# +# Since: 1.7 +## +{ 'command': 'set-mem-policy', + 'data': {'nodeid': 'uint16', '*policy': 'NumaNodePolicy', + '*relative': 'bool', '*host-nodes': ['uint16'] } } diff --git a/qmp-commands.hx b/qmp-commands.hx index 8a8f342..67a9dd2 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3061,6 +3061,7 @@ Example: <- { "return": {} } EQMP + { .name = "query-rx-filter", .args_type = "name:s?", @@ -3124,3 +3125,43 @@ Example: } EQMP + +{ +.name = "set-mem-policy", +.args_type = "nodeid:i,policy:s?,relative:b?,host-nodes:q?", +.help = "Set the host memory binding policy for guest NUMA node", +.mhandler.cmd_new = qmp_marshal_input_set_mem_policy, +}, + +SQMP +set-mem-policy +-- + +Set the host memory binding policy for guest NUMA node + +Arguments: + +- "nodeid": The nodeid of guest NUMA node to set memory policy to. +(json-int) +- "policy": The memory policy to set. +(json-string, optional) +- "relative": If the specified nodes are relative. + (json-bool, optional) +- "host-nodes": The host nodes contained to this memory policy. +(a json-array of int, optional)
[Qemu-devel] [PATCH V12 13/13] NUMA: convert hmp command info_numa to use qmp command query_numa
Signed-off-by: Wanlong Gao --- hmp.c | 54 ++ hmp.h | 1 + monitor.c | 21 + 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/hmp.c b/hmp.c index ae695b0..2d878c6 100644 --- a/hmp.c +++ b/hmp.c @@ -27,6 +27,7 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1579,3 +1580,56 @@ error: hmp_handle_error(mon, &local_err); } + +void hmp_info_numa(Monitor *mon, const QDict *qdict) +{ +NUMAInfoList *node_list, *node; +uint16List *head; +int nodeid; +char *policy_str = NULL; + +node_list = qmp_query_numa(NULL); + +monitor_printf(mon, "%d nodes\n", nb_numa_nodes); +for (node = node_list; node; node = node->next) { +nodeid = node->value->nodeid; +monitor_printf(mon, "node %d cpus:", nodeid); +head = node->value->cpus; +for (head = node->value->cpus; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +monitor_printf(mon, "node %d size: %" PRId64 " MB\n", + nodeid, node->value->memory >> 20); +switch (node->value->policy) { +case NUMA_NODE_POLICY_DEFAULT: +policy_str = g_strdup("default"); +break; +case NUMA_NODE_POLICY_PREFERRED: +policy_str = g_strdup("preferred"); +break; +case NUMA_NODE_POLICY_MEMBIND: +policy_str = g_strdup("membind"); +break; +case NUMA_NODE_POLICY_INTERLEAVE: +policy_str = g_strdup("interleave"); +break; +default: +break; +} +monitor_printf(mon, "node %d policy: %s\n", + nodeid, policy_str ? : " "); +if (policy_str) { +free(policy_str); +} +monitor_printf(mon, "node %d relative: %s\n", nodeid, + node->value->relative ? "true" : "false"); +monitor_printf(mon, "node %d host-nodes:", nodeid); +for (head = node->value->host_nodes; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +} + +qapi_free_NUMAInfoList(node_list); +} diff --git a/hmp.h b/hmp.h index ae09525..56a5efd 100644 --- a/hmp.h +++ b/hmp.h @@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); +void hmp_info_numa(Monitor *mon, const QDict *qdict); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); diff --git a/monitor.c b/monitor.c index be34488..ce9dfe7 100644 --- a/monitor.c +++ b/monitor.c @@ -1995,25 +1995,6 @@ static void do_info_mtree(Monitor *mon, const QDict *qdict) mtree_info((fprintf_function)monitor_printf, mon); } -static void do_info_numa(Monitor *mon, const QDict *qdict) -{ -int i; -CPUState *cpu; - -monitor_printf(mon, "%d nodes\n", nb_numa_nodes); -for (i = 0; i < nb_numa_nodes; i++) { -monitor_printf(mon, "node %d cpus:", i); -CPU_FOREACH(cpu) { -if (cpu->numa_node == i) { -monitor_printf(mon, " %d", cpu->cpu_index); -} -} -monitor_printf(mon, "\n"); -monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -numa_info[i].node_mem >> 20); -} -} - #ifdef CONFIG_PROFILER int64_t qemu_time; @@ -2781,7 +2762,7 @@ static mon_cmd_t info_cmds[] = { .args_type = "", .params = "", .help = "show NUMA information", -.mhandler.cmd = do_info_numa, +.mhandler.cmd = hmp_info_numa, }, { .name = "usb", -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 01/13] NUMA: move numa related code to new file numa.c
Signed-off-by: Wanlong Gao --- Makefile.target | 2 +- cpus.c | 14 include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 3 + numa.c | 182 vl.c| 139 +--- 6 files changed, 187 insertions(+), 154 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index 9a49852..7e1fddf 100644 --- a/Makefile.target +++ b/Makefile.target @@ -113,7 +113,7 @@ endif #CONFIG_BSD_USER # # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/cpus.c b/cpus.c index e566297..2ca0cd9 100644 --- a/cpus.c +++ b/cpus.c @@ -1225,20 +1225,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ -CPUState *cpu; -int i; - -CPU_FOREACH(cpu) { -for (i = 0; i < nb_numa_nodes; i++) { -if (test_bit(cpu->cpu_index, node_cpumask[i])) { -cpu->numa_node = i; -} -} -} -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488..4f79081 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index b1aa059..31e5a5b 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -131,6 +131,9 @@ extern QEMUClockType rtc_clock; extern int nb_numa_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +void numa_add(const char *optarg); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 000..ce7736a --- /dev/null +++ b/numa.c @@ -0,0 +1,182 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2013 Fujitsu Ltd. + * Author: Wanlong Gao + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" + +static void numa_node_parse_cpus(int nodenr, const char *cpus) +{ +char *endptr; +unsigned long long value, endvalue; + +/* Empty CPU range strings will be considered valid, they will simply + * not set any bit in the CPU bitmap. + */ +if (!*cpus) { +return; +} + +if (parse_uint(cpus, &value, &endptr, 10) < 0) { +goto error; +} +if (*endptr == '-') { +if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { +goto error; +} +} else if (*endptr == '\0') { +endvalue = value; +} else { +goto error; +} + +if (endvalue >= MAX_CPUMASK_BITS) { +endvalue = MAX_CPUMASK_BITS - 1; +fprintf(stderr, +"qemu: NUMA: A max of %d VCPUs are supported\n", + MAX_CPUMASK_BITS); +} + +if (endvalue < value) { +goto error; +} + +bitmap_set(node_cpumask[nodenr], value, endvalue-value+1); +return; + +error: +fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); +exit(1); +} + +void numa_add(const char *optarg) +{ +char option[128]; +char *endptr; +unsigned long long nodenr; + +optarg = get_opt_name(option, 128, optarg, ','); +if (*optarg == ',') { +
[Qemu-devel] [PATCH V12 11/13] NUMA: add hmp command set-mem-policy
Add hmp command set-mem-policy to set host memory policy for a guest NUMA node. Then we can also set node's memory policy using the monitor command like: (qemu) set-mem-policy 0 policy=membind,relative=false,host-nodes=0-1 Signed-off-by: Wanlong Gao --- hmp-commands.hx | 16 ++ hmp.c | 65 + hmp.h | 1 + 3 files changed, 82 insertions(+) diff --git a/hmp-commands.hx b/hmp-commands.hx index 65b7f60..b7f6049 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1587,6 +1587,22 @@ Executes a qemu-io command on the given block device. ETEXI { +.name = "set-mem-policy", +.args_type = "nodeid:i,args:s?", +.params = "nodeid [args]", +.help = "set host memory policy for a guest NUMA node", +.mhandler.cmd = hmp_set_mem_policy, +}, + +STEXI +@item set-mem-policy @var{nodeid} @var{args} +@findex set-mem-policy + +Set host memory policy for a guest NUMA node + +ETEXI + +{ .name = "info", .args_type = "item:s?", .params = "[subcommand]", diff --git a/hmp.c b/hmp.c index fcca6ae..ae695b0 100644 --- a/hmp.c +++ b/hmp.c @@ -24,6 +24,9 @@ #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1514,3 +1517,65 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } + +void hmp_set_mem_policy(Monitor *mon, const QDict *qdict) +{ +Error *local_err = NULL; +bool has_policy = true; +bool has_relative = true; +bool has_host_nodes = true; +QemuOpts *opts; +NumaMemOptions *object = NULL; +NumaNodePolicy policy = NUMA_NODE_POLICY_DEFAULT; +bool relative = false; +uint16List *host_nodes = NULL; + +uint64_t nodeid = qdict_get_int(qdict, "nodeid"); +const char *args = qdict_get_try_str(qdict, "args"); + +if (args == NULL) { +has_policy = false; +has_relative = false; +has_host_nodes = false; +} else { +opts = qemu_opts_parse(qemu_find_opts("numa"), args, 1); +if (opts == NULL) { +monitor_printf(mon, "Parsing memory policy args failed\n"); +return; +} else { +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaMemOptions(opts_get_visitor(ov), &object, NULL, + &local_err); +opts_visitor_cleanup(ov); + +if (error_is_set(&local_err)) { +goto error; +} + +has_policy = object->has_policy; +if (has_policy) { +policy = object->policy; +} +has_relative = object->has_relative; +if (has_relative) { +relative = object->relative; +} +has_host_nodes = object->has_host_nodes; +if (has_host_nodes) { +host_nodes = object->host_nodes; +} +} +} + +qmp_set_mem_policy(nodeid, has_policy, policy, has_relative, relative, + has_host_nodes, host_nodes, &local_err); +error: +if (object) { +QapiDeallocVisitor *dv = qapi_dealloc_visitor_new(); +visit_type_NumaMemOptions(qapi_dealloc_get_visitor(dv), + &object, NULL, NULL); +qapi_dealloc_visitor_cleanup(dv); +} + +hmp_handle_error(mon, &local_err); +} diff --git a/hmp.h b/hmp.h index 6c3bdcd..ae09525 100644 --- a/hmp.h +++ b/hmp.h @@ -87,5 +87,6 @@ void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); void hmp_chardev_remove(Monitor *mon, const QDict *qdict); void hmp_qemu_io(Monitor *mon, const QDict *qdict); +void hmp_set_mem_policy(Monitor *mon, const QDict *qdict); #endif -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 00/13] Add support for binding guest numa nodes to host numa nodes
As you know, QEMU can't direct it's memory allocation now, this may cause guest cross node access performance regression. And, the worse thing is that if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policy before the pages are really mapped. According to this patch set, we are able to set guest nodes memory policy like following: -numa node,nodeid=0,cpus=0, \ -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ -numa node,nodeid=1,cpus=1 \ -numa mem,size=1024M,policy=interleave,host-nodes=1 This supports "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" like format. Also add "set-mem-policy" QMP and hmp command to set memory policy. And add a QMP command "query-numa" to show numa info through this API. And convert the "info numa" monitor command to use this QMP command "query-numa". V1->V2: change to use QemuOpts in numa options (Paolo) handle Error in mpol parser (Paolo) change qmp command format to mem-policy=membind,mem-hostnode=0-1 like (Paolo) V2->V3: also handle Error in cpus parser (5/10) split out common parser from cpus and hostnode parser (Bandan 6/10) V3-V4: rebase to request for comments V4->V5: use OptVisitor and split -numa option (Paolo) - s/set-mpol/set-mem-policy (Andreas) - s/mem-policy/policy - s/mem-hostnode/host-nodes fix hmp command process after error (Luiz) add qmp command query-numa and convert info numa to it (Luiz) V5->V6: remove tabs in json file (Laszlo, Paolo) add back "-numa node,mem=xxx" as legacy (Paolo) change cpus and host-nodes to array (Laszlo, Eric) change "nodeid" to "uint16" add NumaMemPolicy enum type (Eric) rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for repeating options" patch set, thanks for Laszlo's help V6-V7: change UInt16 to uint16 (Laszlo) fix a typo in adding qmp command set-mem-policy V7-V8: rebase to current master with Laszlo's V2 of OptsVisitor patch set fix an adding white space line error V8->V9: rebase to current master check if total numa memory size is equal to ram_size (Paolo) add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) replace the use of numa_num_configured_nodes() (Andrew) avoid abusing the fact i==nodeid (Andrew) V9->V10: rebase to current master remove libnuma (Andrew) MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) remove a useless clear_bit() operation (Andrew) V10->V11: rebase to current master fix "maxnode" argument of mbind(2) V11->V12: rebase to current master split patch 02/11 of V11 (Eduardo) add some max value check (Eduardo) split MAX_NODES change patch (Eduardo) *I hope this can catch up the train of 1.7.* Thanks, Wanlong Gao Wanlong Gao (13): NUMA: move numa related code to new file numa.c NUMA: check if the total numa memory size is equal to ram_size NUMA: Add numa_info structure to contain numa nodes info NUMA: convert -numa option to use OptsVisitor NUMA: introduce NumaMemOptions NUMA: add "-numa mem," options NUMA: expand MAX_NODES from 64 to 128 NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command set-mem-policy to set memory policy for NUMA node NUMA: add hmp command set-mem-policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp-commands.hx | 16 ++ hmp.c | 119 + hmp.h | 2 + hw/i386/pc.c| 4 +- include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 +- monitor.c | 21 +-- numa.c | 460 qapi-schema.json| 133 ++ qemu-options.hx | 6 +- qmp-commands.hx | 90 ++ vl.c| 160 ++--- 14 files changed, 861 insertions(+), 185 deletions(-) create mode 100644 numa.c -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 05/13] NUMA: introduce NumaMemOptions
Signed-off-by: Wanlong Gao --- qapi-schema.json | 19 ++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/qapi-schema.json b/qapi-schema.json index fe12ea5..3ca742e 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3783,7 +3783,8 @@ ## { 'union': 'NumaOptions', 'data': { -'node': 'NumaNodeOptions' }} +'node': 'NumaNodeOptions', +'mem' : 'NumaMemOptions' }} ## # @NumaNodeOptions @@ -3803,3 +3804,19 @@ '*nodeid': 'uint16', '*cpus': ['uint16'], '*mem':'str' }} + +## +# @NumaMemOptions +# +# Set memory information of guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @size: #optional memory size of this node +# +# Since 1.7 +## +{ 'type': 'NumaMemOptions', + 'data': { + '*nodeid': 'uint16', + '*size': 'size' }} -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 12/13] NUMA: add qmp command query-numa
Add qmp command query-numa to show guest NUMA information. Signed-off-by: Wanlong Gao --- numa.c | 65 qapi-schema.json | 36 +++ qmp-commands.hx | 49 ++ 3 files changed, 150 insertions(+) diff --git a/numa.c b/numa.c index 19ee7f7..d0e3ebc 100644 --- a/numa.c +++ b/numa.c @@ -393,3 +393,68 @@ error: numa_info[nodeid].relative = old_relative; return; } + +NUMAInfoList *qmp_query_numa(Error **errp) +{ +NUMAInfoList *head = NULL, *cur_item = NULL; +CPUState *cpu; +int i; + +for (i = 0; i < nb_numa_nodes; i++) { +NUMAInfoList *info; +uint16List *cur_cpu_item = NULL; +info = g_malloc0(sizeof(*info)); +info->value = g_malloc0(sizeof(*info->value)); +info->value->nodeid = i; +CPU_FOREACH(cpu) { +if (cpu->numa_node == i) { +uint16List *node_cpu = g_malloc0(sizeof(*node_cpu)); +node_cpu->value = cpu->cpu_index; + +if (!cur_cpu_item) { +info->value->cpus = cur_cpu_item = node_cpu; +} else { +cur_cpu_item->next = node_cpu; +cur_cpu_item = node_cpu; +} +} +} +info->value->memory = numa_info[i].node_mem; + +#ifdef __linux__ +info->value->policy = numa_info[i].policy; +info->value->relative = numa_info[i].relative; + +unsigned long first, next; +next = first = find_first_bit(numa_info[i].host_mem, MAX_NODES); +if (first == MAX_NODES) { +goto end; +} +uint16List *cur_node_item = g_malloc0(sizeof(*cur_node_item)); +cur_node_item->value = first; +info->value->host_nodes = cur_node_item; +do { +next = find_next_bit(numa_info[i].host_mem, MAX_NODES, + next + 1); +if (next == MAX_NODES) { +break; +} + +uint16List *host_node = g_malloc0(sizeof(*host_node)); +host_node->value = next; +cur_node_item->next = host_node; +cur_node_item = host_node; +} while (true); +end: +#endif + +if (!cur_item) { +head = cur_item = info; +} else { +cur_item->next = info; +cur_item = info; +} +} + +return head; +} diff --git a/qapi-schema.json b/qapi-schema.json index 7a8cf6a..088b0d0 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3870,3 +3870,39 @@ { 'command': 'set-mem-policy', 'data': {'nodeid': 'uint16', '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] } } + +## +# @NUMAInfo: +# +# Information about guest NUMA nodes +# +# @nodeid: NUMA node ID +# +# @cpus: VCPUs contained in this node +# +# @memory: memory size of this node +# +# @policy: memory policy of this node +# +# @relative: if host nodes are relative for memory policy +# +# @host-nodes: host nodes for its memory policy +# +# Since: 1.7 +# +## +{ 'type': 'NUMAInfo', + 'data': {'nodeid': 'uint16', 'cpus': ['uint16'], 'memory': 'uint64', + 'policy': 'NumaNodePolicy', 'relative': 'bool', + 'host-nodes': ['uint16'] }} + +## +# @query-numa: +# +# Returns a list of information about each guest node. +# +# Returns: a list of @NUMAInfo for each guest node +# +# Since: 1.7 +## +{ 'command': 'query-numa', 'returns': ['NUMAInfo'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index 67a9dd2..bc0eb41 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3165,3 +3165,52 @@ Notes: to host node 0. EQMP + +{ +.name = "query-numa", +.args_type = "", +.mhandler.cmd_new = qmp_marshal_input_query_numa, +}, + +SQMP +query-numa +- + +Show NUMA information. + +Return a json-array. Each NUMA node is represented by a json-object, +which contains: + +- "nodeid": NUMA node ID (json-int) +- "cpus": a json-arry of contained VCPUs +- "memory": amount of memory in each node in Byte (json-int) +- "policy": memory policy of this node (json-string) +- "relative": if host nodes is relative for its memory policy (json-bool) +- "host-nodes": a json-array of host nodes for its memory policy + +Arguments: + +Example: + +-> { "excute": "query-numa" } +<- { "return":[ +{ +"nodeid": 0, +"cpus": [0, 1], +"memory": 536870912, +"policy": "membind", +"relative": false, +"host-nodes": [0, 1] +}, +{ +"nodeid": 1, +"cpus": [2, 3], +"memory": 536870912, +"policy": "interleave", +"relative": false, +"host-nodes": [1] +} + ] + } + +EQMP -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 06/13] NUMA: add "-numa mem," options
Add "-numa mem," option like following as Paolo suggested: -numa mem,nodeid=0,size=1G This new option will make later coming memory hotplug better. We will use the new options to specify nodes memory info, and just remain "-numa node,mem=xx" as legacy. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 1 + numa.c | 36 qemu-options.hx | 6 -- vl.c| 2 ++ 4 files changed, 43 insertions(+), 2 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 797490e..58c728c 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -130,6 +130,7 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; +extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); diff --git a/numa.c b/numa.c index c4fa665..c676c5e 100644 --- a/numa.c +++ b/numa.c @@ -74,6 +74,31 @@ static int numa_node_parse(NumaNodeOptions *opts) return 0; } +static int numa_mem_parse(NumaMemOptions *opts) +{ +uint16_t nodenr; +uint64_t mem_size; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +} else { +nodenr = nb_numa_mem_nodes; +} + +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} + +if (opts->has_size) { +mem_size = opts->size; +numa_info[nodenr].node_mem = mem_size; +} + +return 0; +} + int numa_init_func(QemuOpts *opts, void *opaque) { NumaOptions *object = NULL; @@ -101,6 +126,13 @@ int numa_init_func(QemuOpts *opts, void *opaque) } nb_numa_nodes++; break; +case NUMA_OPTIONS_KIND_MEM: +ret = numa_mem_parse(object->mem); +if (ret) { +goto error; +} +nb_numa_mem_nodes++; +break; default: fprintf(stderr, "qemu: Invalid NUMA options type.\n"); ret = -1; @@ -119,6 +151,10 @@ error: void set_numa_nodes(void) { +if (nb_numa_mem_nodes > nb_numa_nodes) { +nb_numa_nodes = nb_numa_mem_nodes; +} + if (nb_numa_nodes > 0) { int i; diff --git a/qemu-options.hx b/qemu-options.hx index d15338e..e9123b8 100644 --- a/qemu-options.hx +++ b/qemu-options.hx @@ -95,11 +95,13 @@ specifies the maximum number of hotpluggable CPUs. ETEXI DEF("numa", HAS_ARG, QEMU_OPTION_numa, -"-numa node[,mem=size][,cpus=cpu[-cpu]][,nodeid=node]\n", QEMU_ARCH_ALL) +"-numa node[,nodeid=node][,cpus=cpu[-cpu]]\n" +"-numa mem[,nodeid=node][,size=size]\n" +, QEMU_ARCH_ALL) STEXI @item -numa @var{opts} @findex -numa -Simulate a multi node NUMA system. If mem and cpus are omitted, resources +Simulate a multi node NUMA system. If @var{size} and @var{cpus} are omitted, resources are split equally. ETEXI diff --git a/vl.c b/vl.c index bbded8f..fd2afc7 100644 --- a/vl.c +++ b/vl.c @@ -250,6 +250,7 @@ static QTAILQ_HEAD(, FWBootEntry) fw_boot_order = QTAILQ_HEAD_INITIALIZER(fw_boot_order); int nb_numa_nodes; +int nb_numa_mem_nodes; NodeInfo numa_info[MAX_NODES]; uint8_t qemu_uuid[16]; @@ -2890,6 +2891,7 @@ int main(int argc, char **argv, char **envp) } nb_numa_nodes = 0; +nb_numa_mem_nodes = 0; nb_nics = 0; bdrv_init_with_whitelist(); -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 07/13] NUMA: expand MAX_NODES from 64 to 128
libnuma choosed 128 for MAX_NODES, so we follow libnuma here. Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 58c728c..12529a1 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -127,7 +127,7 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 04/13] NUMA: convert -numa option to use OptsVisitor
Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 3 +- numa.c | 148 +++- qapi-schema.json| 30 ++ vl.c| 11 +++- 4 files changed, 114 insertions(+), 78 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 662c6fb..797490e 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -135,9 +135,10 @@ typedef struct node_info { DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; -void numa_add(const char *optarg); void set_numa_nodes(void); void set_numa_modes(void); +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index 1bc0fad..c4fa665 100644 --- a/numa.c +++ b/numa.c @@ -24,101 +24,97 @@ */ #include "sysemu/sysemu.h" - -static void numa_node_parse_cpus(int nodenr, const char *cpus) +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" +QemuOptsList qemu_numa_opts = { +.name = "numa", +.implied_opt_name = "type", +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), +.desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static int numa_node_parse(NumaNodeOptions *opts) { -char *endptr; -unsigned long long value, endvalue; - -/* Empty CPU range strings will be considered valid, they will simply - * not set any bit in the CPU bitmap. - */ -if (!*cpus) { -return; -} +uint16_t nodenr; +uint16List *cpus = NULL; -if (parse_uint(cpus, &value, &endptr, 10) < 0) { -goto error; -} -if (*endptr == '-') { -if (parse_uint_full(endptr + 1, &endvalue, 10) < 0) { -goto error; -} -} else if (*endptr == '\0') { -endvalue = value; +if (opts->has_nodeid) { +nodenr = opts->nodeid; } else { -goto error; +nodenr = nb_numa_nodes; } -if (endvalue >= MAX_CPUMASK_BITS) { -endvalue = MAX_CPUMASK_BITS - 1; -fprintf(stderr, -"qemu: NUMA: A max of %d VCPUs are supported\n", - MAX_CPUMASK_BITS); +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; } -if (endvalue < value) { -goto error; +for (cpus = opts->cpus; cpus; cpus = cpus->next) { +if (cpus->value > MAX_CPUMASK_BITS) { +fprintf(stderr, "qemu: cpu number %" PRIu16 " is bigger than %d", +cpus->value, MAX_CPUMASK_BITS); +continue; +} +bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } -bitmap_set(numa_info[nodenr].node_cpu, value, endvalue-value+1); -return; +if (opts->has_mem) { +int64_t mem_size; +char *endptr; +mem_size = strtosz(opts->mem, &endptr); +if (mem_size < 0 || *endptr) { +fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); +return -1; +} +numa_info[nodenr].node_mem = mem_size; +} -error: -fprintf(stderr, "qemu: Invalid NUMA CPU range: %s\n", cpus); -exit(1); +return 0; } -void numa_add(const char *optarg) +int numa_init_func(QemuOpts *opts, void *opaque) { -char option[128]; -char *endptr; -unsigned long long nodenr; - -optarg = get_opt_name(option, 128, optarg, ','); -if (*optarg == ',') { -optarg++; +NumaOptions *object = NULL; +Error *err = NULL; +int ret = 0; + +{ +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaOptions(opts_get_visitor(ov), &object, NULL, &err); +opts_visitor_cleanup(ov); } -if (!strcmp(option, "node")) { - -if (nb_numa_nodes >= MAX_NODES) { -fprintf(stderr, "qemu: too many NUMA nodes\n"); -exit(1); -} -if (get_param_value(option, 128, "nodeid", optarg) == 0) { -nodenr = nb_numa_nodes; -} else { -if (parse_uint_full(option, &nodenr, 10) < 0) { -fprintf(stderr, "qemu: Invalid NUMA nodeid: %s\n", option); -exit(1); -} -} - -if (nodenr >= MAX_NODES) { -fprintf(stderr, "qemu: invalid NUMA nodeid: %llu\n", nodenr); -exit(1); -} +if (error_is_set(&err)) { +fprintf(stderr, "qemu: %s\n", error_get_pretty(err)); +error_free(err); +ret = -1; +goto er
[Qemu-devel] [PATCH V12 02/13] NUMA: check if the total numa memory size is equal to ram_size
If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI talb, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao --- numa.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/numa.c b/numa.c index ce7736a..beda80e 100644 --- a/numa.c +++ b/numa.c @@ -150,6 +150,16 @@ void set_numa_nodes(void) node_mem[i] = ram_size - usedmem; } +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; -- 1.8.4.21.g992c386
[Qemu-devel] [PATCH V12 09/13] NUMA: set guest numa nodes memory policy
Set the guest numa nodes memory policies using the mbind(2) system call node by node. After this patch, we are able to set guest nodes memory policies through the QEMU options, this arms to solve the guest cross nodes memory access performance issue. And as you all know, if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policies before the pages are really mapped. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- numa.c | 86 ++ 1 file changed, 86 insertions(+) diff --git a/numa.c b/numa.c index da4dbbd..915a67a 100644 --- a/numa.c +++ b/numa.c @@ -27,6 +27,16 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif + QemuOptsList qemu_numa_opts = { .name = "numa", .implied_opt_name = "type", @@ -228,6 +238,75 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ +int bind_mode; + +switch (numa_info[nodeid].policy) { +case NUMA_NODE_POLICY_DEFAULT: +case NUMA_NODE_POLICY_PREFERRED: +case NUMA_NODE_POLICY_MEMBIND: +case NUMA_NODE_POLICY_INTERLEAVE: +bind_mode = numa_info[nodeid].policy; +break; +default: +bind_mode = NUMA_NODE_POLICY_DEFAULT; +return bind_mode; +} + +bind_mode |= numa_info[nodeid].relative ? +MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + +return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef __linux__ +void *ram_ptr; +RAMBlock *block; +ram_addr_t len, ram_offset = 0; +int bind_mode; +int i; + +QTAILQ_FOREACH(block, &ram_list.blocks, next) { +if (!strcmp(block->mr->name, "pc.ram")) { +break; +} +} + +if (block->host == NULL) { +return -1; +} + +ram_ptr = block->host; +for (i = 0; i < nodeid; i++) { +len = numa_info[i].node_mem; +ram_offset += len; +} + +len = numa_info[nodeid].node_mem; +bind_mode = node_parse_bind_mode(nodeid); +unsigned long *nodes = numa_info[nodeid].host_mem; + +/* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ +unsigned long maxnode = find_last_bit(nodes, MAX_NODES); +if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, +nodes, maxnode + 2, 0)) { +perror("mbind"); +return -1; +} +#endif + +return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -240,4 +319,11 @@ void set_numa_modes(void) } } } + +for (i = 0; i < nb_numa_nodes; i++) { +if (set_node_mem_policy(i) == -1) { +fprintf(stderr, +"qemu: can not set host memory policy for node%d\n", i); +} +} } -- 1.8.4.21.g992c386
Re: [Qemu-devel] [PATCH V11 02/11] NUMA: split -numa option
On 09/04/2013 09:49 AM, Eduardo Habkost wrote: > On Fri, Aug 30, 2013 at 11:10:41AM +0800, Wanlong Gao wrote: >> Change -numa option like following as Paolo suggested: >> -numa node,nodeid=0,cpus=0-1 \ >> -numa mem,nodeid=0,size=1G >> >> This new option will make later coming memory hotplug better. >> And this new option is implemented using OptsVisitor. >> >> And just remain "-numa node,mem=xx" as legacy. >> >> Reviewed-by: Laszlo Ersek >> Signed-off-by: Wanlong Gao > > Would it be possible to first move the existing code as-is to numa.c, > then introduce qemu_numa_opts, and then introduce "-numa mem"? It would > make the patch much easier to review. I thought this patch is straightforward, but if you like I can split as you said. ;) > >> --- >> Makefile.target | 2 +- >> include/sysemu/sysemu.h | 3 + >> numa.c | 144 >> >> qemu-options.hx | 6 +- >> vl.c| 113 ++--- >> 5 files changed, 168 insertions(+), 100 deletions(-) >> create mode 100644 numa.c >> >> diff --git a/Makefile.target b/Makefile.target >> index 9a49852..7e1fddf 100644 >> --- a/Makefile.target >> +++ b/Makefile.target >> @@ -113,7 +113,7 @@ endif #CONFIG_BSD_USER >> # >> # System emulator target >> ifdef CONFIG_SOFTMMU >> -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o >> +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o >> obj-y += qtest.o >> obj-y += hw/ >> obj-$(CONFIG_FDT) += device_tree.o >> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h >> index b1aa059..489b4b6 100644 >> --- a/include/sysemu/sysemu.h >> +++ b/include/sysemu/sysemu.h >> @@ -129,8 +129,11 @@ extern QEMUClockType rtc_clock; >> #define MAX_NODES 64 >> #define MAX_CPUMASK_BITS 255 >> extern int nb_numa_nodes; >> +extern int nb_numa_mem_nodes; >> extern uint64_t node_mem[MAX_NODES]; >> extern unsigned long *node_cpumask[MAX_NODES]; >> +extern QemuOptsList qemu_numa_opts; >> +int numa_init_func(QemuOpts *opts, void *opaque); >> >> #define MAX_OPTION_ROMS 16 >> typedef struct QEMUOptionRom { >> diff --git a/numa.c b/numa.c >> new file mode 100644 >> index 000..e6924f4 >> --- /dev/null >> +++ b/numa.c >> @@ -0,0 +1,144 @@ >> +/* >> + * QEMU System Emulator >> + * >> + * Copyright (c) 2013 Fujitsu Ltd. >> + * Author: Wanlong Gao >> + * >> + * Permission is hereby granted, free of charge, to any person obtaining a >> copy >> + * of this software and associated documentation files (the "Software"), to >> deal >> + * in the Software without restriction, including without limitation the >> rights >> + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell >> + * copies of the Software, and to permit persons to whom the Software is >> + * furnished to do so, subject to the following conditions: >> + * >> + * The above copyright notice and this permission notice shall be included >> in >> + * all copies or substantial portions of the Software. >> + * >> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS >> OR >> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, >> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL >> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR >> OTHER >> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING >> FROM, >> + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN >> + * THE SOFTWARE. >> + */ >> + >> +#include "sysemu/sysemu.h" >> +#include "qemu/bitmap.h" >> +#include "qapi-visit.h" >> +#include "qapi/opts-visitor.h" >> +#include "qapi/dealloc-visitor.h" >> + >> +QemuOptsList qemu_numa_opts = { >> +.name = "numa", >> +.implied_opt_name = "type", >> +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), >> +.desc = { { 0 } } /* validated with OptsVisitor */ >> +}; >> + >> +static int numa_node_parse(NumaNodeOptions *opts) >> +{ >> +uint16_t nodenr; >> +uint16List *cpus = NULL; >> + >> +if (opts->has
Re: [Qemu-devel] [PATCH V11 06/11] NUMA: parse guest numa nodes memory policy
On 09/04/2013 10:28 AM, Eduardo Habkost wrote: > On Fri, Aug 30, 2013 at 11:10:45AM +0800, Wanlong Gao wrote: >> The memory policy setting format is like: >> >> policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N >> And we are adding this setting as a suboption of "-numa mem,", >> the memory policy then can be set like following: >> -numa node,nodeid=0,cpus=0 \ >> -numa node,nodeid=1,cpus=1 \ >> -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ >> -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 >> >> Signed-off-by: Wanlong Gao >> --- >> include/sysemu/sysemu.h | 5 - >> numa.c | 13 + >> qapi-schema.json| 33 +++-- >> vl.c| 3 +++ >> 4 files changed, 51 insertions(+), 3 deletions(-) >> >> diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h >> index e1e4320..2d04bad 100644 >> --- a/include/sysemu/sysemu.h >> +++ b/include/sysemu/sysemu.h >> @@ -127,13 +127,16 @@ extern size_t boot_splash_filedata_size; >> extern uint8_t qemu_extra_params_fw[2]; >> extern QEMUClockType rtc_clock; >> >> -#define MAX_NODES 64 >> +#define MAX_NODES 128 > > Can you please include this in a separate patch? OK, thank you. Regards, Wanlong Gao >
[Qemu-devel] [PATCH V11 10/11] NUMA: add qmp command query-numa
Add qmp command query-numa to show guest NUMA information. Signed-off-by: Wanlong Gao --- numa.c | 65 qapi-schema.json | 36 +++ qmp-commands.hx | 49 ++ 3 files changed, 150 insertions(+) diff --git a/numa.c b/numa.c index b911ffb..b0472c0 100644 --- a/numa.c +++ b/numa.c @@ -379,3 +379,68 @@ error: numa_info[nodeid].relative = old_relative; return; } + +NUMAInfoList *qmp_query_numa(Error **errp) +{ +NUMAInfoList *head = NULL, *cur_item = NULL; +CPUState *cpu; +int i; + +for (i = 0; i < nb_numa_nodes; i++) { +NUMAInfoList *info; +uint16List *cur_cpu_item = NULL; +info = g_malloc0(sizeof(*info)); +info->value = g_malloc0(sizeof(*info->value)); +info->value->nodeid = i; +for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { +if (cpu->numa_node == i) { +uint16List *node_cpu = g_malloc0(sizeof(*node_cpu)); +node_cpu->value = cpu->cpu_index; + +if (!cur_cpu_item) { +info->value->cpus = cur_cpu_item = node_cpu; +} else { +cur_cpu_item->next = node_cpu; +cur_cpu_item = node_cpu; +} +} +} +info->value->memory = numa_info[i].node_mem; + +#ifdef __linux__ +info->value->policy = numa_info[i].policy; +info->value->relative = numa_info[i].relative; + +unsigned long first, next; +next = first = find_first_bit(numa_info[i].host_mem, MAX_NODES); +if (first == MAX_NODES) { +goto end; +} +uint16List *cur_node_item = g_malloc0(sizeof(*cur_node_item)); +cur_node_item->value = first; +info->value->host_nodes = cur_node_item; +do { +next = find_next_bit(numa_info[i].host_mem, MAX_NODES, + next + 1); +if (next == MAX_NODES) { +break; +} + +uint16List *host_node = g_malloc0(sizeof(*host_node)); +host_node->value = next; +cur_node_item->next = host_node; +cur_node_item = host_node; +} while (true); +end: +#endif + +if (!cur_item) { +head = cur_item = info; +} else { +cur_item->next = info; +cur_item = info; +} +} + +return head; +} diff --git a/qapi-schema.json b/qapi-schema.json index 5e04b58..fb58c1d 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3870,3 +3870,39 @@ { 'command': 'set-mem-policy', 'data': {'nodeid': 'uint16', '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] } } + +## +# @NUMAInfo: +# +# Information about guest NUMA nodes +# +# @nodeid: NUMA node ID +# +# @cpus: VCPUs contained in this node +# +# @memory: memory size of this node +# +# @policy: memory policy of this node +# +# @relative: if host nodes are relative for memory policy +# +# @host-nodes: host nodes for its memory policy +# +# Since: 1.7 +# +## +{ 'type': 'NUMAInfo', + 'data': {'nodeid': 'uint16', 'cpus': ['uint16'], 'memory': 'uint64', + 'policy': 'NumaNodePolicy', 'relative': 'bool', + 'host-nodes': ['uint16'] }} + +## +# @query-numa: +# +# Returns a list of information about each guest node. +# +# Returns: a list of @NUMAInfo for each guest node +# +# Since: 1.7 +## +{ 'command': 'query-numa', 'returns': ['NUMAInfo'] } diff --git a/qmp-commands.hx b/qmp-commands.hx index 52e6ff3..20f1e74 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3165,3 +3165,52 @@ Notes: to all available host nodes. EQMP + +{ +.name = "query-numa", +.args_type = "", +.mhandler.cmd_new = qmp_marshal_input_query_numa, +}, + +SQMP +query-numa +- + +Show NUMA information. + +Return a json-array. Each NUMA node is represented by a json-object, +which contains: + +- "nodeid": NUMA node ID (json-int) +- "cpus": a json-arry of contained VCPUs +- "memory": amount of memory in each node in Byte (json-int) +- "policy": memory policy of this node (json-string) +- "relative": if host nodes is relative for its memory policy (json-bool) +- "host-nodes": a json-array of host nodes for its memory policy + +Arguments: + +Example: + +-> { "excute": "query-numa" } +<- { "return":[ +{ +"nodeid": 0, +"cpus": [0, 1], +"memory": 536870912, +"policy": "membind", +"relative": false, +"host-nodes": [0, 1] +}, +{ +"nodeid": 1, +"cpus": [2, 3], +"memory": 536870912, +"policy": "interleave", +"relative": false, +"host-nodes": [1] +} + ] + } + +EQMP -- 1.8.4
[Qemu-devel] [PATCH V11 01/11] NUMA: add NumaOptions, NumaNodeOptions and NumaMemOptions
These are used to generate stuff for OptsVisitor. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- qapi-schema.json | 47 +++ 1 file changed, 47 insertions(+) diff --git a/qapi-schema.json b/qapi-schema.json index a51f7d2..11851a1 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3773,3 +3773,50 @@ ## { 'command': 'query-rx-filter', 'data': { '*name': 'str' }, 'returns': ['RxFilterInfo'] } + +## +# @NumaOptions +# +# A discriminated record of NUMA options. (for OptsVisitor) +# +# Since 1.7 +## +{ 'union': 'NumaOptions', + 'data': { +'node': 'NumaNodeOptions', +'mem': 'NumaMemOptions' }} + +## +# @NumaNodeOptions +# +# Create a guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @cpus: #optional VCPUs belong to this node +# +# @mem: #optional memory size of this node (remain as legacy) +# +# Since: 1.7 +## +{ 'type': 'NumaNodeOptions', + 'data': { + '*nodeid': 'uint16', + '*cpus': ['uint16'], + '*mem':'str' }} + +## +# @NumaMemOptions +# +# Set memory information of guest NUMA node. (for OptsVisitor) +# +# @nodeid: #optional NUMA node ID +# +# @size: #optional memory size of this node +# +# Since 1.7 +## +{ 'type': 'NumaMemOptions', + 'data': { + '*nodeid': 'uint16', + '*size': 'size' }} -- 1.8.4
[Qemu-devel] [PATCH V11 05/11] NUMA: Add numa_info structure to contain numa nodes info
Add the numa_info structure to contain the numa nodes memory, VCPUs information and the future added numa nodes host memory policies. Reviewed-by: Eduardo Habkost Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- hw/i386/pc.c| 4 ++-- include/sysemu/sysemu.h | 8 ++-- monitor.c | 2 +- numa.c | 23 --- vl.c| 7 +++ 5 files changed, 24 insertions(+), 20 deletions(-) diff --git a/hw/i386/pc.c b/hw/i386/pc.c index 3a620a1..2243184 100644 --- a/hw/i386/pc.c +++ b/hw/i386/pc.c @@ -653,14 +653,14 @@ static FWCfgState *bochs_bios_init(void) unsigned int apic_id = x86_cpu_apic_id_from_index(i); assert(apic_id < apic_id_limit); for (j = 0; j < nb_numa_nodes; j++) { -if (test_bit(i, node_cpumask[j])) { +if (test_bit(i, numa_info[j].node_cpu)) { numa_fw_cfg[apic_id + 1] = cpu_to_le64(j); break; } } } for (i = 0; i < nb_numa_nodes; i++) { -numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(node_mem[i]); +numa_fw_cfg[apic_id_limit + 1 + i] = cpu_to_le64(numa_info[i].node_mem); } fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA, numa_fw_cfg, (1 + apic_id_limit + nb_numa_nodes) * diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 9440edb..e1e4320 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -9,6 +9,7 @@ #include "qapi-types.h" #include "qemu/notify.h" #include "qemu/main-loop.h" +#include "qemu/bitmap.h" /* vl.c */ @@ -130,8 +131,11 @@ extern QEMUClockType rtc_clock; #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; -extern uint64_t node_mem[MAX_NODES]; -extern unsigned long *node_cpumask[MAX_NODES]; +typedef struct node_info { +uint64_t node_mem; +DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +} NodeInfo; +extern NodeInfo numa_info[MAX_NODES]; extern QemuOptsList qemu_numa_opts; int numa_init_func(QemuOpts *opts, void *opaque); void set_numa_nodes(void); diff --git a/monitor.c b/monitor.c index ee9744c..e5fc240 100644 --- a/monitor.c +++ b/monitor.c @@ -1826,7 +1826,7 @@ static void do_info_numa(Monitor *mon, const QDict *qdict) } monitor_printf(mon, "\n"); monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -node_mem[i] >> 20); +numa_info[i].node_mem >> 20); } } diff --git a/numa.c b/numa.c index 035fb86..3e2dfc1 100644 --- a/numa.c +++ b/numa.c @@ -53,7 +53,7 @@ static int numa_node_parse(NumaNodeOptions *opts) } for (cpus = opts->cpus; cpus; cpus = cpus->next) { -bitmap_set(node_cpumask[nodenr], cpus->value, 1); +bitmap_set(numa_info[nodenr].node_cpu, cpus->value, 1); } if (opts->has_mem) { @@ -64,7 +64,7 @@ static int numa_node_parse(NumaNodeOptions *opts) fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); return -1; } -node_mem[nodenr] = mem_size; +numa_info[nodenr].node_mem = mem_size; } return 0; @@ -88,7 +88,7 @@ static int numa_mem_parse(NumaMemOptions *opts) if (opts->has_size) { mem_size = opts->size; -node_mem[nodenr] = mem_size; +numa_info[nodenr].node_mem = mem_size; } return 0; @@ -160,7 +160,7 @@ void set_numa_nodes(void) * and distribute the available memory equally across all nodes */ for (i = 0; i < nb_numa_nodes; i++) { -if (node_mem[i] != 0) { +if (numa_info[i].node_mem != 0) { break; } } @@ -172,15 +172,16 @@ void set_numa_nodes(void) * the final node gets the rest. */ for (i = 0; i < nb_numa_nodes - 1; i++) { -node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); -usedmem += node_mem[i]; +numa_info[i].node_mem = (ram_size / nb_numa_nodes) & +~((1 << 23UL) - 1); +usedmem += numa_info[i].node_mem; } -node_mem[i] = ram_size - usedmem; +numa_info[i].node_mem = ram_size - usedmem; } uint64_t numa_total = 0; for (i = 0; i < nb_numa_nodes; i++) { -numa_total += node_mem[i]; +numa_total += numa_info[i].node_mem; } if (numa_total != ram_size) { fprintf(stderr, "qemu: numa nodes total memory size " @@ -189,7 +190,7 @@ void set_numa_nodes(void) } for (i = 0; i < nb_numa_nodes; i++) { -if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { +if (!bitmap_empty(
[Qemu-devel] [PATCH V11 07/11] NUMA: set guest numa nodes memory policy
Set the guest numa nodes memory policies using the mbind(2) system call node by node. After this patch, we are able to set guest nodes memory policies through the QEMU options, this arms to solve the guest cross nodes memory access performance issue. And as you all know, if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policies before the pages are really mapped. Signed-off-by: Andre Przywara Signed-off-by: Wanlong Gao --- numa.c | 85 ++ 1 file changed, 85 insertions(+) diff --git a/numa.c b/numa.c index 4ccc6cb..89be03d 100644 --- a/numa.c +++ b/numa.c @@ -28,6 +28,15 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "exec/memory.h" + +#ifdef __linux__ +#include +#ifndef MPOL_F_RELATIVE_NODES +#define MPOL_F_RELATIVE_NODES (1 << 14) +#define MPOL_F_STATIC_NODES (1 << 15) +#endif +#endif QemuOptsList qemu_numa_opts = { .name = "numa", @@ -219,6 +228,75 @@ void set_numa_nodes(void) } } +#ifdef __linux__ +static int node_parse_bind_mode(unsigned int nodeid) +{ +int bind_mode; + +switch (numa_info[nodeid].policy) { +case NUMA_NODE_POLICY_DEFAULT: +case NUMA_NODE_POLICY_PREFERRED: +case NUMA_NODE_POLICY_MEMBIND: +case NUMA_NODE_POLICY_INTERLEAVE: +bind_mode = numa_info[nodeid].policy; +break; +default: +bind_mode = NUMA_NODE_POLICY_DEFAULT; +return bind_mode; +} + +bind_mode |= numa_info[nodeid].relative ? +MPOL_F_RELATIVE_NODES : MPOL_F_STATIC_NODES; + +return bind_mode; +} +#endif + +static int set_node_mem_policy(int nodeid) +{ +#ifdef __linux__ +void *ram_ptr; +RAMBlock *block; +ram_addr_t len, ram_offset = 0; +int bind_mode; +int i; + +QTAILQ_FOREACH(block, &ram_list.blocks, next) { +if (!strcmp(block->mr->name, "pc.ram")) { +break; +} +} + +if (block->host == NULL) { +return -1; +} + +ram_ptr = block->host; +for (i = 0; i < nodeid; i++) { +len = numa_info[i].node_mem; +ram_offset += len; +} + +len = numa_info[nodeid].node_mem; +bind_mode = node_parse_bind_mode(nodeid); +unsigned long *nodes = numa_info[nodeid].host_mem; + +/* This is a workaround for a long standing bug in Linux' + * mbind implementation, which cuts off the last specified + * node. To stay compatible should this bug be fixed, we + * specify one more node and zero this one out. + */ +unsigned long maxnode = find_last_bit(nodes, MAX_NODES); +if (syscall(SYS_mbind, ram_ptr + ram_offset, len, bind_mode, +nodes, maxnode + 2, 0)) { +perror("mbind"); +return -1; +} +#endif + +return 0; +} + void set_numa_modes(void) { CPUState *cpu; @@ -231,4 +309,11 @@ void set_numa_modes(void) } } } + +for (i = 0; i < nb_numa_nodes; i++) { +if (set_node_mem_policy(i) == -1) { +fprintf(stderr, +"qemu: can not set host memory policy for node%d\n", i); +} +} } -- 1.8.4
[Qemu-devel] [PATCH V11 00/11] Add support for binding guest numa nodes to host numa nodes
As you know, QEMU can't direct it's memory allocation now, this may cause guest cross node access performance regression. And, the worse thing is that if PCI-passthrough is used, direct-attached-device uses DMA transfer between device and qemu process. All pages of the guest will be pinned by get_user_pages(). KVM_ASSIGN_PCI_DEVICE ioctl kvm_vm_ioctl_assign_device() =>kvm_assign_device() => kvm_iommu_map_memslots() => kvm_iommu_map_pages() => kvm_pin_pages() So, with direct-attached-device, all guest page's page count will be +1 and any page migration will not work. AutoNUMA won't too. So, we should set the guest nodes memory allocation policy before the pages are really mapped. According to this patch set, we are able to set guest nodes memory policy like following: -numa node,nodeid=0,cpus=0, \ -numa mem,size=1024M,policy=membind,host-nodes=0-1 \ -numa node,nodeid=1,cpus=1 \ -numa mem,size=1024M,policy=interleave,host-nodes=1 This supports "policy={default|membind|interleave|preferred},relative=true,host-nodes=N-N" like format. Also add "set-mem-policy" QMP and hmp command to set memory policy. And add a QMP command "query-numa" to show numa info through this API. And convert the "info numa" monitor command to use this QMP command "query-numa". V1->V2: change to use QemuOpts in numa options (Paolo) handle Error in mpol parser (Paolo) change qmp command format to mem-policy=membind,mem-hostnode=0-1 like (Paolo) V2->V3: also handle Error in cpus parser (5/10) split out common parser from cpus and hostnode parser (Bandan 6/10) V3-V4: rebase to request for comments V4->V5: use OptVisitor and split -numa option (Paolo) - s/set-mpol/set-mem-policy (Andreas) - s/mem-policy/policy - s/mem-hostnode/host-nodes fix hmp command process after error (Luiz) add qmp command query-numa and convert info numa to it (Luiz) V5->V6: remove tabs in json file (Laszlo, Paolo) add back "-numa node,mem=xxx" as legacy (Paolo) change cpus and host-nodes to array (Laszlo, Eric) change "nodeid" to "uint16" add NumaMemPolicy enum type (Eric) rebased on Laszlo's "OptsVisitor: support / flatten integer ranges for repeating options" patch set, thanks for Laszlo's help V6-V7: change UInt16 to uint16 (Laszlo) fix a typo in adding qmp command set-mem-policy V7-V8: rebase to current master with Laszlo's V2 of OptsVisitor patch set fix an adding white space line error V8->V9: rebase to current master check if total numa memory size is equal to ram_size (Paolo) add comments to the OptsVisitor stuff in qapi-schema.json (Eric, Laszlo) replace the use of numa_num_configured_nodes() (Andrew) avoid abusing the fact i==nodeid (Andrew) V9->V10: rebase to current master remove libnuma (Andrew) MAX_NODES=64 -> MAX_NODES=128 since libnuma selected 128 (Andrew) use MAX_NODES instead of MAX_CPUMASK_BITS for host_mem bitmap (Andrew) remove a useless clear_bit() operation (Andrew) V10->V11: rebase to current master fix "maxnode" argument of mbind(2) *I hope this can catch up the train of 1.7.* Thanks, Wanlong Gao Wanlong Gao (11): NUMA: add NumaOptions, NumaNodeOptions and NumaMemOptions NUMA: split -numa option NUMA: check if the total numa memory size is equal to ram_size NUMA: move numa related code to numa.c NUMA: Add numa_info structure to contain numa nodes info NUMA: parse guest numa nodes memory policy NUMA: set guest numa nodes memory policy NUMA: add qmp command set-mem-policy to set memory policy for NUMA node NUMA: add hmp command set-mem-policy NUMA: add qmp command query-numa NUMA: convert hmp command info_numa to use qmp command query_numa Makefile.target | 2 +- cpus.c | 14 -- hmp-commands.hx | 16 ++ hmp.c | 119 + hmp.h | 2 + hw/i386/pc.c| 4 +- include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 18 +- monitor.c | 21 +-- numa.c | 446 qapi-schema.json| 133 +++ qemu-options.hx | 6 +- qmp-commands.hx | 90 ++ vl.c| 160 ++--- 14 files changed, 847 insertions(+), 185 deletions(-) create mode 100644 numa.c -- 1.8.4
[Qemu-devel] [PATCH V11 11/11] NUMA: convert hmp command info_numa to use qmp command query_numa
Signed-off-by: Wanlong Gao --- hmp.c | 54 ++ hmp.h | 1 + monitor.c | 21 + 3 files changed, 56 insertions(+), 20 deletions(-) diff --git a/hmp.c b/hmp.c index ae695b0..2d878c6 100644 --- a/hmp.c +++ b/hmp.c @@ -27,6 +27,7 @@ #include "qapi-visit.h" #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" +#include "sysemu/sysemu.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1579,3 +1580,56 @@ error: hmp_handle_error(mon, &local_err); } + +void hmp_info_numa(Monitor *mon, const QDict *qdict) +{ +NUMAInfoList *node_list, *node; +uint16List *head; +int nodeid; +char *policy_str = NULL; + +node_list = qmp_query_numa(NULL); + +monitor_printf(mon, "%d nodes\n", nb_numa_nodes); +for (node = node_list; node; node = node->next) { +nodeid = node->value->nodeid; +monitor_printf(mon, "node %d cpus:", nodeid); +head = node->value->cpus; +for (head = node->value->cpus; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +monitor_printf(mon, "node %d size: %" PRId64 " MB\n", + nodeid, node->value->memory >> 20); +switch (node->value->policy) { +case NUMA_NODE_POLICY_DEFAULT: +policy_str = g_strdup("default"); +break; +case NUMA_NODE_POLICY_PREFERRED: +policy_str = g_strdup("preferred"); +break; +case NUMA_NODE_POLICY_MEMBIND: +policy_str = g_strdup("membind"); +break; +case NUMA_NODE_POLICY_INTERLEAVE: +policy_str = g_strdup("interleave"); +break; +default: +break; +} +monitor_printf(mon, "node %d policy: %s\n", + nodeid, policy_str ? : " "); +if (policy_str) { +free(policy_str); +} +monitor_printf(mon, "node %d relative: %s\n", nodeid, + node->value->relative ? "true" : "false"); +monitor_printf(mon, "node %d host-nodes:", nodeid); +for (head = node->value->host_nodes; head != NULL; head = head->next) { +monitor_printf(mon, " %d", (int)head->value); +} +monitor_printf(mon, "\n"); +} + +qapi_free_NUMAInfoList(node_list); +} diff --git a/hmp.h b/hmp.h index ae09525..56a5efd 100644 --- a/hmp.h +++ b/hmp.h @@ -37,6 +37,7 @@ void hmp_info_balloon(Monitor *mon, const QDict *qdict); void hmp_info_pci(Monitor *mon, const QDict *qdict); void hmp_info_block_jobs(Monitor *mon, const QDict *qdict); void hmp_info_tpm(Monitor *mon, const QDict *qdict); +void hmp_info_numa(Monitor *mon, const QDict *qdict); void hmp_quit(Monitor *mon, const QDict *qdict); void hmp_stop(Monitor *mon, const QDict *qdict); void hmp_system_reset(Monitor *mon, const QDict *qdict); diff --git a/monitor.c b/monitor.c index e5fc240..3904f48 100644 --- a/monitor.c +++ b/monitor.c @@ -1811,25 +1811,6 @@ static void do_info_mtree(Monitor *mon, const QDict *qdict) mtree_info((fprintf_function)monitor_printf, mon); } -static void do_info_numa(Monitor *mon, const QDict *qdict) -{ -int i; -CPUState *cpu; - -monitor_printf(mon, "%d nodes\n", nb_numa_nodes); -for (i = 0; i < nb_numa_nodes; i++) { -monitor_printf(mon, "node %d cpus:", i); -for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { -if (cpu->numa_node == i) { -monitor_printf(mon, " %d", cpu->cpu_index); -} -} -monitor_printf(mon, "\n"); -monitor_printf(mon, "node %d size: %" PRId64 " MB\n", i, -numa_info[i].node_mem >> 20); -} -} - #ifdef CONFIG_PROFILER int64_t qemu_time; @@ -2597,7 +2578,7 @@ static mon_cmd_t info_cmds[] = { .args_type = "", .params = "", .help = "show NUMA information", -.mhandler.cmd = do_info_numa, +.mhandler.cmd = hmp_info_numa, }, { .name = "usb", -- 1.8.4
[Qemu-devel] [PATCH V11 04/11] NUMA: move numa related code to numa.c
Signed-off-by: Wanlong Gao --- cpus.c | 14 - include/sysemu/cpus.h | 1 - include/sysemu/sysemu.h | 2 ++ numa.c | 76 + vl.c| 57 + 5 files changed, 79 insertions(+), 71 deletions(-) diff --git a/cpus.c b/cpus.c index b9e5685..2f0a750 100644 --- a/cpus.c +++ b/cpus.c @@ -1235,20 +1235,6 @@ static void tcg_exec_all(void) exit_request = 0; } -void set_numa_modes(void) -{ -CPUState *cpu; -int i; - -for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { -for (i = 0; i < nb_numa_nodes; i++) { -if (test_bit(cpu->cpu_index, node_cpumask[i])) { -cpu->numa_node = i; -} -} -} -} - void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg) { /* XXX: implement xxx_cpu_list for targets that still miss it */ diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h index 6502488..4f79081 100644 --- a/include/sysemu/cpus.h +++ b/include/sysemu/cpus.h @@ -23,7 +23,6 @@ extern int smp_threads; #define smp_threads 1 #endif -void set_numa_modes(void); void list_cpus(FILE *f, fprintf_function cpu_fprintf, const char *optarg); #endif diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index 489b4b6..9440edb 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -134,6 +134,8 @@ extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; extern QemuOptsList qemu_numa_opts; int numa_init_func(QemuOpts *opts, void *opaque); +void set_numa_nodes(void); +void set_numa_modes(void); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c index e6924f4..035fb86 100644 --- a/numa.c +++ b/numa.c @@ -142,3 +142,79 @@ error: return ret; } + +void set_numa_nodes(void) +{ +if (nb_numa_mem_nodes > nb_numa_nodes) { +nb_numa_nodes = nb_numa_mem_nodes; +} + +if (nb_numa_nodes > 0) { +int i; + +if (nb_numa_nodes > MAX_NODES) { +nb_numa_nodes = MAX_NODES; +} + +/* If no memory size if given for any node, assume the default case + * and distribute the available memory equally across all nodes + */ +for (i = 0; i < nb_numa_nodes; i++) { +if (node_mem[i] != 0) { +break; +} +} + +if (i == nb_numa_nodes) { +uint64_t usedmem = 0; + +/* On Linux, the each node's border has to be 8MB aligned, + * the final node gets the rest. + */ +for (i = 0; i < nb_numa_nodes - 1; i++) { +node_mem[i] = (ram_size / nb_numa_nodes) & ~((1 << 23UL) - 1); +usedmem += node_mem[i]; +} +node_mem[i] = ram_size - usedmem; +} + +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + +for (i = 0; i < nb_numa_nodes; i++) { +if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { +break; +} +} +/* assigning the VCPUs round-robin is easier to implement, guest OSes + * must cope with this anyway, because there are BIOSes out there in + * real machines which also use this scheme. + */ +if (i == nb_numa_nodes) { +for (i = 0; i < max_cpus; i++) { +set_bit(i, node_cpumask[i % nb_numa_nodes]); +} +} +} +} + +void set_numa_modes(void) +{ +CPUState *cpu; +int i; + +for (cpu = first_cpu; cpu != NULL; cpu = cpu->next_cpu) { +for (i = 0; i < nb_numa_nodes; i++) { +if (test_bit(cpu->cpu_index, node_cpumask[i])) { +cpu->numa_node = i; +} +} +} +} diff --git a/vl.c b/vl.c index 92aacc1..0828aa3 100644 --- a/vl.c +++ b/vl.c @@ -4143,62 +4143,7 @@ int main(int argc, char **argv, char **envp) exit(1); } -if (nb_numa_mem_nodes > nb_numa_nodes) { -nb_numa_nodes = nb_numa_mem_nodes; -} - -if (nb_numa_nodes > 0) { -int i; - -if (nb_numa_nodes > MAX_NODES) { -nb_numa_nodes = MAX_NODES; -} - -/* If no memory size if given for any node, assume the default case - * and distribute the available memory equally across all nodes - */ -for (i = 0; i < nb_numa_nodes; i++) { -if (node_mem[i] != 0) -break; -} -if (i == nb_numa_nodes) { -uint64_t usedmem = 0
[Qemu-devel] [PATCH V11 08/11] NUMA: add qmp command set-mem-policy to set memory policy for NUMA node
This QMP command allows user set guest node's memory policy through the QMP protocol. The qmp-shell command is like: set-mem-policy nodeid=0 policy=membind relative=true host-nodes=0-1 Signed-off-by: Wanlong Gao --- numa.c | 62 qapi-schema.json | 21 +++ qmp-commands.hx | 41 + 3 files changed, 124 insertions(+) diff --git a/numa.c b/numa.c index 89be03d..b911ffb 100644 --- a/numa.c +++ b/numa.c @@ -29,6 +29,7 @@ #include "qapi/opts-visitor.h" #include "qapi/dealloc-visitor.h" #include "exec/memory.h" +#include "qmp-commands.h" #ifdef __linux__ #include @@ -317,3 +318,64 @@ void set_numa_modes(void) } } } + +void qmp_set_mem_policy(uint16_t nodeid, bool has_policy, NumaNodePolicy policy, +bool has_relative, bool relative, +bool has_host_nodes, uint16List *host_nodes, +Error **errp) +{ +NumaNodePolicy old_policy; +bool old_relative; +DECLARE_BITMAP(host_mem, MAX_NODES); +uint16List *nodes; + +if (nodeid >= nb_numa_nodes) { +error_setg(errp, "Only has '%d' NUMA nodes", nb_numa_nodes); +return; +} + +bitmap_copy(host_mem, numa_info[nodeid].host_mem, MAX_NODES); +old_policy = numa_info[nodeid].policy; +old_relative = numa_info[nodeid].relative; + +numa_info[nodeid].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[nodeid].relative = false; +bitmap_zero(numa_info[nodeid].host_mem, MAX_NODES); + +if (!has_policy) { +if (set_node_mem_policy(nodeid) == -1) { +error_setg(errp, "Failed to set memory policy for node%" PRIu16, + nodeid); +goto error; +} +return; +} + +numa_info[nodeid].policy = policy; + +if (has_relative) { +numa_info[nodeid].relative = relative; +} + +if (!has_host_nodes) { +bitmap_fill(numa_info[nodeid].host_mem, MAX_NODES); +} + +for (nodes = host_nodes; nodes; nodes = nodes->next) { +bitmap_set(numa_info[nodeid].host_mem, nodes->value, 1); +} + +if (set_node_mem_policy(nodeid) == -1) { +error_setg(errp, "Failed to set memory policy for node%" PRIu16, + nodeid); +goto error; +} + +return; + +error: +bitmap_copy(numa_info[nodeid].host_mem, host_mem, MAX_NODES); +numa_info[nodeid].policy = old_policy; +numa_info[nodeid].relative = old_relative; +return; +} diff --git a/qapi-schema.json b/qapi-schema.json index 2160130..5e04b58 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3849,3 +3849,24 @@ '*policy': 'NumaNodePolicy', '*relative': 'bool', '*host-nodes': ['uint16'] }} + +## +# @set-mem-policy: +# +# Set the host memory binding policy for guest NUMA node. +# +# @nodeid: The node ID of guest NUMA node to set memory policy to. +# +# @policy: #optional The memory policy to be set (default 'default'). +# +# @relative: #optional If the specified nodes are relative (default 'false') +# +# @host-nodes: #optional The host nodes range for memory policy. +# +# Returns: Nothing on success +# +# Since: 1.7 +## +{ 'command': 'set-mem-policy', + 'data': {'nodeid': 'uint16', '*policy': 'NumaNodePolicy', + '*relative': 'bool', '*host-nodes': ['uint16'] } } diff --git a/qmp-commands.hx b/qmp-commands.hx index cf47e3f..52e6ff3 100644 --- a/qmp-commands.hx +++ b/qmp-commands.hx @@ -3061,6 +3061,7 @@ Example: <- { "return": {} } EQMP + { .name = "query-rx-filter", .args_type = "name:s?", @@ -3124,3 +3125,43 @@ Example: } EQMP + +{ +.name = "set-mem-policy", +.args_type = "nodeid:i,policy:s?,relative:b?,host-nodes:q?", +.help = "Set the host memory binding policy for guest NUMA node", +.mhandler.cmd_new = qmp_marshal_input_set_mem_policy, +}, + +SQMP +set-mem-policy +-- + +Set the host memory binding policy for guest NUMA node + +Arguments: + +- "nodeid": The nodeid of guest NUMA node to set memory policy to. +(json-int) +- "policy": The memory policy to set. +(json-string, optional) +- "relative": If the specified nodes are relative. + (json-bool, optional) +- "host-nodes": The host nodes contained to this memory policy. +(a json-array of int, optional) + +Example: + +-> { "execute": "set-mem-policy", "arguments": { "nodeid": 0, +
[Qemu-devel] [PATCH V11 06/11] NUMA: parse guest numa nodes memory policy
The memory policy setting format is like: policy={default|membind|interleave|preferred}[,relative=true],host-nodes=N-N And we are adding this setting as a suboption of "-numa mem,", the memory policy then can be set like following: -numa node,nodeid=0,cpus=0 \ -numa node,nodeid=1,cpus=1 \ -numa mem,nodeid=0,size=1G,policy=membind,host-nodes=0-1 \ -numa mem,nodeid=1,size=1G,policy=interleave,relative=true,host-nodes=1 Signed-off-by: Wanlong Gao --- include/sysemu/sysemu.h | 5 - numa.c | 13 + qapi-schema.json| 33 +++-- vl.c| 3 +++ 4 files changed, 51 insertions(+), 3 deletions(-) diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index e1e4320..2d04bad 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -127,13 +127,16 @@ extern size_t boot_splash_filedata_size; extern uint8_t qemu_extra_params_fw[2]; extern QEMUClockType rtc_clock; -#define MAX_NODES 64 +#define MAX_NODES 128 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; extern int nb_numa_mem_nodes; typedef struct node_info { uint64_t node_mem; DECLARE_BITMAP(node_cpu, MAX_CPUMASK_BITS); +DECLARE_BITMAP(host_mem, MAX_NODES); +NumaNodePolicy policy; +bool relative; } NodeInfo; extern NodeInfo numa_info[MAX_NODES]; extern QemuOptsList qemu_numa_opts; diff --git a/numa.c b/numa.c index 3e2dfc1..4ccc6cb 100644 --- a/numa.c +++ b/numa.c @@ -74,6 +74,7 @@ static int numa_mem_parse(NumaMemOptions *opts) { uint16_t nodenr; uint64_t mem_size; +uint16List *nodes; if (opts->has_nodeid) { nodenr = opts->nodeid; @@ -91,6 +92,18 @@ static int numa_mem_parse(NumaMemOptions *opts) numa_info[nodenr].node_mem = mem_size; } +if (opts->has_policy) { +numa_info[nodenr].policy = opts->policy; +} + +if (opts->has_relative) { +numa_info[nodenr].relative = opts->relative; +} + +for (nodes = opts->host_nodes; nodes; nodes = nodes->next) { +bitmap_set(numa_info[nodenr].host_mem, nodes->value, 1); +} + return 0; } diff --git a/qapi-schema.json b/qapi-schema.json index 11851a1..2160130 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -3806,6 +3806,26 @@ '*mem':'str' }} ## +# @NumaNodePolicy +# +# NUMA node policy types +# +# @default: restore default policy, remove any nondefault policy +# +# @preferred: set the preferred node for allocation +# +# @membind: a strict policy that restricts memory allocation to the +# nodes specified +# +# @interleave: the page allocations is interleaved across the set +# of nodes specified +# +# Since 1.7 +## +{ 'enum': 'NumaNodePolicy', + 'data': [ 'default', 'preferred', 'membind', 'interleave' ] } + +## # @NumaMemOptions # # Set memory information of guest NUMA node. (for OptsVisitor) @@ -3814,9 +3834,18 @@ # # @size: #optional memory size of this node # +# @policy: #optional memory policy of this node +# +# @relative: #optional if the nodes specified are relative +# +# @host-nodes: #optional host nodes for its memory policy +# # Since 1.7 ## { 'type': 'NumaMemOptions', 'data': { - '*nodeid': 'uint16', - '*size': 'size' }} + '*nodeid': 'uint16', + '*size': 'size', + '*policy': 'NumaNodePolicy', + '*relative': 'bool', + '*host-nodes': ['uint16'] }} diff --git a/vl.c b/vl.c index 99667a0..c4cc623 100644 --- a/vl.c +++ b/vl.c @@ -2888,6 +2888,9 @@ int main(int argc, char **argv, char **envp) for (i = 0; i < MAX_NODES; i++) { numa_info[i].node_mem = 0; bitmap_zero(numa_info[i].node_cpu, MAX_CPUMASK_BITS); +bitmap_zero(numa_info[i].host_mem, MAX_NODES); +numa_info[i].policy = NUMA_NODE_POLICY_DEFAULT; +numa_info[i].relative = false; } nb_numa_nodes = 0; -- 1.8.4
[Qemu-devel] [PATCH V11 02/11] NUMA: split -numa option
Change -numa option like following as Paolo suggested: -numa node,nodeid=0,cpus=0-1 \ -numa mem,nodeid=0,size=1G This new option will make later coming memory hotplug better. And this new option is implemented using OptsVisitor. And just remain "-numa node,mem=xx" as legacy. Reviewed-by: Laszlo Ersek Signed-off-by: Wanlong Gao --- Makefile.target | 2 +- include/sysemu/sysemu.h | 3 + numa.c | 144 qemu-options.hx | 6 +- vl.c| 113 ++--- 5 files changed, 168 insertions(+), 100 deletions(-) create mode 100644 numa.c diff --git a/Makefile.target b/Makefile.target index 9a49852..7e1fddf 100644 --- a/Makefile.target +++ b/Makefile.target @@ -113,7 +113,7 @@ endif #CONFIG_BSD_USER # # System emulator target ifdef CONFIG_SOFTMMU -obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o +obj-y += arch_init.o cpus.o monitor.o gdbstub.o balloon.o ioport.o numa.o obj-y += qtest.o obj-y += hw/ obj-$(CONFIG_FDT) += device_tree.o diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h index b1aa059..489b4b6 100644 --- a/include/sysemu/sysemu.h +++ b/include/sysemu/sysemu.h @@ -129,8 +129,11 @@ extern QEMUClockType rtc_clock; #define MAX_NODES 64 #define MAX_CPUMASK_BITS 255 extern int nb_numa_nodes; +extern int nb_numa_mem_nodes; extern uint64_t node_mem[MAX_NODES]; extern unsigned long *node_cpumask[MAX_NODES]; +extern QemuOptsList qemu_numa_opts; +int numa_init_func(QemuOpts *opts, void *opaque); #define MAX_OPTION_ROMS 16 typedef struct QEMUOptionRom { diff --git a/numa.c b/numa.c new file mode 100644 index 000..e6924f4 --- /dev/null +++ b/numa.c @@ -0,0 +1,144 @@ +/* + * QEMU System Emulator + * + * Copyright (c) 2013 Fujitsu Ltd. + * Author: Wanlong Gao + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "sysemu/sysemu.h" +#include "qemu/bitmap.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" + +QemuOptsList qemu_numa_opts = { +.name = "numa", +.implied_opt_name = "type", +.head = QTAILQ_HEAD_INITIALIZER(qemu_numa_opts.head), +.desc = { { 0 } } /* validated with OptsVisitor */ +}; + +static int numa_node_parse(NumaNodeOptions *opts) +{ +uint16_t nodenr; +uint16List *cpus = NULL; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} +} else { +nodenr = nb_numa_nodes; +} + +for (cpus = opts->cpus; cpus; cpus = cpus->next) { +bitmap_set(node_cpumask[nodenr], cpus->value, 1); +} + +if (opts->has_mem) { +int64_t mem_size; +char *endptr; +mem_size = strtosz(opts->mem, &endptr); +if (mem_size < 0 || *endptr) { +fprintf(stderr, "qemu: invalid numa mem size: %s\n", opts->mem); +return -1; +} +node_mem[nodenr] = mem_size; +} + +return 0; +} + +static int numa_mem_parse(NumaMemOptions *opts) +{ +uint16_t nodenr; +uint64_t mem_size; + +if (opts->has_nodeid) { +nodenr = opts->nodeid; +if (nodenr >= MAX_NODES) { +fprintf(stderr, "qemu: Max number of NUMA nodes reached: %" +PRIu16 "\n", nodenr); +return -1; +} +} else { +nodenr = nb_numa_mem_nodes; +} + +if (opts->has_size) { +mem_size = opts->size; +node_mem[nodenr] = mem_size; +} + +r
[Qemu-devel] [PATCH V11 03/11] NUMA: check if the total numa memory size is equal to ram_size
If the total number of the assigned numa nodes memory is not equal to the assigned ram size, it will write the wrong data to ACPI talb, then the guest will ignore the wrong ACPI table and recognize all memory to one node. It's buggy, we should check it to ensure that we write the right data to ACPI table. Signed-off-by: Wanlong Gao --- vl.c | 10 ++ 1 file changed, 10 insertions(+) diff --git a/vl.c b/vl.c index 0ef5c5a..92aacc1 100644 --- a/vl.c +++ b/vl.c @@ -4174,6 +4174,16 @@ int main(int argc, char **argv, char **envp) node_mem[i] = ram_size - usedmem; } +uint64_t numa_total = 0; +for (i = 0; i < nb_numa_nodes; i++) { +numa_total += node_mem[i]; +} +if (numa_total != ram_size) { +fprintf(stderr, "qemu: numa nodes total memory size " +"should equal to ram_size\n"); +exit(1); +} + for (i = 0; i < nb_numa_nodes; i++) { if (!bitmap_empty(node_cpumask[i], MAX_CPUMASK_BITS)) { break; -- 1.8.4
[Qemu-devel] [PATCH V11 09/11] NUMA: add hmp command set-mem-policy
Add hmp command set-mem-policy to set host memory policy for a guest NUMA node. Then we can also set node's memory policy using the monitor command like: (qemu) set-mem-policy 0 policy=membind,relative=false,host-nodes=0-1 Signed-off-by: Wanlong Gao --- hmp-commands.hx | 16 ++ hmp.c | 65 + hmp.h | 1 + 3 files changed, 82 insertions(+) diff --git a/hmp-commands.hx b/hmp-commands.hx index 8c6b91a..fe3a26f 100644 --- a/hmp-commands.hx +++ b/hmp-commands.hx @@ -1587,6 +1587,22 @@ Executes a qemu-io command on the given block device. ETEXI { +.name = "set-mem-policy", +.args_type = "nodeid:i,args:s?", +.params = "nodeid [args]", +.help = "set host memory policy for a guest NUMA node", +.mhandler.cmd = hmp_set_mem_policy, +}, + +STEXI +@item set-mem-policy @var{nodeid} @var{args} +@findex set-mem-policy + +Set host memory policy for a guest NUMA node + +ETEXI + +{ .name = "info", .args_type = "item:s?", .params = "[subcommand]", diff --git a/hmp.c b/hmp.c index fcca6ae..ae695b0 100644 --- a/hmp.c +++ b/hmp.c @@ -24,6 +24,9 @@ #include "ui/console.h" #include "block/qapi.h" #include "qemu-io.h" +#include "qapi-visit.h" +#include "qapi/opts-visitor.h" +#include "qapi/dealloc-visitor.h" static void hmp_handle_error(Monitor *mon, Error **errp) { @@ -1514,3 +1517,65 @@ void hmp_qemu_io(Monitor *mon, const QDict *qdict) hmp_handle_error(mon, &err); } + +void hmp_set_mem_policy(Monitor *mon, const QDict *qdict) +{ +Error *local_err = NULL; +bool has_policy = true; +bool has_relative = true; +bool has_host_nodes = true; +QemuOpts *opts; +NumaMemOptions *object = NULL; +NumaNodePolicy policy = NUMA_NODE_POLICY_DEFAULT; +bool relative = false; +uint16List *host_nodes = NULL; + +uint64_t nodeid = qdict_get_int(qdict, "nodeid"); +const char *args = qdict_get_try_str(qdict, "args"); + +if (args == NULL) { +has_policy = false; +has_relative = false; +has_host_nodes = false; +} else { +opts = qemu_opts_parse(qemu_find_opts("numa"), args, 1); +if (opts == NULL) { +monitor_printf(mon, "Parsing memory policy args failed\n"); +return; +} else { +OptsVisitor *ov = opts_visitor_new(opts); +visit_type_NumaMemOptions(opts_get_visitor(ov), &object, NULL, + &local_err); +opts_visitor_cleanup(ov); + +if (error_is_set(&local_err)) { +goto error; +} + +has_policy = object->has_policy; +if (has_policy) { +policy = object->policy; +} +has_relative = object->has_relative; +if (has_relative) { +relative = object->relative; +} +has_host_nodes = object->has_host_nodes; +if (has_host_nodes) { +host_nodes = object->host_nodes; +} +} +} + +qmp_set_mem_policy(nodeid, has_policy, policy, has_relative, relative, + has_host_nodes, host_nodes, &local_err); +error: +if (object) { +QapiDeallocVisitor *dv = qapi_dealloc_visitor_new(); +visit_type_NumaMemOptions(qapi_dealloc_get_visitor(dv), + &object, NULL, NULL); +qapi_dealloc_visitor_cleanup(dv); +} + +hmp_handle_error(mon, &local_err); +} diff --git a/hmp.h b/hmp.h index 6c3bdcd..ae09525 100644 --- a/hmp.h +++ b/hmp.h @@ -87,5 +87,6 @@ void hmp_nbd_server_stop(Monitor *mon, const QDict *qdict); void hmp_chardev_add(Monitor *mon, const QDict *qdict); void hmp_chardev_remove(Monitor *mon, const QDict *qdict); void hmp_qemu_io(Monitor *mon, const QDict *qdict); +void hmp_set_mem_policy(Monitor *mon, const QDict *qdict); #endif -- 1.8.4