numaX: cpus=<id[-id],memory=<mb>[[,hostnodes=<id[-id]>][,policy=<preferred|bind|interleave>]]
example: ------- sockets:4 cores:2 memory:4096 numa: 1 numa0: cpus=0-1,memory=1024,hostnodes=0-1,policy=interleave numa1: cpus=2-3,memory=3072,hostnodes=2,policy=bind qemu command line ----------------- -object memory-backend-ram,size=1024M,policy=interleave,host-nodes=0-1,id=ram-node0 -numa node,nodeid=0,cpus=0-1,memdev=ram-node0 -object memory-backend-ram,size=3072M,policy=bind,host-nodes=2,id=ram-node1 -numa node,nodeid=1,cpus=2-3,memdev=ram-node1 Signed-off-by: Alexandre Derumier <aderum...@odiso.com> --- PVE/QemuServer.pm | 114 +++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 107 insertions(+), 7 deletions(-) diff --git a/PVE/QemuServer.pm b/PVE/QemuServer.pm index 94fd523..b758016 100644 --- a/PVE/QemuServer.pm +++ b/PVE/QemuServer.pm @@ -489,6 +489,19 @@ my $MAX_UNUSED_DISKS = 8; my $MAX_HOSTPCI_DEVICES = 4; my $MAX_SERIAL_PORTS = 4; my $MAX_PARALLEL_PORTS = 3; +my $MAX_NUMA = 8; + +my $numadesc = { + optional => 1, + type => 'string', format => 'pve-qm-numanode', + typetext => "cpus=<id[-id],memory=<mb>[[,hostnodes=<id[-id]>][,policy=<preferred|bind|interleave>]]", + description => "numa topology", +}; +PVE::JSONSchema::register_standard_option("pve-qm-numanode", $numadesc); + +for (my $i = 0; $i < $MAX_NUMA; $i++) { + $confdesc->{"numa$i"} = $numadesc; +} my $nic_model_list = ['rtl8139', 'ne2k_pci', 'e1000', 'pcnet', 'virtio', 'ne2k_isa', 'i82551', 'i82557b', 'i82559er', 'vmxnet3']; @@ -1278,6 +1291,31 @@ sub drive_is_cdrom { } +sub parse_numa { + my ($data) = @_; + + my $res = {}; + + foreach my $kvp (split(/,/, $data)) { + + if ($kvp =~ m/^memory=(\S+)$/) { + $res->{memory} = $1; + } elsif ($kvp =~ m/^policy=(preferred|bind|interleave)$/) { + $res->{policy} = $1; + } elsif ($kvp =~ m/^cpus=(\d+)(-(\d+))?$/) { + $res->{cpus}->{start} = $1; + $res->{cpus}->{end} = $3; + } elsif ($kvp =~ m/^hostnodes=(\d+)(-(\d+))?$/) { + $res->{hostnodes}->{start} = $1; + $res->{hostnodes}->{end} = $3; + } else { + return undef; + } + } + + return $res; +} + sub parse_hostpci { my ($value) = @_; @@ -1458,6 +1496,17 @@ sub verify_bootdisk { die "invalid boot disk '$value'\n"; } +PVE::JSONSchema::register_format('pve-qm-numanode', \&verify_numa); +sub verify_numa { + my ($value, $noerr) = @_; + + return $value if parse_numa($value); + + return undef if $noerr; + + die "unable to parse numa options\n"; +} + PVE::JSONSchema::register_format('pve-qm-net', \&verify_net); sub verify_net { my ($value, $noerr) = @_; @@ -2697,18 +2746,69 @@ sub config_to_command { if($conf->{numa}){ - my $numa_memory = ($memory / $sockets)."M"; + my $numa_totalmemory = undef; + for (my $i = 0; $i < $MAX_NUMA; $i++) { + next if !$conf->{"numa$i"}; + my $numa = parse_numa($conf->{"numa$i"}); + next if !$numa; + #memory + die "missing numa node$i memory value" if !$numa->{memory}; + my $numa_memory = $numa->{memory}; + $numa_totalmemory += $numa_memory; + my $numa_object = "memory-backend-ram,id=ram-node$i,size=$numa_memory"."M"; + + #cpus + my $cpus_start = $numa->{cpus}->{start}; + die "missing numa node$i cpus" if !defined($cpus_start); + my $cpus_end = $numa->{cpus}->{end} if defined($numa->{cpus}->{end}); + my $cpus = $cpus_start; + if (defined($cpus_end)) { + $cpus .= "-$cpus_end"; + die "numa node$i : cpu range $cpus is incorrect" if $cpus_end <= $cpus_start; + } - for (my $i = 0; $i < $sockets; $i++) { + #hostnodes + my $hostnodes_start = $numa->{hostnodes}->{start}; + if (defined($hostnodes_start)) { + my $hostnodes_end = $numa->{hostnodes}->{end} if defined($numa->{hostnodes}->{end}); + my $hostnodes = $hostnodes_start; + if (defined($hostnodes_end)) { + $hostnodes .= "-$hostnodes_end"; + die "host node $hostnodes range is incorrect" if $hostnodes_end <= $hostnodes_start; + } - my $cpustart = ($cores * $i); - my $cpuend = ($cpustart + $cores - 1) if $cores && $cores > 1; - my $cpus = $cpustart; - $cpus .= "-$cpuend" if $cpuend; + my $hostnodes_end_range = defined($hostnodes_end) ? $hostnodes_end : $hostnodes_start; + for (my $i = $hostnodes_start; $i <= $hostnodes_end_range; $i++ ) { + die "host numa node$i don't exist" if !(-d "/sys/devices/system/node/node$i/"); + } - push @$cmd, '-object', "memory-backend-ram,size=$numa_memory,id=ram-node$i"; + #policy + my $policy = $numa->{policy}; + die "you need to define a policy for hostnode $hostnodes" if !$policy; + $numa_object .= ",host-nodes=$hostnodes,policy=$policy"; + } + + push @$cmd, '-object', $numa_object; push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; } + die "total memory for NUMA nodes must be equal to vm memory" if $numa_totalmemory && $numa_totalmemory != $memory; + + #if no custom tology, we split memory and cores across numa nodes + if(!$numa_totalmemory) { + + my $numa_memory = ($memory / $sockets)."M"; + + for (my $i = 0; $i < $sockets; $i++) { + + my $cpustart = ($cores * $i); + my $cpuend = ($cpustart + $cores - 1) if $cores && $cores > 1; + my $cpus = $cpustart; + $cpus .= "-$cpuend" if $cpuend; + + push @$cmd, '-object', "memory-backend-ram,size=$numa_memory,id=ram-node$i"; + push @$cmd, '-numa', "node,nodeid=$i,cpus=$cpus,memdev=ram-node$i"; + } + } } push @$cmd, '-S' if $conf->{freeze}; -- 1.7.10.4 _______________________________________________ pve-devel mailing list pve-devel@pve.proxmox.com http://pve.proxmox.com/cgi-bin/mailman/listinfo/pve-devel