This is an automated email from the ASF dual-hosted git repository. jfthomps pushed a commit to branch VCL-1136_KVM_NUMA_and_huge_pages in repository https://gitbox.apache.org/repos/asf/vcl.git
The following commit(s) were added to refs/heads/VCL-1136_KVM_NUMA_and_huge_pages by this push: new 65678bf2 VCL-1136 - NUMA and huge page performance improvement for KVM 65678bf2 is described below commit 65678bf2f3f50b0c5f3c3bf9feb322a174b2d0a2 Author: Josh Thompson <jftho...@ncsu.edu> AuthorDate: Mon Dec 11 17:13:39 2023 -0500 VCL-1136 - NUMA and huge page performance improvement for KVM Linux.pm: (cleaned up some whitespace) -added get_cpu_numa_data -added get_memory_huge_pages libvirt.pm: modified generate_domain_xml: added code that evaluates host and VM core and RAM specifications and if sufficient resources are needed by the VM and available on the host, configures NUMA and/or huge pages on the VM to allow it to run more efficiently --- managementnode/lib/VCL/Module/OS/Linux.pm | 147 ++++++++++++++++++++- .../lib/VCL/Module/Provisioning/libvirt.pm | 127 +++++++++++++++++- 2 files changed, 262 insertions(+), 12 deletions(-) diff --git a/managementnode/lib/VCL/Module/OS/Linux.pm b/managementnode/lib/VCL/Module/OS/Linux.pm index 8ddc6848..4df1fea4 100644 --- a/managementnode/lib/VCL/Module/OS/Linux.pm +++ b/managementnode/lib/VCL/Module/OS/Linux.pm @@ -495,7 +495,7 @@ sub post_load { my $image_name = $self->data->get_image_name(); my $computer_node_name = $self->data->get_computer_node_name(); my $image_os_install_type = $self->data->get_image_os_install_type(); - + notify($ERRORS{'OK'}, 0, "beginning Linux post_load tasks, image: $image_name, computer: $computer_node_name"); # Wait for computer to respond to SSH @@ -578,7 +578,7 @@ sub post_load { } } } - + return $self->SUPER::post_load(); } @@ -1286,11 +1286,11 @@ sub reserve { # Add a local vcl user group if it doesn't already exist # Do this before OS.pm::reserve calls add_user_accounts $self->add_vcl_usergroup(); - + # Configure sshd to only listen on the private interface and add ext_sshd service listening on the public interface # This needs to be done after update_public_ip_address is called from OS.pm::reserve $self->configure_ext_sshd() || return; - + # Call OS.pm's reserve subroutine $self->SUPER::reserve() || return; @@ -3101,7 +3101,7 @@ sub delete_user { if ($home_directory_on_local_disk) { $delete_home_directory = 1; - + # Fetch exclude_list my @exclude_list = $self->get_exclude_list(); if ((grep(/\/home\/$username/, @exclude_list))) { @@ -3151,7 +3151,7 @@ sub delete_user { } } } - + if ($delete_home_directory) { notify($ERRORS{'DEBUG'}, 0, "home directory will be deleted: $home_directory_path"); $userdel_command .= ' -r'; @@ -4188,6 +4188,59 @@ sub get_cpu_speed { #////////////////////////////////////////////////////////////////////////////// +=head2 get_cpu_numa_data + + Parameters : none + Returns : array + Description : Retrieves numa information of computer's CPUs + +=cut + +sub get_cpu_numa_data { + my $self = shift; + if (ref($self) !~ /VCL::Module/i) { + notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method"); + return; + } + + my $computer_node_name = $self->data->get_computer_node_name(); + + my $command = "lscpu"; + my ($exit_status, $output) = $self->execute($command); + + if (!defined($output)) { + notify($ERRORS{'WARNING'}, 0, "failed to retrieve CPU NUMA info from $computer_node_name"); + return; + } + + my ($numacnt) = map {$_ =~ /NUMA node\(s\):\s*(\d+)/} @$output; + if ($numacnt) { + $numacnt = int($numacnt); + notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name CPU NUMA nodes: $numacnt"); + } + else { + notify($ERRORS{'WARNING'}, 0, "failed to determine $computer_node_name CPU NUMA nodes, 'NUMA node(s):' line does not exist in the cpuinfo output:\n" . join("\n", @$output)); + return (); + } + + my @numanodes; + + for (my $i = 0; $i < $numacnt; $i++) { + ($numanodes[$i]) = map {$_ =~ /NUMA node$i CPU\(s\):\s*([-,0-9]+)/} @$output; + if ($numanodes[$i]) { + notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name NUMA node $i CPU(s): $numanodes[$i]"); + } + else { + notify($ERRORS{'WARNING'}, 0, "failed to determine $computer_node_name NUMA node $i CPU(s), 'NUMA node$i CPU(s):' line does not exist in the cpuinfo output:\n" . join("\n", @$output)); + return (); + } + } + notify($ERRORS{'DEBUG'}, 0, "NUMA CPU data for $computer_node_name:\n" . format_data(@numanodes)); + return @numanodes; +} + +#////////////////////////////////////////////////////////////////////////////// + =head2 get_total_memory Parameters : none @@ -4229,6 +4282,88 @@ sub get_total_memory { #////////////////////////////////////////////////////////////////////////////// +=head2 get_memory_huge_pages + + Parameters : none + Returns : hash reference + Description : Retrieves information about the computer's huge pages + +=cut + +sub get_memory_huge_pages { + my $self = shift; + if (ref($self) !~ /VCL::Module/i) { + notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method"); + return; + } + + my $computer_node_name = $self->data->get_computer_node_name(); + + my $command = "cat /proc/meminfo"; + my ($exit_status, $output) = $self->execute($command); + + if (!defined($output)) { + notify($ERRORS{'WARNING'}, 0, "failed to retrieve huge page size from $computer_node_name"); + return; + } + + my ($hugepagesize) = map {$_ =~ /Hugepagesize:\s*(\d+) kB/} @$output; + if ($hugepagesize) { + notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name huge page size: $hugepagesize kB"); + } + else { + notify($ERRORS{'WARNING'}, 0, "failed to determine $computer_node_name huge page size from command: '$command', output:\n" . join("\n", @$output)); + return; + } + my ($freemem) = map {$_ =~ /MemFree:\s*(\d+) kB/} @$output; + if ($freemem) { + notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name MemFree: $freemem kB"); + } + my ($hugefree) = map {$_ =~ /HugePages_Free:\s*(\d+)/} @$output; + if ($hugefree) { + $hugefree = $hugefree * $hugepagesize; + notify($ERRORS{'DEBUG'}, 0, "retrieved $computer_node_name huge memory free $hugefree kB"); + } + + $command = "numastat -m"; + ($exit_status, $output) = $self->execute($command); + + if (!defined($output)) { + notify($ERRORS{'WARNING'}, 0, "failed to retrieve NUMA memory info from $computer_node_name"); + return; + } + + my @nodes = (); + my $nodecnt = 0; + + for my $line (@$output) { + if($line =~ /Node/) { + my @parts = split(' ', $line); + my $len = scalar(@parts); + $nodecnt = $parts[$len - 2] + 1; + notify($ERRORS{'DEBUG'}, 0, "NUMA node count on $computer_node_name: $nodecnt"); + next; + } + if($line =~ /(MemFree|HugePages_Total|HugePages_Free)/) { + my @parts = split(' ', $line); + for(my $i = 0; $i < $nodecnt; $i++) { + $nodes[$i]{$parts[0]} = $parts[$i + 1] * 1024; # convert to kB + } + } + } + notify($ERRORS{'DEBUG'}, 0, "NUMA data for $computer_node_name\n" . format_data(@nodes)); + if($nodecnt == 0 || scalar(@nodes) == 0) { + notify($ERRORS{'WARNING'}, 0, "failed to determine $computer_node_name NUMA memory data from command: '$command', output:\n" . join("\n", @$output)); + return (); + } + return ( hugepagesize => $hugepagesize, # in kB + numapagedata => \@nodes, + totalfreemem => $freemem, + totalfreehugemem => $hugefree ); +} + +#////////////////////////////////////////////////////////////////////////////// + =head2 get_exclude_list Parameters : none diff --git a/managementnode/lib/VCL/Module/Provisioning/libvirt.pm b/managementnode/lib/VCL/Module/Provisioning/libvirt.pm index 6f2dea8d..58c0bbe3 100644 --- a/managementnode/lib/VCL/Module/Provisioning/libvirt.pm +++ b/managementnode/lib/VCL/Module/Provisioning/libvirt.pm @@ -1844,11 +1844,83 @@ EOF # Windows, however, expects it to be in so called 'localtime'." my $clock_offset = ($image_os_type =~ /windows/) ? 'localtime' : 'utc'; - my $cpusockets = $cpu_count; - my $cpucores = 1; - if($cpu_count > 2) { - $cpusockets = 2; - $cpucores = ($cpu_count - ($cpu_count % 2)) / 2; + my $cpusockets; + my $cpucores; + my $use_numa = 0; + my @numacpu; + my $optimize_memory = 0; + my $use_huge_pages = 0; + my $pernode_memory = 0; + my $cpuset = ''; + my %numa_memory = $self->vmhost_os->get_memory_huge_pages(); + + if($cpu_count < 32) { + $cpusockets = $cpu_count; + $cpucores = 1; + if($cpu_count > 2) { + $cpusockets = 2; + $cpucores = ($cpu_count + ($cpu_count % 2)) / 2; + } + # use regular memory if available to leave huge pages for NUMA optomized VMs but + # use huge pages if not enough regular memory free + if ($memory_kb > $numa_memory{totalfreemem} && + $memory_kb <= $numa_memory{totalfreehugemem}) { + $use_huge_pages = 1; + notify($ERRORS{'DEBUG'}, 0, "Using huge pages for memory backing"); + } + } + else { + notify($ERRORS{'DEBUG'}, 0, "CPU count >= 32, using NUMA settings"); + $use_numa = 1; + @numacpu = $self->vmhost_os->get_cpu_numa_data(); + my $numacnt = scalar(@numacpu); + $cpusockets = $numacnt; + if($cpu_count % $numacnt != 0) { + # increase cpu_count to be evenly divisable by numa node count + $cpu_count = $cpu_count + ($numacnt - $cpu_count % $numacnt); + notify($ERRORS{'DEBUG'}, 0, "CPU count does not divide equally among NUMA nodes ($numacnt), increased CPU count to $cpu_count"); + } + $cpucores = $cpu_count / $numacnt; + + my $huge_memory_kb = $memory_kb; + if ($memory_kb % ($numa_memory{hugepagesize} * $numacnt)) { + # $memory_kb does not evenly divide into huge pages across all NUMA nodes, need to adjust $memory_kb + $huge_memory_kb = $memory_kb + ($numa_memory{hugepagesize} * $numacnt) - ($memory_kb % ($numa_memory{hugepagesize} * $numacnt)); + notify($ERRORS{'DEBUG'}, 0, "required memory does not divide evenly among huge page size and NUMA nodes, increasing to $huge_memory_kb kB if huge pages are used"); + notify($ERRORS{'DEBUG'}, 0, "----> huge page size: $numa_memory{hugepagesize}"); + notify($ERRORS{'DEBUG'}, 0, "----> NUMA nodes: $numacnt"); + notify($ERRORS{'DEBUG'}, 0, "----> memory per node: " . $huge_memory_kb / $numacnt); + } + $optimize_memory = 1; + $use_huge_pages = 1; + $pernode_memory = $memory_kb / $numacnt; + my $huge_pernode_memory = $huge_memory_kb / $numacnt; + + # memory needed by VM: $memory_kb + # divide that by number of NUMA nodes: $numacnt + # check that each NUMA node has that much memory free + my @tmp_cpuset; + for (my $i = 0; $i < $numacnt; $i++) { + if($numa_memory{numapagedata}[$i]{MemFree} < $pernode_memory) { + notify($ERRORS{'DEBUG'}, 0, "not enough memory free to evenly split among NUMA nodes; memory will not be NUMA optimized") if ($optimize_memory); + $optimize_memory = 0; + } + if($numa_memory{numapagedata}[$i]{HugePages_Free} < $huge_pernode_memory) { + notify($ERRORS{'DEBUG'}, 0, "not enough huge pages free to evenly split among NUMA nodes; memory will not use huge pages") if ($use_huge_pages); + $use_huge_pages = 0; + } + push @tmp_cpuset, $numacpu[$i]; + $cpuset = "$cpuset," . $numacpu[$i]; + } + if ($use_huge_pages) { + $optimize_memory = 1; + } + $cpuset = join(",", @tmp_cpuset); + if ($use_huge_pages) { + notify($ERRORS{'DEBUG'}, 0, "Using huge pages for memory backing"); + $memory_kb = $huge_memory_kb; + $pernode_memory = $huge_pernode_memory; + } } my $xml_hashref = { @@ -1962,7 +2034,50 @@ EOF notify($ERRORS{'DEBUG'}, 0, "vmpath ($vmhost_vmpath) is on NFS; setting disk cache to none"); $xml_hashref->{'devices'}[0]{'disk'}[0]{'driver'}{'cache'} = 'none'; } - + + if ($use_numa) { + my $host_numacells = scalar(@numacpu); + # vcpu section + $xml_hashref->{'vcpu'} = [ {'placement' => 'static', 'cpuset' => $cpuset, 'content' => $cpu_count}]; + + # cputune section + my @pins = (); + for (my $i = 0, my $index, my $cpusetval; $i < $cpu_count; $i++) { + $index = int($i / $cpucores); + $cpusetval = $numacpu[$index]; + $pins[$i] = {'vcpu' => $i, 'cpuset' => $cpusetval}; + } + $xml_hashref->{'cputune'} = { 'vcpupin' => \@pins }; + + if ($optimize_memory) { + # numatune section + my @memnodes = (); + for (my $i = 0; $i < $host_numacells; $i++) { + $memnodes[$i] = { 'cellid' => $i, 'mode' => 'strict', 'nodeset' => $i }; + } + my $nodeset = "0-" . ($host_numacells - 1); + $xml_hashref->{'numatune'} = { 'memory' => {'node' => 'strict', 'nodeset' => $nodeset}, 'memnode' => \@memnodes}; + + # cpu numa section + my @numacells = (); + for (my $i = 0, + my $cores_per_cell = $cpu_count / $host_numacells, + my $lower, + my $upper; + $i < $host_numacells; + $i++) { + $lower = $i * $cores_per_cell; + $upper = $lower + $cores_per_cell - 1; + $numacells[$i] = { 'id' => $i, 'cpus' => "$lower-$upper", 'memory' => $pernode_memory, 'unit' => 'KiB' }; + } + $xml_hashref->{'cpu'}[0]{'numa'} = {'cell' => \@numacells}; + } + } + + if ($use_huge_pages) { + $xml_hashref->{'memoryBacking'} = { 'hugepages' => {} }; + } + notify($ERRORS{'DEBUG'}, 0, "generated domain XML:\n" . format_data($xml_hashref)); return hash_to_xml_string($xml_hashref, 'domain'); }