According to the host node map given on the command line the VCPUs are pinned to the respective node (allowing at least scheduling between the cores belonging to this node). The mmap'ed guest memory will be bound to the correct host nodes (this will of course not take effect until the memory actually faults in).
The presence of libnuma will be auto-detected.

Signed-off-by: Andre Przywara <[EMAIL PROTECTED]>

--
Andre Przywara
AMD-Operating System Research Center (OSRC), Dresden, Germany
Tel: +49 351 277-84917
----to satisfy European Law for business letters:
AMD Saxony Limited Liability Company & Co. KG,
Wilschdorfer Landstr. 101, 01109 Dresden, Germany
Register Court Dresden: HRA 4896, General Partner authorized
to represent: AMD Saxony LLC (Wilmington, Delaware, US)
General Manager of AMD Saxony LLC: Dr. Hans-R. Deppe, Thomas McCoy
commit 0bc93b19ba3132140d5b34746523d0c7c8169093
Author: Andre Przywara <[EMAIL PROTECTED]>
Date:   Fri Dec 5 14:05:44 2008 +0100

    allocate guest resources from different host NUMA nodes

diff --git a/qemu/Makefile.target b/qemu/Makefile.target
index 05ace8e..690903e 100644
--- a/qemu/Makefile.target
+++ b/qemu/Makefile.target
@@ -698,6 +698,10 @@ LIBS += -lkvm
 DEPLIBS += ../libkvm/libkvm.a
 endif
 
+ifdef CONFIG_NUMA
+LIBS += -lnuma
+endif
+
 ifdef CONFIG_VNC_TLS
 CPPFLAGS += $(CONFIG_VNC_TLS_CFLAGS)
 LIBS += $(CONFIG_VNC_TLS_LIBS)
diff --git a/qemu/configure b/qemu/configure
index 63a85d6..3e2c9f9 100755
--- a/qemu/configure
+++ b/qemu/configure
@@ -121,6 +121,7 @@ bluez="yes"
 kvm="yes"
 kvm_cap_pit="no"
 kvm_cap_device_assignment="no"
+getcpu="no"
 kerneldir=""
 aix="no"
 blobs="yes"
@@ -391,6 +392,8 @@ for opt do
   ;;
   --enable-mixemu) mixemu="yes"
   ;;
+  --disable-numa) numa="no"
+  ;;
   --disable-aio) aio="no"
   ;;
   --disable-blobs) blobs="no"
@@ -489,6 +492,7 @@ echo "                           Available drivers: $audio_possible_drivers"
 echo "  --audio-card-list=LIST   set list of additional emulated audio cards"
 echo "                           Available cards: ac97 adlib cs4231a gus"
 echo "  --enable-mixemu          enable mixer emulation"
+echo "  --disable-numa           disable NUMA support (host side)"
 echo "  --disable-brlapi         disable BrlAPI"
 echo "  --disable-vnc-tls        disable TLS encryption for VNC server"
 echo "  --disable-curses         disable curses output"
@@ -985,6 +989,29 @@ for drv in $audio_drv_list; do
 done
 
 ##########################################
+# libnuma probe
+
+if test -z "$numa" ; then
+    numa=no
+
+    cat > $TMPC << EOF
+#include <numa.h>
+int main(void) { return numa_available(); }
+EOF
+    if $cc ${ARCH_CFLAGS} -o $TMPE ${OS_CFLAGS} $TMPC -lnuma 2> /dev/null ; then
+	    numa=yes
+    fi
+fi
+
+cat > $TMPC << EOF
+#include <sched.h>
+int main(void) { return sched_getcpu(); }
+EOF
+if $cc ${ARCH_CFLAGS} -o $TMPE ${OS_CFLAGS} $TMPC 2> /dev/null ; then
+    getcpu=yes
+fi
+
+##########################################
 # BrlAPI probe
 
 if test -z "$brlapi" ; then
@@ -1181,6 +1208,7 @@ echo "mingw32 support   $mingw32"
 echo "Audio drivers     $audio_drv_list"
 echo "Extra audio cards $audio_card_list"
 echo "Mixer emulation   $mixemu"
+echo "NUMA support      $numa"
 echo "VNC TLS support   $vnc_tls"
 if test "$vnc_tls" = "yes" ; then
     echo "    TLS CFLAGS    $vnc_tls_cflags"
@@ -1415,6 +1443,13 @@ if test "$mixemu" = "yes" ; then
   echo "CONFIG_MIXEMU=yes" >> $config_mak
   echo "#define CONFIG_MIXEMU 1" >> $config_h
 fi
+if test "$numa" = "yes" ; then
+  echo "CONFIG_NUMA=yes" >> $config_mak
+  echo "#define CONFIG_NUMA 1" >> $config_h
+fi
+if test "$getcpu" = "yes" ; then
+  echo "#define HAVE_GETCPU 1" >> $config_h
+fi
 if test "$vnc_tls" = "yes" ; then
   echo "CONFIG_VNC_TLS=yes" >> $config_mak
   echo "CONFIG_VNC_TLS_CFLAGS=$vnc_tls_cflags" >> $config_mak
diff --git a/qemu/hw/fw_cfg.h b/qemu/hw/fw_cfg.h
index ef8f378..b370e4e 100644
--- a/qemu/hw/fw_cfg.h
+++ b/qemu/hw/fw_cfg.h
@@ -8,6 +8,9 @@
 #define FW_CFG_NOGRAPHIC        0x04
 #define FW_CFG_NB_CPUS          0x05
 #define FW_CFG_MACHINE_ID       0x06
+#define FW_CFG_NUMA_NODES       0x07
+#define FW_CFG_NUMA_NODE_CPUS   0x08
+#define FW_CFG_NUMA_NODE_MEM    0x09
 #define FW_CFG_MAX_ENTRY        0x10
 
 #define FW_CFG_WRITE_CHANNEL    0x4000
diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
index 6de460c..b723125 100644
--- a/qemu/hw/pc.c
+++ b/qemu/hw/pc.c
@@ -439,6 +439,12 @@ static void bochs_bios_init(void)
     fw_cfg = fw_cfg_init(BIOS_CFG_IOPORT, BIOS_CFG_IOPORT + 1, 0, 0);
     fw_cfg_add_i32(fw_cfg, FW_CFG_ID, 1);
     fw_cfg_add_i64(fw_cfg, FW_CFG_RAM_SIZE, (uint64_t)ram_size);
+    fw_cfg_add_i16(fw_cfg, FW_CFG_NUMA_NODES, numnumanodes);
+
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA_NODE_MEM, (uint8_t*)node_mem,
+        sizeof(node_mem[0]) * numnumanodes);
+    fw_cfg_add_bytes(fw_cfg, FW_CFG_NUMA_NODE_CPUS, (uint8_t*)node_to_cpus,
+        sizeof(node_to_cpus[0]) * numnumanodes);
 }
 
 /* Generate an initial boot sector which sets state and jump to
diff --git a/qemu/qemu-kvm.c b/qemu/qemu-kvm.c
index a7cfa24..63afe85 100644
--- a/qemu/qemu-kvm.c
+++ b/qemu/qemu-kvm.c
@@ -28,6 +28,10 @@ int kvm_pit = 1;
 #include <sys/syscall.h>
 #include <sys/mman.h>
 
+#ifdef CONFIG_NUMA
+#include <numa.h>
+#endif
+
 #define false 0
 #define true 1
 
@@ -424,6 +428,36 @@ static int kvm_main_loop_cpu(CPUState *env)
     return 0;
 }
 
+#ifdef CONFIG_NUMA
+
+#ifndef HAVE_GETCPU
+static int get_cur_node (void)
+{
+    return -1;
+}
+#else /* HAVE_GETCPU */
+
+#define NUMA_MASK_SIZE 16
+static int get_cur_node (void)
+{
+int cpunr, node, maskbits;
+unsigned long maskbuf[NUMA_MASK_SIZE];
+
+    maskbits = sizeof(maskbuf[0]) * 8;
+    cpunr = sched_getcpu();
+    if (cpunr >= maskbits * NUMA_MASK_SIZE) return 0;
+    for (node = 0; node <= numa_max_node(); node++)
+    {
+        numa_node_to_cpus (node, maskbuf, sizeof(maskbuf[0]) * NUMA_MASK_SIZE);
+        if (maskbuf[cpunr / maskbits] & (1 << (cpunr % maskbits)))
+            return node;
+    }
+    return 0;
+}
+#endif /* HAVE_GETCPU */
+
+#endif /* CONFIG_NUMA */
+
 static void *ap_main_loop(void *_env)
 {
     CPUState *env = _env;
@@ -432,6 +466,32 @@ static void *ap_main_loop(void *_env)
 
     current_env = env;
     env->thread_id = kvm_get_thread_id();
+
+#ifdef CONFIG_NUMA
+    if (numnumanodes > 0 && numa_available() != -1)
+    {
+    	int i;
+    	for (i = 0; i < numnumanodes; i++) {
+    	    if (!(node_to_cpus[i] & (1 << env->cpu_index))) continue;
+    	    if (hostnodes[i] == (uint64_t)-1) {
+    	        int j;
+    	        unsigned long offset = 0;
+
+                hostnodes[i] = get_cur_node();
+                if (hostnodes[i] != (uint64_t)-1) {
+                    for (j = 0; j < i; ++j) offset += node_mem[i];
+                    numa_tonode_memory (phys_ram_base + offset,
+                        node_mem[i], hostnodes[i] % (numa_max_node() + 1));
+                }
+    	    }
+
+    	    if (hostnodes[i] != (uint64_t)-1)
+                numa_run_on_node (hostnodes[i] % (numa_max_node() + 1));
+            break;
+        }
+    }
+#endif
+
     sigfillset(&signals);
     sigprocmask(SIG_BLOCK, &signals, NULL);
     kvm_create_vcpu(kvm_context, env->cpu_index);
@@ -840,6 +900,21 @@ int kvm_setup_guest_memory(void *area, unsigned long size)
     if (ret)
         perror ("madvise");
 
+#ifdef CONFIG_NUMA
+    if (numnumanodes > 0 && numa_available() != -1) {
+        unsigned long offset = 0;
+        int i;
+
+        for (i = 0; i < numnumanodes; ++i) {
+            if (hostnodes[i] != (uint64_t)-1) {
+                numa_tonode_memory ((char*)area + offset,
+                    node_mem[i], hostnodes[i] % (numa_max_node() + 1));
+            }
+            offset += node_mem[i];
+        }
+    }
+#endif
+
     return ret;
 }
 

Reply via email to