Attached is a patch to qemu which allows preallocation of huge
pages at startup time.  Also in cases where sufficient huge
pages are not available, the user may request to selectively fall
back to 4K pages for those portions of phys_mem which couldn't be
backed by huge pages.  This patch is relative to kvm-70.

The motivation for this arose from odd behavior seen in qemu
when access to huge page backed phys_mem failed during startup
(eg: loading the bios), and during runtime where a guest will
terminate via signal if a free hpage isn't available to satisfy
a guest page fault.

Two flags are introduced:

-mem-prealloc [0|1]

    Attempt to fault-in all hpages and if all can be allocated
    proceed as normal.  If all hpages can't be resolved and
    -mem-fallback behavior wasn't is requested, qemu will error
    exit.  This behavior is enabled by default when -mem-path
    is specified.

-mem-fallback [0|1]

    In the case some hpages can't be allocated, mapping a
    series of 4K pages will be attempted in place of them.
    In the case zero hpages were allocated a diagnostic to
    this effect will be printed (flagging scenarios such
    as /proc/sys/vm/nr_hugepages failing to be setup) and
    the normal non-hpage 4K allocation will be attempted.
    This behavior by default is disabled.  Recognition of
    -mem-fallback will only occur when -mem-path is
    specified and -mem-prealloc behavior is enabled.

This is believed to be an interim bandaid until such support
is available from within the kernel+kvm, ideally allowing
fallback to 4K pages on a dynamic basis and ultimately allowing
huge pages faults to reclaim huge pages from other users in the
system.

-john


Signed-off-by: john cooper <john.cooper <at> redhat.com>


--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -45,6 +45,7 @@
 #include <signal.h>
 #include <time.h>
 #include <errno.h>
+#include <asm/param.h>
 #include <sys/time.h>
 #include <zlib.h>
 
@@ -160,6 +161,15 @@ int inet_aton(const char *cp, struct in_
 /* XXX: use a two level table to limit memory usage */
 #define MAX_IOPORTS 65536
 
+/* assertion checking
+ */
+#define ASSERT(c)       if (!(c)) _assert(#c, __FILE__, __LINE__)
+
+static inline void _assert(char *text, char *file, int line) {
+    fprintf(stderr, "ASSERTION FAILED: [%s] %s:%d\n", text, file, line);
+    kill(getpid(), 12);     /* dump core */
+}
+
 const char *bios_dir = CONFIG_QEMU_SHAREDIR;
 const char *bios_name = NULL;
 void *ioport_opaque[MAX_IOPORTS];
@@ -234,6 +244,8 @@ int autostart = 1;
 int time_drift_fix = 0;
 unsigned int kvm_shadow_memory = 0;
 const char *mem_path = NULL;
+int mem_prealloc = {1};	/* force preallocation of physical target memory */
+int mem_fallback = {0};	/* allow alloc from alternate page size */
 int hpagesize = 0;
 const char *cpu_vendor_string;
 #ifdef TARGET_ARM
@@ -7809,7 +7821,12 @@ static void help(int exitcode)
 #endif
            "-tdf            inject timer interrupts that got lost\n"
            "-kvm-shadow-memory megs set the amount of shadow pages to be allocated\n"
-           "-mem-path       set the path to hugetlbfs/tmpfs mounted directory, also enables allocation of guest memory with huge pages\n"
+           "-mem-path       set the path to hugetlbfs/tmpfs mounted directory, also\n"
+           "                enables allocation of guest memory with huge pages\n"
+           "-mem-prealloc   forces preallocation of huge page physical memory\n"
+           "                at startup when -mem-path is specified.  Default 1 (on)\n"
+           "-mem-fallback   enables fallback to conventional pages as needed\n"
+           "                when -mem-prealloc is enabled.  Default: 0 (off)\n" 
 	   "-option-rom rom load a file, rom, into the option ROM space\n"
 #ifdef TARGET_SPARC
            "-prom-env variable=value  set OpenBIOS nvram variables\n"
@@ -7932,6 +7949,8 @@ enum {
     QEMU_OPTION_tdf,
     QEMU_OPTION_kvm_shadow_memory,
     QEMU_OPTION_mempath,
+    QEMU_OPTION_mem_prealloc,
+    QEMU_OPTION_mem_fallback,
 };
 
 typedef struct QEMUOption {
@@ -8059,6 +8078,8 @@ const QEMUOption qemu_options[] = {
     { "startdate", HAS_ARG, QEMU_OPTION_startdate },
     { "tb-size", HAS_ARG, QEMU_OPTION_tb_size },
     { "mem-path", HAS_ARG, QEMU_OPTION_mempath },
+    { "mem-prealloc", HAS_ARG, QEMU_OPTION_mem_prealloc },
+    { "mem-fallback", HAS_ARG, QEMU_OPTION_mem_fallback },
     { NULL },
 };
 
@@ -8276,11 +8297,36 @@ static int gethugepagesize(void)
     return hugepagesize;
 }
 
-void *alloc_mem_area(unsigned long memory, const char *path)
+/* fault in associated page, fail syscall when free page is unavailable
+ */ 
+static inline int fault_in(void *a)
+{
+    return (gettimeofday(a, NULL) < 0 ? errno != EFAULT : 1);
+}
+
+/* we failed to fault in hpage *a, fall back to conventional page mapping
+ */
+int remap_hpage(void *a, int sz)
+{
+    ASSERT(!(sz & (EXEC_PAGESIZE - 1)));
+    if (munmap(a, sz) < 0)
+	perror("remap_hpage: munmap");
+    else if (mmap(a, sz, PROT_READ|PROT_WRITE,
+	MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED)
+	    perror("remap_hpage: mmap");
+    else
+	return (1);
+    return (0);
+}
+
+/* attempt to allocate memory mmap'ed to mem_path
+ */
+void *alloc_hpage_mem(unsigned long memory, const char *path)
 {
     char *filename;
-    void *area;
-    int fd;
+    void *area, *a;
+    int fd, fail;
+    long i;
 
     if (asprintf(&filename, "%s/kvm.XXXXXX", path) == -1)
 	return NULL;
@@ -8308,27 +8354,80 @@ void *alloc_mem_area(unsigned long memor
      */
     ftruncate(fd, memory);
 
+    fail = 0;
     area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
-    if (area == MAP_FAILED) {
-	perror("mmap");
-	close(fd);
-	return NULL;
-    }
+    if (area == MAP_FAILED)
+	++fail, perror("alloc_hpage_mem: hugetlbfs mmap");
 
-    return area;
+    /* success of mmap() doesn't guarantee phys pages are present in the
+     * map.  if requested we attempt to fault in all associated huge pages           * and where unable if requested, selectively revert back to a mapping
+     * of conventional sized pages
+     */
+    ASSERT(!((unsigned long)area & (hpagesize - 1)));
+    if (mem_prealloc)
+	for (a = area, i = memory / hpagesize; !fail && 0 < i--; a += hpagesize)
+	    if (fault_in(a))
+		;	/* hpage is mapped */
+	    else if (!mem_fallback)
+		++fail;
+	    else if (!remap_hpage(a, hpagesize))
+		++fail;
+    if (!fail)
+        return (area);
+    if (area != MAP_FAILED)
+	munmap(area, memory);
+    close(fd);
+    return (NULL);
 }
 
+/* allocate from hpage mem if requested and to the extent possible, else
+ * revert to conventional page allocation.
+ */
 void *qemu_alloc_physram(unsigned long memory)
 {
     void *area = NULL;
 
-    if (mem_path)
-	area = alloc_mem_area(memory, mem_path);
-    if (!area)
-	area = qemu_vmalloc(memory);
+    if (mem_path && !(area = alloc_hpage_mem(memory, mem_path)))
+	fprintf(stderr, "warning: couldn't alloc memory as requested from %s\n",
+	    mem_path);
+    return (!area && mem_fallback ? qemu_vmalloc(memory) : area);
+}
 
-    return area;
+/* match string *s in pattern *p.  *p is of the form: [<str> | [<str> '|']*]
+ * and <str> is a string to be matched which may be empty.
+ * Return pointer to tail of matched *p for success, NULL otherwise.
+ */
+char const *altmatch(const char *p, const char *s)
+{
+    const char *bp, *q; 
+
+    for (bp = p, q = s; ; ) 
+        if ((!*p || *p == '|') && !*q) 
+            return (q == s && bp != p ? NULL : p);
+        else if (!*p)
+            return (0); 
+        else if (*p == *q)
+            ++p, ++q;   
+        else {   /* failed, try next field */
+            for (q = s; *p && *p != '|'; ++p)
+                ;               
+            if (*p == '|') 
+                bp = ++p;       
+        }           
+}   
+
+int bool_arg(const char *optarg, const char *flag_name)
+{
+    if (!optarg)
+        ;
+    else if (altmatch("y|yes|1", optarg))
+        return (1);
+    else if (altmatch("n|no|0", optarg))
+        return (0);
+    fprintf(stderr, "usage: %s [0|1]\n", flag_name);
+    exit(1);
 }
+		
 
 int main(int argc, char **argv)
 {
@@ -8962,6 +9061,12 @@ int main(int argc, char **argv)
             case QEMU_OPTION_mempath:
 		mem_path = optarg;
 		break;
+            case QEMU_OPTION_mem_prealloc:
+		mem_prealloc = bool_arg(optarg, "mem_prealloc");
+		break;
+            case QEMU_OPTION_mem_fallback:
+		mem_fallback = bool_arg(optarg, "mem_fallback");
+		break;
             case QEMU_OPTION_name:
                 qemu_name = optarg;
                 break;

Reply via email to