john cooper wrote:
Attached is a patch to qemu which allows preallocation of huge
pages at startup time. Also in cases where sufficient huge
pages are not available, the user may request to selectively fall
back to 4K pages for those portions of phys_mem which couldn't be
backed by huge pages. This patch is relative to kvm-70.
The motivation for this arose from odd behavior seen in qemu
when access to huge page backed phys_mem failed during startup
(eg: loading the bios), and during runtime where a guest will
terminate via signal if a free hpage isn't available to satisfy
a guest page fault.
Two flags are introduced:
-mem-prealloc [0|1]
Attempt to fault-in all hpages and if all can be allocated
proceed as normal. If all hpages can't be resolved and
-mem-fallback behavior wasn't is requested, qemu will error
exit. This behavior is enabled by default when -mem-path
is specified.
-mem-fallback [0|1]
In the case some hpages can't be allocated, mapping a
series of 4K pages will be attempted in place of them.
In the case zero hpages were allocated a diagnostic to
this effect will be printed (flagging scenarios such
as /proc/sys/vm/nr_hugepages failing to be setup) and
the normal non-hpage 4K allocation will be attempted.
This behavior by default is disabled. Recognition of
-mem-fallback will only occur when -mem-path is
specified and -mem-prealloc behavior is enabled.
This is believed to be an interim bandaid until such support
is available from within the kernel+kvm, ideally allowing
fallback to 4K pages on a dynamic basis and ultimately allowing
huge pages faults to reclaim huge pages from other users in the
system.
-john
Signed-off-by: john cooper <john.cooper <at> redhat.com>
--- a/qemu/vl.c
+++ b/qemu/vl.c
@@ -45,6 +45,7 @@
#include <signal.h>
#include <time.h>
#include <errno.h>
+#include <asm/param.h>
#include <sys/time.h>
#include <zlib.h>
@@ -160,6 +161,15 @@ int inet_aton(const char *cp, struct in_
/* XXX: use a two level table to limit memory usage */
#define MAX_IOPORTS 65536
+/* assertion checking
+ */
+#define ASSERT(c) if (!(c)) _assert(#c, __FILE__, __LINE__)
+
+static inline void _assert(char *text, char *file, int line) {
+ fprintf(stderr, "ASSERTION FAILED: [%s] %s:%d\n", text, file, line);
+ kill(getpid(), 12); /* dump core */
+}
+
Is it really necessary to add this as part of this patch?
const char *bios_dir = CONFIG_QEMU_SHAREDIR;
const char *bios_name = NULL;
void *ioport_opaque[MAX_IOPORTS];
@@ -234,6 +244,8 @@ int autostart = 1;
int time_drift_fix = 0;
unsigned int kvm_shadow_memory = 0;
const char *mem_path = NULL;
+int mem_prealloc = {1}; /* force preallocation of physical target
memory */
+int mem_fallback = {0}; /* allow alloc from alternate page size */
This is a bizarre way to initialize an integer. I had no idea you could
even do this for a scalar. Just initialize to 1 and 0.
int hpagesize = 0;
const char *cpu_vendor_string;
#ifdef TARGET_ARM
@@ -7809,7 +7821,12 @@ static void help(int exitcode)
#endif
"-tdf inject timer interrupts that got lost\n"
"-kvm-shadow-memory megs set the amount of shadow pages to be
allocated\n"
- "-mem-path set the path to hugetlbfs/tmpfs mounted directory, also
enables allocation of guest memory with huge pages\n"
+ "-mem-path set the path to hugetlbfs/tmpfs mounted directory,
also\n"
+ " enables allocation of guest memory with huge
pages\n"
+ "-mem-prealloc forces preallocation of huge page physical
memory\n"
+ " at startup when -mem-path is specified. Default 1
(on)\n"
+ "-mem-fallback enables fallback to conventional pages as needed\n"
+ " when -mem-prealloc is enabled. Default: 0 (off)\n"
"-option-rom rom load a file, rom, into the option ROM space\n"
#ifdef TARGET_SPARC
"-prom-env variable=value set OpenBIOS nvram variables\n"
@@ -7932,6 +7949,8 @@ enum {
QEMU_OPTION_tdf,
QEMU_OPTION_kvm_shadow_memory,
QEMU_OPTION_mempath,
+ QEMU_OPTION_mem_prealloc,
+ QEMU_OPTION_mem_fallback,
};
typedef struct QEMUOption {
@@ -8059,6 +8078,8 @@ const QEMUOption qemu_options[] = {
{ "startdate", HAS_ARG, QEMU_OPTION_startdate },
{ "tb-size", HAS_ARG, QEMU_OPTION_tb_size },
{ "mem-path", HAS_ARG, QEMU_OPTION_mempath },
+ { "mem-prealloc", HAS_ARG, QEMU_OPTION_mem_prealloc },
+ { "mem-fallback", HAS_ARG, QEMU_OPTION_mem_fallback },
{ NULL },
};
@@ -8276,11 +8297,36 @@ static int gethugepagesize(void)
return hugepagesize;
}
-void *alloc_mem_area(unsigned long memory, const char *path)
+/* fault in associated page, fail syscall when free page is unavailable
+ */
+static inline int fault_in(void *a)
+{
+ return (gettimeofday(a, NULL) < 0 ? errno != EFAULT : 1);
+}
I don't like this very much. Why not just MAP_POPULATE?
+/* we failed to fault in hpage *a, fall back to conventional page mapping
+ */
+int remap_hpage(void *a, int sz)
+{
+ ASSERT(!(sz & (EXEC_PAGESIZE - 1)));
+ if (munmap(a, sz) < 0)
+ perror("remap_hpage: munmap");
+ else if (mmap(a, sz, PROT_READ|PROT_WRITE,
+ MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0) == MAP_FAILED)
+ perror("remap_hpage: mmap");
+ else
+ return (1);
+ return (0);
+}
I think this would be simplified with MAP_POPULATE since you can fail in
large chunks of memory instead of potentially having a highly fragmented
set of VMAs.
+/* attempt to allocate memory mmap'ed to mem_path
+ */
+void *alloc_hpage_mem(unsigned long memory, const char *path)
{
char *filename;
- void *area;
- int fd;
+ void *area, *a;
+ int fd, fail;
+ long i;
if (asprintf(&filename, "%s/kvm.XXXXXX", path) == -1)
return NULL;
@@ -8308,27 +8354,80 @@ void *alloc_mem_area(unsigned long memor
*/
ftruncate(fd, memory);
+ fail = 0;
area = mmap(0, memory, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
- if (area == MAP_FAILED) {
- perror("mmap");
- close(fd);
- return NULL;
- }
+ if (area == MAP_FAILED)
+ ++fail, perror("alloc_hpage_mem: hugetlbfs mmap");
- return area;
+ /* success of mmap() doesn't guarantee phys pages are present in the
+ * map. if requested we attempt to fault in all associated huge pages
* and where unable if requested, selectively revert back to a mapping
+ * of conventional sized pages
+ */
+ ASSERT(!((unsigned long)area & (hpagesize - 1)));
+ if (mem_prealloc)
+ for (a = area, i = memory / hpagesize; !fail && 0 < i--; a += hpagesize)
+ if (fault_in(a))
+ ; /* hpage is mapped */
+ else if (!mem_fallback)
+ ++fail;
+ else if (!remap_hpage(a, hpagesize))
+ ++fail;
+ if (!fail)
+ return (area);
+ if (area != MAP_FAILED)
+ munmap(area, memory);
+ close(fd);
+ return (NULL);
}
+/* allocate from hpage mem if requested and to the extent possible, else
+ * revert to conventional page allocation.
+ */
void *qemu_alloc_physram(unsigned long memory)
{
void *area = NULL;
- if (mem_path)
- area = alloc_mem_area(memory, mem_path);
- if (!area)
- area = qemu_vmalloc(memory);
+ if (mem_path && !(area = alloc_hpage_mem(memory, mem_path)))
+ fprintf(stderr, "warning: couldn't alloc memory as requested from %s\n",
+ mem_path);
+ return (!area && mem_fallback ? qemu_vmalloc(memory) : area);
+}
- return area;
+/* match string *s in pattern *p. *p is of the form: [<str> | [<str> '|']*]
+ * and <str> is a string to be matched which may be empty.
+ * Return pointer to tail of matched *p for success, NULL otherwise.
+ */
+char const *altmatch(const char *p, const char *s)
+{
+ const char *bp, *q;
+
+ for (bp = p, q = s; ; )
+ if ((!*p || *p == '|') && !*q)
+ return (q == s && bp != p ? NULL : p);
+ else if (!*p)
+ return (0);
+ else if (*p == *q)
+ ++p, ++q;
+ else { /* failed, try next field */
+ for (q = s; *p && *p != '|'; ++p)
+ ;
+ if (*p == '|')
+ bp = ++p;
+ }
+}
+
+int bool_arg(const char *optarg, const char *flag_name)
+{
+ if (!optarg)
+ ;
+ else if (altmatch("y|yes|1", optarg))
+ return (1);
+ else if (altmatch("n|no|0", optarg))
+ return (0);
+ fprintf(stderr, "usage: %s [0|1]\n", flag_name);
+ exit(1);
}
+
This is introducing too much infrastructure. Just follow the
conventions in the rest of the code. No need to make the options take a
boolean argument. The options themselves are boolean arguments.
Regards,
Anthony Liguori
int main(int argc, char **argv)
{
@@ -8962,6 +9061,12 @@ int main(int argc, char **argv)
case QEMU_OPTION_mempath:
mem_path = optarg;
break;
+ case QEMU_OPTION_mem_prealloc:
+ mem_prealloc = bool_arg(optarg, "mem_prealloc");
+ break;
+ case QEMU_OPTION_mem_fallback:
+ mem_fallback = bool_arg(optarg, "mem_fallback");
+ break;
case QEMU_OPTION_name:
qemu_name = optarg;
break;
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at http://vger.kernel.org/majordomo-info.html