Marcelo Tosatti wrote: > On Thu, Jan 24, 2008 at 04:29:51PM -0600, Anthony Liguori wrote: > >> Anthony Liguori wrote: >> >>> This patch adds support to QEMU for Rusty's recently introduce virtio >>> balloon >>> driver. The user-facing portions of this are the introduction of a >>> "balloon" >>> and "info balloon" command in the monitor. >>> >>> I think using madvise unconditionally is okay but I am not sure. >>> >> Looks like it's not. I just hung my host system after doing a bunch of >> ballooning with a kernel that doesn't have MM notifiers. >> > > Thats strange, lack of MMU notifiers will crash the guest (by use of a > stale shadow entry), but not the host. > > What are the symptoms? >
It's only happened to me once because I stopped testing with madvise afterward. The guest spontaneously restarted, and a few seconds later, the machine hung. It shouldn't be hard to reproduce by just repeatedly ballooning up and down a guest. Do others expect KVM to just cope with the virtual mapping being changed out from underneath of it? >> I'm inclined to think that we should have a capability check for MM >> notifiers and just not do madvise if they aren't present. I don't think >> the ioctl approach that Marcelo took is sufficient as a malicious guest >> could possibly hose the host. >> > > How's that? The ioctl damage is contained to the guest (other than CPU > processing time, which the guest can cause in other ways). > > Anyway, don't see the need for back compat with older hosts. > Well, I'm unsure if this is a bug or expected behavior. If it's the later, then the ioctl approach just introduces a race condition. If the guest can fault in a page after the ioctl but before the madvise(), then it can trigger the bug. Regards, Anthony Liguori >> Having the guest allocate and not touch memory means that it should >> eventually be removed from the shadow page cache and eventually swapped >> out so ballooning isn't totally useless in the absence of MM notifiers. >> >> Regards, >> >> Anthony Liguori >> >> >>> If madvise >>> is called on memory that is essentially locked (which is what pre-MM >>> notifiers >>> is like) then it should just be a nop right? >>> >>> Signed-off-by: Anthony Liguori <[EMAIL PROTECTED]> >>> >>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target >>> index bb7be0f..d6b4f46 100644 >>> --- a/qemu/Makefile.target >>> +++ b/qemu/Makefile.target >>> @@ -464,7 +464,7 @@ VL_OBJS += rtl8139.o >>> VL_OBJS+= hypercall.o >>> >>> # virtio devices >>> -VL_OBJS += virtio.o virtio-net.o virtio-blk.o >>> +VL_OBJS += virtio.o virtio-net.o virtio-blk.o virtio-balloon.o >>> >>> ifeq ($(TARGET_BASE_ARCH), i386) >>> # Hardware support >>> diff --git a/qemu/balloon.h b/qemu/balloon.h >>> new file mode 100644 >>> index 0000000..ffce1fa >>> --- /dev/null >>> +++ b/qemu/balloon.h >>> @@ -0,0 +1,14 @@ >>> +#ifndef _QEMU_BALLOON_H >>> +#define _QEMU_BALLOON_H >>> + >>> +#include "cpu-defs.h" >>> + >>> +typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target); >>> + >>> +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque); >>> + >>> +void qemu_balloon(ram_addr_t target); >>> + >>> +ram_addr_t qemu_balloon_status(void); >>> + >>> +#endif >>> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c >>> index 652b263..6c8d907 100644 >>> --- a/qemu/hw/pc.c >>> +++ b/qemu/hw/pc.c >>> @@ -1122,6 +1122,9 @@ static void pc_init1(ram_addr_t ram_size, int >>> vga_ram_size, >>> } >>> } >>> >>> + if (pci_enabled) >>> + virtio_balloon_init(pci_bus, 0x1AF4, 0x1002); >>> + >>> if (extboot_drive != -1) { >>> DriveInfo *info = &drives_table[extboot_drive]; >>> int cyls, heads, secs; >>> diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h >>> index f640395..1899c11 100644 >>> --- a/qemu/hw/pc.h >>> +++ b/qemu/hw/pc.h >>> @@ -152,6 +152,9 @@ void virtio_net_poll(void); >>> void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device, >>> BlockDriverState *bs); >>> >>> +/* virtio-balloon.h */ >>> +void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device); >>> + >>> /* extboot.c */ >>> >>> void extboot_init(BlockDriverState *bs, int cmd); >>> diff --git a/qemu/hw/virtio-balloon.c b/qemu/hw/virtio-balloon.c >>> new file mode 100644 >>> index 0000000..1b5a689 >>> --- /dev/null >>> +++ b/qemu/hw/virtio-balloon.c >>> @@ -0,0 +1,142 @@ >>> +/* >>> + * Virtio Block Device >>> + * >>> + * Copyright IBM, Corp. 2008 >>> + * >>> + * Authors: >>> + * Anthony Liguori <[EMAIL PROTECTED]> >>> + * >>> + * This work is licensed under the terms of the GNU GPL, version 2. See >>> + * the COPYING file in the top-level directory. >>> + * >>> + */ >>> + >>> +#include "virtio.h" >>> +#include "block.h" >>> +#include "pc.h" >>> +#include "balloon.h" >>> +#include "sysemu.h" >>> + >>> +#include <sys/mman.h> >>> + >>> +/* from Linux's linux/virtio_blk.h */ >>> + >>> +/* The ID for virtio_balloon */ >>> +#define VIRTIO_ID_BALLOON 5 >>> + >>> +/* The feature bitmap for virtio balloon */ >>> +#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming >>> pages */ >>> + >>> +struct virtio_balloon_config >>> +{ >>> + /* Number of pages host wants Guest to give up. */ >>> + uint32_t num_pages; >>> + /* Number of pages we've actually got in balloon. */ >>> + uint32_t actual; >>> +}; >>> + >>> +typedef struct VirtIOBalloon >>> +{ >>> + VirtIODevice vdev; >>> + VirtQueue *ivq, *dvq; >>> + uint32_t num_pages; >>> + uint32_t actual; >>> +} VirtIOBalloon; >>> + >>> +static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev) >>> +{ >>> + return (VirtIOBalloon *)vdev; >>> +} >>> + >>> +static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue >>> *vq) >>> +{ >>> + VirtIOBalloon *s = to_virtio_balloon(vdev); >>> + VirtQueueElement elem; >>> + unsigned int count; >>> + >>> + while ((count = virtqueue_pop(vq, &elem)) != 0) { >>> + int i; >>> + unsigned int wlen = 0; >>> + >>> + for (i = 0; i < elem.out_num; i++) { >>> + int flags; >>> + uint32_t *pfns = elem.out_sg[i].iov_base; >>> + unsigned int n_pfns = elem.out_sg[i].iov_len / 4; >>> + int j; >>> + >>> + for (j = 0; j < n_pfns; j++) { >>> + if (vq == s->dvq) /* deflate */ >>> + flags = MADV_WILLNEED; >>> + else /* inflate */ >>> + flags = MADV_DONTNEED; >>> + >>> + madvise(phys_ram_base + (pfns[j] << TARGET_PAGE_BITS), >>> + TARGET_PAGE_SIZE, flags); >>> + } >>> + >>> + wlen += elem.out_sg[i].iov_len; >>> + } >>> + >>> + virtqueue_push(vq, &elem, wlen); >>> + virtio_notify(vdev, vq); >>> + } >>> +} >>> + >>> +static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t >>> *config_data) >>> +{ >>> + VirtIOBalloon *dev = to_virtio_balloon(vdev); >>> + struct virtio_balloon_config config; >>> + >>> + config.num_pages = dev->num_pages; >>> + config.actual = dev->actual; >>> + >>> + memcpy(config_data, &config, 8); >>> +} >>> + >>> +static void virtio_balloon_set_config(VirtIODevice *vdev, >>> + const uint8_t *config_data) >>> +{ >>> + VirtIOBalloon *dev = to_virtio_balloon(vdev); >>> + struct virtio_balloon_config config; >>> + memcpy(&config, config_data, 8); >>> + dev->actual = config.actual; >>> +} >>> + >>> +static uint32_t virtio_balloon_get_features(VirtIODevice *vdev) >>> +{ >>> + return 0; >>> +} >>> + >>> +static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t >>> target) >>> +{ >>> + VirtIOBalloon *dev = opaque; >>> + >>> + if (target) { >>> + dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS; >>> + virtio_notify_config(&dev->vdev); >>> + } >>> + >>> + return ram_size - (dev->actual << TARGET_PAGE_BITS); >>> +} >>> + >>> +void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device) >>> +{ >>> + VirtIOBalloon *s; >>> + >>> + s = (VirtIOBalloon *)virtio_init_pci(bus, "virtio-balloon", >>> + vendor, device, >>> + 0, VIRTIO_ID_BALLOON, >>> + 0x05, 0x00, 0x00, >>> + 8, sizeof(VirtIOBalloon)); >>> + >>> + s->vdev.get_config = virtio_balloon_get_config; >>> + s->vdev.set_config = virtio_balloon_set_config; >>> + s->vdev.get_features = virtio_balloon_get_features; >>> + >>> + s->ivq = virtio_add_queue(&s->vdev, 128, >>> virtio_balloon_handle_output); >>> + s->dvq = virtio_add_queue(&s->vdev, 128, >>> virtio_balloon_handle_output); >>> + >>> + qemu_add_balloon_handler(virtio_balloon_to_target, s); >>> + >>> + return &s->vdev; >>> +} >>> diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c >>> index 301b5a1..c8c3233 100644 >>> --- a/qemu/hw/virtio-blk.c >>> +++ b/qemu/hw/virtio-blk.c >>> @@ -153,7 +153,7 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, >>> uint16_t device, >>> 0x01, 0x80, 0x00, >>> 16, sizeof(VirtIOBlock)); >>> >>> - s->vdev.update_config = virtio_blk_update_config; >>> + s->vdev.get_config = virtio_blk_update_config; >>> s->vdev.get_features = virtio_blk_get_features; >>> s->bs = bs; >>> >>> diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c >>> index 86f9e5a..8c1f953 100644 >>> --- a/qemu/hw/virtio-net.c >>> +++ b/qemu/hw/virtio-net.c >>> @@ -288,7 +288,7 @@ void *virtio_net_init(PCIBus *bus, NICInfo *nd, int >>> devfn) >>> 0x02, 0x00, 0x00, >>> 6, sizeof(VirtIONet)); >>> >>> - n->vdev.update_config = virtio_net_update_config; >>> + n->vdev.get_config = virtio_net_update_config; >>> n->vdev.get_features = virtio_net_get_features; >>> n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx); >>> n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx); >>> diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c >>> index bbcb44c..1e8fb44 100644 >>> --- a/qemu/hw/virtio.c >>> +++ b/qemu/hw/virtio.c >>> @@ -304,6 +304,9 @@ static void virtio_config_writeb(void *opaque, >>> uint32_t addr, uint32_t data) >>> return; >>> >>> memcpy(vdev->config + addr, &val, sizeof(val)); >>> + >>> + if (vdev->set_config) >>> + vdev->set_config(vdev, vdev->config); >>> } >>> >>> static void virtio_config_writew(void *opaque, uint32_t addr, uint32_t >>> data) >>> @@ -316,6 +319,9 @@ static void virtio_config_writew(void *opaque, >>> uint32_t addr, uint32_t data) >>> return; >>> >>> memcpy(vdev->config + addr, &val, sizeof(val)); >>> + >>> + if (vdev->set_config) >>> + vdev->set_config(vdev, vdev->config); >>> } >>> >>> static void virtio_config_writel(void *opaque, uint32_t addr, uint32_t >>> data) >>> @@ -328,6 +334,9 @@ static void virtio_config_writel(void *opaque, >>> uint32_t addr, uint32_t data) >>> return; >>> >>> memcpy(vdev->config + addr, &val, sizeof(val)); >>> + >>> + if (vdev->set_config) >>> + vdev->set_config(vdev, vdev->config); >>> } >>> >>> static void virtio_map(PCIDevice *pci_dev, int region_num, >>> @@ -356,7 +365,7 @@ static void virtio_map(PCIDevice *pci_dev, int >>> region_num, >>> register_ioport_read(addr + 20, vdev->config_len, 4, >>> virtio_config_readl, vdev); >>> >>> - vdev->update_config(vdev, vdev->config); >>> + vdev->get_config(vdev, vdev->config); >>> } >>> } >>> >>> @@ -380,6 +389,14 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int >>> queue_size, >>> return &vdev->vq[i]; >>> } >>> >>> +void virtio_notify_config(VirtIODevice *vdev) >>> +{ >>> + /* make sure we have the latest config */ >>> + vdev->get_config(vdev, vdev->config); >>> + vdev->isr = 3; >>> + virtio_update_irq(vdev); >>> +} >>> + >>> void virtio_notify(VirtIODevice *vdev, VirtQueue *vq) >>> { >>> /* Always notify when queue is empty */ >>> diff --git a/qemu/hw/virtio.h b/qemu/hw/virtio.h >>> index dee97ba..ed8bebd 100644 >>> --- a/qemu/hw/virtio.h >>> +++ b/qemu/hw/virtio.h >>> @@ -118,7 +118,8 @@ struct VirtIODevice >>> void *config; >>> uint32_t (*get_features)(VirtIODevice *vdev); >>> void (*set_features)(VirtIODevice *vdev, uint32_t val); >>> - void (*update_config)(VirtIODevice *vdev, uint8_t *config); >>> + void (*get_config)(VirtIODevice *vdev, uint8_t *config); >>> + void (*set_config)(VirtIODevice *vdev, const uint8_t *config); >>> VirtQueue vq[VIRTIO_PCI_QUEUE_MAX]; >>> }; >>> >>> @@ -140,4 +141,6 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement >>> *elem); >>> >>> void virtio_notify(VirtIODevice *vdev, VirtQueue *vq); >>> >>> +void virtio_notify_config(VirtIODevice *vdev); >>> + >>> #endif >>> diff --git a/qemu/monitor.c b/qemu/monitor.c >>> index e03c473..e05f717 100644 >>> --- a/qemu/monitor.c >>> +++ b/qemu/monitor.c >>> @@ -35,6 +35,7 @@ >>> #include "audio/audio.h" >>> #include "disas.h" >>> #include "migration.h" >>> +#include "balloon.h" >>> #include <dirent.h> >>> >>> #include "qemu-kvm.h" >>> @@ -1262,6 +1263,23 @@ static void do_wav_capture (const char *path, >>> } >>> #endif >>> >>> +static void do_balloon(int value) >>> +{ >>> + ram_addr_t target = value; >>> + qemu_balloon(target << 20); >>> +} >>> + >>> +static void do_info_balloon(void) >>> +{ >>> + ram_addr_t actual; >>> + >>> + actual = qemu_balloon_status(); >>> + if (actual == 0) >>> + term_printf("Ballooning not activated in VM\n"); >>> + else >>> + term_printf("balloon: actual=%d\n", (int)(actual >> 20)); >>> +} >>> + >>> static term_cmd_t term_cmds[] = { >>> { "help|?", "s?", do_help, >>> "[cmd]", "show the help" }, >>> @@ -1339,6 +1357,8 @@ static term_cmd_t term_cmds[] = { >>> "", "cancel the current VM migration" }, >>> { "migrate_set_speed", "s", do_migrate_set_speed, >>> "value", "set maximum speed (in bytes) for migrations" }, >>> + { "balloon", "i", do_balloon, >>> + "target", "request VM to change it's memory allocation (in MB)" }, >>> { NULL, NULL, }, >>> }; >>> >>> @@ -1401,6 +1421,8 @@ static term_cmd_t info_cmds[] = { >>> #endif >>> { "migration", "", do_info_migration, >>> "", "show migration information" }, >>> + { "balloon", "", do_info_balloon, >>> + "", "show balloon information" }, >>> { NULL, NULL, }, >>> }; >>> >>> diff --git a/qemu/vl.c b/qemu/vl.c >>> index 756e13d..d339eb2 100644 >>> --- a/qemu/vl.c >>> +++ b/qemu/vl.c >>> @@ -38,6 +38,7 @@ >>> #include "block.h" >>> #include "audio/audio.h" >>> #include "migration.h" >>> +#include "balloon.h" >>> #include "qemu-kvm.h" >>> >>> #include <unistd.h> >>> @@ -511,6 +512,31 @@ void hw_error(const char *fmt, ...) >>> abort(); >>> } >>> >>> +/***************/ >>> +/* ballooning */ >>> + >>> +static QEMUBalloonEvent *qemu_balloon_event; >>> +void *qemu_balloon_event_opaque; >>> + >>> +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque) >>> +{ >>> + qemu_balloon_event = func; >>> + qemu_balloon_event_opaque = opaque; >>> +} >>> + >>> +void qemu_balloon(ram_addr_t target) >>> +{ >>> + if (qemu_balloon_event) >>> + qemu_balloon_event(qemu_balloon_event_opaque, target); >>> +} >>> + >>> +ram_addr_t qemu_balloon_status(void) >>> +{ >>> + if (qemu_balloon_event) >>> + return qemu_balloon_event(qemu_balloon_event_opaque, 0); >>> + return 0; >>> +} >>> + >>> /***********************************************************/ >>> /* keyboard/mouse */ >>> >>> >>> ------------------------------------------------------------------------- This SF.net email is sponsored by: Microsoft Defy all challenges. Microsoft(R) Visual Studio 2008. http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/ _______________________________________________ kvm-devel mailing list kvm-devel@lists.sourceforge.net https://lists.sourceforge.net/lists/listinfo/kvm-devel