Marcelo Tosatti wrote:
> On Thu, Jan 24, 2008 at 04:29:51PM -0600, Anthony Liguori wrote:
>
>> Anthony Liguori wrote:
>>
>>> This patch adds support to QEMU for Rusty's recently introduce virtio
>>> balloon
>>> driver. The user-facing portions of this are the introduction of a
>>> "balloon"
>>> and "info balloon" command in the monitor.
>>>
>>> I think using madvise unconditionally is okay but I am not sure.
>>>
>> Looks like it's not. I just hung my host system after doing a bunch of
>> ballooning with a kernel that doesn't have MM notifiers.
>>
>
> Thats strange, lack of MMU notifiers will crash the guest (by use of a
> stale shadow entry), but not the host.
>
> What are the symptoms?
>
It's only happened to me once because I stopped testing with madvise
afterward. The guest spontaneously restarted, and a few seconds later,
the machine hung. It shouldn't be hard to reproduce by just repeatedly
ballooning up and down a guest.
Do others expect KVM to just cope with the virtual mapping being changed
out from underneath of it?
>> I'm inclined to think that we should have a capability check for MM
>> notifiers and just not do madvise if they aren't present. I don't think
>> the ioctl approach that Marcelo took is sufficient as a malicious guest
>> could possibly hose the host.
>>
>
> How's that? The ioctl damage is contained to the guest (other than CPU
> processing time, which the guest can cause in other ways).
>
> Anyway, don't see the need for back compat with older hosts.
>
Well, I'm unsure if this is a bug or expected behavior. If it's the
later, then the ioctl approach just introduces a race condition. If the
guest can fault in a page after the ioctl but before the madvise(), then
it can trigger the bug.
Regards,
Anthony Liguori
>> Having the guest allocate and not touch memory means that it should
>> eventually be removed from the shadow page cache and eventually swapped
>> out so ballooning isn't totally useless in the absence of MM notifiers.
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>
>>> If madvise
>>> is called on memory that is essentially locked (which is what pre-MM
>>> notifiers
>>> is like) then it should just be a nop right?
>>>
>>> Signed-off-by: Anthony Liguori <[EMAIL PROTECTED]>
>>>
>>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
>>> index bb7be0f..d6b4f46 100644
>>> --- a/qemu/Makefile.target
>>> +++ b/qemu/Makefile.target
>>> @@ -464,7 +464,7 @@ VL_OBJS += rtl8139.o
>>> VL_OBJS+= hypercall.o
>>>
>>> # virtio devices
>>> -VL_OBJS += virtio.o virtio-net.o virtio-blk.o
>>> +VL_OBJS += virtio.o virtio-net.o virtio-blk.o virtio-balloon.o
>>>
>>> ifeq ($(TARGET_BASE_ARCH), i386)
>>> # Hardware support
>>> diff --git a/qemu/balloon.h b/qemu/balloon.h
>>> new file mode 100644
>>> index 0000000..ffce1fa
>>> --- /dev/null
>>> +++ b/qemu/balloon.h
>>> @@ -0,0 +1,14 @@
>>> +#ifndef _QEMU_BALLOON_H
>>> +#define _QEMU_BALLOON_H
>>> +
>>> +#include "cpu-defs.h"
>>> +
>>> +typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
>>> +
>>> +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque);
>>> +
>>> +void qemu_balloon(ram_addr_t target);
>>> +
>>> +ram_addr_t qemu_balloon_status(void);
>>> +
>>> +#endif
>>> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
>>> index 652b263..6c8d907 100644
>>> --- a/qemu/hw/pc.c
>>> +++ b/qemu/hw/pc.c
>>> @@ -1122,6 +1122,9 @@ static void pc_init1(ram_addr_t ram_size, int
>>> vga_ram_size,
>>> }
>>> }
>>>
>>> + if (pci_enabled)
>>> + virtio_balloon_init(pci_bus, 0x1AF4, 0x1002);
>>> +
>>> if (extboot_drive != -1) {
>>> DriveInfo *info = &drives_table[extboot_drive];
>>> int cyls, heads, secs;
>>> diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h
>>> index f640395..1899c11 100644
>>> --- a/qemu/hw/pc.h
>>> +++ b/qemu/hw/pc.h
>>> @@ -152,6 +152,9 @@ void virtio_net_poll(void);
>>> void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
>>> BlockDriverState *bs);
>>>
>>> +/* virtio-balloon.h */
>>> +void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device);
>>> +
>>> /* extboot.c */
>>>
>>> void extboot_init(BlockDriverState *bs, int cmd);
>>> diff --git a/qemu/hw/virtio-balloon.c b/qemu/hw/virtio-balloon.c
>>> new file mode 100644
>>> index 0000000..1b5a689
>>> --- /dev/null
>>> +++ b/qemu/hw/virtio-balloon.c
>>> @@ -0,0 +1,142 @@
>>> +/*
>>> + * Virtio Block Device
>>> + *
>>> + * Copyright IBM, Corp. 2008
>>> + *
>>> + * Authors:
>>> + * Anthony Liguori <[EMAIL PROTECTED]>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2. See
>>> + * the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#include "virtio.h"
>>> +#include "block.h"
>>> +#include "pc.h"
>>> +#include "balloon.h"
>>> +#include "sysemu.h"
>>> +
>>> +#include <sys/mman.h>
>>> +
>>> +/* from Linux's linux/virtio_blk.h */
>>> +
>>> +/* The ID for virtio_balloon */
>>> +#define VIRTIO_ID_BALLOON 5
>>> +
>>> +/* The feature bitmap for virtio balloon */
>>> +#define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming
>>> pages */
>>> +
>>> +struct virtio_balloon_config
>>> +{
>>> + /* Number of pages host wants Guest to give up. */
>>> + uint32_t num_pages;
>>> + /* Number of pages we've actually got in balloon. */
>>> + uint32_t actual;
>>> +};
>>> +
>>> +typedef struct VirtIOBalloon
>>> +{
>>> + VirtIODevice vdev;
>>> + VirtQueue *ivq, *dvq;
>>> + uint32_t num_pages;
>>> + uint32_t actual;
>>> +} VirtIOBalloon;
>>> +
>>> +static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev)
>>> +{
>>> + return (VirtIOBalloon *)vdev;
>>> +}
>>> +
>>> +static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue
>>> *vq)
>>> +{
>>> + VirtIOBalloon *s = to_virtio_balloon(vdev);
>>> + VirtQueueElement elem;
>>> + unsigned int count;
>>> +
>>> + while ((count = virtqueue_pop(vq, &elem)) != 0) {
>>> + int i;
>>> + unsigned int wlen = 0;
>>> +
>>> + for (i = 0; i < elem.out_num; i++) {
>>> + int flags;
>>> + uint32_t *pfns = elem.out_sg[i].iov_base;
>>> + unsigned int n_pfns = elem.out_sg[i].iov_len / 4;
>>> + int j;
>>> +
>>> + for (j = 0; j < n_pfns; j++) {
>>> + if (vq == s->dvq) /* deflate */
>>> + flags = MADV_WILLNEED;
>>> + else /* inflate */
>>> + flags = MADV_DONTNEED;
>>> +
>>> + madvise(phys_ram_base + (pfns[j] << TARGET_PAGE_BITS),
>>> + TARGET_PAGE_SIZE, flags);
>>> + }
>>> +
>>> + wlen += elem.out_sg[i].iov_len;
>>> + }
>>> +
>>> + virtqueue_push(vq, &elem, wlen);
>>> + virtio_notify(vdev, vq);
>>> + }
>>> +}
>>> +
>>> +static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t
>>> *config_data)
>>> +{
>>> + VirtIOBalloon *dev = to_virtio_balloon(vdev);
>>> + struct virtio_balloon_config config;
>>> +
>>> + config.num_pages = dev->num_pages;
>>> + config.actual = dev->actual;
>>> +
>>> + memcpy(config_data, &config, 8);
>>> +}
>>> +
>>> +static void virtio_balloon_set_config(VirtIODevice *vdev,
>>> + const uint8_t *config_data)
>>> +{
>>> + VirtIOBalloon *dev = to_virtio_balloon(vdev);
>>> + struct virtio_balloon_config config;
>>> + memcpy(&config, config_data, 8);
>>> + dev->actual = config.actual;
>>> +}
>>> +
>>> +static uint32_t virtio_balloon_get_features(VirtIODevice *vdev)
>>> +{
>>> + return 0;
>>> +}
>>> +
>>> +static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t
>>> target)
>>> +{
>>> + VirtIOBalloon *dev = opaque;
>>> +
>>> + if (target) {
>>> + dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS;
>>> + virtio_notify_config(&dev->vdev);
>>> + }
>>> +
>>> + return ram_size - (dev->actual << TARGET_PAGE_BITS);
>>> +}
>>> +
>>> +void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device)
>>> +{
>>> + VirtIOBalloon *s;
>>> +
>>> + s = (VirtIOBalloon *)virtio_init_pci(bus, "virtio-balloon",
>>> + vendor, device,
>>> + 0, VIRTIO_ID_BALLOON,
>>> + 0x05, 0x00, 0x00,
>>> + 8, sizeof(VirtIOBalloon));
>>> +
>>> + s->vdev.get_config = virtio_balloon_get_config;
>>> + s->vdev.set_config = virtio_balloon_set_config;
>>> + s->vdev.get_features = virtio_balloon_get_features;
>>> +
>>> + s->ivq = virtio_add_queue(&s->vdev, 128,
>>> virtio_balloon_handle_output);
>>> + s->dvq = virtio_add_queue(&s->vdev, 128,
>>> virtio_balloon_handle_output);
>>> +
>>> + qemu_add_balloon_handler(virtio_balloon_to_target, s);
>>> +
>>> + return &s->vdev;
>>> +}
>>> diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
>>> index 301b5a1..c8c3233 100644
>>> --- a/qemu/hw/virtio-blk.c
>>> +++ b/qemu/hw/virtio-blk.c
>>> @@ -153,7 +153,7 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor,
>>> uint16_t device,
>>> 0x01, 0x80, 0x00,
>>> 16, sizeof(VirtIOBlock));
>>>
>>> - s->vdev.update_config = virtio_blk_update_config;
>>> + s->vdev.get_config = virtio_blk_update_config;
>>> s->vdev.get_features = virtio_blk_get_features;
>>> s->bs = bs;
>>>
>>> diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
>>> index 86f9e5a..8c1f953 100644
>>> --- a/qemu/hw/virtio-net.c
>>> +++ b/qemu/hw/virtio-net.c
>>> @@ -288,7 +288,7 @@ void *virtio_net_init(PCIBus *bus, NICInfo *nd, int
>>> devfn)
>>> 0x02, 0x00, 0x00,
>>> 6, sizeof(VirtIONet));
>>>
>>> - n->vdev.update_config = virtio_net_update_config;
>>> + n->vdev.get_config = virtio_net_update_config;
>>> n->vdev.get_features = virtio_net_get_features;
>>> n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
>>> n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
>>> diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c
>>> index bbcb44c..1e8fb44 100644
>>> --- a/qemu/hw/virtio.c
>>> +++ b/qemu/hw/virtio.c
>>> @@ -304,6 +304,9 @@ static void virtio_config_writeb(void *opaque,
>>> uint32_t addr, uint32_t data)
>>> return;
>>>
>>> memcpy(vdev->config + addr, &val, sizeof(val));
>>> +
>>> + if (vdev->set_config)
>>> + vdev->set_config(vdev, vdev->config);
>>> }
>>>
>>> static void virtio_config_writew(void *opaque, uint32_t addr, uint32_t
>>> data)
>>> @@ -316,6 +319,9 @@ static void virtio_config_writew(void *opaque,
>>> uint32_t addr, uint32_t data)
>>> return;
>>>
>>> memcpy(vdev->config + addr, &val, sizeof(val));
>>> +
>>> + if (vdev->set_config)
>>> + vdev->set_config(vdev, vdev->config);
>>> }
>>>
>>> static void virtio_config_writel(void *opaque, uint32_t addr, uint32_t
>>> data)
>>> @@ -328,6 +334,9 @@ static void virtio_config_writel(void *opaque,
>>> uint32_t addr, uint32_t data)
>>> return;
>>>
>>> memcpy(vdev->config + addr, &val, sizeof(val));
>>> +
>>> + if (vdev->set_config)
>>> + vdev->set_config(vdev, vdev->config);
>>> }
>>>
>>> static void virtio_map(PCIDevice *pci_dev, int region_num,
>>> @@ -356,7 +365,7 @@ static void virtio_map(PCIDevice *pci_dev, int
>>> region_num,
>>> register_ioport_read(addr + 20, vdev->config_len, 4,
>>> virtio_config_readl, vdev);
>>>
>>> - vdev->update_config(vdev, vdev->config);
>>> + vdev->get_config(vdev, vdev->config);
>>> }
>>> }
>>>
>>> @@ -380,6 +389,14 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int
>>> queue_size,
>>> return &vdev->vq[i];
>>> }
>>>
>>> +void virtio_notify_config(VirtIODevice *vdev)
>>> +{
>>> + /* make sure we have the latest config */
>>> + vdev->get_config(vdev, vdev->config);
>>> + vdev->isr = 3;
>>> + virtio_update_irq(vdev);
>>> +}
>>> +
>>> void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>>> {
>>> /* Always notify when queue is empty */
>>> diff --git a/qemu/hw/virtio.h b/qemu/hw/virtio.h
>>> index dee97ba..ed8bebd 100644
>>> --- a/qemu/hw/virtio.h
>>> +++ b/qemu/hw/virtio.h
>>> @@ -118,7 +118,8 @@ struct VirtIODevice
>>> void *config;
>>> uint32_t (*get_features)(VirtIODevice *vdev);
>>> void (*set_features)(VirtIODevice *vdev, uint32_t val);
>>> - void (*update_config)(VirtIODevice *vdev, uint8_t *config);
>>> + void (*get_config)(VirtIODevice *vdev, uint8_t *config);
>>> + void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
>>> VirtQueue vq[VIRTIO_PCI_QUEUE_MAX];
>>> };
>>>
>>> @@ -140,4 +141,6 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement
>>> *elem);
>>>
>>> void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
>>>
>>> +void virtio_notify_config(VirtIODevice *vdev);
>>> +
>>> #endif
>>> diff --git a/qemu/monitor.c b/qemu/monitor.c
>>> index e03c473..e05f717 100644
>>> --- a/qemu/monitor.c
>>> +++ b/qemu/monitor.c
>>> @@ -35,6 +35,7 @@
>>> #include "audio/audio.h"
>>> #include "disas.h"
>>> #include "migration.h"
>>> +#include "balloon.h"
>>> #include <dirent.h>
>>>
>>> #include "qemu-kvm.h"
>>> @@ -1262,6 +1263,23 @@ static void do_wav_capture (const char *path,
>>> }
>>> #endif
>>>
>>> +static void do_balloon(int value)
>>> +{
>>> + ram_addr_t target = value;
>>> + qemu_balloon(target << 20);
>>> +}
>>> +
>>> +static void do_info_balloon(void)
>>> +{
>>> + ram_addr_t actual;
>>> +
>>> + actual = qemu_balloon_status();
>>> + if (actual == 0)
>>> + term_printf("Ballooning not activated in VM\n");
>>> + else
>>> + term_printf("balloon: actual=%d\n", (int)(actual >> 20));
>>> +}
>>> +
>>> static term_cmd_t term_cmds[] = {
>>> { "help|?", "s?", do_help,
>>> "[cmd]", "show the help" },
>>> @@ -1339,6 +1357,8 @@ static term_cmd_t term_cmds[] = {
>>> "", "cancel the current VM migration" },
>>> { "migrate_set_speed", "s", do_migrate_set_speed,
>>> "value", "set maximum speed (in bytes) for migrations" },
>>> + { "balloon", "i", do_balloon,
>>> + "target", "request VM to change it's memory allocation (in MB)" },
>>> { NULL, NULL, },
>>> };
>>>
>>> @@ -1401,6 +1421,8 @@ static term_cmd_t info_cmds[] = {
>>> #endif
>>> { "migration", "", do_info_migration,
>>> "", "show migration information" },
>>> + { "balloon", "", do_info_balloon,
>>> + "", "show balloon information" },
>>> { NULL, NULL, },
>>> };
>>>
>>> diff --git a/qemu/vl.c b/qemu/vl.c
>>> index 756e13d..d339eb2 100644
>>> --- a/qemu/vl.c
>>> +++ b/qemu/vl.c
>>> @@ -38,6 +38,7 @@
>>> #include "block.h"
>>> #include "audio/audio.h"
>>> #include "migration.h"
>>> +#include "balloon.h"
>>> #include "qemu-kvm.h"
>>>
>>> #include <unistd.h>
>>> @@ -511,6 +512,31 @@ void hw_error(const char *fmt, ...)
>>> abort();
>>> }
>>>
>>> +/***************/
>>> +/* ballooning */
>>> +
>>> +static QEMUBalloonEvent *qemu_balloon_event;
>>> +void *qemu_balloon_event_opaque;
>>> +
>>> +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque)
>>> +{
>>> + qemu_balloon_event = func;
>>> + qemu_balloon_event_opaque = opaque;
>>> +}
>>> +
>>> +void qemu_balloon(ram_addr_t target)
>>> +{
>>> + if (qemu_balloon_event)
>>> + qemu_balloon_event(qemu_balloon_event_opaque, target);
>>> +}
>>> +
>>> +ram_addr_t qemu_balloon_status(void)
>>> +{
>>> + if (qemu_balloon_event)
>>> + return qemu_balloon_event(qemu_balloon_event_opaque, 0);
>>> + return 0;
>>> +}
>>> +
>>> /***********************************************************/
>>> /* keyboard/mouse */
>>>
>>>
>>>
-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
[email protected]
https://lists.sourceforge.net/lists/listinfo/kvm-devel