Marcelo Tosatti wrote:
> On Thu, Jan 24, 2008 at 04:29:51PM -0600, Anthony Liguori wrote:
>   
>> Anthony Liguori wrote:
>>     
>>> This patch adds support to QEMU for Rusty's recently introduce virtio 
>>> balloon
>>> driver.  The user-facing portions of this are the introduction of a 
>>> "balloon"
>>> and "info balloon" command in the monitor.
>>>
>>> I think using madvise unconditionally is okay but I am not sure.
>>>       
>> Looks like it's not.  I just hung my host system after doing a bunch of 
>> ballooning with a kernel that doesn't have MM notifiers.
>>     
>
> Thats strange, lack of MMU notifiers will crash the guest (by use of a
> stale shadow entry), but not the host.
>
> What are the symptoms?
>   

It's only happened to me once because I stopped testing with madvise 
afterward.  The guest spontaneously restarted, and a few seconds later, 
the machine hung.  It shouldn't be hard to reproduce by just repeatedly 
ballooning up and down a guest.

Do others expect KVM to just cope with the virtual mapping being changed 
out from underneath of it?

>> I'm inclined to think that we should have a capability check for MM 
>> notifiers and just not do madvise if they aren't present.  I don't think 
>> the ioctl approach that Marcelo took is sufficient as a malicious guest 
>> could possibly hose the host.
>>     
>
> How's that? The ioctl damage is contained to the guest (other than CPU
> processing time, which the guest can cause in other ways).
>
> Anyway, don't see the need for back compat with older hosts.
>   

Well, I'm unsure if this is a bug or expected behavior.  If it's the 
later, then the ioctl approach just introduces a race condition.  If the 
guest can fault in a page after the ioctl but before the madvise(), then 
it can trigger the bug.

Regards,

Anthony Liguori

>> Having the guest allocate and not touch memory means that it should 
>> eventually be removed from the shadow page cache and eventually swapped 
>> out so ballooning isn't totally useless in the absence of MM notifiers.
>>
>> Regards,
>>
>> Anthony Liguori
>>
>>     
>>>  If madvise
>>> is called on memory that is essentially locked (which is what pre-MM 
>>> notifiers
>>> is like) then it should just be a nop right?
>>>
>>> Signed-off-by: Anthony Liguori <[EMAIL PROTECTED]>
>>>
>>> diff --git a/qemu/Makefile.target b/qemu/Makefile.target
>>> index bb7be0f..d6b4f46 100644
>>> --- a/qemu/Makefile.target
>>> +++ b/qemu/Makefile.target
>>> @@ -464,7 +464,7 @@ VL_OBJS += rtl8139.o
>>> VL_OBJS+= hypercall.o
>>>
>>> # virtio devices
>>> -VL_OBJS += virtio.o virtio-net.o virtio-blk.o
>>> +VL_OBJS += virtio.o virtio-net.o virtio-blk.o virtio-balloon.o
>>>
>>> ifeq ($(TARGET_BASE_ARCH), i386)
>>> # Hardware support
>>> diff --git a/qemu/balloon.h b/qemu/balloon.h
>>> new file mode 100644
>>> index 0000000..ffce1fa
>>> --- /dev/null
>>> +++ b/qemu/balloon.h
>>> @@ -0,0 +1,14 @@
>>> +#ifndef _QEMU_BALLOON_H
>>> +#define _QEMU_BALLOON_H
>>> +
>>> +#include "cpu-defs.h"
>>> +
>>> +typedef ram_addr_t (QEMUBalloonEvent)(void *opaque, ram_addr_t target);
>>> +
>>> +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque);
>>> +
>>> +void qemu_balloon(ram_addr_t target);
>>> +
>>> +ram_addr_t qemu_balloon_status(void);
>>> +
>>> +#endif
>>> diff --git a/qemu/hw/pc.c b/qemu/hw/pc.c
>>> index 652b263..6c8d907 100644
>>> --- a/qemu/hw/pc.c
>>> +++ b/qemu/hw/pc.c
>>> @@ -1122,6 +1122,9 @@ static void pc_init1(ram_addr_t ram_size, int 
>>> vga_ram_size,
>>>     }
>>>     }
>>>
>>> +    if (pci_enabled)
>>> +   virtio_balloon_init(pci_bus, 0x1AF4, 0x1002);
>>> +
>>>     if (extboot_drive != -1) {
>>>     DriveInfo *info = &drives_table[extboot_drive];
>>>     int cyls, heads, secs;
>>> diff --git a/qemu/hw/pc.h b/qemu/hw/pc.h
>>> index f640395..1899c11 100644
>>> --- a/qemu/hw/pc.h
>>> +++ b/qemu/hw/pc.h
>>> @@ -152,6 +152,9 @@ void virtio_net_poll(void);
>>> void *virtio_blk_init(PCIBus *bus, uint16_t vendor, uint16_t device,
>>>                   BlockDriverState *bs);
>>>
>>> +/* virtio-balloon.h */
>>> +void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device);
>>> +
>>> /* extboot.c */
>>>
>>> void extboot_init(BlockDriverState *bs, int cmd);
>>> diff --git a/qemu/hw/virtio-balloon.c b/qemu/hw/virtio-balloon.c
>>> new file mode 100644
>>> index 0000000..1b5a689
>>> --- /dev/null
>>> +++ b/qemu/hw/virtio-balloon.c
>>> @@ -0,0 +1,142 @@
>>> +/*
>>> + * Virtio Block Device
>>> + *
>>> + * Copyright IBM, Corp. 2008
>>> + *
>>> + * Authors:
>>> + *  Anthony Liguori   <[EMAIL PROTECTED]>
>>> + *
>>> + * This work is licensed under the terms of the GNU GPL, version 2.  See
>>> + * the COPYING file in the top-level directory.
>>> + *
>>> + */
>>> +
>>> +#include "virtio.h"
>>> +#include "block.h"
>>> +#include "pc.h"
>>> +#include "balloon.h"
>>> +#include "sysemu.h"
>>> +
>>> +#include <sys/mman.h>
>>> +
>>> +/* from Linux's linux/virtio_blk.h */
>>> +
>>> +/* The ID for virtio_balloon */
>>> +#define VIRTIO_ID_BALLOON  5
>>> +
>>> +/* The feature bitmap for virtio balloon */
>>> +#define VIRTIO_BALLOON_F_MUST_TELL_HOST    0 /* Tell before reclaiming 
>>> pages */
>>> +
>>> +struct virtio_balloon_config
>>> +{
>>> +    /* Number of pages host wants Guest to give up. */
>>> +    uint32_t num_pages;
>>> +    /* Number of pages we've actually got in balloon. */
>>> +    uint32_t actual;
>>> +};
>>> +
>>> +typedef struct VirtIOBalloon
>>> +{
>>> +    VirtIODevice vdev;
>>> +    VirtQueue *ivq, *dvq;
>>> +    uint32_t num_pages;
>>> +    uint32_t actual;
>>> +} VirtIOBalloon;
>>> +
>>> +static VirtIOBalloon *to_virtio_balloon(VirtIODevice *vdev)
>>> +{
>>> +    return (VirtIOBalloon *)vdev;
>>> +}
>>> +
>>> +static void virtio_balloon_handle_output(VirtIODevice *vdev, VirtQueue 
>>> *vq)
>>> +{
>>> +    VirtIOBalloon *s = to_virtio_balloon(vdev);
>>> +    VirtQueueElement elem;
>>> +    unsigned int count;
>>> +
>>> +    while ((count = virtqueue_pop(vq, &elem)) != 0) {
>>> +   int i;
>>> +   unsigned int wlen = 0;
>>> +
>>> +   for (i = 0; i < elem.out_num; i++) {
>>> +       int flags;
>>> +       uint32_t *pfns = elem.out_sg[i].iov_base;
>>> +       unsigned int n_pfns = elem.out_sg[i].iov_len / 4;
>>> +       int j;
>>> +
>>> +       for (j = 0; j < n_pfns; j++) {
>>> +           if (vq == s->dvq) /* deflate */
>>> +               flags = MADV_WILLNEED;
>>> +           else /* inflate */
>>> +               flags = MADV_DONTNEED;
>>> +
>>> +           madvise(phys_ram_base + (pfns[j] << TARGET_PAGE_BITS),
>>> +                   TARGET_PAGE_SIZE, flags);
>>> +       }
>>> +
>>> +       wlen += elem.out_sg[i].iov_len;
>>> +   }
>>> +
>>> +   virtqueue_push(vq, &elem, wlen);
>>> +   virtio_notify(vdev, vq);
>>> +    }
>>> +}
>>> +
>>> +static void virtio_balloon_get_config(VirtIODevice *vdev, uint8_t 
>>> *config_data)
>>> +{
>>> +    VirtIOBalloon *dev = to_virtio_balloon(vdev);
>>> +    struct virtio_balloon_config config;
>>> +
>>> +    config.num_pages = dev->num_pages;
>>> +    config.actual = dev->actual;
>>> +
>>> +    memcpy(config_data, &config, 8);
>>> +}
>>> +
>>> +static void virtio_balloon_set_config(VirtIODevice *vdev,
>>> +                                 const uint8_t *config_data)
>>> +{
>>> +    VirtIOBalloon *dev = to_virtio_balloon(vdev);
>>> +    struct virtio_balloon_config config;
>>> +    memcpy(&config, config_data, 8);
>>> +    dev->actual = config.actual;
>>> +}
>>> +
>>> +static uint32_t virtio_balloon_get_features(VirtIODevice *vdev)
>>> +{
>>> +    return 0;
>>> +}
>>> +
>>> +static ram_addr_t virtio_balloon_to_target(void *opaque, ram_addr_t 
>>> target)
>>> +{
>>> +    VirtIOBalloon *dev = opaque;
>>> +
>>> +    if (target) {
>>> +   dev->num_pages = (ram_size - target) >> TARGET_PAGE_BITS;
>>> +   virtio_notify_config(&dev->vdev);
>>> +    }
>>> +
>>> +    return ram_size - (dev->actual << TARGET_PAGE_BITS);
>>> +}
>>> +
>>> +void *virtio_balloon_init(PCIBus *bus, uint16_t vendor, uint16_t device)
>>> +{
>>> +    VirtIOBalloon *s;
>>> +
>>> +    s = (VirtIOBalloon *)virtio_init_pci(bus, "virtio-balloon",
>>> +                                    vendor, device,
>>> +                                    0, VIRTIO_ID_BALLOON,
>>> +                                    0x05, 0x00, 0x00,
>>> +                                    8, sizeof(VirtIOBalloon));
>>> +
>>> +    s->vdev.get_config = virtio_balloon_get_config;
>>> +    s->vdev.set_config = virtio_balloon_set_config;
>>> +    s->vdev.get_features = virtio_balloon_get_features;
>>> +
>>> +    s->ivq = virtio_add_queue(&s->vdev, 128, 
>>> virtio_balloon_handle_output);
>>> +    s->dvq = virtio_add_queue(&s->vdev, 128, 
>>> virtio_balloon_handle_output);
>>> +
>>> +    qemu_add_balloon_handler(virtio_balloon_to_target, s);
>>> +
>>> +    return &s->vdev;
>>> +}
>>> diff --git a/qemu/hw/virtio-blk.c b/qemu/hw/virtio-blk.c
>>> index 301b5a1..c8c3233 100644
>>> --- a/qemu/hw/virtio-blk.c
>>> +++ b/qemu/hw/virtio-blk.c
>>> @@ -153,7 +153,7 @@ void *virtio_blk_init(PCIBus *bus, uint16_t vendor, 
>>> uint16_t device,
>>>                                    0x01, 0x80, 0x00,
>>>                                    16, sizeof(VirtIOBlock));
>>>
>>> -    s->vdev.update_config = virtio_blk_update_config;
>>> +    s->vdev.get_config = virtio_blk_update_config;
>>>     s->vdev.get_features = virtio_blk_get_features;
>>>     s->bs = bs;
>>>
>>> diff --git a/qemu/hw/virtio-net.c b/qemu/hw/virtio-net.c
>>> index 86f9e5a..8c1f953 100644
>>> --- a/qemu/hw/virtio-net.c
>>> +++ b/qemu/hw/virtio-net.c
>>> @@ -288,7 +288,7 @@ void *virtio_net_init(PCIBus *bus, NICInfo *nd, int 
>>> devfn)
>>>                                  0x02, 0x00, 0x00,
>>>                                  6, sizeof(VirtIONet));
>>>
>>> -    n->vdev.update_config = virtio_net_update_config;
>>> +    n->vdev.get_config = virtio_net_update_config;
>>>     n->vdev.get_features = virtio_net_get_features;
>>>     n->rx_vq = virtio_add_queue(&n->vdev, 512, virtio_net_handle_rx);
>>>     n->tx_vq = virtio_add_queue(&n->vdev, 128, virtio_net_handle_tx);
>>> diff --git a/qemu/hw/virtio.c b/qemu/hw/virtio.c
>>> index bbcb44c..1e8fb44 100644
>>> --- a/qemu/hw/virtio.c
>>> +++ b/qemu/hw/virtio.c
>>> @@ -304,6 +304,9 @@ static void virtio_config_writeb(void *opaque, 
>>> uint32_t addr, uint32_t data)
>>>     return;
>>>
>>>     memcpy(vdev->config + addr, &val, sizeof(val));
>>> +
>>> +    if (vdev->set_config)
>>> +   vdev->set_config(vdev, vdev->config);
>>> }
>>>
>>> static void virtio_config_writew(void *opaque, uint32_t addr, uint32_t 
>>> data)
>>> @@ -316,6 +319,9 @@ static void virtio_config_writew(void *opaque, 
>>> uint32_t addr, uint32_t data)
>>>     return;
>>>
>>>     memcpy(vdev->config + addr, &val, sizeof(val));
>>> +
>>> +    if (vdev->set_config)
>>> +   vdev->set_config(vdev, vdev->config);
>>> }
>>>
>>> static void virtio_config_writel(void *opaque, uint32_t addr, uint32_t 
>>> data)
>>> @@ -328,6 +334,9 @@ static void virtio_config_writel(void *opaque, 
>>> uint32_t addr, uint32_t data)
>>>     return;
>>>
>>>     memcpy(vdev->config + addr, &val, sizeof(val));
>>> +
>>> +    if (vdev->set_config)
>>> +   vdev->set_config(vdev, vdev->config);
>>> }
>>>
>>> static void virtio_map(PCIDevice *pci_dev, int region_num,
>>> @@ -356,7 +365,7 @@ static void virtio_map(PCIDevice *pci_dev, int 
>>> region_num,
>>>     register_ioport_read(addr + 20, vdev->config_len, 4,
>>>                          virtio_config_readl, vdev);
>>>
>>> -   vdev->update_config(vdev, vdev->config);
>>> +   vdev->get_config(vdev, vdev->config);
>>>     }
>>> }
>>>
>>> @@ -380,6 +389,14 @@ VirtQueue *virtio_add_queue(VirtIODevice *vdev, int 
>>> queue_size,
>>>     return &vdev->vq[i];
>>> }
>>>
>>> +void virtio_notify_config(VirtIODevice *vdev)
>>> +{
>>> +    /* make sure we have the latest config */
>>> +    vdev->get_config(vdev, vdev->config);
>>> +    vdev->isr = 3;
>>> +    virtio_update_irq(vdev);
>>> +}
>>> +
>>> void virtio_notify(VirtIODevice *vdev, VirtQueue *vq)
>>> {
>>>     /* Always notify when queue is empty */
>>> diff --git a/qemu/hw/virtio.h b/qemu/hw/virtio.h
>>> index dee97ba..ed8bebd 100644
>>> --- a/qemu/hw/virtio.h
>>> +++ b/qemu/hw/virtio.h
>>> @@ -118,7 +118,8 @@ struct VirtIODevice
>>>     void *config;
>>>     uint32_t (*get_features)(VirtIODevice *vdev);
>>>     void (*set_features)(VirtIODevice *vdev, uint32_t val);
>>> -    void (*update_config)(VirtIODevice *vdev, uint8_t *config);
>>> +    void (*get_config)(VirtIODevice *vdev, uint8_t *config);
>>> +    void (*set_config)(VirtIODevice *vdev, const uint8_t *config);
>>>     VirtQueue vq[VIRTIO_PCI_QUEUE_MAX];
>>> };
>>>
>>> @@ -140,4 +141,6 @@ int virtqueue_pop(VirtQueue *vq, VirtQueueElement 
>>> *elem);
>>>
>>> void virtio_notify(VirtIODevice *vdev, VirtQueue *vq);
>>>
>>> +void virtio_notify_config(VirtIODevice *vdev);
>>> +
>>> #endif
>>> diff --git a/qemu/monitor.c b/qemu/monitor.c
>>> index e03c473..e05f717 100644
>>> --- a/qemu/monitor.c
>>> +++ b/qemu/monitor.c
>>> @@ -35,6 +35,7 @@
>>> #include "audio/audio.h"
>>> #include "disas.h"
>>> #include "migration.h"
>>> +#include "balloon.h"
>>> #include <dirent.h>
>>>
>>> #include "qemu-kvm.h"
>>> @@ -1262,6 +1263,23 @@ static void do_wav_capture (const char *path,
>>> }
>>> #endif
>>>
>>> +static void do_balloon(int value)
>>> +{
>>> +    ram_addr_t target = value;
>>> +    qemu_balloon(target << 20);
>>> +}
>>> +
>>> +static void do_info_balloon(void)
>>> +{
>>> +    ram_addr_t actual;
>>> +
>>> +    actual = qemu_balloon_status();
>>> +    if (actual == 0)
>>> +   term_printf("Ballooning not activated in VM\n");
>>> +    else
>>> +   term_printf("balloon: actual=%d\n", (int)(actual >> 20));
>>> +}
>>> +
>>> static term_cmd_t term_cmds[] = {
>>>     { "help|?", "s?", do_help,
>>>       "[cmd]", "show the help" },
>>> @@ -1339,6 +1357,8 @@ static term_cmd_t term_cmds[] = {
>>>       "", "cancel the current VM migration" },
>>>     { "migrate_set_speed", "s", do_migrate_set_speed,
>>>       "value", "set maximum speed (in bytes) for migrations" },
>>> +    { "balloon", "i", do_balloon,
>>> +      "target", "request VM to change it's memory allocation (in MB)" },
>>>     { NULL, NULL, },
>>> };
>>>
>>> @@ -1401,6 +1421,8 @@ static term_cmd_t info_cmds[] = {
>>> #endif
>>>     { "migration", "", do_info_migration,
>>>       "", "show migration information" },
>>> +    { "balloon", "", do_info_balloon,
>>> +      "", "show balloon information" },
>>>     { NULL, NULL, },
>>> };
>>>
>>> diff --git a/qemu/vl.c b/qemu/vl.c
>>> index 756e13d..d339eb2 100644
>>> --- a/qemu/vl.c
>>> +++ b/qemu/vl.c
>>> @@ -38,6 +38,7 @@
>>> #include "block.h"
>>> #include "audio/audio.h"
>>> #include "migration.h"
>>> +#include "balloon.h"
>>> #include "qemu-kvm.h"
>>>
>>> #include <unistd.h>
>>> @@ -511,6 +512,31 @@ void hw_error(const char *fmt, ...)
>>>     abort();
>>> }
>>>
>>> +/***************/
>>> +/* ballooning */
>>> +
>>> +static QEMUBalloonEvent *qemu_balloon_event;
>>> +void *qemu_balloon_event_opaque;
>>> +
>>> +void qemu_add_balloon_handler(QEMUBalloonEvent *func, void *opaque)
>>> +{
>>> +    qemu_balloon_event = func;
>>> +    qemu_balloon_event_opaque = opaque;
>>> +}
>>> +
>>> +void qemu_balloon(ram_addr_t target)
>>> +{
>>> +    if (qemu_balloon_event)
>>> +   qemu_balloon_event(qemu_balloon_event_opaque, target);
>>> +}
>>> +
>>> +ram_addr_t qemu_balloon_status(void)
>>> +{
>>> +    if (qemu_balloon_event)
>>> +   return qemu_balloon_event(qemu_balloon_event_opaque, 0);
>>> +    return 0;
>>> +}
>>> +
>>> /***********************************************************/
>>> /* keyboard/mouse */
>>>
>>>  
>>>       


-------------------------------------------------------------------------
This SF.net email is sponsored by: Microsoft
Defy all challenges. Microsoft(R) Visual Studio 2008.
http://clk.atdmt.com/MRT/go/vse0120000070mrt/direct/01/
_______________________________________________
kvm-devel mailing list
kvm-devel@lists.sourceforge.net
https://lists.sourceforge.net/lists/listinfo/kvm-devel

Reply via email to