On Tue, Apr 09, 2013 at 05:43:43PM +0200, Paolo Bonzini wrote:
> Using qemu_memalign only leaves the RAM zero by chance, because libc
> will usually use mmap to satisfy our huge requests.  But memory will
> not be zero when using MALLOC_PERTURB_ with a nonzero value.  In the
> case of incoming migration, this breaks a recently-introduced
> invariant (commit f1c7279, migration: do not sent zero pages in
> bulk stage, 2013-03-26).
> 
> To fix this, use mmap ourselves to get a well-aligned, always zero
> block for the RAM.  Mmap-ed memory is easy to "trim" at the sides.
> 
> This also removes the need to do something special on valgrind
> (see commit c2a8238a, Support running QEMU on Valgrind, 2011-10-31),
> thus effectively reverts that patch.

Hi Paolo,

This patch introduced a regression bug in hot-removing device. (reproduce: 100%)

(gdb) r  --enable-kvm -mon chardev=qmp,mode=control,pretty=on -chardev 
socket,id=qmp,host=localhost,port=1234,server,nowait -vnc :0 -monitor stdio 
-vnc :0 -m 2000 /images/RHEL-Server-6.4-64-virtio.qcow2 -device 
virtio-net-pci,netdev=ndev1,id=id1 -netdev tap,id=ndev1 -device 
e1000,netdev=ndev2,id=id2 -netdev tap,id=ndev2
Starting program: /home/devel/qemu/x86_64-softmmu/qemu-system-x86_64 
--enable-kvm -mon chardev=qmp,mode=control,pretty=on -chardev 
socket,id=qmp,host=localhost,port=1234,server,nowait -vnc :0 -monitor stdio 
-vnc :0 -m 2000 /images/RHEL-Server-6.4-64-virtio.qcow2 -device 
virtio-net-pci,netdev=ndev1,id=id1 -netdev tap,id=ndev1 -device 
e1000,netdev=ndev2,id=id2 -netdev tap,id=ndev2
(qemu) info network
id1: index=0,type=nic,model=virtio-net-pci,macaddr=52:54:00:12:34:56
 \ ndev1: 
index=0,type=tap,ifname=tap0,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown
id2: index=0,type=nic,model=e1000,macaddr=52:54:00:12:34:57
 \ ndev2: 
index=0,type=tap,ifname=tap1,script=/etc/qemu-ifup,downscript=/etc/qemu-ifdown
(qemu) netdev_del ndev1
/etc/qemu-ifdown: could not launch network script
(qemu) device_del id1
Program received signal SIGSEGV, Segmentation fault.
[Switching to Thread 0x7ffff2adc700 (LWP 9766)]
0x00007ffff50cc6ec in free () from /lib64/libc.so.6

(gdb) bt full
#0  0x00007ffff50cc6ec in free () from /lib64/libc.so.6
No symbol table info available.
#1  0x00007ffff7efbb1b in qemu_vfree (ptr=0x7ffff1200000) at 
util/oslib-posix.c:135
No locals.
#2  0x00007ffff7de7358 in qemu_ram_free (addr=2105868288) at 
/home/devel/qemu/exec.c:1161
        block = 0x7ffff8ddc370
#3  0x00007ffff7e5270e in memory_region_destructor_ram (mr=0x7ffff8de3e00) at 
/home/devel/qemu/memory.c:763
No locals.
#4  0x00007ffff7e53269 in memory_region_destroy (mr=0x7ffff8de3e00) at 
/home/devel/qemu/memory.c:1025
        __PRETTY_FUNCTION__ = "memory_region_destroy"
#5  0x00007ffff7cdcf35 in pci_del_option_rom (pdev=0x7ffff8de3830) at 
hw/pci/pci.c:1975
No locals.
#6  0x00007ffff7cda333 in pci_unregister_device (dev=0x7ffff8de3830) at 
hw/pci/pci.c:901
        pci_dev = 0x7ffff8de3830
        pc = 0x7ffff8de3770
#7  0x00007ffff7c644a9 in device_unrealize (dev=0x7ffff8de3830, 
errp=0x7ffff2adb710) at hw/core/qdev.c:190
        rc = 32767
        dc = 0x7ffff8de3770
#8  0x00007ffff7c65dfb in device_set_realized (obj=0x7ffff8de3830, value=false, 
err=0x0) at hw/core/qdev.c:714
        dev = 0x7ffff8de3830
        dc = 0x7ffff8de3770
        local_err = 0x0
#9  0x00007ffff7d8a807 in property_set_bool (obj=0x7ffff8de3830, 
v=0x7fffe4024240, opaque=0x7ffff8de36e0, name=0x7ffff7f2074e "realized", 
errp=0x0) at qom/object.c:1235
        prop = 0x7ffff8de36e0
        value = false
        local_err = 0x0
#10 0x00007ffff7d892a3 in object_property_set (obj=0x7ffff8de3830, 
v=0x7fffe4024240, name=0x7ffff7f2074e "realized", errp=0x0) at qom/object.c:715
        prop = 0x7ffff8de60d0
#11 0x00007ffff7d8aac2 in object_property_set_qobject (obj=0x7ffff8de3830, 
value=0x7fffe40086e0, name=0x7ffff7f2074e "realized", errp=0x0) at 
qom/qom-qobject.c:24
        mi = 0x7fffe4024240
#12 0x00007ffff7d895aa in object_property_set_bool (obj=0x7ffff8de3830, 
value=false, name=0x7ffff7f2074e "realized", errp=0x0) at qom/object.c:778
        qbool = 0x7fffe40086e0
#13 0x00007ffff7c66133 in device_unparent (obj=0x7ffff8de3830) at 
hw/core/qdev.c:792
        dev = 0x7ffff8de3830
        bus = 0x7ffff7f1bbd0
        event_data = 0x7ffff7d8886b
        have_realized = true
#14 0x00007ffff7d88420 in object_unparent (obj=0x7ffff8de3830) at 
qom/object.c:367
No locals.
#15 0x00007ffff7c648d9 in qdev_free (dev=0x7ffff8de3830) at hw/core/qdev.c:285
No locals.
#16 0x00007ffff7c30f99 in acpi_piix_eject_slot (s=0x7ffff8dddd90, slots=8) at 
hw/acpi/piix4.c:306
        qdev = 0x7ffff8de3830
        dev = 0x7ffff8de3830
        pc = 0x7ffff8de3770
        kid = 0x7ffff8dee780
        next = 0x7ffff8de11b0
        bus = 0x7ffff8d6ab10
        slot = 3
        slot_free = true
#17 0x00007ffff7c31b5c in pci_write (opaque=0x7ffff8dddd90, addr=8, data=8, 
size=4) at hw/acpi/piix4.c:569
No locals.
#18 0x00007ffff7e500ff in memory_region_write_accessor (opaque=0x7ffff8dde598, 
addr=8, value=0x7ffff2adbb00, size=4, shift=0, mask=4294967295) at 
/home/devel/qemu/memory.c:334
        mr = 0x7ffff8dde598
        tmp = 8
#19 0x00007ffff7e501e1 in access_with_adjusted_size (addr=8, 
value=0x7ffff2adbb00, size=4, access_size_min=1, access_size_max=4, access=
    0x7ffff7e50073 <memory_region_write_accessor>, opaque=0x7ffff8dde598) at 
/home/devel/qemu/memory.c:364
        access_mask = 4294967295
        access_size = 4
        i = 0
#20 0x00007ffff7e50669 in memory_region_iorange_write (iorange=0x7ffff8dcf940, 
offset=8, width=4, data=8) at /home/devel/qemu/memory.c:439
        mrio = 0x7ffff8dcf940
        mr = 0x7ffff8dde598
        __PRETTY_FUNCTION__ = "memory_region_iorange_write"
#21 0x00007ffff7e48a18 in ioport_writel_thunk (opaque=0x7ffff8dcf940, 
addr=44552, data=8) at /home/devel/qemu/ioport.c:226
        ioport = 0x7ffff8dcf940
#22 0x00007ffff7e482e3 in ioport_write (index=2, address=44552, data=8) at 
/home/devel/qemu/ioport.c:83
        default_func = {0x7ffff7e48337 <default_ioport_writeb>, 0x7ffff7e483da 
<default_ioport_writew>, 0x7ffff7e48485 <default_ioport_writel>}
        func = 0x7ffff7e489be <ioport_writel_thunk>
#23 0x00007ffff7e48f93 in cpu_outl (addr=44552, val=8) at 
/home/devel/qemu/ioport.c:303
No locals.
#24 0x00007ffff7e4c9e7 in kvm_handle_io (port=44552, data=0x7ffff20d9000, 
direction=1, size=4, count=1) at /home/devel/qemu/kvm-all.c:1430
        i = 0
        ptr = 0x7ffff20d9000 "\b"
#25 0x00007ffff7e4d001 in kvm_cpu_exec (env=0x7ffff8d4be80) at 
/home/devel/qemu/kvm-all.c:1575
        cpu = 0x7ffff8d4bd70
        run = 0x7ffff20d8000
        ret = 0
        run_ret = 0
...
 
> Reviewed-by: Juan Quintela <quint...@redhat.com>
> Signed-off-by: Paolo Bonzini <pbonz...@redhat.com>
> ---
>       v2->v3: use MAP_FAILED.  You learn something every day.
> 
>  util/oslib-posix.c | 35 ++++++++++++++++++-----------------
>  1 file changed, 18 insertions(+), 17 deletions(-)
> 
> diff --git a/util/oslib-posix.c b/util/oslib-posix.c
> index 4e4b819..bda62c0 100644
> --- a/util/oslib-posix.c
> +++ b/util/oslib-posix.c
> @@ -40,7 +40,6 @@ extern int daemon(int, int);
>        Valgrind does not support alignments larger than 1 MiB,
>        therefore we need special code which handles running on Valgrind. */
>  #  define QEMU_VMALLOC_ALIGN (512 * 4096)
> -#  define CONFIG_VALGRIND
>  #elif defined(__linux__) && defined(__s390x__)
>     /* Use 1 MiB (segment size) alignment so gmap can be used by KVM. */
>  #  define QEMU_VMALLOC_ALIGN (256 * 4096)
> @@ -52,12 +51,8 @@ extern int daemon(int, int);
>  #include "sysemu/sysemu.h"
>  #include "trace.h"
>  #include "qemu/sockets.h"
> +#include <sys/mman.h>
>  
> -#if defined(CONFIG_VALGRIND)
> -static int running_on_valgrind = -1;
> -#else
> -#  define running_on_valgrind 0
> -#endif
>  #ifdef CONFIG_LINUX
>  #include <sys/syscall.h>
>  #endif
> @@ -108,22 +103,28 @@ void *qemu_memalign(size_t alignment, size_t size)
>  /* alloc shared memory pages */
>  void *qemu_vmalloc(size_t size)
>  {
> -    void *ptr;
>      size_t align = QEMU_VMALLOC_ALIGN;
> +    size_t total = size + align - getpagesize();
> +    void *ptr = mmap(0, total, PROT_READ | PROT_WRITE,
> +                     MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
> +    size_t offset = QEMU_ALIGN_UP((uintptr_t)ptr, align) - (uintptr_t)ptr;
>  
> -#if defined(CONFIG_VALGRIND)
> -    if (running_on_valgrind < 0) {
> -        /* First call, test whether we are running on Valgrind.
> -           This is a substitute for RUNNING_ON_VALGRIND from valgrind.h. */
> -        const char *ld = getenv("LD_PRELOAD");
> -        running_on_valgrind = (ld != NULL && strstr(ld, "vgpreload"));
> +    if (ptr == MAP_FAILED) {
> +        fprintf(stderr, "Failed to allocate %zu B: %s\n",
> +                size, strerror(errno));
> +        abort();
>      }
> -#endif
>  
> -    if (size < align || running_on_valgrind) {
> -        align = getpagesize();
> +    ptr += offset;
> +    total -= offset;
> +
> +    if (offset > 0) {
> +        munmap(ptr - offset, offset);
>      }
> -    ptr = qemu_memalign(align, size);
> +    if (total > size) {
> +        munmap(ptr + size, total - size);
> +    }
> +
>      trace_qemu_vmalloc(size, ptr);
>      return ptr;
>  }
> -- 
> 1.8.1.4
> 

-- 
                        Amos.

Reply via email to