Hi! I am hitting an racing issue with migration.
I migrate a guest from one machine to another using libvirt: virsh migrate --live --persistent --undefinesource --copy-storage-all --verbose --desturi qemu+ssh://legkvm/system --domain chig1 I.e. it copied the full disk which is qcow2, 20G virtual, 10GB of real disk space. When migration finished, process_incoming_migration_co() calls bdrv_invalidate_cache_all() which calls qcow2_invalidate_cache() which does qcow2_close() and the latter destroys l2_table_cache and refcount_block_cache. It also calls qcow2_cache_flush(). All good. However somehow after migration is completed as described above, qcow2_co_flush_to_os() is called again and I either get crash in qcow2_cache_flush (as @c==NULL) or I get assert like below as s->lock is not set, the backtrace is below. In qcow2_co_flush_to_os(), @bs points to valid data and @s is empty. The xml I used is at the end of this email. But it does not seem essential for the problem - it only happens on two selected machines and I cannot reproduce it on the machines I got locally. It sounds to me like qcow2_close() is called when there is still a qcow2_co_flush_to_os() "coroutine" in flight (there is always one, I believe). Why is this happening and how to fix it properly? Thanks. Program received signal SIGSEGV, Segmentation fault. 0x0000000010504204 in qcow2_cache_flush (bs=0x10019aab420, c=0x0) at /home/alexey/p/qemu/block/qcow2-cache.c:174 174 for (i = 0; i < c->size; i++) { (gdb) bt #0 0x0000000010504204 in qcow2_cache_flush (bs=0x10019aab420, c=0x0) at /home/alexey/p/qemu/block/qcow2-cache.c:174 #1 0x00000000104f557c in qcow2_co_flush_to_os (bs=0x10019aab420) at /home/alexey/p/qemu/block/qcow2.c:2162 #2 0x00000000104c126c in bdrv_co_flush (bs=0x10019aab420) at /home/alexey/p/qemu/block.c:4971 #3 0x00000000104b2000 in nbd_trip (opaque=0x10019cf75c0) at /home/alexey/p/qemu/nbd.c:1259 #4 0x00000000104d17d4 in coroutine_trampoline (i0=0x100, i1=0x19cd5c00) at /home/alexey/p/qemu/coroutine-ucontext.c:118 #5 0x00003fff94ff099c in .__makecontext () from /usr/lib64/libc.so.6 #6 0x0eeabf8ea4edc5a2 in ?? () Backtrace stopped: previous frame inner to this frame (corrupt stack?) (gdb) up #1 0x00000000104f557c in qcow2_co_flush_to_os (bs=0x10019aab420) at /home/alexey/p/qemu/block/qcow2.c:2162 2162 ret = qcow2_cache_flush(bs, s->l2_table_cache); (gdb) p s $1 = (BDRVQcowState *) 0x10019aaf300 (gdb) p s->l2_table_cache $2 = (Qcow2Cache *) 0x0 (gdb) p *s $3 = { cluster_bits = 0x0, cluster_size = 0x0, cluster_sectors = 0x0, l2_bits = 0x0, l2_size = 0x0, l1_size = 0x0, l1_vm_state_index = 0x0, csize_shift = 0x0, csize_mask = 0x0, cluster_offset_mask = 0x0, l1_table_offset = 0x0, l1_table = 0x0, l2_table_cache = 0x0, refcount_block_cache = 0x0, cluster_cache = 0x0, cluster_data = 0x0, cluster_cache_offset = 0x0, cluster_allocs = { lh_first = 0x0 }, refcount_table = 0x0, refcount_table_offset = 0x0, refcount_table_size = 0x0, free_cluster_index = 0x0, free_byte_offset = 0x0, lock = { locked = 0x1, queue = { entries = { tqh_first = 0x0, tqh_last = 0x0 } } }, crypt_method = 0x0, crypt_method_header = 0x0, aes_encrypt_key = { rd_key = {0x0 <repeats 60 times>}, rounds = 0x0 }, aes_decrypt_key = { rd_key = {0x0 <repeats 60 times>}, rounds = 0x0 }, snapshots_offset = 0x0, snapshots_size = 0x0, nb_snapshots = 0x0, snapshots = 0x0, flags = 0x0, qcow_version = 0x0, use_lazy_refcounts = 0x0, refcount_order = 0x0, discard_passthrough = {0x0, 0x0, 0x0, 0x0, 0x0}, overlap_check = 0x0, incompatible_features = 0x0, compatible_features = 0x0, autoclear_features = 0x0, unknown_header_fields_size = 0x0, unknown_header_fields = 0x0, unknown_header_ext = { lh_first = 0x0 }, discards = { tqh_first = 0x0, tqh_last = 0x0 }, cache_discards = 0x0 } (gdb) p *bs $4 = { total_sectors = 0x2800000, read_only = 0x0, open_flags = 0x2062, encrypted = 0x0, valid_key = 0x0, sg = 0x0, copy_on_read = 0x0, drv = 0x1078d440 <bdrv_qcow2>, opaque = 0x10019aaf300, dev = 0x10019a90b38, dev_ops = 0x105b49b0 <virtio_block_ops>, dev_opaque = 0x10019a90b38, aio_context = 0x10019a7f270, aio_notifiers = { lh_first = 0x10019cf74f0 }, filename = "/var/lib/libvirt/images/chig1.qcow2", backing_file = "", backing_format = "", full_open_options = 0x10019c2a030, exact_filename = "/var/lib/libvirt/images/chig1.qcow2", backing_hd = 0x0, file = 0x10019aae3b0, close_notifiers = { notifiers = { lh_first = 0x10019cf6460 } }, before_write_notifiers = { notifiers = { lh_first = 0x0 } }, serialising_in_flight = 0x0, throttle_state = { cfg = { buckets = {{ avg = 0, max = 0, level = 0 }, { avg = 0, max = 0, level = 0 }, { avg = 0, max = 0, level = 0 }, { avg = 0, max = 0, level = 0 }, { avg = 0, max = 0, level = 0 }, { avg = 0, max = 0, level = 0 }}, op_size = 0x0 }, previous_leak = 0x0, timers = {0x0, 0x0}, clock_type = QEMU_CLOCK_REALTIME, read_timer_cb = 0x0, write_timer_cb = 0x0, timer_opaque = 0x0 }, throttled_reqs = {{ entries = { tqh_first = 0x0, tqh_last = 0x10019aac188 } }, { entries = { tqh_first = 0x0, tqh_last = 0x10019aac198 } }}, io_limits_enabled = 0x0, nr_bytes = {0x0, 0x0, 0x0}, nr_ops = {0x0, 0x0, 0x0}, total_time_ns = {0x0, 0x0, 0x0}, wr_highest_sector = 0x27fffff, bl = { max_discard = 0x0, discard_alignment = 0x0, max_write_zeroes = 0x0, write_zeroes_alignment = 0x80, opt_transfer_length = 0x0, opt_mem_alignment = 0x1000 }, growable = 0x0, zero_beyond_eof = 0x1, request_alignment = 0x200, guest_block_size = 0x200, enable_write_cache = 0x1, on_read_error = BLOCKDEV_ON_ERROR_REPORT, on_write_error = BLOCKDEV_ON_ERROR_ENOSPC, iostatus_enabled = 0x1, iostatus = BLOCK_DEVICE_IO_STATUS_OK, node_name = "", node_list = { tqe_next = 0x0, tqe_prev = 0x0 }, device_name = "drive-virtio-disk0", device_list = { tqe_next = 0x0, tqe_prev = 0x1078b328 <bdrv_states> }, dirty_bitmaps = { lh_first = 0x0 }, refcnt = 0x2, tracked_requests = { lh_first = 0x0 }, op_blockers = {{ lh_first = 0x0 } <repeats 14 times>}, job = 0x0, options = 0x10019aa9db0, detect_zeroes = BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF, backing_blocker = 0x0 } [root@chikvm ~]# cat chig1-aik.xml <domain type='kvm'> <name>chig1</name> <uuid>bbf91237-3c78-489e-b426-ab593806c78b</uuid> <memory unit='KiB'>4194304</memory> <currentMemory unit='KiB'>4194304</currentMemory> <vcpu placement='static'>1</vcpu> <resource> <partition>/machine</partition> </resource> <os> <type arch='ppc64' machine='pseries'>hvm</type> <boot dev='hd'/> <boot dev='network'/> <bootmenu enable='yes'/> </os> <features> <acpi/> <apic/> </features> <cpu> </cpu> <clock offset='utc'/> <on_poweroff>destroy</on_poweroff> <on_reboot>restart</on_reboot> <on_crash>restart</on_crash> <devices> <emulator>/usr/bin/qemu-system-ppc64.aik</emulator> <disk type='file' device='disk'> <driver name='qemu' type='qcow2' cache='none'/> <source file='/var/lib/libvirt/images/chig1.qcow2'/> <target dev='vda' bus='virtio'/> <address type='pci' domain='0x0000' bus='0x00' slot='0x04' function='0x0'/> </disk> <controller type='pci' index='0' model='pci-root'/> <controller type='usb' index='0'> <address type='pci' domain='0x0000' bus='0x00' slot='0x03' function='0x0'/> </controller> <interface type='bridge'> <mac address='52:54:00:27:70:6f'/> <source bridge='brenP1p9s0f0'/> <driver name='qemu'/> <model type='virtio'/> <address type='pci' domain='0x0000' bus='0x00' slot='0x02' function='0x0'/> </interface> <serial type='pty'> <target port='0'/> <address type='spapr-vio' reg='0x30000000'/> </serial> <console type='pty'> <target type='serial' port='0'/> <address type='spapr-vio' reg='0x30000000'/> </console> <video> <model type='vga' vram='9216' heads='1'/> <address type='pci' domain='0x0000' bus='0x00' slot='0x01' function='0x0'/> </video> <memballoon model='virtio'> <address type='pci' domain='0x0000' bus='0x00' slot='0x05' function='0x0'/> </memballoon> </devices> <seclabel type='dynamic' model='selinux' relabel='yes'/> </domain> -- Alexey